From abad7da850420e01eb5709d4b0e8740005e67214 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 May 2013 10:18:35 +0530 Subject: [PATCH] pep8 --- src/calibre/ebooks/conversion/preprocess.py | 23 ++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 7e5873edd2..91f91c8b3d 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -14,7 +14,7 @@ SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' convert_entities = functools.partial(entity_to_unicode, - result_exceptions = { + result_exceptions={ u'<' : '<', u'>' : '>', u"'" : ''', @@ -144,9 +144,9 @@ class DocAnalysis(object): percent is the percentage of lines that should be in a single bucket to return true The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks ''' - minLineLength=20 # Ignore lines under 20 chars (typical of spaces) - maxLineLength=1900 # Discard larger than this to stay in range - buckets=20 # Each line is divided into a bucket based on length + minLineLength=20 # Ignore lines under 20 chars (typical of spaces) + maxLineLength=1900 # Discard larger than this to stay in range + buckets=20 # Each line is divided into a bucket based on length #print "there are "+str(len(lines))+" lines" #max = 0 @@ -156,7 +156,7 @@ class DocAnalysis(object): # max = l #print "max line found is "+str(max) # Build the line length histogram - hRaw = [ 0 for i in range(0,buckets) ] + hRaw = [0 for i in range(0,buckets)] for line in self.lines: l = len(line) if l > minLineLength and l < maxLineLength: @@ -167,7 +167,7 @@ class DocAnalysis(object): # Normalize the histogram into percents totalLines = len(self.lines) if totalLines > 0: - h = [ float(count)/totalLines for count in hRaw ] + h = [float(count)/totalLines for count in hRaw] else: h = [] #print "\nhRaw histogram lengths are: "+str(hRaw) @@ -200,7 +200,7 @@ class Dehyphenator(object): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' # only remove if it's not already the point of hyphenation - self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" + self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" # noqa self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE) self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation @@ -265,19 +265,18 @@ class Dehyphenator(object): self.html = html self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) # noqa elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'txt': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) # noqa elif format == 'individual_words': intextmatch = re.compile(u'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)') elif format == 'html_cleanup': - intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') # noqa elif format == 'txt_cleanup': intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)(?P\s+)(?P[\w\d]+)') - html = intextmatch.sub(self.dehyphenate, html) return html @@ -581,7 +580,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa ) for rule in self.PREPROCESS + start_rules: