From 1272988089814321248ffe0c58232f1d061a67a3 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 16 Jan 2011 20:11:52 +0800 Subject: [PATCH] enabled hyphen removal across the entire document text, refactored logic to reduce false positives, added verbose debug output --- src/calibre/ebooks/conversion/preprocess.py | 47 +++++++++----- src/calibre/ebooks/conversion/utils.py | 69 +++++++++++---------- src/calibre/ebooks/txt/input.py | 4 +- 3 files changed, 72 insertions(+), 48 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index abaff77f33..9dedd05e33 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -174,13 +174,19 @@ class Dehyphenator(object): retain hyphens. ''' - def __init__(self): + def __init__(self, verbose=0, log=None): + self.log = default_log if log is None else log + self.verbose = verbose # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' - self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE) + # only remove if it's not already the point of hyphenation + self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" + self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE) + self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation - self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) - self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) + self.prefix_string = '^(dis|re|un|in|ex)' + self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE) + self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE) def dehyphenate(self, match): firsthalf = match.group('firstpart') @@ -191,31 +197,44 @@ class Dehyphenator(object): wraptags = '' hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf) dehyphenated = unicode(firsthalf) + unicode(secondhalf) - lookupword = self.removesuffixes.sub('', dehyphenated) - if self.prefixes.match(firsthalf) is None: + if self.suffixes.match(secondhalf) is None: + lookupword = self.removesuffixes.sub('', dehyphenated) + else: + lookupword = dehyphenated + if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + if self.verbose > 2: + self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)) try: searchresult = self.html.find(lookupword.lower()) except: return hyphenated if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: - #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + if self.verbose > 2: + self.log(" Cleanup:returned dehyphenated word: " + str(dehyphenated)) return dehyphenated elif self.html.find(hyphenated) != -1: - #print "Cleanup:returned hyphenated word: " + str(hyphenated) + if self.verbose > 2: + self.log(" Cleanup:returned hyphenated word: " + str(hyphenated)) return hyphenated else: - #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + if self.verbose > 2: + self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)) return firsthalf+u'\u2014'+wraptags+secondhalf else: + if len(firsthalf) <= 2 and len(secondhalf) <= 2: + if self.verbose > 2: + self.log("too short, returned hyphenated word: " + str(hyphenated)) + return hyphenated if self.html.find(lookupword) != -1 or searchresult != -1: - #print "returned dehyphenated word: " + str(dehyphenated) + if self.verbose > 2: + self.log(" returned dehyphenated word: " + str(dehyphenated)) return dehyphenated else: - #print " returned hyphenated word: " + str(hyphenated) + if self.verbose > 2: + self.log(" returned hyphenated word: " + str(hyphenated)) return hyphenated def __call__(self, html, format, length=1): @@ -228,7 +247,7 @@ class Dehyphenator(object): elif format == 'txt': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet + intextmatch = re.compile(u'(?!<)(?P\w+)(-|‐)\s*(?P\w+)(?![^<]*?>)') # for later, not called anywhere yet elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') elif format == 'txt_cleanup': @@ -512,7 +531,7 @@ class HTMLPreProcessor(object): if is_pdftohtml and length > -1: # Dehyphenate - dehyphenator = Dehyphenator() + dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) html = dehyphenator(html,'html', length) if is_pdftohtml: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 96a9a4783d..4a118d423c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -322,11 +322,11 @@ class HeuristicProcessor(object): html = re.sub(ur'\s*\s*', ' ', html) # Delete microsoft 'smart' tags html = re.sub('(?i)', '', html) - # Get rid of empty span, bold, font, & italics tags - html = re.sub(r'\s*]*>\s*\s*', '', html) + # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) - html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*\s*){0,2}\s*", " ", html) + html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*\s*){0,2}\s*", " ", html) html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) + html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*\s*){0,2}\s*", " ", html) self.deleted_nbsps = True return html @@ -376,27 +376,31 @@ class HeuristicProcessor(object): except: self.log("Can't get wordcount") - if 0 < self.totalwords < 50: + print "found "+unicode(self.totalwords)+" words in the flow" + if self.totalwords < 50: self.log("flow is too short, not running heuristics") return html # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = self.arrange_htm_line_endings(html) - ###### Check Markup ###### - # - # some lit files don't have any

tags or equivalent (generally just plain text between - #

 tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-            self.log("not enough paragraph markers, adding now")
-            # markup using text processing
-            html = self.markup_pre(html)
+        if self.cleanup_required():
+            ###### Check Markup ######
+            #
+            # some lit files don't have any 

tags or equivalent (generally just plain text between + #

 tags), check and  mark up line endings if required before proceeding
+            # fix indents must run after this step
+            if self.no_markup(html, 0.1):
+                self.log("not enough paragraph markers, adding now")
+                # markup using text processing
+                html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', False):
             html = self.fix_nbsp_indents(html)
 
         if self.cleanup_required():
+            # fix indents must run before this step, as it removes non-breaking spaces
             html = self.cleanup_markup(html)
 
         # ADE doesn't render 
, change to empty paragraphs @@ -420,26 +424,26 @@ class HeuristicProcessor(object): self.log("deleting blank lines") html = self.multi_blank.sub('\n

', html) html = self.blankreg.sub('', html) + + # Determine line ending type + # Some OCR sourced files have line breaks in the html using a combination of span & p tags + # span are used for hard line breaks, p for new paragraphs. Determine which is used so + # that lines can be un-wrapped across page boundaries + format = self.analyze_line_endings(html) + + # Check Line histogram to determine if the document uses hard line breaks, If 50% or + # more of the lines break in the same region of the document then unwrapping is required + docanalysis = DocAnalysis(format, html) + hardbreaks = docanalysis.line_histogram(.50) + self.log("Hard line breaks check returned "+unicode(hardbreaks)) + + # Calculate Length + unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) + length = docanalysis.line_length(unwrap_factor) + self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") ###### Unwrap lines ###### if getattr(self.extra_opts, 'unwrap_lines', False): - # Determine line ending type - # Some OCR sourced files have line breaks in the html using a combination of span & p tags - # span are used for hard line breaks, p for new paragraphs. Determine which is used so - # that lines can be un-wrapped across page boundaries - format = self.analyze_line_endings(html) - - # Check Line histogram to determine if the document uses hard line breaks, If 50% or - # more of the lines break in the same region of the document then unwrapping is required - docanalysis = DocAnalysis(format, html) - hardbreaks = docanalysis.line_histogram(.50) - self.log("Hard line breaks check returned "+unicode(hardbreaks)) - - # Calculate Length - unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) - length = docanalysis.line_length(unwrap_factor) - self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") - # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor if hardbreaks or unwrap_factor < 0.4: self.log("Unwrapping required, unwrapping Lines") @@ -447,15 +451,16 @@ class HeuristicProcessor(object): dehyphenator = Dehyphenator() html = dehyphenator(html,'html', length) html = self.punctuation_unwrap(length, html, 'html') - #check any remaining hyphens, but only unwrap if there is a match - dehyphenator = Dehyphenator() + # unwrap remaining hyphens based on line length, but only remove if there is a match + dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) html = dehyphenator(html,'html_cleanup', length) if getattr(self.extra_opts, 'dehyphenate', False): # dehyphenate in cleanup mode to fix anything previous conversions/editing missed self.log("Fixing hyphenated content") - dehyphenator = Dehyphenator() + dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) html = dehyphenator(html,'html_cleanup', length) + html = dehyphenator(html, 'individual_words', length) # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False): diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5cffbafe21..8bf33c4837 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin): log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Dehyphenate - dehyphenator = Dehyphenator() + dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None)) txt = dehyphenator(txt,'txt', length) # We don't check for block because the processor assumes block. @@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin): setattr(options, 'dehyphenate', True) # Dehyphenate in cleanup mode for missed txt and markdown conversion - dehyphenator = Dehyphenator() + dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None)) html = dehyphenator(html,'txt_cleanup', length) html = dehyphenator(html,'html_cleanup', length)