From 1272988089814321248ffe0c58232f1d061a67a3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 20:11:52 +0800
Subject: [PATCH] enabled hyphen removal across the entire document text,
 refactored logic to reduce false positives, added verbose debug output

---
 src/calibre/ebooks/conversion/preprocess.py | 47 +++++++++-----
 src/calibre/ebooks/conversion/utils.py      | 69 +++++++++++----------
 src/calibre/ebooks/txt/input.py             |  4 +-
 3 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index abaff77f33..9dedd05e33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -174,13 +174,19 @@ class Dehyphenator(object):
     retain hyphens.
     '''
 
-    def __init__(self):
+    def __init__(self, verbose=0, log=None):
+        self.log = default_log if log is None else log
+        self.verbose = verbose
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
+        # only remove if it's not already the point of hyphenation
+        self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
+        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
+        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
+        self.prefix_string = '^(dis|re|un|in|ex)'
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
+        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
 
     def dehyphenate(self, match):
         firsthalf = match.group('firstpart')
@@ -191,31 +197,44 @@ class Dehyphenator(object):
             wraptags = ''
         hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
         dehyphenated = unicode(firsthalf) + unicode(secondhalf)
-        lookupword = self.removesuffixes.sub('', dehyphenated)
-        if self.prefixes.match(firsthalf) is None:
+        if self.suffixes.match(secondhalf) is None:
+            lookupword = self.removesuffixes.sub('', dehyphenated)
+        else:
+            lookupword = dehyphenated
+        if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
             lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.verbose > 2:
+            self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
         try:
             searchresult = self.html.find(lookupword.lower())
         except:
             return hyphenated
         if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("    Cleanup:returned dehyphenated word: " + str(dehyphenated))
                 return dehyphenated
             elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("        Cleanup:returned hyphenated word: " + str(hyphenated))
                 return hyphenated
             else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                if self.verbose > 2:
+                    self.log("            Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
                 return firsthalf+u'\u2014'+wraptags+secondhalf
 
         else:
+            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + str(hyphenated))
+                return hyphenated
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("     returned dehyphenated word: " + str(dehyphenated))
                 return dehyphenated
             else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("          returned hyphenated word: " + str(hyphenated))
                 return hyphenated
 
     def __call__(self, html, format, length=1):
@@ -228,7 +247,7 @@ class Dehyphenator(object):
         elif format == 'txt':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)') # for later, not called anywhere yet
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
         elif format == 'txt_cleanup':
@@ -512,7 +531,7 @@ class HTMLPreProcessor(object):
 
         if is_pdftohtml and length > -1:
             # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html', length)
 
         if is_pdftohtml:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96a9a4783d..4a118d423c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -322,11 +322,11 @@ class HeuristicProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, font, & italics tags
-        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
+        # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
         self.deleted_nbsps = True
         return html
 
@@ -376,27 +376,31 @@ class HeuristicProcessor(object):
         except:
             self.log("Can't get wordcount")
 
-        if 0 < self.totalwords < 50:
+        print "found "+unicode(self.totalwords)+" words in the flow"
+        if self.totalwords < 50:
             self.log("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
 
-        ###### Check Markup ######
-        #
-        # some lit files don't have any <p> tags or equivalent (generally just plain text between
-        # <pre> tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-            self.log("not enough paragraph markers, adding now")
-            # markup using text processing
-            html = self.markup_pre(html)
+        if self.cleanup_required():
+            ###### Check Markup ######
+            #
+            # some lit files don't have any <p> tags or equivalent (generally just plain text between
+            # <pre> tags), check and  mark up line endings if required before proceeding
+            # fix indents must run after this step
+            if self.no_markup(html, 0.1):
+                self.log("not enough paragraph markers, adding now")
+                # markup using text processing
+                html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', False):
             html = self.fix_nbsp_indents(html)
 
         if self.cleanup_required():
+            # fix indents must run before this step, as it removes non-breaking spaces
             html = self.cleanup_markup(html)
 
         # ADE doesn't render <br />, change to empty paragraphs
@@ -420,26 +424,26 @@ class HeuristicProcessor(object):
             self.log("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
+
+        # Determine line ending type
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+        # that lines can be un-wrapped across page boundaries
+        format = self.analyze_line_endings(html)
+
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+        # more of the lines break in the same region of the document then unwrapping is required
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log("Hard line breaks check returned "+unicode(hardbreaks))
+
+        # Calculate Length
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = docanalysis.line_length(unwrap_factor)
+        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
             
         ###### Unwrap lines ######
         if getattr(self.extra_opts, 'unwrap_lines', False):
-            # Determine line ending type
-            # Some OCR sourced files have line breaks in the html using a combination of span & p tags
-            # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
-            # that lines can be un-wrapped across page boundaries
-            format = self.analyze_line_endings(html)
-
-            # Check Line histogram to determine if the document uses hard line breaks, If 50% or
-            # more of the lines break in the same region of the document then unwrapping is required
-            docanalysis = DocAnalysis(format, html)
-            hardbreaks = docanalysis.line_histogram(.50)
-            self.log("Hard line breaks check returned "+unicode(hardbreaks))
-
-            # Calculate Length
-            unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-            length = docanalysis.line_length(unwrap_factor)
-            self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
-
             # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
             if hardbreaks or unwrap_factor < 0.4:
                 self.log("Unwrapping required, unwrapping Lines")
@@ -447,15 +451,16 @@ class HeuristicProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html', length)
                 html = self.punctuation_unwrap(length, html, 'html')
-                #check any remaining hyphens, but only unwrap if there is a match
-                dehyphenator = Dehyphenator()
+                # unwrap remaining hyphens based on line length, but only remove if there is a match
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                 html = dehyphenator(html,'html_cleanup', length)
 
         if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html_cleanup', length)
+            html = dehyphenator(html, 'individual_words', length)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5cffbafe21..8bf33c4837 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
                     log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
 
             # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
             txt = dehyphenator(txt,'txt', length)
 
             # We don't check for block because the processor assumes block.
@@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
                 setattr(options, 'dehyphenate', True)
 
         # Dehyphenate in cleanup mode for missed txt and markdown conversion
-        dehyphenator = Dehyphenator()
+        dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
         html = dehyphenator(html,'txt_cleanup', length)
         html = dehyphenator(html,'html_cleanup', length)