Various improvements to preprocessing code. Also replace a couple of unicode punctuation characters that the SONY readers dont support when the Outpu profile is sony

2025-07-09 03:04:10 -04:00 · 2010-09-28 15:43:22 -06:00 · 2010-09-28 15:43:22 -06:00 · f54ee65bcc
commit f54ee65bcc
parent 5fe81f0162 7653dfd082
6 changed files with 265 additions and 123 deletions
--- a/resources/recipes/ajc.recipe
+++ b/resources/recipes/ajc.recipe
@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
 import datetime
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    now = datetime.datetime.now()
    title = 'The AJC'
--- a/resources/recipes/boortz.recipe
+++ b/resources/recipes/boortz.recipe
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
+
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Nealz Nuze'
    language = 'en'
--- a/resources/recipes/popscience.recipe
+++ b/resources/recipes/popscience.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
+import re
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Popular Science'
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -1,3 +1,4 @@
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
@ -251,6 +252,9 @@ class OutputProfile(Plugin):
    #: The character used to represent a star in ratings
    ratings_char = u'*'
    #: Unsupported unicode characters to be replaced during preprocessing
    unsupported_unicode_chars = []
    @classmethod
    def tags_to_string(cls, tags):
        return escape(', '.join(tags))
@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile):
    dpi                       = 168.451
    fbase                     = 12
    fsizes                    = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
    unsupported_unicode_chars = [u'\u201f', u'\u201b']
 class KoboReaderOutput(OutputProfile):
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -62,49 +62,104 @@ def wrap_lines(match):
    else:
               return ital+' '
-def line_length(format, raw, percent):
+class DocAnalysis(object):
    '''
-    raw is the raw text to find the line length to use for wrapping.
+    Provides various text analysis functions to determine how the document is structured.
-    percentage is a decimal number, 0 - 1 which is used to determine
+    format is the type of document analysis will be done against.
-    how far in the list of line lengths to use. The list of line lengths is
+    raw is the raw text to determine the line length to use for wrapping.
-    ordered smallest to larged and does not include duplicates. 0.5 is the
+    Blank lines are excluded from analysis
    median value.
    '''
    raw = raw.replace('&nbsp;', ' ')
    if format == 'html':
        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
    elif format == 'pdf':
        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
    elif format == 'spanned_html':
        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
    lines = linere.findall(raw)
-    lengths = []
+    def __init__(self, format='html', raw=''):
-    for line in lines:
+        raw = raw.replace('&nbsp;', ' ')
-        if len(line) > 0:
+        if format == 'html':
-            lengths.append(len(line))
+            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
        elif format == 'pdf':
            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
        self.lines = linere.findall(raw)
-    if not lengths:
+    def line_length(self, percent):
-        return 0
+        '''
        Analyses the document to find the median line length.
        percentage is a decimal number, 0 - 1 which is used to determine
        how far in the list of line lengths to use. The list of line lengths is
        ordered smallest to larged and does not include duplicates. 0.5 is the
        median value.
        '''
        lengths = []
        for line in self.lines:
            if len(line) > 0:
                lengths.append(len(line))
-    lengths = list(set(lengths))
+        if not lengths:
-    total = sum(lengths)
+            return 0
    avg = total / len(lengths)
    max_line = avg * 2
-    lengths = sorted(lengths)
+        lengths = list(set(lengths))
-    for i in range(len(lengths) - 1, -1, -1):
+        total = sum(lengths)
-        if lengths[i] > max_line:
+        avg = total / len(lengths)
-            del lengths[i]
+        max_line = avg * 2
-    if percent > 1:
+        lengths = sorted(lengths)
-        percent = 1
+        for i in range(len(lengths) - 1, -1, -1):
-    if percent < 0:
+            if lengths[i] > max_line:
-        percent = 0
+                del lengths[i]
-    index = int(len(lengths) * percent) - 1
+        if percent > 1:
            percent = 1
        if percent < 0:
            percent = 0
-    return lengths[index]
+        index = int(len(lengths) * percent) - 1
        return lengths[index]
    def line_histogram(self, percent):
        '''
        Creates a broad histogram of the document to determine whether it incorporates hard
        line breaks.  Lines are sorted into 20 'buckets' based on length.
        percent is the percentage of lines that should be in a single bucket to return true
        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
        '''
        minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
        maxLineLength=1900 # Discard larger than this to stay in range
        buckets=20 # Each line is divided into a bucket based on length
        #print "there are "+str(len(lines))+" lines"
        #max = 0
        #for line in self.lines:
        #    l = len(line)
        #    if l > max:
        #        max = l
        #print "max line found is "+str(max)
        # Build the line length histogram
        hRaw = [ 0 for i in range(0,buckets) ]
        for line in self.lines:
            l = len(line)
            if l > minLineLength and l < maxLineLength:
                    l = int(l/100)
                    #print "adding "+str(l)
                    hRaw[l]+=1
        # Normalize the histogram into percents
        totalLines = len(self.lines)
        h = [ float(count)/totalLines for count in hRaw ]
        #print "\nhRaw histogram lengths are: "+str(hRaw)
        #print "              percents are: "+str(h)+"\n"
        # Find the biggest bucket
        maxValue = 0
        for i in range(0,len(h)):
            if h[i] > maxValue:
                maxValue = h[i]
        if maxValue < percent:
            #print "Line lengths are too variable. Not unwrapping."
            return False
        else:
            #print str(maxValue)+" of the lines were in one bucket"
            return True
 class Dehyphenator(object):
    '''
@ -117,42 +172,62 @@ class Dehyphenator(object):
    def __init__(self):
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
+        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
        secondhalf = match.group('secondpart')
        try:
            wraptags = match.group('wraptags')
        except:
            wraptags = ''
        hyphenated = str(firsthalf) + "-" + str(secondhalf)
        dehyphenated = str(firsthalf) + str(secondhalf)
        lookupword = self.removesuffixes.sub('', dehyphenated)
        if self.prefixes.match(firsthalf) is None:
           lookupword = self.removeprefix.sub('', lookupword)
        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
-        match = booklookup.search(self.html)
+        try:
-        if match:
+            searchresult = self.html.find(str.lower(lookupword))
-            #print "returned dehyphenated word: " + str(dehyphenated)
+        except:
            return dehyphenated
        else:
            #print "returned hyphenated word: " + str(hyphenated)
            return hyphenated
        if self.format == 'html_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
                return hyphenated
            else:
                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
                return firsthalf+u'\u2014'+wraptags+secondhalf
        else:
            if self.html.find(lookupword) != -1 or searchresult != -1:
                #print "returned dehyphenated word: " + str(dehyphenated)
                return dehyphenated
            else:
                #print "           returned hyphenated word: " + str(hyphenated)
                return hyphenated
    def __call__(self, html, format, length=1):
        self.html = html
        self.format = format
        if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
        elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        html = intextmatch.sub(self.dehyphenate, html)
        return html
 class CSSPreProcessor(object):
    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
@ -286,7 +361,7 @@ class HTMLPreProcessor(object):
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                  # Cover the case where every letter in a chapter title is separated by a space
                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
@ -374,10 +449,8 @@ class HTMLPreProcessor(object):
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
-        # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
+        # delete soft hyphens - moved here so it's executed after header/footer removal
        if is_pdftohtml:
            # unwrap em/en dashes
            end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens
            end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens with formatting
@ -391,12 +464,15 @@ class HTMLPreProcessor(object):
        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+            docanalysis = DocAnalysis('pdf', html)
            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
            if length:
-                # print "The pdf line length returned is " + str(length)
+                #print "The pdf line length returned is " + str(length)
                # unwrap em/en dashes
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )
        for rule in self.PREPROCESS + start_rules:
@ -454,6 +530,14 @@ class HTMLPreProcessor(object):
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
        unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        if unsupported_unicode_chars:
            from calibre.ebooks.unidecode.unidecoder import Unidecoder
            unidecoder = Unidecoder()
            for char in unsupported_unicode_chars:
                asciichar = unidecoder.decode(char)
                html = html.replace(char, asciichar)
        return html
    def smarten_punctuation(self, html):
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 class PreProcessor(object):
@ -77,13 +77,18 @@ class PreProcessor(object):
    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = re.sub(r"\s*</p>", "</p>\n", html)
        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
        ###### Check Markup ######
        #
        # some lit files don't have any <p> tags or equivalent (generally just plain text between
        # <pre> tags), check and  mark up line endings if required before proceeding
        if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
-             # check if content is in pre tags, use txt procesor to mark up if so
+             # check if content is in pre tags, use txt processor to mark up if so
             pre = re.compile(r'<pre>', re.IGNORECASE)
             if len(pre.findall(html)) == 1:
                 self.log("Running Text Processing")
@ -113,47 +118,77 @@ class PreProcessor(object):
        # Get rid of empty <o:p> tags to simplify other processing
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Get rid of empty span, bold, & italics tags
-        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
        # paragraph spacing then delete blank lines to clean up spacing
        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        blanks_between_paragraphs = False
        if len(lines) > 1:
            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
            'remove_paragraph_spacing', False):
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+            elif float(len(blanklines)) / float(len(lines)) > 0.40:
-        html = re.sub(r"\s*</p>", "</p>\n", html)
+               blanks_between_paragraphs = True
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+               #print "blanks between paragraphs is marked True"
            else:
                blanks_between_paragraphs = False
        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
        # detect chapters/sections to match xpath or splitting logic
        #
        # Build the Regular Expressions in pieces
        lookahead = "(?=<(p|div))"
        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
        chapter_header_open = r"(?P<chap>"
        chapter_header_close = ")\s*"
        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
        if blanks_between_paragraphs:
            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
        else:
            blank_lines = ""
        opt_title_open = "("
        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        title_header_open = "(?P<title>"
        title_header_close = ")\s*"
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
        opt_title_close = ")?"
        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
        #print chapter_marker
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
        self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
        #
        # Start with most typical chapter headings, get more aggressive until one works
        if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
+            chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect2.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
            html = chapdetect2.sub(self.chapter_head, html)
        ###### Unwrap lines ######
        #
        self.log("Unwrapping Lines")
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
@ -168,25 +203,40 @@ class PreProcessor(object):
                format = 'html'
        else:
            format = 'html'
-
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log("Hard line breaks check returned "+str(hardbreaks))
        # Calculate Length
-        length = line_length(format, html, getattr(self.extra_opts,
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-            'html_unwrap_factor', 0.4))
+        length = docanalysis.line_length(unwrap_factor)
        self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
-        max_length = length * 1.4
+        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
-        min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
+        if hardbreaks or unwrap_factor < 0.4:
-        #
+            self.log("Unwrapping required, unwrapping Lines")
-        # Unwrap em/en dashes, delete soft-hyphens
+            # Unwrap em/en dashes
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
-        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+            # Dehyphenate
-        html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
+            self.log("Unwrapping/Removing hyphens")
-        # Dehyphenate
+            dehyphenator = Dehyphenator()
-        dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html', length)
-        html = dehyphenator(html,'html', length)
+            self.log("Done dehyphenating")
            # Unwrap lines using punctation and line length
            unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
            html = unwrap.sub(' ', html)
            #check any remaining hyphens, but only unwrap if there is a match
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
        else:
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Cleaning up hyphenation")
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
            self.log("Done dehyphenating")
-        # Unwrap lines using punctation and line length
+        # delete soft hyphens
-        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
        html = unwrap.sub(' ', html)
        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < 10: