Preprocessing Updates

2025-07-09 03:04:10 -04:00 · 2010-09-04 15:12:29 +10:00 · 2010-09-04 15:12:29 +10:00 · 5c951fb962
commit 5c951fb962
parent 132df9b6c8
5 changed files with 132 additions and 14 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -62,6 +62,7 @@ def wrap_lines(match):
    else:
               return ital+' '
 def line_length(format, raw, percent):
    '''
    raw is the raw text to find the line length to use for wrapping.
@ -191,32 +192,36 @@ class HTMLPreProcessor(object):
                  (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                  (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
                  # If pdf printed from a browser then the header/footer has a reliable pattern
                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
                  # Center separator lines
                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
                  # Remove page links
                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                  # Replace <br><br> with <p>
-                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
+                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
-                  # Remove hyphenation
+                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  (re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
+                  (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
                  # Have paragraphs show better
                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
                  # Clean up spaces
                  (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                  # Connect paragraphs split by -
                  (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
                  # Add space before and after italics
                  (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
                 ]
    # Fix Book Designer markup
@ -293,6 +298,13 @@ class HTMLPreProcessor(object):
                import traceback
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
        if getattr(self.extra_opts, 'preprocess_html', None):
            if is_pdftohtml:
                end_rules.append(
                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
                )
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
        if not hasattr(self, 'log'):
            from calibre.utils.logging import default_log
            self.log = default_log
-		self.log("*********  Preprocessing HTML  *********")
+		self.log("*********  Preprocessing HTML - HTML Input plugin *********")
 		# Detect Chapters to match the xpath in the GUI
 		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
 		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -11,12 +11,14 @@ import re
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.preprocess import line_length
 class LITInput(InputFormatPlugin):
    name        = 'LIT Input'
    author      = 'Marshall T. Vandegrift'
    description = 'Convert LIT files to HTML'
    file_types  = set(['lit'])
    html_preprocess_sections = 0
    def convert(self, stream, options, file_ext, log,
                accelerators):
@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):
 	def preprocess_html(self, html):
        def chapter_head(match):
            chap = match.group('chap')
            title = match.group('title')
            if not title:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
                       return '<h2>'+chap+'</h2>\n'
            else:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                       return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
        def chapter_link(match):
            chap = match.group('sectionlink')
            if not chap:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
                       return '<br style="page-break-before:always">'
            else:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
                       return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
        def no_markup(raw, percent):
            '''
            Detects total marked up line endings in the file. raw is the text to 
            inspect.  Percent is the minimum percent of line endings which should 
            be marked up to return true.
            '''
            htm_end_ere = re.compile('</p>', re.DOTALL)
            line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
            htm_end = htm_end_ere.findall(raw)
            line_end = line_end_ere.findall(raw)
            tot_htm_ends = len(htm_end)
            tot_ln_fds = len(line_end)
            self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
            if percent > 1:
                percent = 1
            if percent < 0:
                percent = 0    
            min_lns = tot_ln_fds * percent
            self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
            if min_lns > tot_htm_ends:
                return True
 		self.log("*********  Preprocessing HTML  *********")
-		# Detect Chapters to match the xpath in the GUI
+		# remove non-breaking spaces
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
+		html = re.sub(ur'\u00a0', ' ', html)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
+		# Get rid of empty <o:p> tags to simplify other processing
-		# Unwrap lines using punctation if the median length of all lines is less than 150
+		html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
 		# Get rid of empty span tags
        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
 		linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        if len(lines) > 1:
            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40:
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
 		# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
 		html = re.sub(r"\s*</p>", "</p>\n", html)
 		# some lit files don't have any <p> tags or equivalent, check and 
 		# mark up line endings if required before proceeding
 		if no_markup(html, 0.1):
 		     self.log("not enough paragraph markers, adding now")
             add_markup = re.compile('(?<!>)(\n)')
             html = add_markup.sub('</p>\n<p>', html)
 		# detect chapters/sections to match xpath or splitting logic
 		#
-		# Insert extra line feeds so the line length regex functions properly
+		# Mark split points based on embedded links
-		html = re.sub(r"</p>", "</p>\n", html)
+		chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
        html = chaplink.sub(chapter_link, html)
        # Continue with alternate patterns, start with most typical chapter headings
 		if self.html_preprocess_sections < 10:        
            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
            html = chapdetect.sub(chapter_head, html)
 		if self.html_preprocess_sections < 10:
 		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
 		    html = chapdetect2.sub(chapter_head, html)
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
 		#    
 		# Unwrap lines using punctation if the median length of all lines is less than 150		
 		length = line_length('html', html, 0.4)
 		self.log("*** Median length is " + str(length) + " ***")
 		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -3,6 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.customize.conversion import InputFormatPlugin
 class MOBIInput(InputFormatPlugin):
@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
                    include_meta_content_type=False))
                accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
        return mr.created_opf_path
    def preprocess_html(self, html):
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
        return html
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -408,6 +408,10 @@ class Page(object):
    # Fraction of text height that two strings' bottoms can differ by
    # for them to be considered to be part of the same text fragment
    LINE_FACTOR = 0.4
    # Percentage of the page heigth which should be considered header
    # or footer to be discarded from reflow considerations
    HEAD_FOOTER_MARGIN
    # Multiplies the average line height when determining row height
    # of a particular element to detect columns.