Preprocessing Updates

2025-07-09 03:04:10 -04:00 · 2010-09-04 15:12:29 +10:00 · 2010-09-04 15:12:29 +10:00 · 5c951fb962
commit 5c951fb962
parent 132df9b6c8
5 changed files with 132 additions and 14 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -62,6 +62,7 @@ def wrap_lines(match):
    else:
               return ital+' '

+
 def line_length(format, raw, percent):
    '''
    raw is the raw text to find the line length to use for wrapping.
@ -191,32 +192,36 @@ class HTMLPreProcessor(object):
                  (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                  (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),

+                  # If pdf printed from a browser then the header/footer has a reliable pattern
+                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+                  # Center separator lines
+                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+
                  # Remove page links
                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                  # Replace <br><br> with <p>
-                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
+                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),

-                  # Remove hyphenation
-                  (re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
+                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
+                  (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),

                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),

                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),

                  # Have paragraphs show better
                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
                  # Clean up spaces
                  (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
-                  # Connect paragraphs split by -
-                  (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
                  # Add space before and after italics
                  (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
+                                   
                 ]

    # Fix Book Designer markup
@ -294,6 +299,13 @@ class HTMLPreProcessor(object):
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
        
+        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+        if getattr(self.extra_opts, 'preprocess_html', None):
+            if is_pdftohtml:
+                end_rules.append(
+                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                )
+
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
            if length:
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
        if not hasattr(self, 'log'):
            from calibre.utils.logging import default_log
            self.log = default_log
-		self.log("*********  Preprocessing HTML  *********")
+		self.log("*********  Preprocessing HTML - HTML Input plugin *********")
 		# Detect Chapters to match the xpath in the GUI
 		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
 		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -11,12 +11,14 @@ import re
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.preprocess import line_length

+
 class LITInput(InputFormatPlugin):

    name        = 'LIT Input'
    author      = 'Marshall T. Vandegrift'
    description = 'Convert LIT files to HTML'
    file_types  = set(['lit'])
+    html_preprocess_sections = 0

    def convert(self, stream, options, file_ext, log,
                accelerators):
@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):


 	def preprocess_html(self, html):
+
+        def chapter_head(match):
+            chap = match.group('chap')
+            title = match.group('title')
+            if not title:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+                       return '<h2>'+chap+'</h2>\n'
+            else:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+                       return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
+
+        def chapter_link(match):
+            chap = match.group('sectionlink')
+            if not chap:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+                       return '<br style="page-break-before:always">'
+            else:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+                       return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+
+
+        def no_markup(raw, percent):
+            '''
+            Detects total marked up line endings in the file. raw is the text to 
+            inspect.  Percent is the minimum percent of line endings which should 
+            be marked up to return true.
+            '''
+            htm_end_ere = re.compile('</p>', re.DOTALL)
+            line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+            htm_end = htm_end_ere.findall(raw)
+            line_end = line_end_ere.findall(raw)
+            tot_htm_ends = len(htm_end)
+            tot_ln_fds = len(line_end)
+            self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+    
+            if percent > 1:
+                percent = 1
+            if percent < 0:
+                percent = 0    
+    
+            min_lns = tot_ln_fds * percent
+            self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+            if min_lns > tot_htm_ends:
+                return True
+                
 		self.log("*********  Preprocessing HTML  *********")
-		# Detect Chapters to match the xpath in the GUI
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
-		# Unwrap lines using punctation if the median length of all lines is less than 150
+		# remove non-breaking spaces
+		html = re.sub(ur'\u00a0', ' ', html)
+		# Get rid of empty <o:p> tags to simplify other processing
+		html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
+		# Get rid of empty span tags
+        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
+        
+        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+		linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        blanklines = blankreg.findall(html)
+        lines = linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                self.log("deleting blank lines")
+                html = blankreg.sub('', html)
+		# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+		html = re.sub(r"\s*</p>", "</p>\n", html)
+		
+		# some lit files don't have any <p> tags or equivalent, check and 
+		# mark up line endings if required before proceeding
+		if no_markup(html, 0.1):
+		     self.log("not enough paragraph markers, adding now")
+             add_markup = re.compile('(?<!>)(\n)')
+             html = add_markup.sub('</p>\n<p>', html)
+        
+		# detect chapters/sections to match xpath or splitting logic
 		#
-		# Insert extra line feeds so the line length regex functions properly
-		html = re.sub(r"</p>", "</p>\n", html)
+		# Mark split points based on embedded links
+		chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
+        html = chaplink.sub(chapter_link, html)
+        # Continue with alternate patterns, start with most typical chapter headings
+		if self.html_preprocess_sections < 10:        
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            html = chapdetect.sub(chapter_head, html)
+		if self.html_preprocess_sections < 10:
+		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+		    html = chapdetect2.sub(chapter_head, html)
+		    
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+		#    
+		# Unwrap lines using punctation if the median length of all lines is less than 150		
 		length = line_length('html', html, 0.4)
 		self.log("*** Median length is " + str(length) + " ***")
 		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -3,6 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

+import re
 from calibre.customize.conversion import InputFormatPlugin

 class MOBIInput(InputFormatPlugin):
@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
                    include_meta_content_type=False))
                accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
        return mr.created_opf_path
+
+    def preprocess_html(self, html):
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        return html
+
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -409,6 +409,10 @@ class Page(object):
    # for them to be considered to be part of the same text fragment
    LINE_FACTOR = 0.4
    
+    # Percentage of the page heigth which should be considered header
+    # or footer to be discarded from reflow considerations
+    HEAD_FOOTER_MARGIN
+
    # Multiplies the average line height when determining row height
    # of a particular element to detect columns.
    YFUZZ = 1.5