minor tweaks to preprocessing, backed out reflow change

2025-07-09 03:04:10 -04:00 · 2010-09-12 11:17:49 +10:00 · 2010-09-12 11:17:49 +10:00 · 9a06996b16
commit 9a06996b16
parent f6de0bef13
3 changed files with 21 additions and 23 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -319,8 +319,8 @@ class HTMLPreProcessor(object):
      
        # unwrap hyphenation - moved here so it's executed after header/footer removal
        if is_pdftohtml:
-            # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
-            # hyphens are for compound words, formatting, etc
+            # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+            # for compound words, formatting, etc
            end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens
            end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -29,16 +29,12 @@ class PreProcessor(object):
                   self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                   return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'

-    def chapter_link(self, match):
-        chap = match.group('sectionlink')
-        if not chap:
+    def chapter_break(self, match):
+        chap = match.group('section')
+        styles = match.group('styles')
        self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
-                   return '<br style="page-break-before:always">'
-        else:
-                   self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
-                   return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+        self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+        return '<'+styles+' style="page-break-before:always">'+chap

    def no_markup(self, raw, percent):
        '''
@ -74,7 +70,7 @@ class PreProcessor(object):
        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
        
        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
-        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
@ -100,8 +96,13 @@ class PreProcessor(object):
        chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
        html = chapdetect.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)    
+
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
            html = chapdetect2.sub(self.chapter_head, html)    
        #    
        # Unwrap lines using punctation if the median length of all lines is less than 200        
@ -111,12 +112,13 @@ class PreProcessor(object):
        if length < 200:
            self.log("Unwrapping Lines")
            html = unwrap.sub(' ', html)
+            
        # If still no sections after unwrapping lines break on lines with no punctuation
        if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+            self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
            #self.log(html)
-            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
-            html = chapdetect3.sub(self.chapter_head, html)        
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_break, html)      
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -409,10 +409,6 @@ class Page(object):
    # for them to be considered to be part of the same text fragment
    LINE_FACTOR = 0.4

-    # Percentage of the page heigth which should be considered header
-    # or footer to be discarded from reflow considerations
-    HEAD_FOOTER_MARGIN
-
    # Multiplies the average line height when determining row height
    # of a particular element to detect columns.
    YFUZZ = 1.5