preprocess updates for lit, html, and pdf

2025-07-09 03:04:10 -04:00 · 2010-09-11 21:02:44 +10:00 · 2010-09-11 21:02:44 +10:00 · cf7cc4de4d
commit cf7cc4de4d
parent 480eccb0b0
6 changed files with 129 additions and 142 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -214,7 +214,6 @@ class HTMLPreProcessor(object):
                  (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
                  # If pdf printed from a browser then the header/footer has a reliable pattern
                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
@ -225,13 +224,6 @@ class HTMLPreProcessor(object):
                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                  # Replace <br><br> with <p>
                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
                  #(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
                  # unwrap/delete soft hyphens
                  #(re.compile(u'[]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -3,4 +3,124 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
+__docformat__ = 'restructuredtext en'
 import re
 from calibre.ebooks.conversion.preprocess import line_length
 from calibre.utils.logging import default_log
 from lxml import etree
 class PreProcessor(object):
    html_preprocess_sections = 0
    def __init__(self, args):
        self.args = args
        self.log = default_log
    def chapter_head(self, match):
        chap = match.group('chap')
        title = match.group('title')
        if not title:
                   self.html_preprocess_sections = self.html_preprocess_sections + 1
                   self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
                   return '<h2>'+chap+'</h2>\n'
        else:
                   self.html_preprocess_sections = self.html_preprocess_sections + 1
                   self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                   return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
    def chapter_link(self, match):
        chap = match.group('sectionlink')
        if not chap:
                   self.html_preprocess_sections = self.html_preprocess_sections + 1
                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
                   return '<br style="page-break-before:always">'
        else:
                   self.html_preprocess_sections = self.html_preprocess_sections + 1
                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
                   return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
    def no_markup(self, raw, percent):
        '''
        Detects total marked up line endings in the file. raw is the text to 
        inspect.  Percent is the minimum percent of line endings which should 
        be marked up to return true.
        '''
        htm_end_ere = re.compile('</p>', re.DOTALL)
        line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
        htm_end = htm_end_ere.findall(raw)
        line_end = line_end_ere.findall(raw)
        tot_htm_ends = len(htm_end)
        tot_ln_fds = len(line_end)
        self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
        if percent > 1:
            percent = 1
        if percent < 0:
            percent = 0    
        min_lns = tot_ln_fds * percent
        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
        if min_lns > tot_htm_ends:
            return True
    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
        # remove non-breaking spaces
        html = re.sub(ur'\u00a0', ' ', html)
        # Get rid of empty <o:p> tags to simplify other processing
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Get rid of empty span tags
        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        if len(lines) > 1:
            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40:
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = re.sub(r"\s*</p>", "</p>\n", html)
        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
        # some lit files don't have any <p> tags or equivalent, check and 
        # mark up line endings if required before proceeding
        if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
             add_markup = re.compile('(?<!>)(\n)')
             html = add_markup.sub('</p>\n<p>', html)
        # detect chapters/sections to match xpath or splitting logic
        # 
        # Start with most typical chapter headings       
        chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
        html = chapdetect.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
            html = chapdetect2.sub(self.chapter_head, html)    
        #    
        # Unwrap lines using punctation if the median length of all lines is less than 200        
        length = line_length('html', html, 0.4)
        self.log("*** Median line length is " + str(length) + " ***")
        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
        if length < 200:
            self.log("Unwrapping Lines")
            html = unwrap.sub(' ', html)        
        # If still no sections after unwrapping lines break on lines with no punctuation
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
            #self.log(html)
            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_head, html)        
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
        return html
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 class Link(object):
    '''
@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
        return (None, raw)
 	def preprocess_html(self, html):
-        if not hasattr(self, 'log'):
+        preprocessor = PreProcessor(html)
-            from calibre.utils.logging import default_log
+        html = preprocessor(html)
            self.log = default_log
 		self.log("*********  Preprocessing HTML - HTML Input plugin *********")
 		# Detect Chapters to match the xpath in the GUI
 		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
 		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
 		# Unwrap lines using punctation if the median length of all lines is less than 150
 		#
 		# Insert extra line feeds so the line length regex functions properly
 		html = re.sub(r"</p>", "</p>\n", html)
 		length = line_length('html', html, 0.4)
 		self.log.debug("*** Median length is " + str(length) + " ***")
 		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
 		if length < 150:
 			html = unwrap.sub(' ', html)
        return html
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -6,10 +6,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 class LITInput(InputFormatPlugin):
@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin):
    author      = 'Marshall T. Vandegrift'
    description = 'Convert LIT files to HTML'
    file_types  = set(['lit'])
    html_preprocess_sections = 0
    def convert(self, stream, options, file_ext, log,
                accelerators):
@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin):
 	def preprocess_html(self, html):
-
+        preprocessor = PreProcessor(html)
-        def chapter_head(match):
+        html = preprocessor(html)
            chap = match.group('chap')
            title = match.group('title')
            if not title:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
                       return '<h2>'+chap+'</h2>\n'
            else:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                       return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
        def chapter_link(match):
            chap = match.group('sectionlink')
            if not chap:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
                       return '<br style="page-break-before:always">'
            else:
                       self.html_preprocess_sections = self.html_preprocess_sections + 1
                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
                       return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
        def no_markup(raw, percent):
            '''
            Detects total marked up line endings in the file. raw is the text to 
            inspect.  Percent is the minimum percent of line endings which should 
            be marked up to return true.
            '''
            htm_end_ere = re.compile('</p>', re.DOTALL)
            line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
            htm_end = htm_end_ere.findall(raw)
            line_end = line_end_ere.findall(raw)
            tot_htm_ends = len(htm_end)
            tot_ln_fds = len(line_end)
            self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
            if percent > 1:
                percent = 1
            if percent < 0:
                percent = 0    
            min_lns = tot_ln_fds * percent
            self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
            if min_lns > tot_htm_ends:
                return True
 		self.log("*********  Preprocessing HTML  *********")
 		# remove non-breaking spaces
 		html = re.sub(ur'\u00a0', ' ', html)
 		# Get rid of empty <o:p> tags to simplify other processing
 		html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
 		# Get rid of empty span tags
        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
 		linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        if len(lines) > 1:
            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40:
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
 		# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
 		html = re.sub(r"\s*</p>", "</p>\n", html)
 		# some lit files don't have any <p> tags or equivalent, check and 
 		# mark up line endings if required before proceeding
 		if no_markup(html, 0.1):
 		     self.log("not enough paragraph markers, adding now")
             add_markup = re.compile('(?<!>)(\n)')
             html = add_markup.sub('</p>\n<p>', html)
 		# detect chapters/sections to match xpath or splitting logic
 		#
 		# Mark split points based on embedded links
 		chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
        html = chaplink.sub(chapter_link, html)
        # Continue with alternate patterns, start with most typical chapter headings
 		if self.html_preprocess_sections < 10:        
            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
            html = chapdetect.sub(chapter_head, html)
 		if self.html_preprocess_sections < 10:
 		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
 		    html = chapdetect2.sub(chapter_head, html)    
        #    
 		# Unwrap lines using punctation if the median length of all lines is less than 150		
 		length = line_length('html', html, 0.4)
 		self.log("*** Median line length is " + str(length) + " ***")
 		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
 		if length < 150:
 		    self.log("Unwrapping Lines")
 			html = unwrap.sub(' ', html)		
 		# If still no sections after unwrapping lines break on lines with no punctuation
 		if self.html_preprocess_sections < 10:
 		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
 		    #self.log(html)
            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
            html = chapdetect3.sub(chapter_head, html)    	
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
        return html
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@ -21,7 +21,7 @@ class Reader(FormatReader):
        self.options = options
        setattr(self.options, 'new_pdf_engine', False)
        setattr(self.options, 'no_images', False)
-        setattr(self.options, 'unwrap_factor', 0.5)
+        setattr(self.options, 'unwrap_factor', 0.45)
    def extract_content(self, output_dir):
        self.log.info('Extracting PDF...')
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
        OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
            help=_('Scale used to determine the length at which a line should '
            'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.45, this is the median line length.')),
+            'default is 0.45, just below the median line length.')),
        OptionRecommendation(name='new_pdf_engine', recommended_value=False,
            help=_('Use the new PDF conversion engine.'))
    ])