From 5c951fb9628617133f17ead6d1393ea84b7c6412 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 4 Sep 2010 15:12:29 +1000 Subject: [PATCH] Preprocessing Updates --- src/calibre/ebooks/conversion/preprocess.py | 26 +++-- src/calibre/ebooks/html/input.py | 2 +- src/calibre/ebooks/lit/input.py | 104 ++++++++++++++++++-- src/calibre/ebooks/mobi/input.py | 10 ++ src/calibre/ebooks/pdf/reflow.py | 4 + 5 files changed, 132 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 957418f1fd..2954fd7c26 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,6 +62,7 @@ def wrap_lines(match): else: return ital+' ' + def line_length(format, raw, percent): ''' raw is the raw text to find the line length to use for wrapping. @@ -191,32 +192,36 @@ class HTMLPreProcessor(object): (re.compile(u'¸\s*()*\s*c', re.UNICODE), lambda match: u'ç'), (re.compile(u'¸\s*()*\s*C', re.UNICODE), lambda match: u'Ç'), + # If pdf printed from a browser then the header/footer has a reliable pattern + (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), + + # Center separator lines + (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), + # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: '
'), # Replace

with

- (re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), + # (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n

'), - # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), + # unwrap hyphenation - don't delete the hyphen (often doesn't split words) + (re.compile(r'(?<=[-–])\s*
\s*(?=[[a-z\d])'), lambda match: ''), # Remove gray background (re.compile(r']+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(()?)?)]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), # Clean up spaces (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), - # Connect paragraphs split by - - (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''), # Add space before and after italics (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), + ] # Fix Book Designer markup @@ -293,6 +298,13 @@ class HTMLPreProcessor(object): import traceback print 'Failed to parse remove_footer regexp' traceback.print_exc() + + # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives + if getattr(self.extra_opts, 'preprocess_html', None): + if is_pdftohtml: + end_rules.append( + (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + ) if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index d57bfddd3e..35a8a1a9bc 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin): if not hasattr(self, 'log'): from calibre.utils.logging import default_log self.log = default_log - self.log("********* Preprocessing HTML *********") + self.log("********* Preprocessing HTML - HTML Input plugin *********") # Detect Chapters to match the xpath in the GUI chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 9bf20fb1d4..f7bb0fbfd9 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -11,12 +11,14 @@ import re from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.conversion.preprocess import line_length + class LITInput(InputFormatPlugin): name = 'LIT Input' author = 'Marshall T. Vandegrift' description = 'Convert LIT files to HTML' file_types = set(['lit']) + html_preprocess_sections = 0 def convert(self, stream, options, file_ext, log, accelerators): @@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin): def preprocess_html(self, html): + + def chapter_head(match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + return '<h2>'+chap+'</h2>\n' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' + + def chapter_link(match): + chap = match.group('sectionlink') + if not chap: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links") + return '<br style="page-break-before:always">' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap)) + return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>' + + + def no_markup(raw, percent): + ''' + Detects total marked up line endings in the file. raw is the text to + inspect. Percent is the minimum percent of line endings which should + be marked up to return true. + ''' + htm_end_ere = re.compile('</p>', re.DOTALL) + line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) + htm_end = htm_end_ere.findall(raw) + line_end = line_end_ere.findall(raw) + tot_htm_ends = len(htm_end) + tot_ln_fds = len(line_end) + self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***") + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + min_lns = tot_ln_fds * percent + self.log("There must be more than " + str(min_lns) + " unmarked lines to be true") + if min_lns > tot_htm_ends: + return True + self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 + # remove non-breaking spaces + html = re.sub(ur'\u00a0', ' ', html) + # Get rid of empty <o:p> tags to simplify other processing + html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) + # Get rid of empty span tags + html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) + + # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE) + blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) + blanklines = blankreg.findall(html) + lines = linereg.findall(html) + if len(lines) > 1: + self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + if float(len(blanklines)) / float(len(lines)) > 0.40: + self.log("deleting blank lines") + html = blankreg.sub('', html) + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*</p>", "</p>\n", html) + + # some lit files don't have any <p> tags or equivalent, check and + # mark up line endings if required before proceeding + if no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?<!>)(\n)') + html = add_markup.sub('</p>\n<p>', html) + + # detect chapters/sections to match xpath or splitting logic # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) + # Mark split points based on embedded links + chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE) + html = chaplink.sub(chapter_link, html) + # Continue with alternate patterns, start with most typical chapter headings + if self.html_preprocess_sections < 10: + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + html = chapdetect.sub(chapter_head, html) + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(chapter_head, html) + + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + # + # Unwrap lines using punctation if the median length of all lines is less than 150 length = line_length('html', html, 0.4) self.log("*** Median length is " + str(length) + " ***") unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 487e70c04f..b8dc7a9560 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import re from calibre.customize.conversion import InputFormatPlugin class MOBIInput(InputFormatPlugin): @@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin): include_meta_content_type=False)) accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path + + def preprocess_html(self, html): + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + return html + diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 584d631d0b..36848ddb8b 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -408,6 +408,10 @@ class Page(object): # Fraction of text height that two strings' bottoms can differ by # for them to be considered to be part of the same text fragment LINE_FACTOR = 0.4 + + # Percentage of the page heigth which should be considered header + # or footer to be discarded from reflow considerations + HEAD_FOOTER_MARGIN # Multiplies the average line height when determining row height # of a particular element to detect columns.