From 0ad1f3c088f2ff0872de49171fd99a91a50a031a Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 25 Aug 2010 10:49:42 +1000 Subject: [PATCH 01/12] preprocessing regex tweaks --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/rtf/input.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index da652c1a38..940c27344b 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -209,7 +209,7 @@ class HTMLPreProcessor(object): (re.compile(ur'\u00a0'), lambda match : ' '), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(()?)?)]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), # Have paragraphs show better diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index dcffbe68ca..eaba28e429 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -231,12 +231,12 @@ class RTFInput(InputFormatPlugin): if self.options.preprocess_html: print "********* Preprocessing HTML *********\n" # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE) + chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(<(/i|b)>)?)?)\s*</span>\s*</p>', re.IGNORECASE) res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res) # Unwrap lines using punctation if the median length of all lines is less than 150 length = line_length('html', res, 0.4) print "*** Median length is " + str(length) + " ***\n" - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE) + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*</p>\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*<span[^>]*>\s*" % length, re.UNICODE) if length < 150: res = unwrap.sub(' ', res) f.write(res) From 5c951fb9628617133f17ead6d1393ea84b7c6412 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 4 Sep 2010 15:12:29 +1000 Subject: [PATCH 02/12] Preprocessing Updates --- src/calibre/ebooks/conversion/preprocess.py | 26 +++-- src/calibre/ebooks/html/input.py | 2 +- src/calibre/ebooks/lit/input.py | 104 ++++++++++++++++++-- src/calibre/ebooks/mobi/input.py | 10 ++ src/calibre/ebooks/pdf/reflow.py | 4 + 5 files changed, 132 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 957418f1fd..2954fd7c26 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,6 +62,7 @@ def wrap_lines(match): else: return ital+' ' + def line_length(format, raw, percent): ''' raw is the raw text to find the line length to use for wrapping. @@ -191,32 +192,36 @@ class HTMLPreProcessor(object): (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'), (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'), + # If pdf printed from a browser then the header/footer has a reliable pattern + (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), + + # Center separator lines + (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), + # Remove page links (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), # Remove <hr> tags (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), # Replace <br><br> with <p> - (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'), + # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'), - # Remove hyphenation - (re.compile(r'-<br.*?>\n\r?'), lambda match: ''), + # unwrap hyphenation - don't delete the hyphen (often doesn't split words) + (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), # Remove gray background (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), # Clean up spaces (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), - # Connect paragraphs split by - - (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''), # Add space before and after italics (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), + ] # Fix Book Designer markup @@ -293,6 +298,13 @@ class HTMLPreProcessor(object): import traceback print 'Failed to parse remove_footer regexp' traceback.print_exc() + + # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives + if getattr(self.extra_opts, 'preprocess_html', None): + if is_pdftohtml: + end_rules.append( + (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + ) if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index d57bfddd3e..35a8a1a9bc 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin): if not hasattr(self, 'log'): from calibre.utils.logging import default_log self.log = default_log - self.log("********* Preprocessing HTML *********") + self.log("********* Preprocessing HTML - HTML Input plugin *********") # Detect Chapters to match the xpath in the GUI chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 9bf20fb1d4..f7bb0fbfd9 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -11,12 +11,14 @@ import re from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.conversion.preprocess import line_length + class LITInput(InputFormatPlugin): name = 'LIT Input' author = 'Marshall T. Vandegrift' description = 'Convert LIT files to HTML' file_types = set(['lit']) + html_preprocess_sections = 0 def convert(self, stream, options, file_ext, log, accelerators): @@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin): def preprocess_html(self, html): + + def chapter_head(match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + return '<h2>'+chap+'</h2>\n' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' + + def chapter_link(match): + chap = match.group('sectionlink') + if not chap: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links") + return '<br style="page-break-before:always">' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap)) + return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>' + + + def no_markup(raw, percent): + ''' + Detects total marked up line endings in the file. raw is the text to + inspect. Percent is the minimum percent of line endings which should + be marked up to return true. + ''' + htm_end_ere = re.compile('</p>', re.DOTALL) + line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) + htm_end = htm_end_ere.findall(raw) + line_end = line_end_ere.findall(raw) + tot_htm_ends = len(htm_end) + tot_ln_fds = len(line_end) + self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***") + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + min_lns = tot_ln_fds * percent + self.log("There must be more than " + str(min_lns) + " unmarked lines to be true") + if min_lns > tot_htm_ends: + return True + self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 + # remove non-breaking spaces + html = re.sub(ur'\u00a0', ' ', html) + # Get rid of empty <o:p> tags to simplify other processing + html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) + # Get rid of empty span tags + html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) + + # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE) + blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) + blanklines = blankreg.findall(html) + lines = linereg.findall(html) + if len(lines) > 1: + self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + if float(len(blanklines)) / float(len(lines)) > 0.40: + self.log("deleting blank lines") + html = blankreg.sub('', html) + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*</p>", "</p>\n", html) + + # some lit files don't have any <p> tags or equivalent, check and + # mark up line endings if required before proceeding + if no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?<!>)(\n)') + html = add_markup.sub('</p>\n<p>', html) + + # detect chapters/sections to match xpath or splitting logic # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) + # Mark split points based on embedded links + chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE) + html = chaplink.sub(chapter_link, html) + # Continue with alternate patterns, start with most typical chapter headings + if self.html_preprocess_sections < 10: + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + html = chapdetect.sub(chapter_head, html) + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(chapter_head, html) + + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + # + # Unwrap lines using punctation if the median length of all lines is less than 150 length = line_length('html', html, 0.4) self.log("*** Median length is " + str(length) + " ***") unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 487e70c04f..b8dc7a9560 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import re from calibre.customize.conversion import InputFormatPlugin class MOBIInput(InputFormatPlugin): @@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin): include_meta_content_type=False)) accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path + + def preprocess_html(self, html): + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + return html + diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 584d631d0b..36848ddb8b 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -408,6 +408,10 @@ class Page(object): # Fraction of text height that two strings' bottoms can differ by # for them to be considered to be part of the same text fragment LINE_FACTOR = 0.4 + + # Percentage of the page heigth which should be considered header + # or footer to be discarded from reflow considerations + HEAD_FOOTER_MARGIN # Multiplies the average line height when determining row height # of a particular element to detect columns. From 4c7373026b9ee8a618dccf8602740d6a7d578aa2 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 11 Sep 2010 12:10:49 +1000 Subject: [PATCH 03/12] preprocessing changes for lit & pdf, added utils.py, changed default unwrap_factor --- src/calibre/ebooks/conversion/preprocess.py | 15 ++++++++--- src/calibre/ebooks/conversion/utils.py | 6 +++++ src/calibre/ebooks/lit/input.py | 29 +++++++++++++-------- src/calibre/ebooks/pdf/input.py | 4 +-- 4 files changed, 37 insertions(+), 17 deletions(-) create mode 100644 src/calibre/ebooks/conversion/utils.py diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 2954fd7c26..452a322d95 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -77,6 +77,7 @@ def line_length(format, raw, percent): elif format == 'pdf': linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) lines = linere.findall(raw) + print "percent is " + str(percent) lengths = [] for line in lines: @@ -165,6 +166,11 @@ class HTMLPreProcessor(object): (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'), (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'), (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'), + + #(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'), + #(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'), + #(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'), + #(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'), (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'), (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'), @@ -206,13 +212,13 @@ class HTMLPreProcessor(object): # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'), # unwrap hyphenation - don't delete the hyphen (often doesn't split words) - (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), + (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), # Remove gray background (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), @@ -303,15 +309,16 @@ class HTMLPreProcessor(object): if getattr(self.extra_opts, 'preprocess_html', None): if is_pdftohtml: end_rules.append( - (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), ) if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) if length: + print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py new file mode 100644 index 0000000000..52be473372 --- /dev/null +++ b/src/calibre/ebooks/conversion/utils.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' \ No newline at end of file diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index f7bb0fbfd9..35dad501be 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -102,7 +102,7 @@ class LITInput(InputFormatPlugin): percent = 0 min_lns = tot_ln_fds * percent - self.log("There must be more than " + str(min_lns) + " unmarked lines to be true") + self.log("There must be more than " + str(min_lns) + " unmarked lines to return true") if min_lns > tot_htm_ends: return True @@ -141,24 +141,31 @@ class LITInput(InputFormatPlugin): html = chaplink.sub(chapter_link, html) # Continue with alternate patterns, start with most typical chapter headings if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) html = chapdetect.sub(chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) - html = chapdetect2.sub(chapter_head, html) - + html = chapdetect2.sub(chapter_head, html) + # + # Unwrap lines using punctation if the median length of all lines is less than 150 + length = line_length('html', html, 0.4) + self.log("*** Median line length is " + str(length) + " ***") + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + if length < 150: + self.log("Unwrapping Lines") + html = unwrap.sub(' ', html) + # If still no sections after unwrapping lines break on lines with no punctuation + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation") + #self.log(html) + chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE) + html = chapdetect3.sub(chapter_head, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) - # - # Unwrap lines using punctation if the median length of all lines is less than 150 - length = line_length('html', html, 0.4) - self.log("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - html = unwrap.sub(' ', html) + return html diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 64a089281e..113c3d99d8 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin): options = set([ OptionRecommendation(name='no_images', recommended_value=False, help=_('Do not extract images from the document')), - OptionRecommendation(name='unwrap_factor', recommended_value=0.5, + OptionRecommendation(name='unwrap_factor', recommended_value=0.45, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' - 'default is 0.5, this is the median line length.')), + 'default is 0.45, this is the median line length.')), OptionRecommendation(name='new_pdf_engine', recommended_value=False, help=_('Use the new PDF conversion engine.')) ]) From faf15b2f3d611594352721d4d06407025fea1320 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 11 Sep 2010 13:09:23 +1000 Subject: [PATCH 04/12] preprocess merge gone wrong, fixing --- src/calibre/ebooks/conversion/preprocess.py | 25 ++++++--------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e2364d961f..24a389e65c 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -168,7 +168,6 @@ class HTMLPreProcessor(object): (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'), -<<<<<<< TREE (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'), (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'), (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'), @@ -176,13 +175,6 @@ class HTMLPreProcessor(object): (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'), (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'), - #(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'), - #(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'), - #(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'), - #(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'), -======= ->>>>>>> MERGE-SOURCE - # ´ (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'), (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'), @@ -218,14 +210,7 @@ class HTMLPreProcessor(object): # ¸ (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'), (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'), - -<<<<<<< TREE - # If pdf printed from a browser then the header/footer has a reliable pattern - (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), - - # Center separator lines - (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), -======= + # ˛ (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'), (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'), @@ -235,8 +220,12 @@ class HTMLPreProcessor(object): # ˙ (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), - ->>>>>>> MERGE-SOURCE + + # If pdf printed from a browser then the header/footer has a reliable pattern + (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), + + # Center separator lines + (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), # Remove page links (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), From 2a906184ad4c56d3018806c03bf2647bd8ecc242 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 11 Sep 2010 13:17:21 +1000 Subject: [PATCH 05/12] preprocess merge gone wrong, merged original accent code back --- src/calibre/ebooks/conversion/preprocess.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 24a389e65c..f2b19efa9b 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -168,13 +168,7 @@ class HTMLPreProcessor(object): (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'), - (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'), - (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'), - (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'), - (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'), - (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'), - (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'), - + # ´ (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'), (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'), @@ -210,7 +204,7 @@ class HTMLPreProcessor(object): # ¸ (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'), (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'), - + # ˛ (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'), (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'), @@ -221,6 +215,7 @@ class HTMLPreProcessor(object): (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), + # If pdf printed from a browser then the header/footer has a reliable pattern (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), From 480eccb0b0c3921fd356d329e6d601b9207c2d26 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 11 Sep 2010 15:33:10 +1000 Subject: [PATCH 06/12] Fixed unwrapping for various hyphen and dash types, other minor tweaks to pdf --- src/calibre/ebooks/conversion/preprocess.py | 28 ++++++++++++++------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f2b19efa9b..c120f0a560 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -77,7 +77,6 @@ def line_length(format, raw, percent): elif format == 'pdf': linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) lines = linere.findall(raw) - print "percent is " + str(percent) lengths = [] for line in lines: @@ -230,14 +229,17 @@ class HTMLPreProcessor(object): # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'), # unwrap hyphenation - don't delete the hyphen (often doesn't split words) - (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), + #(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), + # unwrap/delete soft hyphens + #(re.compile(u'[­]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), # Remove gray background (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), # Detect Chapters to match default XPATH in GUI (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), - + (re.compile(r'<br\s*/?>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?'), chap_head), + # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), # Clean up spaces @@ -322,21 +324,29 @@ class HTMLPreProcessor(object): import traceback print 'Failed to parse remove_footer regexp' traceback.print_exc() + + # unwrap hyphenation - moved here so it's executed after header/footer removal + if is_pdftohtml: + # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these + # hyphens are for compound words, formatting, etc + end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens + end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens with formatting + end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives if getattr(self.extra_opts, 'preprocess_html', None): if is_pdftohtml: - end_rules.append( - (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), - ) - + end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head)) + if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) if length: - print "The pdf line length returned is " + str(length) + # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: From cf7cc4de4d9b9fa5e4b22c5ce2cb63c099165589 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 11 Sep 2010 21:02:44 +1000 Subject: [PATCH 07/12] preprocess updates for lit, html, and pdf --- src/calibre/ebooks/conversion/preprocess.py | 8 -- src/calibre/ebooks/conversion/utils.py | 122 +++++++++++++++++++- src/calibre/ebooks/html/input.py | 20 +--- src/calibre/ebooks/lit/input.py | 117 +------------------ src/calibre/ebooks/pdb/pdf/reader.py | 2 +- src/calibre/ebooks/pdf/input.py | 2 +- 6 files changed, 129 insertions(+), 142 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index c120f0a560..6123577191 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -214,7 +214,6 @@ class HTMLPreProcessor(object): (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), - # If pdf printed from a browser then the header/footer has a reliable pattern (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), @@ -225,13 +224,6 @@ class HTMLPreProcessor(object): (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), # Remove <hr> tags (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), - # Replace <br><br> with <p> - # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'), - - # unwrap hyphenation - don't delete the hyphen (often doesn't split words) - #(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), - # unwrap/delete soft hyphens - #(re.compile(u'[­]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), # Remove gray background (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 52be473372..68cebb3a11 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -3,4 +3,124 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' -__docformat__ = 'restructuredtext en' \ No newline at end of file +__docformat__ = 'restructuredtext en' + +import re +from calibre.ebooks.conversion.preprocess import line_length +from calibre.utils.logging import default_log +from lxml import etree + +class PreProcessor(object): + html_preprocess_sections = 0 + + def __init__(self, args): + self.args = args + self.log = default_log + + def chapter_head(self, match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + return '<h2>'+chap+'</h2>\n' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' + + def chapter_link(self, match): + chap = match.group('sectionlink') + if not chap: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links") + return '<br style="page-break-before:always">' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap)) + return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>' + + def no_markup(self, raw, percent): + ''' + Detects total marked up line endings in the file. raw is the text to + inspect. Percent is the minimum percent of line endings which should + be marked up to return true. + ''' + htm_end_ere = re.compile('</p>', re.DOTALL) + line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) + htm_end = htm_end_ere.findall(raw) + line_end = line_end_ere.findall(raw) + tot_htm_ends = len(htm_end) + tot_ln_fds = len(line_end) + self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***") + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + min_lns = tot_ln_fds * percent + self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true") + if min_lns > tot_htm_ends: + return True + + def __call__(self, html): + self.log("********* Preprocessing HTML *********") + # remove non-breaking spaces + html = re.sub(ur'\u00a0', ' ', html) + # Get rid of empty <o:p> tags to simplify other processing + html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) + # Get rid of empty span tags + html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) + + # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE) + blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) + blanklines = blankreg.findall(html) + lines = linereg.findall(html) + if len(lines) > 1: + self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + if float(len(blanklines)) / float(len(lines)) > 0.40: + self.log("deleting blank lines") + html = blankreg.sub('', html) + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*</p>", "</p>\n", html) + html = re.sub(r"\s*<p>\s*", "\n<p>", html) + + # some lit files don't have any <p> tags or equivalent, check and + # mark up line endings if required before proceeding + if self.no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?<!>)(\n)') + html = add_markup.sub('</p>\n<p>', html) + + # detect chapters/sections to match xpath or splitting logic + # + # Start with most typical chapter headings + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + html = chapdetect.sub(self.chapter_head, html) + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) + # + # Unwrap lines using punctation if the median length of all lines is less than 200 + length = line_length('html', html, 0.4) + self.log("*** Median line length is " + str(length) + " ***") + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + if length < 200: + self.log("Unwrapping Lines") + html = unwrap.sub(' ', html) + # If still no sections after unwrapping lines break on lines with no punctuation + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation") + #self.log(html) + chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE) + html = chapdetect3.sub(self.chapter_head, html) + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + + return html \ No newline at end of file diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 35a8a1a9bc..e83216ae1f 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows from calibre import unicode_path from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class Link(object): ''' @@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin): return (None, raw) def preprocess_html(self, html): - if not hasattr(self, 'log'): - from calibre.utils.logging import default_log - self.log = default_log - self.log("********* Preprocessing HTML - HTML Input plugin *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 - # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) - length = line_length('html', html, 0.4) - self.log.debug("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - html = unwrap.sub(' ', html) + preprocessor = PreProcessor(html) + html = preprocessor(html) return html diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 35dad501be..58e7bc84bf 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -6,10 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import re - from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class LITInput(InputFormatPlugin): @@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin): author = 'Marshall T. Vandegrift' description = 'Convert LIT files to HTML' file_types = set(['lit']) - html_preprocess_sections = 0 def convert(self, stream, options, file_ext, log, accelerators): @@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin): def preprocess_html(self, html): - - def chapter_head(match): - chap = match.group('chap') - title = match.group('title') - if not title: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) - return '<h2>'+chap+'</h2>\n' - else: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) - return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' - - def chapter_link(match): - chap = match.group('sectionlink') - if not chap: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links") - return '<br style="page-break-before:always">' - else: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap)) - return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>' - - - def no_markup(raw, percent): - ''' - Detects total marked up line endings in the file. raw is the text to - inspect. Percent is the minimum percent of line endings which should - be marked up to return true. - ''' - htm_end_ere = re.compile('</p>', re.DOTALL) - line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) - htm_end = htm_end_ere.findall(raw) - line_end = line_end_ere.findall(raw) - tot_htm_ends = len(htm_end) - tot_ln_fds = len(line_end) - self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***") - - if percent > 1: - percent = 1 - if percent < 0: - percent = 0 - - min_lns = tot_ln_fds * percent - self.log("There must be more than " + str(min_lns) + " unmarked lines to return true") - if min_lns > tot_htm_ends: - return True - - self.log("********* Preprocessing HTML *********") - # remove non-breaking spaces - html = re.sub(ur'\u00a0', ' ', html) - # Get rid of empty <o:p> tags to simplify other processing - html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) - # Get rid of empty span tags - html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) - - # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing - linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE) - blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) - blanklines = blankreg.findall(html) - lines = linereg.findall(html) - if len(lines) > 1: - self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") - if float(len(blanklines)) / float(len(lines)) > 0.40: - self.log("deleting blank lines") - html = blankreg.sub('', html) - # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*</p>", "</p>\n", html) - - # some lit files don't have any <p> tags or equivalent, check and - # mark up line endings if required before proceeding - if no_markup(html, 0.1): - self.log("not enough paragraph markers, adding now") - add_markup = re.compile('(?<!>)(\n)') - html = add_markup.sub('</p>\n<p>', html) - - # detect chapters/sections to match xpath or splitting logic - # - # Mark split points based on embedded links - chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE) - html = chaplink.sub(chapter_link, html) - # Continue with alternate patterns, start with most typical chapter headings - if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) - html = chapdetect.sub(chapter_head, html) - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) - html = chapdetect2.sub(chapter_head, html) - # - # Unwrap lines using punctation if the median length of all lines is less than 150 - length = line_length('html', html, 0.4) - self.log("*** Median line length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - self.log("Unwrapping Lines") - html = unwrap.sub(' ', html) - # If still no sections after unwrapping lines break on lines with no punctuation - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation") - #self.log(html) - chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE) - html = chapdetect3.sub(chapter_head, html) - # search for places where a first or second level heading is immediately followed by another - # top level heading. demote the second heading to h3 to prevent splitting between chapter - # headings and titles, images, etc - doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) - html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) - + preprocessor = PreProcessor(html) + html = preprocessor(html) return html diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index 3ae9f8ccca..c151551866 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -21,7 +21,7 @@ class Reader(FormatReader): self.options = options setattr(self.options, 'new_pdf_engine', False) setattr(self.options, 'no_images', False) - setattr(self.options, 'unwrap_factor', 0.5) + setattr(self.options, 'unwrap_factor', 0.45) def extract_content(self, output_dir): self.log.info('Extracting PDF...') diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 113c3d99d8..14b3552b04 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin): OptionRecommendation(name='unwrap_factor', recommended_value=0.45, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' - 'default is 0.45, this is the median line length.')), + 'default is 0.45, just below the median line length.')), OptionRecommendation(name='new_pdf_engine', recommended_value=False, help=_('Use the new PDF conversion engine.')) ]) From f6de0bef13d7d1001b951d465cff3135aad616ed Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 11 Sep 2010 22:15:09 +1000 Subject: [PATCH 08/12] replaced messed up rtf file --- src/calibre/ebooks/rtf/preprocess.py | 624 +++++++++++++-------------- 1 file changed, 289 insertions(+), 335 deletions(-) diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py index ee45da697f..a3076651fd 100644 --- a/src/calibre/ebooks/rtf/preprocess.py +++ b/src/calibre/ebooks/rtf/preprocess.py @@ -1,390 +1,344 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement __license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__copyright__ = '2010, Gerendi Sandor Attila' __docformat__ = 'restructuredtext en' -import functools, re +""" +RTF tokenizer and token parser. v.1.0 (1/17/2010) +Author: Gerendi Sandor Attila -from calibre import entity_to_unicode +At this point this will tokenize a RTF file then rebuild it from the tokens. +In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant. +""" -XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') -SVG_NS = 'http://www.w3.org/2000/svg' -XLINK_NS = 'http://www.w3.org/1999/xlink' +class tokenDelimitatorStart(): + def __init__(self): + pass + def toRTF(self): + return b'{' + def __repr__(self): + return '{' -convert_entities = functools.partial(entity_to_unicode, - result_exceptions = { - u'<' : '<', - u'>' : '>', - u"'" : ''', - u'"' : '"', - u'&' : '&', - }) -_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE) +class tokenDelimitatorEnd(): + def __init__(self): + pass + def toRTF(self): + return b'}' + def __repr__(self): + return '}' -LIGATURES = { -# u'\u00c6': u'AE', -# u'\u00e6': u'ae', -# u'\u0152': u'OE', -# u'\u0153': u'oe', -# u'\u0132': u'IJ', -# u'\u0133': u'ij', -# u'\u1D6B': u'ue', - u'\uFB00': u'ff', - u'\uFB01': u'fi', - u'\uFB02': u'fl', - u'\uFB03': u'ffi', - u'\uFB04': u'ffl', - u'\uFB05': u'ft', - u'\uFB06': u'st', - } +class tokenControlWord(): + def __init__(self, name, separator = ''): + self.name = name + self.separator = separator + def toRTF(self): + return self.name + self.separator + def __repr__(self): + return self.name + self.separator -_ligpat = re.compile(u'|'.join(LIGATURES)) +class tokenControlWordWithNumericArgument(): + def __init__(self, name, argument, separator = ''): + self.name = name + self.argument = argument + self.separator = separator + def toRTF(self): + return self.name + repr(self.argument) + self.separator + def __repr__(self): + return self.name + repr(self.argument) + self.separator -def sanitize_head(match): - x = match.group(1) - x = _span_pat.sub('', x) - return '<head>\n%s\n</head>' % x +class tokenControlSymbol(): + def __init__(self, name): + self.name = name + def toRTF(self): + return self.name + def __repr__(self): + return self.name -def chap_head(match): - chap = match.group('chap') - title = match.group('title') - if not title: - return '<h1>'+chap+'</h1><br/>\n' - else: - return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n' +class tokenData(): + def __init__(self, data): + self.data = data + def toRTF(self): + return self.data + def __repr__(self): + return self.data -def wrap_lines(match): - ital = match.group('ital') - if not ital: - return ' ' - else: - return ital+' ' +class tokenBinN(): + def __init__(self, data, separator = ''): + self.data = data + self.separator = separator + def toRTF(self): + return "\\bin" + repr(len(self.data)) + self.separator + self.data + def __repr__(self): + return "\\bin" + repr(len(self.data)) + self.separator + self.data + +class token8bitChar(): + def __init__(self, data): + self.data = data + def toRTF(self): + return "\\'" + self.data + def __repr__(self): + return "\\'" + self.data + +class tokenUnicode(): + def __init__(self, data, separator = '', current_ucn = 1, eqList = []): + self.data = data + self.separator = separator + self.current_ucn = current_ucn + self.eqList = eqList + def toRTF(self): + result = '\\u' + repr(self.data) + ' ' + ucn = self.current_ucn + if len(self.eqList) < ucn: + ucn = len(self.eqList) + result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result + i = 0 + for eq in self.eqList: + if i >= ucn: + break + result = result + eq.toRTF() + return result + def __repr__(self): + return '\\u' + repr(self.data) -def line_length(format, raw, percent): - ''' - raw is the raw text to find the line length to use for wrapping. - percentage is a decimal number, 0 - 1 which is used to determine - how far in the list of line lengths to use. The list of line lengths is - ordered smallest to larged and does not include duplicates. 0.5 is the - median value. - ''' - raw = raw.replace(' ', ' ') - if format == 'html': - linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL) - elif format == 'pdf': - linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) - lines = linere.findall(raw) - print "percent is " + str(percent) +def isAsciiLetter(value): + return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z')) - lengths = [] - for line in lines: - if len(line) > 0: - lengths.append(len(line)) +def isDigit(value): + return (value >= '0') and (value <= '9') - if not lengths: - return 0 +def isChar(value, char): + return value == char - lengths = list(set(lengths)) - total = sum(lengths) - avg = total / len(lengths) - max_line = avg * 2 - - lengths = sorted(lengths) - for i in range(len(lengths) - 1, -1, -1): - if lengths[i] > max_line: - del lengths[i] - - if percent > 1: - percent = 1 - if percent < 0: - percent = 0 - - index = int(len(lengths) * percent) - 1 - - return lengths[index] +def isString(buffer, string): + return buffer == string -class CSSPreProcessor(object): +class RtfTokenParser(): + def __init__(self, tokens): + self.tokens = tokens + self.process() + self.processUnicode() - PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') + def process(self): + i = 0 + newTokens = [] + while i < len(self.tokens): + if isinstance(self.tokens[i], tokenControlSymbol): + if isString(self.tokens[i].name, "\\'"): + i = i + 1 + if not isinstance(self.tokens[i], tokenData): + raise Exception('Error: token8bitChar without data.') + if len(self.tokens[i].data) < 2: + raise Exception('Error: token8bitChar without data.') + newTokens.append(token8bitChar(self.tokens[i].data[0:2])) + if len(self.tokens[i].data) > 2: + newTokens.append(tokenData(self.tokens[i].data[2:])) + i = i + 1 + continue - def __call__(self, data, add_namespace=False): - from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE - data = self.PAGE_PAT.sub('', data) - if not add_namespace: - return data - ans, namespaced = [], False - for line in data.splitlines(): - ll = line.lstrip() - if not (namespaced or ll.startswith('@import') or - ll.startswith('@charset')): - ans.append(XHTML_CSS_NAMESPACE.strip()) - namespaced = True - ans.append(line) + newTokens.append(self.tokens[i]) + i = i + 1 - return u'\n'.join(ans) + self.tokens = list(newTokens) -class HTMLPreProcessor(object): + def processUnicode(self): + i = 0 + newTokens = [] + ucNbStack = [1] + while i < len(self.tokens): + if isinstance(self.tokens[i], tokenDelimitatorStart): + ucNbStack.append(ucNbStack[len(ucNbStack) - 1]) + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isinstance(self.tokens[i], tokenDelimitatorEnd): + ucNbStack.pop() + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isinstance(self.tokens[i], tokenControlWordWithNumericArgument): + if isString(self.tokens[i].name, '\\uc'): + ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isString(self.tokens[i].name, '\\u'): + x = i + j = 0 + i = i + 1 + replace = [] + partialData = None + ucn = ucNbStack[len(ucNbStack) - 1] + while (i < len(self.tokens)) and (j < ucn): + if isinstance(self.tokens[i], tokenDelimitatorStart): + break + if isinstance(self.tokens[i], tokenDelimitatorEnd): + break + if isinstance(self.tokens[i], tokenData): + if len(self.tokens[i].data) >= ucn - j: + replace.append(tokenData(self.tokens[i].data[0 : ucn - j])) + if len(self.tokens[i].data) > ucn - j: + partialData = tokenData(self.tokens[i].data[ucn - j:]) + i = i + 1 + break + else: + replace.append(self.tokens[i]) + j = j + len(self.tokens[i].data) + i = i + 1 + continue + if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN): + replace.append(self.tokens[i]) + i = i + 1 + j = j + 1 + continue + raise Exception('Error: incorect utf replacement.') - PREPROCESS = [ - # Some idiotic HTML generators (Frontpage I'm looking at you) - # Put all sorts of crap into <head>. This messes up lxml - (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL), - sanitize_head), - # Convert all entities, since lxml doesn't handle them well - (re.compile(r'&(\S+?);'), convert_entities), - # Remove the <![if/endif tags inserted by everybody's darling, MS Word - (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), - lambda match: ''), - ] + #calibre rtf2xml does not support utfreplace + replace = [] - # Fix pdftohtml markup - PDFTOHTML = [ - # Fix umlauts - # ¨ - (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'), - (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'), - (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'), - (re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'), - (re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'), - (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'), - (re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'), - (re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'), - (re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'), - (re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'), + newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace)) + if partialData != None: + newTokens.append(partialData) + continue - # Fix accents - # ` - (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'), - (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'), - (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'), - (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'), - (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'), - (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'), - (re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'), - (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'), - (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'), - (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'), + newTokens.append(self.tokens[i]) + i = i + 1 - # ´ - (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'), - (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'), - (re.compile(u'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ć'), - (re.compile(u'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ć'), - (re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'), - (re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'), - (re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'), - (re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'), - (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'), - (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'), - (re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'), - (re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'), - (re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'), - (re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'), - (re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'), - (re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'), - (re.compile(u'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ź'), - (re.compile(u'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ź'), + self.tokens = list(newTokens) - # ˆ - (re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'), - (re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'), - # ¸ - (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'), - (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'), + def toRTF(self): + result = [] + for token in self.tokens: + result.append(token.toRTF()) + return "".join(result) - # ˛ - (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'), - (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'), - (re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'), - (re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'), - - # ˙ - (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'), - (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), - - # If pdf printed from a browser then the header/footer has a reliable pattern - (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), +class RtfTokenizer(): + def __init__(self, rtfData): + self.rtfData = [] + self.tokens = [] + self.rtfData = rtfData + self.tokenize() - # Center separator lines - (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), + def tokenize(self): + i = 0 + lastDataStart = -1 + while i < len(self.rtfData): - # Remove page links - (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), - # Remove <hr> tags - (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), - # Replace <br><br> with <p> - # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'), + if isChar(self.rtfData[i], '{'): + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 + self.tokens.append(tokenDelimitatorStart()) + i = i + 1 + continue - # unwrap hyphenation - don't delete the hyphen (often doesn't split words) - (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''), + if isChar(self.rtfData[i], '}'): + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 + self.tokens.append(tokenDelimitatorEnd()) + i = i + 1 + continue - # Remove gray background - (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), + if isChar(self.rtfData[i], '\\'): + if i + 1 >= len(self.rtfData): + raise Exception('Error: Control character found at the end of the document.') - # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 - # Have paragraphs show better - (re.compile(r'<br.*?>'), lambda match : '<p>'), - # Clean up spaces - (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), - # Add space before and after italics - (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), - (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), - - ] + tokenStart = i + i = i + 1 - # Fix Book Designer markup - BOOK_DESIGNER = [ - # HR - (re.compile('<hr>', re.IGNORECASE), - lambda match : '<span style="page-break-after:always"> </span>'), - # Create header tags - (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), - lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), - lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), - lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)), - (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), - lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), - ] - def __init__(self, input_plugin_preprocess, plugin_preprocess, - extra_opts=None): - self.input_plugin_preprocess = input_plugin_preprocess - self.plugin_preprocess = plugin_preprocess - self.extra_opts = extra_opts + #Control Words + if isAsciiLetter(self.rtfData[i]): + #consume <ASCII Letter Sequence> + consumed = False + while i < len(self.rtfData): + if not isAsciiLetter(self.rtfData[i]): + tokenEnd = i + consumed = True + break + i = i + 1 - def is_baen(self, src): - return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', - re.IGNORECASE).search(src) is not None + if not consumed: + raise Exception('Error (at:%d): Control Word without end.'%(tokenStart)) - def is_book_designer(self, raw): - return re.search('<H2[^><]*id=BookTitle', raw) is not None + #we have numeric argument before delimiter + if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]): + #consume the numeric argument + consumed = False + l = 0 + while i < len(self.rtfData): + if not isDigit(self.rtfData[i]): + consumed = True + break + l = l + 1 + i = i + 1 + if l > 10 : + raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart]) - def is_pdftohtml(self, src): - return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] + if not consumed: + raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart]) - def __call__(self, html, remove_special_chars=None, - get_preprocess_html=False): - if remove_special_chars is not None: - html = remove_special_chars.sub('', html) - html = html.replace('\0', '') - is_pdftohtml = self.is_pdftohtml(html) - if self.is_baen(html): - rules = [] - elif self.is_book_designer(html): - rules = self.BOOK_DESIGNER - elif is_pdftohtml: - rules = self.PDFTOHTML - else: - rules = [] + separator = '' + if isChar(self.rtfData[i], ' '): + separator = ' ' - start_rules = [] - if is_pdftohtml: - # Remove non breaking spaces - start_rules.append((re.compile(ur'\u00a0'), lambda match : ' ')) + controlWord = self.rtfData[tokenStart: tokenEnd] + if tokenEnd < i: + value = int(self.rtfData[tokenEnd: i]) + if isString(controlWord, "\\bin"): + i = i + value + self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator)) + else: + self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator)) + else: + self.tokens.append(tokenControlWord(controlWord, separator)) + #space delimiter, we should discard it + if self.rtfData[i] == ' ': + i = i + 1 - if not getattr(self.extra_opts, 'keep_ligatures', False): - html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) + #Control Symbol + else: + self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1])) + i = i + 1 + continue - end_rules = [] - if getattr(self.extra_opts, 'remove_header', None): - try: - rules.insert(0, - (re.compile(self.extra_opts.header_regex), lambda match : '') - ) - except: - import traceback - print 'Failed to parse remove_header regexp' - traceback.print_exc() + if lastDataStart < 0: + lastDataStart = i + i = i + 1 - if getattr(self.extra_opts, 'remove_footer', None): - try: - rules.insert(0, - (re.compile(self.extra_opts.footer_regex), lambda match : '') - ) - except: - import traceback - print 'Failed to parse remove_footer regexp' - traceback.print_exc() + def toRTF(self): + result = [] + for token in self.tokens: + result.append(token.toRTF()) + return "".join(result) - # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives - if getattr(self.extra_opts, 'preprocess_html', None): - if is_pdftohtml: - end_rules.append( - (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), - ) - if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: - length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) - if length: - print "The pdf line length returned is " + str(length) - end_rules.append( - # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), - ) +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + print ("Usage %prog rtfFileToConvert") + sys.exit() + f = open(sys.argv[1], 'rb') + data = f.read() + f.close() - for rule in self.PREPROCESS + start_rules: - html = rule[0].sub(rule[1], html) + tokenizer = RtfTokenizer(data) + parsedTokens = RtfTokenParser(tokenizer.tokens) - if get_preprocess_html: - return html + data = parsedTokens.toRTF() - def dump(raw, where): - import os - dp = getattr(self.extra_opts, 'debug_pipeline', None) - if dp and os.path.exists(dp): - odir = os.path.join(dp, 'input') - if os.path.exists(odir): - odir = os.path.join(odir, where) - if not os.path.exists(odir): - os.makedirs(odir) - name, i = None, 0 - while not name or os.path.exists(os.path.join(odir, name)): - i += 1 - name = '%04d.html'%i - with open(os.path.join(odir, name), 'wb') as f: - f.write(raw.encode('utf-8')) + f = open(sys.argv[1], 'w') + f.write(data) + f.close() - #dump(html, 'pre-preprocess') - - for rule in rules + end_rules: - html = rule[0].sub(rule[1], html) - - #dump(html, 'post-preprocess') - - # Handle broken XHTML w/ SVG (ugh) - if 'svg:' in html and SVG_NS not in html: - html = html.replace( - '<html', '<html xmlns:svg="%s"' % SVG_NS, 1) - if 'xlink:' in html and XLINK_NS not in html: - html = html.replace( - '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) - - html = XMLDECL_RE.sub('', html) - - if getattr(self.extra_opts, 'asciiize', False): - from calibre.ebooks.unidecode.unidecoder import Unidecoder - unidecoder = Unidecoder() - html = unidecoder.decode(html) - - if self.plugin_preprocess: - html = self.input_plugin_preprocess(html) - - return html From 9a06996b16486a3511e4055535a6be48f484a90a Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sun, 12 Sep 2010 11:17:49 +1000 Subject: [PATCH 09/12] minor tweaks to preprocessing, backed out reflow change --- src/calibre/ebooks/conversion/preprocess.py | 4 +-- src/calibre/ebooks/conversion/utils.py | 36 +++++++++++---------- src/calibre/ebooks/pdf/reflow.py | 4 --- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 6123577191..46308b2ea0 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -319,8 +319,8 @@ class HTMLPreProcessor(object): # unwrap hyphenation - moved here so it's executed after header/footer removal if is_pdftohtml: - # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these - # hyphens are for compound words, formatting, etc + # unwrap visible dashes and hyphens - don't delete they are often hyphens for + # for compound words, formatting, etc end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 68cebb3a11..fb683bdb12 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -29,16 +29,12 @@ class PreProcessor(object): self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' - def chapter_link(self, match): - chap = match.group('sectionlink') - if not chap: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links") - return '<br style="page-break-before:always">' - else: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap)) - return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>' + def chapter_break(self, match): + chap = match.group('section') + styles = match.group('styles') + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap)) + return '<'+styles+' style="page-break-before:always">'+chap def no_markup(self, raw, percent): ''' @@ -74,7 +70,7 @@ class PreProcessor(object): html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing - linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE) + linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) @@ -100,8 +96,13 @@ class PreProcessor(object): chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) + + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) # # Unwrap lines using punctation if the median length of all lines is less than 200 @@ -110,13 +111,14 @@ class PreProcessor(object): unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) if length < 200: self.log("Unwrapping Lines") - html = unwrap.sub(' ', html) + html = unwrap.sub(' ', html) + # If still no sections after unwrapping lines break on lines with no punctuation if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation") + self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation") #self.log(html) - chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE) - html = chapdetect3.sub(self.chapter_head, html) + chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) + html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 36848ddb8b..584d631d0b 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -408,10 +408,6 @@ class Page(object): # Fraction of text height that two strings' bottoms can differ by # for them to be considered to be part of the same text fragment LINE_FACTOR = 0.4 - - # Percentage of the page heigth which should be considered header - # or footer to be discarded from reflow considerations - HEAD_FOOTER_MARGIN # Multiplies the average line height when determining row height # of a particular element to detect columns. From cdb696f63bc39b9327abe809fa71e94baa6e0b86 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 13 Sep 2010 00:12:21 +1000 Subject: [PATCH 10/12] enhanced preprocessing class - looking pretty good --- src/calibre/ebooks/conversion/preprocess.py | 18 ++-- src/calibre/ebooks/conversion/utils.py | 98 +++++++++++++++------ 2 files changed, 82 insertions(+), 34 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 46308b2ea0..f6277956c8 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,7 +62,6 @@ def wrap_lines(match): else: return ital+' ' - def line_length(format, raw, percent): ''' raw is the raw text to find the line length to use for wrapping. @@ -76,6 +75,8 @@ def line_length(format, raw, percent): linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL) elif format == 'pdf': linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) + elif format == 'spanned_html': + linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) lines = linere.findall(raw) lengths = [] @@ -223,14 +224,15 @@ class HTMLPreProcessor(object): # Remove page links (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), # Remove <hr> tags - (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), + (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br>'), # Remove gray background (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), - (re.compile(r'<br\s*/?>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?'), chap_head), + (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), + # Cover the case where every letter in a chapter title is separated by a space + (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), @@ -238,8 +240,7 @@ class HTMLPreProcessor(object): (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), - (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), - + (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), ] # Fix Book Designer markup @@ -327,10 +328,11 @@ class HTMLPreProcessor(object): # unwrap/delete soft hyphens with formatting end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) - # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives + # Make the more aggressive chapter marking regex optional with the preprocess option to + # reduce false positives and move after header/footer removal if getattr(self.extra_opts, 'preprocess_html', None): if is_pdftohtml: - end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head)) + end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),) if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index fb683bdb12..abfa43e7ed 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -8,10 +8,10 @@ __docformat__ = 'restructuredtext en' import re from calibre.ebooks.conversion.preprocess import line_length from calibre.utils.logging import default_log -from lxml import etree class PreProcessor(object): html_preprocess_sections = 0 + found_indents = 0 def __init__(self, args): self.args = args @@ -22,11 +22,11 @@ class PreProcessor(object): title = match.group('title') if not title: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) return '<h2>'+chap+'</h2>\n' else: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' def chapter_break(self, match): @@ -35,7 +35,22 @@ class PreProcessor(object): self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap)) return '<'+styles+' style="page-break-before:always">'+chap - + + def insert_indent(self, match): + pstyle = match.group('formatting') + span = match.group('span') + self.found_indents = self.found_indents + 1 + if pstyle: + if not span: + return '<p '+pstyle+' style="text-indent:3%">' + else: + return '<p '+pstyle+' style="text-indent:3%">'+span + else: + if not span: + return '<p style="text-indent:3%">' + else: + return '<p style="text-indent:3%">'+span + def no_markup(self, raw, percent): ''' Detects total marked up line endings in the file. raw is the text to @@ -48,7 +63,7 @@ class PreProcessor(object): line_end = line_end_ere.findall(raw) tot_htm_ends = len(htm_end) tot_ln_fds = len(line_end) - self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***") + self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings") if percent > 1: percent = 1 @@ -56,13 +71,18 @@ class PreProcessor(object): percent = 0 min_lns = tot_ln_fds * percent - self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true") + self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup") if min_lns > tot_htm_ends: return True def __call__(self, html): self.log("********* Preprocessing HTML *********") - # remove non-breaking spaces + # Replace series of non-breaking spaces with text-indent + txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) + html = txtindent.sub(self.insert_indent, html) + if self.found_indents > 1: + self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles") + # remove remaining non-breaking spaces html = re.sub(ur'\u00a0', ' ', html) # Get rid of empty <o:p> tags to simplify other processing html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) @@ -83,41 +103,67 @@ class PreProcessor(object): html = re.sub(r"\s*</p>", "</p>\n", html) html = re.sub(r"\s*<p>\s*", "\n<p>", html) - # some lit files don't have any <p> tags or equivalent, check and - # mark up line endings if required before proceeding + # some lit files don't have any <p> tags or equivalent (generally just plain text between + # <pre> tags), check and mark up line endings if required before proceeding if self.no_markup(html, 0.1): self.log("not enough paragraph markers, adding now") add_markup = re.compile('(?<!>)(\n)') html = add_markup.sub('</p>\n<p>', html) # detect chapters/sections to match xpath or splitting logic + heading = re.compile('<h(1|2)[^>]*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") # - # Start with most typical chapter headings - chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) - html = chapdetect.sub(self.chapter_head, html) + # Start with most typical chapter headings, get more aggressive until one works + if self.html_preprocess_sections < 10: + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) - html = chapdetect2.sub(self.chapter_head, html) - # - # Unwrap lines using punctation if the median length of all lines is less than 200 - length = line_length('html', html, 0.4) - self.log("*** Median line length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 200: - self.log("Unwrapping Lines") - html = unwrap.sub(' ', html) + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) - # If still no sections after unwrapping lines break on lines with no punctuation + # Unwrap lines + # + self.log("Unwrapping Lines") + # Some OCR sourced files have line breaks in the html using a combination of span & p tags + # span are used for hard line breaks, p for new paragraphs. Determine which is used so + # that lines can be wrapped across page boundaries + paras_reg = re.compile('<p[^>]*>', re.IGNORECASE) + spans_reg = re.compile('<span[^>]*>', re.IGNORECASE) + paras = len(paras_reg.findall(html)) + spans = len(spans_reg.findall(html)) + if spans > 1: + if float(paras) / float(spans) < 0.75: + format = 'spanned_html' + else: + format = 'html' + else: + format = 'html' + + # Calculate Length + length = line_length(format, html, 0.4) + self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") + # + # Unwrap and/or delete soft-hyphens, hyphens + html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) + html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + + # Unwrap lines using punctation if the median length of all lines is less than 200 + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + html = unwrap.sub(' ', html) + + # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < 10: - self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation") + self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) #self.log(html) - chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) + chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter From 548417ea6b6157faf1688b3b082f3eac5476636f Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 13 Sep 2010 09:18:45 +1000 Subject: [PATCH 11/12] comments and minor tweak --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index abfa43e7ed..ecf030b27d 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -111,7 +111,7 @@ class PreProcessor(object): html = add_markup.sub('</p>\n<p>', html) # detect chapters/sections to match xpath or splitting logic - heading = re.compile('<h(1|2)[^>]*>', re.IGNORECASE) + heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") # @@ -134,7 +134,7 @@ class PreProcessor(object): self.log("Unwrapping Lines") # Some OCR sourced files have line breaks in the html using a combination of span & p tags # span are used for hard line breaks, p for new paragraphs. Determine which is used so - # that lines can be wrapped across page boundaries + # that lines can be un-wrapped across page boundaries paras_reg = re.compile('<p[^>]*>', re.IGNORECASE) spans_reg = re.compile('<span[^>]*>', re.IGNORECASE) paras = len(paras_reg.findall(html)) From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Tue, 14 Sep 2010 02:56:56 +1000 Subject: [PATCH 12/12] tweaked preprocess for $, added rtf to new preprocess logic, changed last pdf default --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/rtf/input.py | 13 +++---------- src/calibre/gui2/convert/pdf_input.ui | 2 +- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f6277956c8..9464be1210 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -340,7 +340,7 @@ class HTMLPreProcessor(object): # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 216ccf591d..d229b80c16 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -8,6 +8,7 @@ from lxml import etree from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class InlineClass(etree.XSLTExtension): @@ -229,16 +230,8 @@ class RTFInput(InputFormatPlugin): res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] if self.options.preprocess_html: - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(<(/i|b)>)?)?)\s*</span>\s*</p>', re.IGNORECASE) - res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res) - # Unwrap lines using punctation if the median length of all lines is less than 150 - length = line_length('html', res, 0.4) - self.log("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*</p>\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*<span[^>]*>\s*" % length, re.UNICODE) - if length < 150: - res = unwrap.sub(' ', res) + preprocessor = PreProcessor(res) + res = preprocessor(res) f.write(res) self.write_inline_css(inline_class) stream.seek(0) diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index 626c68ea63..b2ee421922 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -46,7 +46,7 @@ <double>0.010000000000000</double> </property> <property name="value"> - <double>0.500000000000000</double> + <double>0.450000000000000</double> </property> </widget> </item>