From 2fc9cd28c997e4db658e3a4c28a6a15950018e34 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 17 Sep 2010 21:31:07 +0800 Subject: [PATCH 1/4] Passing options to preprocess, other tweaks to preprocess code --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/conversion/utils.py | 67 ++++++++++++++------- src/calibre/ebooks/html/input.py | 4 +- src/calibre/ebooks/lit/input.py | 5 +- src/calibre/ebooks/lrf/input.py | 4 +- src/calibre/ebooks/mobi/input.py | 4 +- src/calibre/ebooks/rtf/input.py | 2 +- 7 files changed, 57 insertions(+), 31 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e72e15c3d9..3e5de26766 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -399,7 +399,7 @@ class HTMLPreProcessor(object): html = unidecoder.decode(html) if self.plugin_preprocess: - html = self.input_plugin_preprocess(html) + html = self.input_plugin_preprocess(self.extra_opts, html) if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 3fe6ce0ed4..874e157063 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -8,10 +8,11 @@ __docformat__ = 'restructuredtext en' import re from calibre.ebooks.conversion.preprocess import line_length from calibre.utils.logging import default_log +from calibre import entity_to_unicode class PreProcessor(object): - def __init__(self, log=None, extra_opts=None): + def __init__(self, extra_opts=None, log=None): self.log = default_log if log is None else log self.html_preprocess_sections = 0 self.found_indents = 0 @@ -77,6 +78,32 @@ class PreProcessor(object): def __call__(self, html): self.log("********* Preprocessing HTML *********") + ###### Check Markup ###### + # + # some lit files don't have any

tags or equivalent (generally just plain text between + #

 tags), check and  mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+             self.log("not enough paragraph markers, adding now")
+             # check if content is in pre tags, use txt procesor to mark up if so
+             pre = re.compile(r'
', re.IGNORECASE)
+             if len(pre.findall(html)) == 1:
+                 self.log("Running Text Processing")
+                 from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+                 separate_paragraphs_single_line
+                 outerhtml = re.compile(r'.*?(?<=
)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL) + html = outerhtml.sub('\g', html) + html = separate_paragraphs_single_line(html) + html = preserve_spaces(html) + html = convert_basic(html, epub_split_size_kb=0) + else: + # Add markup naively + # TODO - find out if there are cases where there are more than one
 tag or 
+                 # other types of unmarked html and handle them in some better fashion
+                 add_markup = re.compile('(?)(\n)')
+                 html = add_markup.sub('

\n

', html) + + ###### Mark Indents/Cleanup ###### + # # Replace series of non-breaking spaces with text-indent txtindent = re.compile(ur'[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) html = txtindent.sub(self.insert_indent, html) @@ -86,31 +113,27 @@ class PreProcessor(object): html = re.sub(ur'\u00a0', ' ', html) # Get rid of empty tags to simplify other processing html = re.sub(ur'\s*\s*', ' ', html) - # Get rid of empty span tags - html = re.sub(r"\s*]*>\s*", " ", html) + # Get rid of empty span, bold, & italics tags + html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) + html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*\s*){0,2}\s*", " ", html) + html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - blankreg = re.compile(r'\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

', re.IGNORECASE) + blankreg = re.compile(r'\s*(?P]*>)\s*(<(b|i|u)>)?\s*()?\s*(?P

)', re.IGNORECASE) #multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) if len(lines) > 1: self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") - if float(len(blanklines)) / float(len(lines)) > 0.40: + if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, + 'remove_paragraph_spacing', False): self.log("deleting blank lines") html = blankreg.sub('', html) # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*

\s*", "\n

", html) - - # some lit files don't have any

tags or equivalent (generally just plain text between - #

 tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-             self.log("not enough paragraph markers, adding now")
-             add_markup = re.compile('(?)(\n)')
-             html = add_markup.sub('

\n

', html) - + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) @@ -118,7 +141,7 @@ class PreProcessor(object): # # Start with most typical chapter headings, get more aggressive until one works if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}s*(]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(){0,2})\s*()?s*()?\s*(){0,2}\s*()\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") @@ -127,10 +150,10 @@ class PreProcessor(object): if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) - # Unwrap lines + ###### Unwrap lines ###### # self.log("Unwrapping Lines") # Some OCR sourced files have line breaks in the html using a combination of span & p tags @@ -149,9 +172,9 @@ class PreProcessor(object): format = 'html' # Calculate Length - length = line_length('pdf', html, getattr(self.extra_opts, + length = line_length(format, html, getattr(self.extra_opts, 'html_unwrap_factor', 0.4)) - self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") + self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") # # Unwrap and/or delete soft-hyphens, hyphens html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) @@ -164,13 +187,15 @@ class PreProcessor(object): # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < 10: self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) - #self.log(html) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) - html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) - + html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html) + + # put back non-breaking spaces in empty paragraphs to preserve original formatting + html = blankreg.sub('\n'+'\g<openline>'+' '+'\g<closeline>', html) + return html diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 084d48e54b..8b94fd83ec 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -490,7 +490,7 @@ class HTMLInput(InputFormatPlugin): return (None, None) return (None, raw) - def preprocess_html(self, html): - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + def preprocess_html(self, options, html): + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 65f5c607a2..4c0beebdd9 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -53,7 +53,8 @@ class LITInput(InputFormatPlugin): pre.append(ne) - def preprocess_html(self, html): - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + def preprocess_html(self, options, html): + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html) diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py index c54f3b071f..cdc8fa75c2 100644 --- a/src/calibre/ebooks/lrf/input.py +++ b/src/calibre/ebooks/lrf/input.py @@ -420,8 +420,8 @@ class LRFInput(InputFormatPlugin): styles.write() return os.path.abspath('content.opf') - def preprocess_html(self, html): - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + def preprocess_html(self, options, html): + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index b8dc7a9560..9ab7996a74 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -39,11 +39,11 @@ class MOBIInput(InputFormatPlugin): accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path - def preprocess_html(self, html): + def preprocess_html(self, options, html): # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) - html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html) return html diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 000c603c1c..078b30627f 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -229,7 +229,7 @@ class RTFInput(InputFormatPlugin): res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] if self.options.preprocess_html: - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) res = preprocessor(res) f.write(res) self.write_inline_css(inline_class) From 58ce70509040b32e40ce5642791b8801d697bf3a Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 17 Sep 2010 22:14:16 +0800 Subject: [PATCH 2/4] ... --- src/calibre/ebooks/conversion/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 874e157063..ba9feff4f7 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' import re from calibre.ebooks.conversion.preprocess import line_length from calibre.utils.logging import default_log -from calibre import entity_to_unicode class PreProcessor(object): From d0d1ff06dcf129c30efaf1a4a46d38d61c29a4b7 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 17 Sep 2010 22:16:27 +0800 Subject: [PATCH 3/4] ... --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ba9feff4f7..91de2dc259 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -119,7 +119,7 @@ class PreProcessor(object): # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) - blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*(?P<closeline></p>)', re.IGNORECASE) + blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) From dbb35b5823e9486817dc5e4e689dbfcacb8c29ef Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 17 Sep 2010 23:42:41 +0800 Subject: [PATCH 4/4] fixed option definition for lrf & html input plugins --- src/calibre/ebooks/html/input.py | 1 + src/calibre/ebooks/lrf/input.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 8b94fd83ec..e29ebd4554 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -491,6 +491,7 @@ class HTMLInput(InputFormatPlugin): return (None, raw) def preprocess_html(self, options, html): + self.options = options preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html) diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py index cdc8fa75c2..70529c0a04 100644 --- a/src/calibre/ebooks/lrf/input.py +++ b/src/calibre/ebooks/lrf/input.py @@ -421,6 +421,7 @@ class LRFInput(InputFormatPlugin): return os.path.abspath('content.opf') def preprocess_html(self, options, html): + self.options = options preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html)