diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index ec83600a49..a9e573ffa0 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() - def preprocess_html(self, opts, html): + def heuristics(self, opts, html): ''' This method is called by the conversion pipeline on all HTML before it is parsed. It is meant to be used to do any required preprocessing on diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 50d0646c7d..a40c17a743 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, - opts.preprocess_html, opts) + html_preprocessor = HTMLPreProcessor(input_plugin.heuristics, + opts.enable_heuristics, opts) if not encoding: encoding = None oeb = OEBBook(log, html_preprocessor, diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index dac93fa2e2..44d4235b6c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -113,6 +113,11 @@ class PreProcessor(object): return wordcount.words def markup_chapters(self, html, wordcount, blanks_between_paragraphs): + ''' + Searches for common chapter headings throughout the document + attempts multiple patterns based on likelihood of a match + with minimum false positives. Exits after finding a successful pattern + ''' # Typical chapters are between 2000 and 7000 words, use the larger number to decide the # minimum of chapters to search for self.min_chapters = 1 @@ -185,6 +190,10 @@ class PreProcessor(object): return html def punctuation_unwrap(self, length, content, format): + ''' + Unwraps lines based on line length and punctuation + supports range of potential html markup and text files + ''' # define the pieces of the regex lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((p|span|div)>)?" @@ -201,53 +210,38 @@ class PreProcessor(object): return content - def __call__(self, html): - self.log("********* Preprocessing HTML *********") + def text_process_pre(self, html): + pre = re.compile(r'
', re.IGNORECASE) + if len(pre.findall(html)) == 1: + self.log("Running Text Processing") + from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ + separate_paragraphs_single_line + outerhtml = re.compile(r'.*?(?<=)(?P).*', re.IGNORECASE|re.DOTALL) + html = outerhtml.sub('\g.*)(?= ', html) + html = separate_paragraphs_single_line(html) + html = preserve_spaces(html) + html = convert_basic(html, epub_split_size_kb=0) + else: + # Add markup naively + # TODO - find out if there are cases where there are more than one tag or + # other types of unmarked html and handle them in some better fashion + add_markup = re.compile('(?)(\n)') + html = add_markup.sub('\n', html) + return html - # Count the words in the document to estimate how many chapters to look for and whether - # other types of processing are attempted - totalwords = 0 - totalwords = self.get_word_count(html) - - if totalwords < 50: - self.log("not enough text, not preprocessing") - return html - - # Arrange line feeds and
tags so the line_length and no_markup functions work correctly + def arrange_htm_line_endings(self, html): html = re.sub(r"\s*(?Pp|div)>", ""+"\g "+">\n", html) html = re.sub(r"\s*<(?P p|div)(?P