From 90c978bb1076e7afb7843df96959cc365d17332d Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 14 Jan 2011 21:33:47 +0800
Subject: [PATCH] tied enable heuristics to preprocess, moved various pieces to
functions
---
src/calibre/customize/conversion.py | 2 +-
src/calibre/ebooks/conversion/plumber.py | 4 +-
src/calibre/ebooks/conversion/utils.py | 117 ++++++++++++++---------
src/calibre/ebooks/lit/input.py | 2 +-
4 files changed, 78 insertions(+), 47 deletions(-)
diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index ec83600a49..a9e573ffa0 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
- def preprocess_html(self, opts, html):
+ def heuristics(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 50d0646c7d..a40c17a743 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
- html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
- opts.preprocess_html, opts)
+ html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
+ opts.enable_heuristics, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index dac93fa2e2..44d4235b6c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,6 +113,11 @@ class PreProcessor(object):
return wordcount.words
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+ '''
+ Searches for common chapter headings throughout the document
+ attempts multiple patterns based on likelihood of a match
+ with minimum false positives. Exits after finding a successful pattern
+ '''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for
self.min_chapters = 1
@@ -185,6 +190,10 @@ class PreProcessor(object):
return html
def punctuation_unwrap(self, length, content, format):
+ '''
+ Unwraps lines based on line length and punctuation
+ supports range of potential html markup and text files
+ '''
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((p|span|div)>)?"
@@ -201,53 +210,38 @@ class PreProcessor(object):
return content
- def __call__(self, html):
- self.log("********* Preprocessing HTML *********")
+ def text_process_pre(self, html):
+ pre = re.compile(r'', re.IGNORECASE)
+ if len(pre.findall(html)) == 1:
+ self.log("Running Text Processing")
+ from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+ separate_paragraphs_single_line
+ outerhtml = re.compile(r'.*?(?<=)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL)
+ html = outerhtml.sub('\g', html)
+ html = separate_paragraphs_single_line(html)
+ html = preserve_spaces(html)
+ html = convert_basic(html, epub_split_size_kb=0)
+ else:
+ # Add markup naively
+ # TODO - find out if there are cases where there are more than one tag or
+ # other types of unmarked html and handle them in some better fashion
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n', html)
+ return html
- # Count the words in the document to estimate how many chapters to look for and whether
- # other types of processing are attempted
- totalwords = 0
- totalwords = self.get_word_count(html)
-
- if totalwords < 50:
- self.log("not enough text, not preprocessing")
- return html
-
- # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*(?Pp|div)>", ""+"\g"+">\n", html)
html = re.sub(r"\s*<(?Pp|div)(?P