From 8676ddd30fba0df90eb62e7c1c84c3fd3dc13f39 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 14 Jan 2011 18:12:17 +0800
Subject: [PATCH 1/6] updated heuristics help messages
---
src/calibre/ebooks/conversion/plumber.py | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 3ec4e104f9..50d0646c7d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -491,8 +491,8 @@ OptionRecommendation(name='enable_heuristics',
OptionRecommendation(name='markup_chapter_headings',
recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Detect chapter headings and sub headings. Change '
- 'them to h1 and h2 tags.')),
+ help=_('Detect unformatted chapter headings and sub headings. Change '
+ 'them to h2 and h3 tags.')),
OptionRecommendation(name='italicize_common_cases',
recommended_value=False, level=OptionRecommendation.LOW,
@@ -508,26 +508,30 @@ OptionRecommendation(name='html_unwrap_factor',
recommended_value=0.40, level=OptionRecommendation.LOW,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.4, just below the median line length.')),
+ 'default is 0.4, just below the median line length. If only a '
+ 'few lines in the document require unwrapping this value should '
+ 'be reduced')),
OptionRecommendation(name='unwrap_lines',
recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Unwrap lines.')),
+ help=_('Unwrap lines using punctuation and other formatting clues.')),
OptionRecommendation(name='delete_blank_paragraphs',
recommended_value=True, level=OptionRecommendation.LOW,
- help=_('Remove empyt paragraphs from the document')),
+ help=_('Remove empty paragraphs from the document when they exist between '
+ 'every other paragraph')),
OptionRecommendation(name='format_scene_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Replace soft scene breaks that use multiple blank lines '
- 'with horizontal rules.')),
+ help=_('Detects left aligned scene break markers and center aligns them. '
+ 'Replace soft scene breaks that use multiple blank lines with'
+ 'horizontal rules.')),
OptionRecommendation(name='dehyphenate',
recommended_value=True, level=OptionRecommendation.LOW,
- help=_('Combine words that are separated by a hyphen. '
- 'This is for cases where a word is hyphenated across '
- 'two lines to denote the characters from a single word.')),
+ help=_('Analyses hyphenated words throughout the document. The '
+ 'document itself is used as a dictionary to determine whether hyphens '
+ 'should be retained or removed.')),
OptionRecommendation(name='sr1_search',
recommended_value='', level=OptionRecommendation.LOW,
From 90c978bb1076e7afb7843df96959cc365d17332d Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 14 Jan 2011 21:33:47 +0800
Subject: [PATCH 2/6] tied enable heuristics to preprocess, moved various
pieces to functions
---
src/calibre/customize/conversion.py | 2 +-
src/calibre/ebooks/conversion/plumber.py | 4 +-
src/calibre/ebooks/conversion/utils.py | 117 ++++++++++++++---------
src/calibre/ebooks/lit/input.py | 2 +-
4 files changed, 78 insertions(+), 47 deletions(-)
diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index ec83600a49..a9e573ffa0 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
- def preprocess_html(self, opts, html):
+ def heuristics(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 50d0646c7d..a40c17a743 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
- html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
- opts.preprocess_html, opts)
+ html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
+ opts.enable_heuristics, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index dac93fa2e2..44d4235b6c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,6 +113,11 @@ class PreProcessor(object):
return wordcount.words
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+ '''
+ Searches for common chapter headings throughout the document
+ attempts multiple patterns based on likelihood of a match
+ with minimum false positives. Exits after finding a successful pattern
+ '''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for
self.min_chapters = 1
@@ -185,6 +190,10 @@ class PreProcessor(object):
return html
def punctuation_unwrap(self, length, content, format):
+ '''
+ Unwraps lines based on line length and punctuation
+ supports range of potential html markup and text files
+ '''
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((p|span|div)>)?"
@@ -201,53 +210,38 @@ class PreProcessor(object):
return content
- def __call__(self, html):
- self.log("********* Preprocessing HTML *********")
+ def text_process_pre(self, html):
+ pre = re.compile(r'', re.IGNORECASE)
+ if len(pre.findall(html)) == 1:
+ self.log("Running Text Processing")
+ from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+ separate_paragraphs_single_line
+ outerhtml = re.compile(r'.*?(?<=)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL)
+ html = outerhtml.sub('\g', html)
+ html = separate_paragraphs_single_line(html)
+ html = preserve_spaces(html)
+ html = convert_basic(html, epub_split_size_kb=0)
+ else:
+ # Add markup naively
+ # TODO - find out if there are cases where there are more than one tag or
+ # other types of unmarked html and handle them in some better fashion
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n', html)
+ return html
- # Count the words in the document to estimate how many chapters to look for and whether
- # other types of processing are attempted
- totalwords = 0
- totalwords = self.get_word_count(html)
-
- if totalwords < 50:
- self.log("not enough text, not preprocessing")
- return html
-
- # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*(?Pp|div)>", ""+"\g"+">\n", html)
html = re.sub(r"\s*<(?Pp|div)(?P