From b0a9c9659cda37d6cda41b22cd765713fb29f308 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Thu, 13 Jan 2011 19:58:09 -0500
Subject: [PATCH 01/54] Add heuristic options. Remove options that they
replace.
---
src/calibre/ebooks/conversion/cli.py | 23 ++++-
src/calibre/ebooks/conversion/plumber.py | 117 ++++++++++++++---------
2 files changed, 92 insertions(+), 48 deletions(-)
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 3178fe1b43..f825776c9c 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -126,8 +126,21 @@ def add_pipeline_options(parser, plumber):
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
- 'asciiize', 'remove_header', 'header_regex',
- 'remove_footer', 'footer_regex',
+ 'asciiize',
+ ]
+ ),
+
+ 'HEURISTICS' : (
+ _('Modify the document text and strucutre using common patterns.'),
+ [
+ 'enable_heuristics', 'markup_chapter_headings',
+ 'italicize_common_cases', 'fix_indents',
+ 'html_unwrap_factor', 'unwrap_lines',
+ 'delete_blank_paragraphs', 'format_scene_breaks',
+ 'dehyphenate',
+ 'sr1_search', 'sr1_replace',
+ 'sr2_search', 'sr2_replace',
+ 'sr3_search', 'sr3_replace',
]
),
@@ -137,7 +150,6 @@ def add_pipeline_options(parser, plumber):
'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before',
- 'preprocess_html', 'html_unwrap_factor',
]
),
@@ -164,8 +176,9 @@ def add_pipeline_options(parser, plumber):
}
- group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
- 'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
+ group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
+ 'STRUCTURE DETECTION', 'TABLE OF CONTENTS',
+ 'METADATA', 'DEBUG']
for group in group_order:
desc, options = groups[group]
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 9b22fb46ec..3ec4e104f9 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata',
)
),
-OptionRecommendation(name='preprocess_html',
- recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Attempt to detect and correct hard line breaks and other '
- 'problems in the source file. This may make things worse, so use '
- 'with care.'
- )
- ),
-
-OptionRecommendation(name='html_unwrap_factor',
- recommended_value=0.40, level=OptionRecommendation.LOW,
- help=_('Scale used to determine the length at which a line should '
- 'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
- 'default is 0.40, just below the median line length. This will unwrap typical books '
- ' with hard line breaks, but should be reduced if the line length is variable.'
- )
- ),
-
OptionRecommendation(name='smarten_punctuation',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Convert plain quotes, dashes and ellipsis to their '
@@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation',
)
),
-OptionRecommendation(name='remove_header',
- recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Use a regular expression to try and remove the header.'
- )
- ),
-
-OptionRecommendation(name='header_regex',
- recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)',
- level=OptionRecommendation.LOW,
- help=_('The regular expression to use to remove the header.'
- )
- ),
-
-OptionRecommendation(name='remove_footer',
- recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Use a regular expression to try and remove the footer.'
- )
- ),
-
-OptionRecommendation(name='footer_regex',
- recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)',
- level=OptionRecommendation.LOW,
- help=_('The regular expression to use to remove the footer.'
- )
- ),
-
OptionRecommendation(name='read_metadata_from_opf',
recommended_value=None, level=OptionRecommendation.LOW,
short_switch='m',
@@ -526,7 +483,81 @@ OptionRecommendation(name='pubdate',
OptionRecommendation(name='timestamp',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the book timestamp (used by the date column in calibre).')),
+
+OptionRecommendation(name='enable_heuristics',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Enable heurisic processing. This option must be set for any '
+ 'heuristic processing to take place.')),
+OptionRecommendation(name='markup_chapter_headings',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Detect chapter headings and sub headings. Change '
+ 'them to h1 and h2 tags.')),
+
+OptionRecommendation(name='italicize_common_cases',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Look for common words and patterns that denote '
+ 'italics and italicize them.')),
+
+OptionRecommendation(name='fix_indents',
+ recommended_value=True, level=OptionRecommendation.LOW,
+ help=_('Turn indentation created from multiple entities '
+ 'into CSS indents.')),
+
+OptionRecommendation(name='html_unwrap_factor',
+ recommended_value=0.40, level=OptionRecommendation.LOW,
+ help=_('Scale used to determine the length at which a line should '
+ 'be unwrapped. Valid values are a decimal between 0 and 1. The '
+ 'default is 0.4, just below the median line length.')),
+
+OptionRecommendation(name='unwrap_lines',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Unwrap lines.')),
+
+OptionRecommendation(name='delete_blank_paragraphs',
+ recommended_value=True, level=OptionRecommendation.LOW,
+ help=_('Remove empyt paragraphs from the document')),
+
+OptionRecommendation(name='format_scene_breaks',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Replace soft scene breaks that use multiple blank lines '
+ 'with horizontal rules.')),
+
+OptionRecommendation(name='dehyphenate',
+ recommended_value=True, level=OptionRecommendation.LOW,
+ help=_('Combine words that are separated by a hyphen. '
+ 'This is for cases where a word is hyphenated across '
+ 'two lines to denote the characters from a single word.')),
+
+OptionRecommendation(name='sr1_search',
+ recommended_value='', level=OptionRecommendation.LOW,
+ help=_('Search pattern (regular expression) to be replaced with '
+ 'sr1-replace.')),
+
+OptionRecommendation(name='sr1_replace',
+ recommended_value='', level=OptionRecommendation.LOW,
+ help=_('Replace characters (can be lambda expression) to '
+ 'replace the text found with sr1-search.')),
+
+OptionRecommendation(name='sr2_search',
+ recommended_value='', level=OptionRecommendation.LOW,
+ help=_('Search pattern (regular expression) to be replaced with '
+ 'sr2-replace.')),
+
+OptionRecommendation(name='sr2_replace',
+ recommended_value='', level=OptionRecommendation.LOW,
+ help=_('Replace characters (can be lambda expression) to '
+ 'replace the text found with sr2-search.')),
+
+OptionRecommendation(name='sr3_search',
+ recommended_value='', level=OptionRecommendation.LOW,
+ help=_('Search pattern (regular expression) to be replaced with '
+ 'sr3-replace.')),
+
+OptionRecommendation(name='sr3_replace',
+ recommended_value='', level=OptionRecommendation.LOW,
+ help=_('Replace characters (can be lambda expression) to '
+ 'replace the text found with sr3-search.')),
]
# }}}
From 8676ddd30fba0df90eb62e7c1c84c3fd3dc13f39 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 14 Jan 2011 18:12:17 +0800
Subject: [PATCH 02/54] updated heuristics help messages
---
src/calibre/ebooks/conversion/plumber.py | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 3ec4e104f9..50d0646c7d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -491,8 +491,8 @@ OptionRecommendation(name='enable_heuristics',
OptionRecommendation(name='markup_chapter_headings',
recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Detect chapter headings and sub headings. Change '
- 'them to h1 and h2 tags.')),
+ help=_('Detect unformatted chapter headings and sub headings. Change '
+ 'them to h2 and h3 tags.')),
OptionRecommendation(name='italicize_common_cases',
recommended_value=False, level=OptionRecommendation.LOW,
@@ -508,26 +508,30 @@ OptionRecommendation(name='html_unwrap_factor',
recommended_value=0.40, level=OptionRecommendation.LOW,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.4, just below the median line length.')),
+ 'default is 0.4, just below the median line length. If only a '
+ 'few lines in the document require unwrapping this value should '
+ 'be reduced')),
OptionRecommendation(name='unwrap_lines',
recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Unwrap lines.')),
+ help=_('Unwrap lines using punctuation and other formatting clues.')),
OptionRecommendation(name='delete_blank_paragraphs',
recommended_value=True, level=OptionRecommendation.LOW,
- help=_('Remove empyt paragraphs from the document')),
+ help=_('Remove empty paragraphs from the document when they exist between '
+ 'every other paragraph')),
OptionRecommendation(name='format_scene_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
- help=_('Replace soft scene breaks that use multiple blank lines '
- 'with horizontal rules.')),
+ help=_('Detects left aligned scene break markers and center aligns them. '
+ 'Replace soft scene breaks that use multiple blank lines with'
+ 'horizontal rules.')),
OptionRecommendation(name='dehyphenate',
recommended_value=True, level=OptionRecommendation.LOW,
- help=_('Combine words that are separated by a hyphen. '
- 'This is for cases where a word is hyphenated across '
- 'two lines to denote the characters from a single word.')),
+ help=_('Analyses hyphenated words throughout the document. The '
+ 'document itself is used as a dictionary to determine whether hyphens '
+ 'should be retained or removed.')),
OptionRecommendation(name='sr1_search',
recommended_value='', level=OptionRecommendation.LOW,
From 90c978bb1076e7afb7843df96959cc365d17332d Mon Sep 17 00:00:00 2001
From: ldolse
Date: Fri, 14 Jan 2011 21:33:47 +0800
Subject: [PATCH 03/54] tied enable heuristics to preprocess, moved various
pieces to functions
---
src/calibre/customize/conversion.py | 2 +-
src/calibre/ebooks/conversion/plumber.py | 4 +-
src/calibre/ebooks/conversion/utils.py | 117 ++++++++++++++---------
src/calibre/ebooks/lit/input.py | 2 +-
4 files changed, 78 insertions(+), 47 deletions(-)
diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index ec83600a49..a9e573ffa0 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
- def preprocess_html(self, opts, html):
+ def heuristics(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 50d0646c7d..a40c17a743 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
- html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
- opts.preprocess_html, opts)
+ html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
+ opts.enable_heuristics, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index dac93fa2e2..44d4235b6c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,6 +113,11 @@ class PreProcessor(object):
return wordcount.words
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+ '''
+ Searches for common chapter headings throughout the document
+ attempts multiple patterns based on likelihood of a match
+ with minimum false positives. Exits after finding a successful pattern
+ '''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for
self.min_chapters = 1
@@ -185,6 +190,10 @@ class PreProcessor(object):
return html
def punctuation_unwrap(self, length, content, format):
+ '''
+ Unwraps lines based on line length and punctuation
+ supports range of potential html markup and text files
+ '''
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((p|span|div)>)?"
@@ -201,53 +210,38 @@ class PreProcessor(object):
return content
- def __call__(self, html):
- self.log("********* Preprocessing HTML *********")
+ def text_process_pre(self, html):
+ pre = re.compile(r'', re.IGNORECASE)
+ if len(pre.findall(html)) == 1:
+ self.log("Running Text Processing")
+ from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+ separate_paragraphs_single_line
+ outerhtml = re.compile(r'.*?(?<=)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL)
+ html = outerhtml.sub('\g', html)
+ html = separate_paragraphs_single_line(html)
+ html = preserve_spaces(html)
+ html = convert_basic(html, epub_split_size_kb=0)
+ else:
+ # Add markup naively
+ # TODO - find out if there are cases where there are more than one tag or
+ # other types of unmarked html and handle them in some better fashion
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n', html)
+ return html
- # Count the words in the document to estimate how many chapters to look for and whether
- # other types of processing are attempted
- totalwords = 0
- totalwords = self.get_word_count(html)
-
- if totalwords < 50:
- self.log("not enough text, not preprocessing")
- return html
-
- # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*(?Pp|div)>", ""+"\g"+">\n", html)
html = re.sub(r"\s*<(?Pp|div)(?P