diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index ec83600a49..b77ac81587 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() - def preprocess_html(self, opts, html): - ''' - This method is called by the conversion pipeline on all HTML before it - is parsed. It is meant to be used to do any required preprocessing on - the HTML, like removing hard line breaks, etc. - - :param html: A unicode string - :return: A unicode string - ''' - return html - - def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index c4b124fe98..89efa2b4d1 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin): def _create_oebbook(self, hhcpath, basedir, opts, log, mi): from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import DirContainer - oeb = create_oebbook(log, None, opts, self, + oeb = create_oebbook(log, None, opts, encoding=opts.input_encoding, populate=False) self.oeb = oeb diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 3178fe1b43..b5c057b0f9 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -126,10 +126,29 @@ def add_pipeline_options(parser, plumber): 'margin_top', 'margin_left', 'margin_right', 'margin_bottom', 'change_justification', 'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size', - 'asciiize', 'remove_header', 'header_regex', - 'remove_footer', 'footer_regex', + 'asciiize', ] ), + + 'HEURISTIC PROCESSING' : ( + _('Modify the document text and structure using common patterns.'), + [ + 'enable_heuristics', 'markup_chapter_headings', + 'italicize_common_cases', 'fix_indents', + 'html_unwrap_factor', 'unwrap_lines', + 'delete_blank_paragraphs', 'format_scene_breaks', + 'dehyphenate', 'renumber_headings', + ] + ), + + 'SEARCH AND REPLACE' : ( + _('Modify the document text and structure using user defined patterns.'), + [ + 'sr1_search', 'sr1_replace', + 'sr2_search', 'sr2_replace', + 'sr3_search', 'sr3_replace', + ] + ), 'STRUCTURE DETECTION' : ( _('Control auto-detection of document structure.'), @@ -137,7 +156,6 @@ def add_pipeline_options(parser, plumber): 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', - 'preprocess_html', 'html_unwrap_factor', ] ), @@ -164,7 +182,8 @@ def add_pipeline_options(parser, plumber): } - group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION', + group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING', + 'SEARCH AND REPLACE', 'STRUCTURE DETECTION', 'TABLE OF CONTENTS', 'METADATA', 'DEBUG'] for group in group_order: diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 9b22fb46ec..6fdf7ddc68 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata', ) ), -OptionRecommendation(name='preprocess_html', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Attempt to detect and correct hard line breaks and other ' - 'problems in the source file. This may make things worse, so use ' - 'with care.' - ) - ), - -OptionRecommendation(name='html_unwrap_factor', - recommended_value=0.40, level=OptionRecommendation.LOW, - help=_('Scale used to determine the length at which a line should ' - 'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The ' - 'default is 0.40, just below the median line length. This will unwrap typical books ' - ' with hard line breaks, but should be reduced if the line length is variable.' - ) - ), - OptionRecommendation(name='smarten_punctuation', recommended_value=False, level=OptionRecommendation.LOW, help=_('Convert plain quotes, dashes and ellipsis to their ' @@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation', ) ), -OptionRecommendation(name='remove_header', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Use a regular expression to try and remove the header.' - ) - ), - -OptionRecommendation(name='header_regex', - recommended_value='(?i)(?<=
\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) - # Make the more aggressive chapter marking regex optional with the preprocess option to - # reduce false positives and move after header/footer removal - if getattr(self.extra_opts, 'preprocess_html', None): - if is_pdftohtml: - end_rules.append((re.compile(r'
\s*(?P \s*(?P )?'), chap_head),)
-
length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
docanalysis = DocAnalysis('pdf', html)
@@ -512,15 +515,14 @@ class HTMLPreProcessor(object):
if is_pdftohtml and length > -1:
# Dehyphenate
- dehyphenator = Dehyphenator()
+ dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html', length)
if is_pdftohtml:
- from calibre.ebooks.conversion.utils import PreProcessor
- pdf_markup = PreProcessor(self.extra_opts, None)
+ from calibre.ebooks.conversion.utils import HeuristicProcessor
+ pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0
- totalwords = pdf_markup.get_word_count(html)
- if totalwords > 7000:
+ if pdf_markup.get_word_count(html) > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess')
@@ -540,8 +542,10 @@ class HTMLPreProcessor(object):
unidecoder = Unidecoder()
html = unidecoder.decode(html)
- if self.plugin_preprocess:
- html = self.input_plugin_preprocess(self.extra_opts, html)
+ if getattr(self.extra_opts, 'enable_heuristics', False):
+ from calibre.ebooks.conversion.utils import HeuristicProcessor
+ preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+ html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 7732bb2b4d..4663eeccdf 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,13 +11,22 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
-class PreProcessor(object):
+class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log
self.html_preprocess_sections = 0
self.found_indents = 0
self.extra_opts = extra_opts
+ self.deleted_nbsps = False
+ self.totalwords = 0
+ self.min_chapters = 1
+ self.chapters_no_title = 0
+ self.chapters_with_title = 0
+ self.blanks_deleted = False
+ self.linereg = re.compile('(?<= )', re.IGNORECASE|re.DOTALL)
+ self.blankreg = re.compile(r'\s*(?P ]*>)\s*(?P ]*>\s* ', html)
+ return html
- # Count the words in the document to estimate how many chapters to look for and whether
- # other types of processing are attempted
- totalwords = 0
- totalwords = self.get_word_count(html)
-
- if totalwords < 50:
- self.log("not enough text, not preprocessing")
- return html
-
- # Arrange line feeds and '+chap+'
\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + unicode(self.html_preprocess_sections) +
+ self.log.debug("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
return ''+chap+'
\n'+title+'
\n'
@@ -40,10 +49,18 @@ class PreProcessor(object):
chap = match.group('section')
styles = match.group('styles')
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + unicode(self.html_preprocess_sections) +
+ self.log.debug("marked " + unicode(self.html_preprocess_sections) +
" section markers based on punctuation. - " + unicode(chap))
return '<'+styles+' style="page-break-before:always">'+chap
+ def analyze_title_matches(self, match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ self.chapters_no_title = self.chapters_no_title + 1
+ else:
+ self.chapters_with_title = self.chapters_with_title + 1
+
def insert_indent(self, match):
pstyle = match.group('formatting')
span = match.group('span')
@@ -75,8 +92,8 @@ class PreProcessor(object):
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
- self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
- unicode(tot_htm_ends) + " marked up endings")
+ #self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
+ # unicode(tot_htm_ends) + " marked up endings")
if percent > 1:
percent = 1
@@ -84,7 +101,7 @@ class PreProcessor(object):
percent = 0
min_lns = tot_ln_fds * percent
- self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
+ #self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends:
return True
@@ -112,16 +129,55 @@ class PreProcessor(object):
wordcount = get_wordcount_obj(word_count_text)
return wordcount.words
+ def markup_italicis(self, html):
+ ITALICIZE_WORDS = [
+ 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+ 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
+ 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+ 'Mlle.', 'Mons.', 'PS.', 'PPS.',
+ ]
+
+ ITALICIZE_STYLE_PATS = [
+ r'(?msu)(?<=\s)_(?P', re.IGNORECASE)
+ if len(pre.findall(html)) >= 1:
+ self.log.debug("Running Text Processing")
+ outerhtml = re.compile(r'.*?(?<=
)(?P
', re.IGNORECASE|re.DOTALL)
+ html = outerhtml.sub(self.txt_process, html)
+ else:
+ # Add markup naively
+ # TODO - find out if there are cases where there are more than one tag or
+ # other types of unmarked html and handle them in some better fashion
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('