merged from user_none, enables heuristics for the entire conversion pipeline

This commit is contained in:
ldolse 2011-01-16 02:01:39 +08:00
commit a8ebb35312
11 changed files with 27 additions and 58 deletions

View File

@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
def heuristics(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return

View File

@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
self.opts_to_mi(self.user_metadata)
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin,
encoding=self.input_plugin.output_encoding)
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
self.opts.is_image_collection = self.input_plugin.is_image_collection
@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
self.log(self.output_fmt.upper(), 'output written to', self.output)
self.flush()
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
def create_oebbook(log, path_or_stream, opts, reader=None,
encoding='utf-8', populate=True):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
opts.enable_heuristics, opts)
html_preprocessor = HTMLPreProcessor(log, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,

View File

@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def __init__(self, input_plugin_preprocess, plugin_preprocess,
extra_opts=None):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
def __init__(self, log=None, extra_opts=None):
self.log = log
self.extra_opts = extra_opts
def is_baen(self, src):
@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
unidecoder = Unidecoder()
html = unidecoder.decode(html)
if self.plugin_preprocess:
html = self.input_plugin_preprocess(self.extra_opts, html)
if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)

View File

@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object):
class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log
@ -367,7 +367,7 @@ class PreProcessor(object):
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
self.log("********* Heuristic processing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted

View File

@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
from calibre.ebooks.conversion.utils import HeuristicProcessor
class LITInput(InputFormatPlugin):
@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
for elem in body:
ne = copy.deepcopy(elem)
pre.append(ne)
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -12,7 +12,6 @@ from copy import deepcopy
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
from calibre import guess_type
class Canvas(etree.XSLTExtension):
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
f.write(result)
styles.write()
return os.path.abspath('content.opf')
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -9,7 +9,6 @@ import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin):

View File

@ -7,7 +7,6 @@ import os, glob, re, textwrap
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
border_style_map = {
'single' : 'solid',

View File

@ -21,7 +21,7 @@ class HeuristicsWidget(Widget, Ui_Form):
'italicize_common_cases', 'fix_indents',
'html_unwrap_factor', 'unwrap_lines',
'delete_blank_paragraphs', 'format_scene_breaks',
'dehyphenate']
'dehyphenate', 'renumber_headings']
)
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)
@ -53,6 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
self.opt_delete_blank_paragraphs.setEnabled(state)
self.opt_format_scene_breaks.setEnabled(state)
self.opt_dehyphenate.setEnabled(state)
self.opt_renumber_headings(state)
self.opt_unwrap_lines.setEnabled(state)
if state and self.opt_unwrap_lines.checkState() == Qt.Checked:

View File

@ -6,7 +6,7 @@
<rect>
<x>0</x>
<y>0</y>
<width>657</width>
<width>811</width>
<height>479</height>
</rect>
</property>
@ -80,42 +80,42 @@
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<item row="4" column="0" colspan="2">
<widget class="QCheckBox" name="opt_delete_blank_paragraphs">
<property name="text">
<string>Delete blank lines between paragraphs</string>
</property>
</widget>
</item>
<item row="4" column="0" colspan="2">
<item row="5" column="0" colspan="2">
<widget class="QCheckBox" name="opt_format_scene_breaks">
<property name="text">
<string>Ensure scene breaks are consistently formatted</string>
</property>
</widget>
</item>
<item row="5" column="0" colspan="2">
<item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_dehyphenate">
<property name="text">
<string>Remove unnecessary hyphens</string>
</property>
</widget>
</item>
<item row="6" column="0" colspan="2">
<item row="7" column="0" colspan="2">
<widget class="QCheckBox" name="opt_italicize_common_cases">
<property name="text">
<string>Italicize common words and patterns</string>
</property>
</widget>
</item>
<item row="7" column="0" colspan="2">
<item row="8" column="0" colspan="2">
<widget class="QCheckBox" name="opt_fix_indents">
<property name="text">
<string>Replace entity indents with CSS indents</string>
</property>
</widget>
</item>
<item row="8" column="0">
<item row="9" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
@ -141,6 +141,13 @@
</property>
</spacer>
</item>
<item row="3" column="0">
<widget class="QCheckBox" name="opt_renumber_headings">
<property name="text">
<string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>