merged from user_none, enables heuristics for the entire conversion pipeline

This commit is contained in:
ldolse 2011-01-16 02:01:39 +08:00
commit a8ebb35312
11 changed files with 27 additions and 58 deletions

View File

@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError() raise NotImplementedError()
def heuristics(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
''' '''
This method must be implemented in sub-classes. It must return This method must be implemented in sub-classes. It must return

View File

@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
self.opts_to_mi(self.user_metadata) self.opts_to_mi(self.user_metadata)
if not hasattr(self.oeb, 'manifest'): if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts, self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin,
encoding=self.input_plugin.output_encoding) encoding=self.input_plugin.output_encoding)
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log) self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
self.opts.is_image_collection = self.input_plugin.is_image_collection self.opts.is_image_collection = self.input_plugin.is_image_collection
@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
self.log(self.output_fmt.upper(), 'output written to', self.output) self.log(self.output_fmt.upper(), 'output written to', self.output)
self.flush() self.flush()
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, def create_oebbook(log, path_or_stream, opts, reader=None,
encoding='utf-8', populate=True): encoding='utf-8', populate=True):
''' '''
Create an OEBBook. Create an OEBBook.
''' '''
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.heuristics, html_preprocessor = HTMLPreProcessor(log, opts)
opts.enable_heuristics, opts)
if not encoding: if not encoding:
encoding = None encoding = None
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,

View File

@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
] ]
def __init__(self, input_plugin_preprocess, plugin_preprocess, def __init__(self, log=None, extra_opts=None):
extra_opts=None): self.log = log
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
self.extra_opts = extra_opts self.extra_opts = extra_opts
def is_baen(self, src): def is_baen(self, src):
@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
unidecoder = Unidecoder() unidecoder = Unidecoder()
html = unidecoder.decode(html) html = unidecoder.decode(html)
if self.plugin_preprocess: if getattr(self.extra_opts, 'enable_heuristics', False):
html = self.input_plugin_preprocess(self.extra_opts, html) from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False): if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html) html = self.smarten_punctuation(html)

View File

@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object): class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None): def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log self.log = default_log if log is None else log
@ -367,7 +367,7 @@ class PreProcessor(object):
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Heuristic processing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether # Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted # other types of processing are attempted

View File

@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path from calibre import unicode_path
from calibre.utils.localization import get_lang from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.ebooks.conversion.utils import PreProcessor
class Link(object): class Link(object):
''' '''
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
self.log.exception('Failed to read CSS file: %r'%link) self.log.exception('Failed to read CSS file: %r'%link)
return (None, None) return (None, None)
return (None, raw) return (None, raw)
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor from calibre.ebooks.conversion.utils import HeuristicProcessor
class LITInput(InputFormatPlugin): class LITInput(InputFormatPlugin):
@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
for elem in body: for elem in body:
ne = copy.deepcopy(elem) ne = copy.deepcopy(elem)
pre.append(ne) pre.append(ne)
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -12,7 +12,6 @@ from copy import deepcopy
from lxml import etree from lxml import etree
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
from calibre import guess_type from calibre import guess_type
class Canvas(etree.XSLTExtension): class Canvas(etree.XSLTExtension):
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
f.write(result) f.write(result)
styles.write() styles.write()
return os.path.abspath('content.opf') return os.path.abspath('content.opf')
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -9,7 +9,6 @@ import os
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin): class PDBInput(InputFormatPlugin):

View File

@ -7,7 +7,6 @@ import os, glob, re, textwrap
from lxml import etree from lxml import etree
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
border_style_map = { border_style_map = {
'single' : 'solid', 'single' : 'solid',

View File

@ -21,7 +21,7 @@ class HeuristicsWidget(Widget, Ui_Form):
'italicize_common_cases', 'fix_indents', 'italicize_common_cases', 'fix_indents',
'html_unwrap_factor', 'unwrap_lines', 'html_unwrap_factor', 'unwrap_lines',
'delete_blank_paragraphs', 'format_scene_breaks', 'delete_blank_paragraphs', 'format_scene_breaks',
'dehyphenate'] 'dehyphenate', 'renumber_headings']
) )
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)
@ -53,6 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
self.opt_delete_blank_paragraphs.setEnabled(state) self.opt_delete_blank_paragraphs.setEnabled(state)
self.opt_format_scene_breaks.setEnabled(state) self.opt_format_scene_breaks.setEnabled(state)
self.opt_dehyphenate.setEnabled(state) self.opt_dehyphenate.setEnabled(state)
self.opt_renumber_headings(state)
self.opt_unwrap_lines.setEnabled(state) self.opt_unwrap_lines.setEnabled(state)
if state and self.opt_unwrap_lines.checkState() == Qt.Checked: if state and self.opt_unwrap_lines.checkState() == Qt.Checked:

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>657</width> <width>811</width>
<height>479</height> <height>479</height>
</rect> </rect>
</property> </property>
@ -80,42 +80,42 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="3" column="0" colspan="2"> <item row="4" column="0" colspan="2">
<widget class="QCheckBox" name="opt_delete_blank_paragraphs"> <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
<property name="text"> <property name="text">
<string>Delete blank lines between paragraphs</string> <string>Delete blank lines between paragraphs</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="4" column="0" colspan="2"> <item row="5" column="0" colspan="2">
<widget class="QCheckBox" name="opt_format_scene_breaks"> <widget class="QCheckBox" name="opt_format_scene_breaks">
<property name="text"> <property name="text">
<string>Ensure scene breaks are consistently formatted</string> <string>Ensure scene breaks are consistently formatted</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="5" column="0" colspan="2"> <item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_dehyphenate"> <widget class="QCheckBox" name="opt_dehyphenate">
<property name="text"> <property name="text">
<string>Remove unnecessary hyphens</string> <string>Remove unnecessary hyphens</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="6" column="0" colspan="2"> <item row="7" column="0" colspan="2">
<widget class="QCheckBox" name="opt_italicize_common_cases"> <widget class="QCheckBox" name="opt_italicize_common_cases">
<property name="text"> <property name="text">
<string>Italicize common words and patterns</string> <string>Italicize common words and patterns</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="7" column="0" colspan="2"> <item row="8" column="0" colspan="2">
<widget class="QCheckBox" name="opt_fix_indents"> <widget class="QCheckBox" name="opt_fix_indents">
<property name="text"> <property name="text">
<string>Replace entity indents with CSS indents</string> <string>Replace entity indents with CSS indents</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="8" column="0"> <item row="9" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -141,6 +141,13 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="3" column="0">
<widget class="QCheckBox" name="opt_renumber_headings">
<property name="text">
<string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
</item> </item>