mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merged from user_none, enables heuristics for the entire conversion pipeline
This commit is contained in:
commit
a8ebb35312
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def heuristics(self, opts, html):
|
||||
'''
|
||||
This method is called by the conversion pipeline on all HTML before it
|
||||
is parsed. It is meant to be used to do any required preprocessing on
|
||||
the HTML, like removing hard line breaks, etc.
|
||||
|
||||
:param html: A unicode string
|
||||
:return: A unicode string
|
||||
'''
|
||||
return html
|
||||
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
'''
|
||||
This method must be implemented in sub-classes. It must return
|
||||
|
@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
|
||||
self.opts_to_mi(self.user_metadata)
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||
self.input_plugin,
|
||||
encoding=self.input_plugin.output_encoding)
|
||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||
@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
|
||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||
self.flush()
|
||||
|
||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
||||
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||
encoding='utf-8', populate=True):
|
||||
'''
|
||||
Create an OEBBook.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
|
||||
opts.enable_heuristics, opts)
|
||||
html_preprocessor = HTMLPreProcessor(log, opts)
|
||||
if not encoding:
|
||||
encoding = None
|
||||
oeb = OEBBook(log, html_preprocessor,
|
||||
|
@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
||||
extra_opts=None):
|
||||
self.input_plugin_preprocess = input_plugin_preprocess
|
||||
self.plugin_preprocess = plugin_preprocess
|
||||
def __init__(self, log=None, extra_opts=None):
|
||||
self.log = log
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def is_baen(self, src):
|
||||
@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
|
||||
unidecoder = Unidecoder()
|
||||
html = unidecoder.decode(html)
|
||||
|
||||
if self.plugin_preprocess:
|
||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = self.smarten_punctuation(html)
|
||||
|
@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
|
||||
class PreProcessor(object):
|
||||
class HeuristicProcessor(object):
|
||||
|
||||
def __init__(self, extra_opts=None, log=None):
|
||||
self.log = default_log if log is None else log
|
||||
@ -367,7 +367,7 @@ class PreProcessor(object):
|
||||
|
||||
|
||||
def __call__(self, html):
|
||||
self.log("********* Preprocessing HTML *********")
|
||||
self.log("********* Heuristic processing HTML *********")
|
||||
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
|
@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
|
||||
from calibre import unicode_path
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
|
||||
self.log.exception('Failed to read CSS file: %r'%link)
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
|
||||
def heuristics(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
|
||||
for elem in body:
|
||||
ne = copy.deepcopy(elem)
|
||||
pre.append(ne)
|
||||
|
||||
|
||||
def heuristics(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
@ -12,7 +12,6 @@ from copy import deepcopy
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
from calibre import guess_type
|
||||
|
||||
class Canvas(etree.XSLTExtension):
|
||||
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
|
||||
f.write(result)
|
||||
styles.write()
|
||||
return os.path.abspath('content.opf')
|
||||
|
||||
def heuristics(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
||||
|
||||
|
@ -9,7 +9,6 @@ import os
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
border_style_map = {
|
||||
'single' : 'solid',
|
||||
|
@ -21,7 +21,7 @@ class HeuristicsWidget(Widget, Ui_Form):
|
||||
'italicize_common_cases', 'fix_indents',
|
||||
'html_unwrap_factor', 'unwrap_lines',
|
||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||
'dehyphenate']
|
||||
'dehyphenate', 'renumber_headings']
|
||||
)
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
@ -53,6 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
|
||||
self.opt_delete_blank_paragraphs.setEnabled(state)
|
||||
self.opt_format_scene_breaks.setEnabled(state)
|
||||
self.opt_dehyphenate.setEnabled(state)
|
||||
self.opt_renumber_headings(state)
|
||||
|
||||
self.opt_unwrap_lines.setEnabled(state)
|
||||
if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
|
||||
|
@ -6,7 +6,7 @@
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>657</width>
|
||||
<width>811</width>
|
||||
<height>479</height>
|
||||
</rect>
|
||||
</property>
|
||||
@ -80,42 +80,42 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="0" colspan="2">
|
||||
<item row="4" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_delete_blank_paragraphs">
|
||||
<property name="text">
|
||||
<string>Delete blank lines between paragraphs</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0" colspan="2">
|
||||
<item row="5" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_format_scene_breaks">
|
||||
<property name="text">
|
||||
<string>Ensure scene breaks are consistently formatted</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="5" column="0" colspan="2">
|
||||
<item row="6" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_dehyphenate">
|
||||
<property name="text">
|
||||
<string>Remove unnecessary hyphens</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0" colspan="2">
|
||||
<item row="7" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_italicize_common_cases">
|
||||
<property name="text">
|
||||
<string>Italicize common words and patterns</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="7" column="0" colspan="2">
|
||||
<item row="8" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_fix_indents">
|
||||
<property name="text">
|
||||
<string>Replace entity indents with CSS indents</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="8" column="0">
|
||||
<item row="9" column="0">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -141,6 +141,13 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="3" column="0">
|
||||
<widget class="QCheckBox" name="opt_renumber_headings">
|
||||
<property name="text">
|
||||
<string>Renumber sequences of <h1> or <h2> tags to prevent splitting</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
|
Loading…
x
Reference in New Issue
Block a user