mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
Search and replace wizard: Fix generated html being slightly different from the actual html in the conversion pipeline for some input formats (mainly HTML, CHM, LIT).
This commit is contained in:
parent
4cf6ea0f12
commit
32de3c16ea
@ -77,7 +77,7 @@ class Plumber(object):
|
|||||||
|
|
||||||
def __init__(self, input, output, log, report_progress=DummyReporter(),
|
def __init__(self, input, output, log, report_progress=DummyReporter(),
|
||||||
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
|
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
|
||||||
override_input_metadata=False):
|
override_input_metadata=False, for_regex_wizard=False):
|
||||||
'''
|
'''
|
||||||
:param input: Path to input file.
|
:param input: Path to input file.
|
||||||
:param output: Path to output file/directory
|
:param output: Path to output file/directory
|
||||||
@ -87,6 +87,7 @@ class Plumber(object):
|
|||||||
if isbytestring(output):
|
if isbytestring(output):
|
||||||
output = output.decode(filesystem_encoding)
|
output = output.decode(filesystem_encoding)
|
||||||
self.original_input_arg = input
|
self.original_input_arg = input
|
||||||
|
self.for_regex_wizard = for_regex_wizard
|
||||||
self.input = os.path.abspath(input)
|
self.input = os.path.abspath(input)
|
||||||
self.output = os.path.abspath(output)
|
self.output = os.path.abspath(output)
|
||||||
self.log = log
|
self.log = log
|
||||||
@ -123,7 +124,7 @@ OptionRecommendation(name='input_profile',
|
|||||||
'conversion system information on how to interpret '
|
'conversion system information on how to interpret '
|
||||||
'various information in the input document. For '
|
'various information in the input document. For '
|
||||||
'example resolution dependent lengths (i.e. lengths in '
|
'example resolution dependent lengths (i.e. lengths in '
|
||||||
'pixels). Choices are:')+\
|
'pixels). Choices are:')+
|
||||||
', '.join([x.short_name for x in input_profiles()])
|
', '.join([x.short_name for x in input_profiles()])
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -135,7 +136,7 @@ OptionRecommendation(name='output_profile',
|
|||||||
'created document for the specified device. In some cases, '
|
'created document for the specified device. In some cases, '
|
||||||
'an output profile is required to produce documents that '
|
'an output profile is required to produce documents that '
|
||||||
'will work on a device. For example EPUB on the SONY reader. '
|
'will work on a device. For example EPUB on the SONY reader. '
|
||||||
'Choices are:') + \
|
'Choices are:') +
|
||||||
', '.join([x.short_name for x in output_profiles()])
|
', '.join([x.short_name for x in output_profiles()])
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -490,7 +491,7 @@ OptionRecommendation(name='asciiize',
|
|||||||
'cases where there are multiple representations of a character '
|
'cases where there are multiple representations of a character '
|
||||||
'(characters shared by Chinese and Japanese for instance) the '
|
'(characters shared by Chinese and Japanese for instance) the '
|
||||||
'representation based on the current calibre interface language will be '
|
'representation based on the current calibre interface language will be '
|
||||||
'used.')%\
|
'used.')%
|
||||||
u'\u041c\u0438\u0445\u0430\u0438\u043b '
|
u'\u041c\u0438\u0445\u0430\u0438\u043b '
|
||||||
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
|
u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
|
||||||
)
|
)
|
||||||
@ -711,7 +712,6 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.input_fmt = input_fmt
|
self.input_fmt = input_fmt
|
||||||
self.output_fmt = output_fmt
|
self.output_fmt = output_fmt
|
||||||
|
|
||||||
|
|
||||||
self.all_format_options = set()
|
self.all_format_options = set()
|
||||||
self.input_options = set()
|
self.input_options = set()
|
||||||
self.output_options = set()
|
self.output_options = set()
|
||||||
@ -775,7 +775,7 @@ OptionRecommendation(name='search_replace',
|
|||||||
if not html_files:
|
if not html_files:
|
||||||
raise ValueError(_('Could not find an ebook inside the archive'))
|
raise ValueError(_('Could not find an ebook inside the archive'))
|
||||||
html_files = [(f, os.stat(f).st_size) for f in html_files]
|
html_files = [(f, os.stat(f).st_size) for f in html_files]
|
||||||
html_files.sort(cmp = lambda x, y: cmp(x[1], y[1]))
|
html_files.sort(cmp=lambda x, y: cmp(x[1], y[1]))
|
||||||
html_files = [f[0] for f in html_files]
|
html_files = [f[0] for f in html_files]
|
||||||
for q in ('toc', 'index'):
|
for q in ('toc', 'index'):
|
||||||
for f in html_files:
|
for f in html_files:
|
||||||
@ -783,8 +783,6 @@ OptionRecommendation(name='search_replace',
|
|||||||
return f, os.path.splitext(f)[1].lower()[1:]
|
return f, os.path.splitext(f)[1].lower()[1:]
|
||||||
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
|
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_option_by_name(self, name):
|
def get_option_by_name(self, name):
|
||||||
for group in (self.input_options, self.pipeline_options,
|
for group in (self.input_options, self.pipeline_options,
|
||||||
self.output_options, self.all_format_options):
|
self.output_options, self.all_format_options):
|
||||||
@ -956,7 +954,6 @@ OptionRecommendation(name='search_replace',
|
|||||||
|
|
||||||
self.log.info('Input debug saved to:', out_dir)
|
self.log.info('Input debug saved to:', out_dir)
|
||||||
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
'''
|
'''
|
||||||
Run the conversion pipeline
|
Run the conversion pipeline
|
||||||
@ -965,10 +962,12 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.setup_options()
|
self.setup_options()
|
||||||
if self.opts.verbose:
|
if self.opts.verbose:
|
||||||
self.log.filter_level = self.log.DEBUG
|
self.log.filter_level = self.log.DEBUG
|
||||||
|
if self.for_regex_wizard and hasattr(self.opts, 'no_process'):
|
||||||
|
self.opts.no_process = True
|
||||||
self.flush()
|
self.flush()
|
||||||
import cssutils, logging
|
import cssutils, logging
|
||||||
cssutils.log.setLevel(logging.WARN)
|
cssutils.log.setLevel(logging.WARN)
|
||||||
get_types_map() # Ensure the mimetypes module is intialized
|
get_types_map() # Ensure the mimetypes module is intialized
|
||||||
|
|
||||||
if self.opts.debug_pipeline is not None:
|
if self.opts.debug_pipeline is not None:
|
||||||
self.opts.verbose = max(self.opts.verbose, 4)
|
self.opts.verbose = max(self.opts.verbose, 4)
|
||||||
@ -1003,6 +1002,8 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.ui_reporter(0.01, _('Converting input to HTML...'))
|
self.ui_reporter(0.01, _('Converting input to HTML...'))
|
||||||
ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)
|
ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)
|
||||||
self.input_plugin.report_progress = ir
|
self.input_plugin.report_progress = ir
|
||||||
|
if self.for_regex_wizard:
|
||||||
|
self.input_plugin.for_viewer = True
|
||||||
with self.input_plugin:
|
with self.input_plugin:
|
||||||
self.oeb = self.input_plugin(stream, self.opts,
|
self.oeb = self.input_plugin(stream, self.opts,
|
||||||
self.input_fmt, self.log,
|
self.input_fmt, self.log,
|
||||||
@ -1014,8 +1015,12 @@ OptionRecommendation(name='search_replace',
|
|||||||
if self.input_fmt in ('recipe', 'downloaded_recipe'):
|
if self.input_fmt in ('recipe', 'downloaded_recipe'):
|
||||||
self.opts_to_mi(self.user_metadata)
|
self.opts_to_mi(self.user_metadata)
|
||||||
if not hasattr(self.oeb, 'manifest'):
|
if not hasattr(self.oeb, 'manifest'):
|
||||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
self.oeb = create_oebbook(
|
||||||
encoding=self.input_plugin.output_encoding)
|
self.log, self.oeb, self.opts,
|
||||||
|
encoding=self.input_plugin.output_encoding,
|
||||||
|
for_regex_wizard=self.for_regex_wizard)
|
||||||
|
if self.for_regex_wizard:
|
||||||
|
return
|
||||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||||
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
|
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
|
||||||
@ -1081,7 +1086,6 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.dump_oeb(self.oeb, out_dir)
|
self.dump_oeb(self.oeb, out_dir)
|
||||||
self.log('Structured HTML written to:', out_dir)
|
self.log('Structured HTML written to:', out_dir)
|
||||||
|
|
||||||
|
|
||||||
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
|
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
|
||||||
self.opts.extra_css = open(self.opts.extra_css, 'rb').read()
|
self.opts.extra_css = open(self.opts.extra_css, 'rb').read()
|
||||||
|
|
||||||
@ -1161,13 +1165,20 @@ OptionRecommendation(name='search_replace',
|
|||||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
|
# This has to be global as create_oebbook can be called from other locations
|
||||||
|
# (for example in the html input plugin)
|
||||||
|
regex_wizard_callback = None
|
||||||
|
def set_regex_wizard_callback(f):
|
||||||
|
global regex_wizard_callback
|
||||||
|
regex_wizard_callback = f
|
||||||
|
|
||||||
def create_oebbook(log, path_or_stream, opts, reader=None,
|
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||||
encoding='utf-8', populate=True):
|
encoding='utf-8', populate=True, for_regex_wizard=False):
|
||||||
'''
|
'''
|
||||||
Create an OEBBook.
|
Create an OEBBook.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
html_preprocessor = HTMLPreProcessor(log, opts)
|
html_preprocessor = HTMLPreProcessor(log, opts, regex_wizard_callback=regex_wizard_callback)
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = None
|
encoding = None
|
||||||
oeb = OEBBook(log, html_preprocessor,
|
oeb = OEBBook(log, html_preprocessor,
|
||||||
@ -1182,3 +1193,4 @@ def create_oebbook(log, path_or_stream, opts, reader=None,
|
|||||||
|
|
||||||
reader()(oeb, path_or_stream)
|
reader()(oeb, path_or_stream)
|
||||||
return oeb
|
return oeb
|
||||||
|
|
||||||
|
@ -497,9 +497,11 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
]
|
]
|
||||||
def __init__(self, log=None, extra_opts=None):
|
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
|
||||||
self.log = log
|
self.log = log
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
self.regex_wizard_callback = regex_wizard_callback
|
||||||
|
self.current_href = None
|
||||||
|
|
||||||
def is_baen(self, src):
|
def is_baen(self, src):
|
||||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||||
@ -586,6 +588,9 @@ class HTMLPreProcessor(object):
|
|||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
|
|
||||||
|
if self.regex_wizard_callback is not None:
|
||||||
|
self.regex_wizard_callback(self.current_href, html)
|
||||||
|
|
||||||
if get_preprocess_html:
|
if get_preprocess_html:
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
@ -871,6 +871,7 @@ class Manifest(object):
|
|||||||
orig_data = data
|
orig_data = data
|
||||||
fname = urlunquote(self.href)
|
fname = urlunquote(self.href)
|
||||||
self.oeb.log.debug('Parsing', fname, '...')
|
self.oeb.log.debug('Parsing', fname, '...')
|
||||||
|
self.oeb.html_preprocessor.current_href = self.href
|
||||||
try:
|
try:
|
||||||
data = parse_html(data, log=self.oeb.log,
|
data = parse_html(data, log=self.oeb.log,
|
||||||
decoder=self.oeb.decode,
|
decoder=self.oeb.decode,
|
||||||
@ -1312,9 +1313,9 @@ class Guide(object):
|
|||||||
('notes', __('Notes')),
|
('notes', __('Notes')),
|
||||||
('preface', __('Preface')),
|
('preface', __('Preface')),
|
||||||
('text', __('Main Text'))]
|
('text', __('Main Text'))]
|
||||||
TYPES = set(t for t, _ in _TYPES_TITLES)
|
TYPES = set(t for t, _ in _TYPES_TITLES) # noqa
|
||||||
TITLES = dict(_TYPES_TITLES)
|
TITLES = dict(_TYPES_TITLES)
|
||||||
ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES))
|
ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES)) # noqa
|
||||||
|
|
||||||
def __init__(self, oeb, type, title, href):
|
def __init__(self, oeb, type, title, href):
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
|
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, re
|
import sys, os, re
|
||||||
|
|
||||||
from calibre.customize.ui import available_input_formats
|
from calibre.customize.ui import available_input_formats
|
||||||
|
|
||||||
@ -26,17 +26,18 @@ def EbookIterator(*args, **kwargs):
|
|||||||
from calibre.ebooks.oeb.iterator.book import EbookIterator
|
from calibre.ebooks.oeb.iterator.book import EbookIterator
|
||||||
return EbookIterator(*args, **kwargs)
|
return EbookIterator(*args, **kwargs)
|
||||||
|
|
||||||
def get_preprocess_html(path_to_ebook, output):
|
def get_preprocess_html(path_to_ebook, output=None):
|
||||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
from calibre.ebooks.conversion.plumber import set_regex_wizard_callback, Plumber
|
||||||
iterator = EbookIterator(path_to_ebook)
|
from calibre.utils.logging import DevNull
|
||||||
iterator.__enter__(only_input_plugin=True, run_char_count=False,
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
read_anchor_map=False)
|
raw = {}
|
||||||
preprocessor = HTMLPreProcessor(None, False)
|
set_regex_wizard_callback(raw.__setitem__)
|
||||||
with open(output, 'wb') as out:
|
with TemporaryDirectory('_regex_wiz') as tdir:
|
||||||
for path in iterator.spine:
|
pl = Plumber(path_to_ebook, os.path.join(tdir, 'a.epub'), DevNull(), for_regex_wizard=True)
|
||||||
with open(path, 'rb') as f:
|
pl.run()
|
||||||
html = f.read().decode('utf-8', 'replace')
|
items = [raw[item.href] for item in pl.oeb.spine if item.href in raw]
|
||||||
html = preprocessor(html, get_preprocess_html=True)
|
|
||||||
|
with (sys.stdout if output is None else open(output, 'wb')) as out:
|
||||||
|
for html in items:
|
||||||
out.write(html.encode('utf-8'))
|
out.write(html.encode('utf-8'))
|
||||||
out.write(b'\n\n' + b'-'*80 + b'\n\n')
|
out.write(b'\n\n' + b'-'*80 + b'\n\n')
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user