mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improve header and footer regular expression matching
This commit is contained in:
parent
138a7c53ad
commit
10c9f4032a
@ -228,17 +228,16 @@ class HTMLPreProcessor(object):
|
||||
else:
|
||||
rules = []
|
||||
|
||||
pre_rules = []
|
||||
end_rules = []
|
||||
if getattr(self.extra_opts, 'remove_header', None):
|
||||
pre_rules.append(
|
||||
end_rules.append(
|
||||
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
|
||||
)
|
||||
if getattr(self.extra_opts, 'remove_footer', None):
|
||||
pre_rules.append(
|
||||
end_rules.append(
|
||||
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
||||
)
|
||||
|
||||
end_rules = []
|
||||
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
@ -247,7 +246,7 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + pre_rules + rules + end_rules:
|
||||
for rule in self.PREPROCESS + rules + end_rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
|
||||
# Handle broken XHTML w/ SVG (ugh)
|
||||
|
@ -123,7 +123,7 @@ class EbookIterator(object):
|
||||
else:
|
||||
print 'Loaded embedded font:', repr(family)
|
||||
|
||||
def __enter__(self, raw_only=False):
|
||||
def __enter__(self, processed=False):
|
||||
self.delete_on_exit = []
|
||||
self._tdir = TemporaryDirectory('_ebook_iter')
|
||||
self.base = self._tdir.__enter__()
|
||||
@ -140,7 +140,7 @@ class EbookIterator(object):
|
||||
plumber.opts, plumber.input_fmt, self.log,
|
||||
{}, self.base)
|
||||
|
||||
if not raw_only and plumber.input_fmt.lower() in ('pdf', 'rb'):
|
||||
if processed or plumber.input_fmt.lower() in ('pdf', 'rb'):
|
||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
|
||||
plumber.input_plugin)
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
|
@ -87,12 +87,12 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
|
||||
|
||||
def open_book(self, pathtoebook):
|
||||
self.iterator = EbookIterator(pathtoebook)
|
||||
self.iterator.__enter__(raw_only=True)
|
||||
self.iterator.__enter__(processed=True)
|
||||
text = [u'']
|
||||
for path in self.iterator.spine:
|
||||
html = open(path, 'rb').read().decode(path.encoding, 'replace')
|
||||
html = open(path, 'rb').read().decode('utf-8', 'replace')
|
||||
text.append(html)
|
||||
self.preview.setPlainText('\n\n'.join(text))
|
||||
self.preview.setPlainText('\n---\n'.join(text))
|
||||
|
||||
def button_clicked(self, button):
|
||||
if button == self.button_box.button(QDialogButtonBox.Open):
|
||||
|
Loading…
x
Reference in New Issue
Block a user