Get header and footer regex matching working better.

This commit is contained in:
John Schember 2009-10-26 19:55:10 -04:00
parent abe52807cb
commit 68e3acd43a
3 changed files with 10 additions and 11 deletions

View File

@ -228,17 +228,16 @@ class HTMLPreProcessor(object):
else: else:
rules = [] rules = []
pre_rules = [] end_rules = []
if getattr(self.extra_opts, 'remove_header', None): if getattr(self.extra_opts, 'remove_header', None):
pre_rules.append( end_rules.append(
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
) )
if getattr(self.extra_opts, 'remove_footer', None): if getattr(self.extra_opts, 'remove_footer', None):
pre_rules.append( end_rules.append(
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
) )
end_rules = []
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
if length: if length:
@ -247,7 +246,7 @@ class HTMLPreProcessor(object):
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + pre_rules + rules + end_rules: for rule in self.PREPROCESS + rules + end_rules:
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
# Handle broken XHTML w/ SVG (ugh) # Handle broken XHTML w/ SVG (ugh)

View File

@ -123,7 +123,7 @@ class EbookIterator(object):
else: else:
print 'Loaded embedded font:', repr(family) print 'Loaded embedded font:', repr(family)
def __enter__(self, raw_only=False): def __enter__(self, processed=False):
self.delete_on_exit = [] self.delete_on_exit = []
self._tdir = TemporaryDirectory('_ebook_iter') self._tdir = TemporaryDirectory('_ebook_iter')
self.base = self._tdir.__enter__() self.base = self._tdir.__enter__()
@ -140,7 +140,7 @@ class EbookIterator(object):
plumber.opts, plumber.input_fmt, self.log, plumber.opts, plumber.input_fmt, self.log,
{}, self.base) {}, self.base)
if not raw_only and plumber.input_fmt.lower() in ('pdf', 'rb'): if processed or plumber.input_fmt.lower() in ('pdf', 'rb'):
self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts, self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
plumber.input_plugin) plumber.input_plugin)
if hasattr(self.pathtoopf, 'manifest'): if hasattr(self.pathtoopf, 'manifest'):

View File

@ -87,12 +87,12 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
def open_book(self, pathtoebook): def open_book(self, pathtoebook):
self.iterator = EbookIterator(pathtoebook) self.iterator = EbookIterator(pathtoebook)
self.iterator.__enter__(raw_only=True) self.iterator.__enter__(processed=True)
text = [u''] text = [u'']
for path in self.iterator.spine: for path in self.iterator.spine:
html = open(path, 'rb').read().decode(path.encoding, 'replace') html = open(path, 'rb').read().decode('utf-8', 'replace')
text.append(html) text.append(html)
self.preview.setPlainText('\n\n'.join(text)) self.preview.setPlainText('\n---\n'.join(text))
def button_clicked(self, button): def button_clicked(self, button):
if button == self.button_box.button(QDialogButtonBox.Open): if button == self.button_box.button(QDialogButtonBox.Open):