diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 39ca28e87f..b63c7ca861 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -228,17 +228,16 @@ class HTMLPreProcessor(object): else: rules = [] - pre_rules = [] + end_rules = [] if getattr(self.extra_opts, 'remove_header', None): - pre_rules.append( + end_rules.append( (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') ) if getattr(self.extra_opts, 'remove_footer', None): - pre_rules.append( + end_rules.append( (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') ) - - end_rules = [] + if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) if length: @@ -247,7 +246,7 @@ class HTMLPreProcessor(object): (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), ) - for rule in self.PREPROCESS + pre_rules + rules + end_rules: + for rule in self.PREPROCESS + rules + end_rules: html = rule[0].sub(rule[1], html) # Handle broken XHTML w/ SVG (ugh) diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 762b14c3e5..565ceed519 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -123,7 +123,7 @@ class EbookIterator(object): else: print 'Loaded embedded font:', repr(family) - def __enter__(self, raw_only=False): + def __enter__(self, processed=False): self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() @@ -140,7 +140,7 @@ class EbookIterator(object): plumber.opts, plumber.input_fmt, self.log, {}, self.base) - if not raw_only and plumber.input_fmt.lower() in ('pdf', 'rb'): + if processed or plumber.input_fmt.lower() in ('pdf', 'rb'): self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts, plumber.input_plugin) if hasattr(self.pathtoopf, 'manifest'): diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index 20da8d7aaf..b1d8fbcbd5 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -87,12 +87,12 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): def open_book(self, pathtoebook): self.iterator = EbookIterator(pathtoebook) - self.iterator.__enter__(raw_only=True) + self.iterator.__enter__(processed=True) text = [u''] for path in self.iterator.spine: - html = open(path, 'rb').read().decode(path.encoding, 'replace') + html = open(path, 'rb').read().decode('utf-8', 'replace') text.append(html) - self.preview.setPlainText('\n\n'.join(text)) + self.preview.setPlainText('\n---\n'.join(text)) def button_clicked(self, button): if button == self.button_box.button(QDialogButtonBox.Open):