diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index da652c1a38..973e508746 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -5,8 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import functools -import re +import functools, re from calibre import entity_to_unicode @@ -73,7 +72,7 @@ def line_length(format, raw, percent): ''' raw = raw.replace(' ', ' ') if format == 'html': - linere = re.compile('(?<=)', re.DOTALL) + linere = re.compile('(?<=)', re.DOTALL) elif format == 'pdf': linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) lines = linere.findall(raw) @@ -205,9 +204,6 @@ class HTMLPreProcessor(object): # Remove gray background (re.compile(r']+>'), lambda match : ''), - # Remove non breaking spaces - (re.compile(ur'\u00a0'), lambda match : ' '), - # Detect Chapters to match default XPATH in GUI (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(()?)?)]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), @@ -254,20 +250,27 @@ class HTMLPreProcessor(object): def is_pdftohtml(self, src): return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] - def __call__(self, html, remove_special_chars=None): + def __call__(self, html, remove_special_chars=None, + get_preprocess_html=False): if remove_special_chars is not None: html = remove_special_chars.sub('', html) html = html.replace('\0', '') + is_pdftohtml = self.is_pdftohtml(html) if self.is_baen(html): rules = [] elif self.is_book_designer(html): rules = self.BOOK_DESIGNER - elif self.is_pdftohtml(html): + elif is_pdftohtml: rules = self.PDFTOHTML else: rules = [] - if not self.extra_opts.keep_ligatures: + start_rules = [] + if is_pdftohtml: + # Remove non breaking spaces + start_rules.append((re.compile(ur'\u00a0'), lambda match : ' ')) + + if not getattr(self.extra_opts, 'keep_ligatures', False): html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) end_rules = [] @@ -299,9 +302,35 @@ class HTMLPreProcessor(object): (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), ) - for rule in self.PREPROCESS + rules + end_rules: + for rule in self.PREPROCESS + start_rules: html = rule[0].sub(rule[1], html) + if get_preprocess_html: + return html + + def dump(raw, where): + import os + dp = getattr(self.extra_opts, 'debug_pipeline', None) + if dp and os.path.exists(dp): + odir = os.path.join(dp, 'input') + if os.path.exists(odir): + odir = os.path.join(odir, where) + if not os.path.exists(odir): + os.makedirs(odir) + name, i = None, 0 + while not name or os.path.exists(os.path.join(odir, name)): + i += 1 + name = '%04d.html'%i + with open(os.path.join(odir, name), 'wb') as f: + f.write(raw.encode('utf-8')) + + #dump(html, 'pre-preprocess') + + for rule in rules + end_rules: + html = rule[0].sub(rule[1], html) + + #dump(html, 'post-preprocess') + # Handle broken XHTML w/ SVG (ugh) if 'svg:' in html and SVG_NS not in html: html = html.replace( diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index b10772b86c..c5c8d84a88 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -14,7 +14,7 @@ from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit from calibre.gui2 import error_dialog, choose_files from calibre.ebooks.oeb.iterator import EbookIterator -from calibre.ebooks.conversion.preprocess import convert_entities +from calibre.ebooks.conversion.preprocess import HTMLPreProcessor from calibre.gui2.dialogs.choose_format import ChooseFormatDialog class RegexBuilder(QDialog, Ui_RegexBuilder): @@ -91,10 +91,10 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): self.iterator = EbookIterator(pathtoebook) self.iterator.__enter__(only_input_plugin=True) text = [u''] - ent_pat = re.compile(r'&(\S+?);') + preprocessor = HTMLPreProcessor(None, False) for path in self.iterator.spine: html = open(path, 'rb').read().decode('utf-8', 'replace') - html = ent_pat.sub(convert_entities, html) + html = preprocessor(html, get_preprocess_html=True) text.append(html) self.preview.setPlainText('\n---\n'.join(text))