Add/remove header wizard: When running on PDF input, replace non breaking spaces with normal spaces, since it is hard to write regexps to match non breaking spaces with the regex builder wizard.

This commit is contained in:
Kovid Goyal 2010-08-25 10:02:13 -06:00
parent 364c6101ec
commit 2635f836fb
2 changed files with 42 additions and 13 deletions

View File

@ -5,8 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import functools import functools, re
import re
from calibre import entity_to_unicode from calibre import entity_to_unicode
@ -73,7 +72,7 @@ def line_length(format, raw, percent):
''' '''
raw = raw.replace('&nbsp;', ' ') raw = raw.replace('&nbsp;', ' ')
if format == 'html': if format == 'html':
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL) linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
elif format == 'pdf': elif format == 'pdf':
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
lines = linere.findall(raw) lines = linere.findall(raw)
@ -205,9 +204,6 @@ class HTMLPreProcessor(object):
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI # Detect Chapters to match default XPATH in GUI
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
@ -254,20 +250,27 @@ class HTMLPreProcessor(object):
def is_pdftohtml(self, src): def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None): def __call__(self, html, remove_special_chars=None,
get_preprocess_html=False):
if remove_special_chars is not None: if remove_special_chars is not None:
html = remove_special_chars.sub('', html) html = remove_special_chars.sub('', html)
html = html.replace('\0', '') html = html.replace('\0', '')
is_pdftohtml = self.is_pdftohtml(html)
if self.is_baen(html): if self.is_baen(html):
rules = [] rules = []
elif self.is_book_designer(html): elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html): elif is_pdftohtml:
rules = self.PDFTOHTML rules = self.PDFTOHTML
else: else:
rules = [] rules = []
if not self.extra_opts.keep_ligatures: start_rules = []
if is_pdftohtml:
# Remove non breaking spaces
start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
if not getattr(self.extra_opts, 'keep_ligatures', False):
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
end_rules = [] end_rules = []
@ -299,9 +302,35 @@ class HTMLPreProcessor(object):
(re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + rules + end_rules: for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
if get_preprocess_html:
return html
def dump(raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'input')
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
#dump(html, 'pre-preprocess')
for rule in rules + end_rules:
html = rule[0].sub(rule[1], html)
#dump(html, 'post-preprocess')
# Handle broken XHTML w/ SVG (ugh) # Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html: if 'svg:' in html and SVG_NS not in html:
html = html.replace( html = html.replace(

View File

@ -14,7 +14,7 @@ from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder
from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit
from calibre.gui2 import error_dialog, choose_files from calibre.gui2 import error_dialog, choose_files
from calibre.ebooks.oeb.iterator import EbookIterator from calibre.ebooks.oeb.iterator import EbookIterator
from calibre.ebooks.conversion.preprocess import convert_entities from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.gui2.dialogs.choose_format import ChooseFormatDialog
class RegexBuilder(QDialog, Ui_RegexBuilder): class RegexBuilder(QDialog, Ui_RegexBuilder):
@ -91,10 +91,10 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
self.iterator = EbookIterator(pathtoebook) self.iterator = EbookIterator(pathtoebook)
self.iterator.__enter__(only_input_plugin=True) self.iterator.__enter__(only_input_plugin=True)
text = [u''] text = [u'']
ent_pat = re.compile(r'&(\S+?);') preprocessor = HTMLPreProcessor(None, False)
for path in self.iterator.spine: for path in self.iterator.spine:
html = open(path, 'rb').read().decode('utf-8', 'replace') html = open(path, 'rb').read().decode('utf-8', 'replace')
html = ent_pat.sub(convert_entities, html) html = preprocessor(html, get_preprocess_html=True)
text.append(html) text.append(html)
self.preview.setPlainText('\n---\n'.join(text)) self.preview.setPlainText('\n---\n'.join(text))