mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add/remove header wizard: When running on PDF input, replace non breaking spaces with normal spaces, since it is hard to write regexps to match non breaking spaces with the regex builder wizard.
This commit is contained in:
parent
364c6101ec
commit
2635f836fb
@ -5,8 +5,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import functools
|
import functools, re
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode
|
||||||
|
|
||||||
@ -73,7 +72,7 @@ def line_length(format, raw, percent):
|
|||||||
'''
|
'''
|
||||||
raw = raw.replace(' ', ' ')
|
raw = raw.replace(' ', ' ')
|
||||||
if format == 'html':
|
if format == 'html':
|
||||||
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||||
lines = linere.findall(raw)
|
lines = linere.findall(raw)
|
||||||
@ -205,9 +204,6 @@ class HTMLPreProcessor(object):
|
|||||||
# Remove gray background
|
# Remove gray background
|
||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||||
|
|
||||||
# Remove non breaking spaces
|
|
||||||
(re.compile(ur'\u00a0'), lambda match : ' '),
|
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
# Detect Chapters to match default XPATH in GUI
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
||||||
@ -254,20 +250,27 @@ class HTMLPreProcessor(object):
|
|||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
|
|
||||||
def __call__(self, html, remove_special_chars=None):
|
def __call__(self, html, remove_special_chars=None,
|
||||||
|
get_preprocess_html=False):
|
||||||
if remove_special_chars is not None:
|
if remove_special_chars is not None:
|
||||||
html = remove_special_chars.sub('', html)
|
html = remove_special_chars.sub('', html)
|
||||||
html = html.replace('\0', '')
|
html = html.replace('\0', '')
|
||||||
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
if self.is_baen(html):
|
if self.is_baen(html):
|
||||||
rules = []
|
rules = []
|
||||||
elif self.is_book_designer(html):
|
elif self.is_book_designer(html):
|
||||||
rules = self.BOOK_DESIGNER
|
rules = self.BOOK_DESIGNER
|
||||||
elif self.is_pdftohtml(html):
|
elif is_pdftohtml:
|
||||||
rules = self.PDFTOHTML
|
rules = self.PDFTOHTML
|
||||||
else:
|
else:
|
||||||
rules = []
|
rules = []
|
||||||
|
|
||||||
if not self.extra_opts.keep_ligatures:
|
start_rules = []
|
||||||
|
if is_pdftohtml:
|
||||||
|
# Remove non breaking spaces
|
||||||
|
start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
|
||||||
|
|
||||||
|
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||||
|
|
||||||
end_rules = []
|
end_rules = []
|
||||||
@ -299,9 +302,35 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + rules + end_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
|
|
||||||
|
if get_preprocess_html:
|
||||||
|
return html
|
||||||
|
|
||||||
|
def dump(raw, where):
|
||||||
|
import os
|
||||||
|
dp = getattr(self.extra_opts, 'debug_pipeline', None)
|
||||||
|
if dp and os.path.exists(dp):
|
||||||
|
odir = os.path.join(dp, 'input')
|
||||||
|
if os.path.exists(odir):
|
||||||
|
odir = os.path.join(odir, where)
|
||||||
|
if not os.path.exists(odir):
|
||||||
|
os.makedirs(odir)
|
||||||
|
name, i = None, 0
|
||||||
|
while not name or os.path.exists(os.path.join(odir, name)):
|
||||||
|
i += 1
|
||||||
|
name = '%04d.html'%i
|
||||||
|
with open(os.path.join(odir, name), 'wb') as f:
|
||||||
|
f.write(raw.encode('utf-8'))
|
||||||
|
|
||||||
|
#dump(html, 'pre-preprocess')
|
||||||
|
|
||||||
|
for rule in rules + end_rules:
|
||||||
|
html = rule[0].sub(rule[1], html)
|
||||||
|
|
||||||
|
#dump(html, 'post-preprocess')
|
||||||
|
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
if 'svg:' in html and SVG_NS not in html:
|
if 'svg:' in html and SVG_NS not in html:
|
||||||
html = html.replace(
|
html = html.replace(
|
||||||
|
@ -14,7 +14,7 @@ from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder
|
|||||||
from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit
|
from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit
|
||||||
from calibre.gui2 import error_dialog, choose_files
|
from calibre.gui2 import error_dialog, choose_files
|
||||||
from calibre.ebooks.oeb.iterator import EbookIterator
|
from calibre.ebooks.oeb.iterator import EbookIterator
|
||||||
from calibre.ebooks.conversion.preprocess import convert_entities
|
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||||
from calibre.gui2.dialogs.choose_format import ChooseFormatDialog
|
from calibre.gui2.dialogs.choose_format import ChooseFormatDialog
|
||||||
|
|
||||||
class RegexBuilder(QDialog, Ui_RegexBuilder):
|
class RegexBuilder(QDialog, Ui_RegexBuilder):
|
||||||
@ -91,10 +91,10 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
|
|||||||
self.iterator = EbookIterator(pathtoebook)
|
self.iterator = EbookIterator(pathtoebook)
|
||||||
self.iterator.__enter__(only_input_plugin=True)
|
self.iterator.__enter__(only_input_plugin=True)
|
||||||
text = [u'']
|
text = [u'']
|
||||||
ent_pat = re.compile(r'&(\S+?);')
|
preprocessor = HTMLPreProcessor(None, False)
|
||||||
for path in self.iterator.spine:
|
for path in self.iterator.spine:
|
||||||
html = open(path, 'rb').read().decode('utf-8', 'replace')
|
html = open(path, 'rb').read().decode('utf-8', 'replace')
|
||||||
html = ent_pat.sub(convert_entities, html)
|
html = preprocessor(html, get_preprocess_html=True)
|
||||||
text.append(html)
|
text.append(html)
|
||||||
self.preview.setPlainText('\n---\n'.join(text))
|
self.preview.setPlainText('\n---\n'.join(text))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user