py3: Port preprocess.py

Now uses unicode literals and also fix various regexps that failed to
compile or raised future warnings under py3
This commit is contained in:
Kovid Goyal 2019-05-27 09:39:47 +05:30
parent ed9b4fe49b
commit 66c0721ba7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,15 +1,16 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
# flake8: noqa from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import functools, re, json import functools, re, json
from math import ceil
from calibre import entity_to_unicode, as_unicode from calibre import entity_to_unicode, as_unicode
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type, range
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg' SVG_NS = 'http://www.w3.org/2000/svg'
@ -17,32 +18,32 @@ XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode, convert_entities = functools.partial(entity_to_unicode,
result_exceptions={ result_exceptions={
u'<' : '&lt;', '<' : '&lt;',
u'>' : '&gt;', '>' : '&gt;',
u"'" : '&apos;', "'" : '&apos;',
u'"' : '&quot;', '"' : '&quot;',
u'&' : '&amp;', '&' : '&amp;',
}) })
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE) _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
LIGATURES = { LIGATURES = {
# u'\u00c6': u'AE', # '\u00c6': 'AE',
# u'\u00e6': u'ae', # '\u00e6': 'ae',
# u'\u0152': u'OE', # '\u0152': 'OE',
# u'\u0153': u'oe', # '\u0153': 'oe',
# u'\u0132': u'IJ', # '\u0132': 'IJ',
# u'\u0133': u'ij', # '\u0133': 'ij',
# u'\u1D6B': u'ue', # '\u1D6B': 'ue',
u'\uFB00': u'ff', '\uFB00': 'ff',
u'\uFB01': u'fi', '\uFB01': 'fi',
u'\uFB02': u'fl', '\uFB02': 'fl',
u'\uFB03': u'ffi', '\uFB03': 'ffi',
u'\uFB04': u'ffl', '\uFB04': 'ffl',
u'\uFB05': u'ft', '\uFB05': 'ft',
u'\uFB06': u'st', '\uFB06': 'st',
} }
_ligpat = re.compile(u'|'.join(LIGATURES)) _ligpat = re.compile('|'.join(LIGATURES))
def sanitize_head(match): def sanitize_head(match):
@ -96,9 +97,9 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''): def __init__(self, format='html', raw=''):
raw = raw.replace('&nbsp;', ' ') raw = raw.replace('&nbsp;', ' ')
if format == 'html': if format == 'html':
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL) linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
elif format == 'pdf': elif format == 'pdf':
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL) linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt': elif format == 'txt':
@ -110,7 +111,7 @@ class DocAnalysis(object):
Analyses the document to find the median line length. Analyses the document to find the median line length.
percentage is a decimal number, 0 - 1 which is used to determine percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is how far in the list of line lengths to use. The list of line lengths is
ordered smallest to larged and does not include duplicates. 0.5 is the ordered smallest to largest and does not include duplicates. 0.5 is the
median value. median value.
''' '''
lengths = [] lengths = []
@ -124,7 +125,7 @@ class DocAnalysis(object):
lengths = list(set(lengths)) lengths = list(set(lengths))
total = sum(lengths) total = sum(lengths)
avg = total / len(lengths) avg = total / len(lengths)
max_line = avg * 2 max_line = ceil(avg * 2)
lengths = sorted(lengths) lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1): for i in range(len(lengths) - 1, -1, -1):
@ -163,9 +164,9 @@ class DocAnalysis(object):
for line in self.lines: for line in self.lines:
l = len(line) l = len(line)
if l > minLineLength and l < maxLineLength: if l > minLineLength and l < maxLineLength:
l = int(l/100) l = int(l/100)
# print "adding "+str(l) # print "adding "+str(l)
hRaw[l]+=1 hRaw[l]+=1
# Normalize the histogram into percents # Normalize the histogram into percents
totalLines = len(self.lines) totalLines = len(self.lines)
@ -204,7 +205,10 @@ class Dehyphenator(object):
# Add common suffixes to the regex below to increase the likelihood of a match - # Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex' # don't add suffixes which are also complete words, such as 'able' or 'sex'
# only remove if it's not already the point of hyphenation # only remove if it's not already the point of hyphenation
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" # noqa self.suffix_string = (
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE) self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE) self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation # remove prefixes if the prefix was not already the point of hyphenation
@ -245,7 +249,7 @@ class Dehyphenator(object):
else: else:
if self.verbose > 2: if self.verbose > 2:
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)) self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
return firsthalf+u'\u2014'+wraptags+secondhalf return firsthalf+'\u2014'+wraptags+secondhalf
else: else:
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6: if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
@ -269,17 +273,27 @@ class Dehyphenator(object):
self.html = html self.html = html
self.format = format self.format = format
if format == 'html': if format == 'html':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length) # noqa intextmatch = re.compile((
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?'
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile((
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?P<wraptags><p>|'
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
elif format == 'txt': elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length) # noqa intextmatch = re.compile(
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile(u'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE) intextmatch = re.compile(
r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
elif format == 'html_cleanup': elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') # noqa intextmatch = re.compile(
r'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
elif format == 'txt_cleanup': elif format == 'txt_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\W\-]+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)') intextmatch = re.compile(
r'(?P<firstpart>[^\W\-]+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html
@ -316,7 +330,7 @@ class CSSPreProcessor(object):
# are commented lines before the first @import or @charset rule. Since # are commented lines before the first @import or @charset rule. Since
# the conversion will remove all stylesheets anyway, we don't lose # the conversion will remove all stylesheets anyway, we don't lose
# anything # anything
data = re.sub(unicode_type(r'/\*.*?\*/'), u'', data, flags=re.DOTALL) data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
ans, namespaced = [], False ans, namespaced = [], False
for line in data.splitlines(): for line in data.splitlines():
@ -327,7 +341,7 @@ class CSSPreProcessor(object):
namespaced = True namespaced = True
ans.append(line) ans.append(line)
return u'\n'.join(ans) return '\n'.join(ans)
class HTMLPreProcessor(object): class HTMLPreProcessor(object):
@ -350,121 +364,121 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup # Fix pdftohtml markup
PDFTOHTML = [ PDFTOHTML = [
# Fix umlauts # Fix umlauts
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(r'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'), (re.compile(r'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'), (re.compile(r'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ë'),
(re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'), (re.compile(r'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ë'),
(re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'), (re.compile(r'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ï'),
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'), (re.compile(r'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ï'),
(re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'), (re.compile(r'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ö'),
(re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'), (re.compile(r'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ö'),
(re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'), (re.compile(r'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ü'),
(re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'), (re.compile(r'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ü'),
# Fix accents # Fix accents
# ` # `
(re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'), (re.compile(r'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'à'),
(re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'), (re.compile(r'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'À'),
(re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'), (re.compile(r'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'è'),
(re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'), (re.compile(r'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'È'),
(re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'), (re.compile(r'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ì'),
(re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'), (re.compile(r'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ì'),
(re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'), (re.compile(r'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ò'),
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(r'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ò'),
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(r'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ù'),
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'), (re.compile(r'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ù'),
# ` with letter before # ` with letter before
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'), (re.compile(r'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'à'),
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'), (re.compile(r'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'À'),
(re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'), (re.compile(r'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'è'),
(re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'), (re.compile(r'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'È'),
(re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'), (re.compile(r'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ì'),
(re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'), (re.compile(r'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ì'),
(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'), (re.compile(r'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ò'),
(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'), (re.compile(r'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ò'),
(re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'), (re.compile(r'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ù'),
(re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'), (re.compile(r'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ù'),
# ´ # ´
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'), (re.compile(r'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'á'),
(re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'), (re.compile(r'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Á'),
(re.compile(u'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ć'), (re.compile(r'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ć'),
(re.compile(u'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ć'), (re.compile(r'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ć'),
(re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'), (re.compile(r'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'é'),
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'), (re.compile(r'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'É'),
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'), (re.compile(r'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'í'),
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'), (re.compile(r'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Í'),
(re.compile(u'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ĺ'), (re.compile(r'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ĺ'),
(re.compile(u'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ĺ'), (re.compile(r'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ĺ'),
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'), (re.compile(r'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ó'),
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'), (re.compile(r'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ó'),
(re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'), (re.compile(r'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ń'),
(re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'), (re.compile(r'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ń'),
(re.compile(u'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ŕ'), (re.compile(r'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ŕ'),
(re.compile(u'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ŕ'), (re.compile(r'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ŕ'),
(re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'), (re.compile(r'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'ś'),
(re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'), (re.compile(r'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Ś'),
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'), (re.compile(r'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ú'),
(re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'), (re.compile(r'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ú'),
(re.compile(u'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ź'), (re.compile(r'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ź'),
(re.compile(u'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ź'), (re.compile(r'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ź'),
# ˆ # ˆ
(re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'), (re.compile(r'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'â'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'), (re.compile(r'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Â'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'), (re.compile(r'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ê'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'), (re.compile(r'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ê'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'), (re.compile(r'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'î'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'), (re.compile(r'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Î'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'), (re.compile(r'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ô'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'), (re.compile(r'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ô'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'), (re.compile(r'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'û'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'), (re.compile(r'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Û'),
# ¸ # ¸
(re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'), (re.compile(r'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ç'),
(re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'), (re.compile(r'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ç'),
# ˛ # ˛
(re.compile(u'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'), (re.compile(r'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ą'),
(re.compile(u'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'), (re.compile(r'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ą'),
(re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'), (re.compile(r'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ę'),
(re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'), (re.compile(r'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ę'),
# ˙ # ˙
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(r'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ż'),
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), (re.compile(r'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ż'),
# ˇ # ˇ
(re.compile(u'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'č'), (re.compile(r'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'č'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Č'), (re.compile(r'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Č'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: u'ď'), (re.compile(r'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: 'ď'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: u'Ď'), (re.compile(r'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: 'Ď'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ě'), (re.compile(r'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ě'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ě'), (re.compile(r'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ě'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ľ'), (re.compile(r'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ľ'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ľ'), (re.compile(r'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ľ'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ň'), (re.compile(r'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ň'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ň'), (re.compile(r'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ň'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ř'), (re.compile(r'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ř'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ř'), (re.compile(r'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ř'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'š'), (re.compile(r'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'š'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Š'), (re.compile(r'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Š'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: u'ť'), (re.compile(r'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: 'ť'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: u'Ť'), (re.compile(r'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: 'Ť'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ž'), (re.compile(r'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ž'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ž'), (re.compile(r'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ž'),
# ° # °
(re.compile(u'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ů'), (re.compile(r'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ů'),
(re.compile(u'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ů'), (re.compile(r'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ů'),
# If pdf printed from a browser then the header/footer has a reliable pattern # If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
# Center separator lines # Center separator lines
(re.compile(u'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'), (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
# Remove <hr> tags # Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''), (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
@ -478,9 +492,9 @@ class HTMLPreProcessor(object):
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'), (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
# Clean up spaces # Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics # Add space before and after italics
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '), (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
] ]
@ -490,9 +504,9 @@ class HTMLPreProcessor(object):
(re.compile('<hr>', re.IGNORECASE), (re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'), lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags # Create header tags
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))), lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))), lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)), lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
@ -570,9 +584,11 @@ class HTMLPreProcessor(object):
# delete soft hyphens - moved here so it's executed after header/footer removal # delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml: if is_pdftohtml:
# unwrap/delete soft hyphens # unwrap/delete soft hyphens
end_rules.append((re.compile(u'[­](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(
r'[­](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting # unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(
r'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
length = -1 length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
@ -581,13 +597,14 @@ class HTMLPreProcessor(object):
if length: if length:
# print "The pdf line length returned is " + str(length) # print "The pdf line length returned is " + str(length)
# unwrap em/en dashes # unwrap em/en dashes
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append((re.compile(
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(( (re.compile((
u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]' r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]'
u'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?' r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
u'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines), r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + start_rules: for rule in self.PREPROCESS + start_rules:
@ -657,7 +674,7 @@ class HTMLPreProcessor(object):
from calibre.utils.localization import get_udc from calibre.utils.localization import get_udc
from calibre.utils.mreplace import MReplace from calibre.utils.mreplace import MReplace
unihandecoder = get_udc() unihandecoder = get_udc()
mr = MReplace(data={u'«':u'&lt;'*3, u'»':u'&gt;'*3}) mr = MReplace(data={'«':'&lt;'*3, '»':'&gt;'*3})
html = mr.mreplace(html) html = mr.mreplace(html)
html = unihandecoder.decode(html) html = unihandecoder.decode(html)
@ -675,7 +692,7 @@ class HTMLPreProcessor(object):
try: try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
except AttributeError: except AttributeError:
unsupported_unicode_chars = u'' unsupported_unicode_chars = ''
if unsupported_unicode_chars: if unsupported_unicode_chars:
from calibre.utils.localization import get_udc from calibre.utils.localization import get_udc
unihandecoder = get_udc() unihandecoder = get_udc()