diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d791d45aad..846f27198b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -1,15 +1,16 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-# flake8: noqa
+from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
import functools, re, json
+from math import ceil
from calibre import entity_to_unicode, as_unicode
-from polyglot.builtins import unicode_type
+from polyglot.builtins import unicode_type, range
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
@@ -17,32 +18,32 @@ XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode,
result_exceptions={
- u'<' : '<',
- u'>' : '>',
- u"'" : ''',
- u'"' : '"',
- u'&' : '&',
+ '<' : '<',
+ '>' : '>',
+ "'" : ''',
+ '"' : '"',
+ '&' : '&',
})
_span_pat = re.compile('', re.DOTALL|re.IGNORECASE)
LIGATURES = {
-# u'\u00c6': u'AE',
-# u'\u00e6': u'ae',
-# u'\u0152': u'OE',
-# u'\u0153': u'oe',
-# u'\u0132': u'IJ',
-# u'\u0133': u'ij',
-# u'\u1D6B': u'ue',
- u'\uFB00': u'ff',
- u'\uFB01': u'fi',
- u'\uFB02': u'fl',
- u'\uFB03': u'ffi',
- u'\uFB04': u'ffl',
- u'\uFB05': u'ft',
- u'\uFB06': u'st',
+# '\u00c6': 'AE',
+# '\u00e6': 'ae',
+# '\u0152': 'OE',
+# '\u0153': 'oe',
+# '\u0132': 'IJ',
+# '\u0133': 'ij',
+# '\u1D6B': 'ue',
+ '\uFB00': 'ff',
+ '\uFB01': 'fi',
+ '\uFB02': 'fl',
+ '\uFB03': 'ffi',
+ '\uFB04': 'ffl',
+ '\uFB05': 'ft',
+ '\uFB06': 'st',
}
-_ligpat = re.compile(u'|'.join(LIGATURES))
+_ligpat = re.compile('|'.join(LIGATURES))
def sanitize_head(match):
@@ -96,9 +97,9 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
if format == 'html':
- linere = re.compile('(?<=]*>\s*
).*?(?=
)', re.DOTALL)
+ linere = re.compile(r'(?<=]*>\s*
).*?(?=)', re.DOTALL)
elif format == 'pdf':
- linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL)
+ linere = re.compile(r'(?<=
)(?!\s*
).*?(?=
)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
elif format == 'txt':
@@ -110,7 +111,7 @@ class DocAnalysis(object):
Analyses the document to find the median line length.
percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is
- ordered smallest to larged and does not include duplicates. 0.5 is the
+ ordered smallest to largest and does not include duplicates. 0.5 is the
median value.
'''
lengths = []
@@ -124,7 +125,7 @@ class DocAnalysis(object):
lengths = list(set(lengths))
total = sum(lengths)
avg = total / len(lengths)
- max_line = avg * 2
+ max_line = ceil(avg * 2)
lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1):
@@ -163,9 +164,9 @@ class DocAnalysis(object):
for line in self.lines:
l = len(line)
if l > minLineLength and l < maxLineLength:
- l = int(l/100)
- # print "adding "+str(l)
- hRaw[l]+=1
+ l = int(l/100)
+ # print "adding "+str(l)
+ hRaw[l]+=1
# Normalize the histogram into percents
totalLines = len(self.lines)
@@ -204,7 +205,10 @@ class Dehyphenator(object):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
# only remove if it's not already the point of hyphenation
- self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" # noqa
+ self.suffix_string = (
+ "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
+ "(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
+ "(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
@@ -245,7 +249,7 @@ class Dehyphenator(object):
else:
if self.verbose > 2:
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
- return firsthalf+u'\u2014'+wraptags+secondhalf
+ return firsthalf+'\u2014'+wraptags+secondhalf
else:
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
@@ -269,17 +273,27 @@ class Dehyphenator(object):
self.html = html
self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) # noqa
+ intextmatch = re.compile((
+ r'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?'
+ r'\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*
\s*)'
+ r'?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)') % length)
elif format == 'pdf':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ intextmatch = re.compile((
+ r'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?P|'
+ r'[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)')% length)
elif format == 'txt':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) # noqa
+ intextmatch = re.compile(
+ '(?<=.{%i})(?P[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\\w\\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)', re.UNICODE)
+ intextmatch = re.compile(
+ r'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)', re.UNICODE)
elif format == 'html_cleanup':
- intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') # noqa
+ intextmatch = re.compile(
+ r'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P\s*([iubp]>'
+ r'\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
elif format == 'txt_cleanup':
- intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)(?P\s+)(?P[\w\d]+)')
+ intextmatch = re.compile(
+ r'(?P[^\W\-]+)(-|‐)(?P\s+)(?P[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html)
return html
@@ -316,7 +330,7 @@ class CSSPreProcessor(object):
# are commented lines before the first @import or @charset rule. Since
# the conversion will remove all stylesheets anyway, we don't lose
# anything
- data = re.sub(unicode_type(r'/\*.*?\*/'), u'', data, flags=re.DOTALL)
+ data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
ans, namespaced = [], False
for line in data.splitlines():
@@ -327,7 +341,7 @@ class CSSPreProcessor(object):
namespaced = True
ans.append(line)
- return u'\n'.join(ans)
+ return '\n'.join(ans)
class HTMLPreProcessor(object):
@@ -350,121 +364,121 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup
PDFTOHTML = [
# Fix umlauts
- (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'),
- (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'),
- (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'),
- (re.compile(u'¨\s*()*\s*E', re.UNICODE), lambda match: u'Ë'),
- (re.compile(u'¨\s*()*\s*i', re.UNICODE), lambda match: u'ï'),
- (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'),
- (re.compile(u'¨\s*()*\s*o', re.UNICODE), lambda match: u'ö'),
- (re.compile(u'¨\s*()*\s*O', re.UNICODE), lambda match: u'Ö'),
- (re.compile(u'¨\s*()*\s*u', re.UNICODE), lambda match: u'ü'),
- (re.compile(u'¨\s*()*\s*U', re.UNICODE), lambda match: u'Ü'),
+ (re.compile(r'¨\s*()*\s*a', re.UNICODE), lambda match: 'ä'),
+ (re.compile(r'¨\s*()*\s*A', re.UNICODE), lambda match: 'Ä'),
+ (re.compile(r'¨\s*()*\s*e', re.UNICODE), lambda match: 'ë'),
+ (re.compile(r'¨\s*()*\s*E', re.UNICODE), lambda match: 'Ë'),
+ (re.compile(r'¨\s*()*\s*i', re.UNICODE), lambda match: 'ï'),
+ (re.compile(r'¨\s*()*\s*I', re.UNICODE), lambda match: 'Ï'),
+ (re.compile(r'¨\s*()*\s*o', re.UNICODE), lambda match: 'ö'),
+ (re.compile(r'¨\s*()*\s*O', re.UNICODE), lambda match: 'Ö'),
+ (re.compile(r'¨\s*()*\s*u', re.UNICODE), lambda match: 'ü'),
+ (re.compile(r'¨\s*()*\s*U', re.UNICODE), lambda match: 'Ü'),
# Fix accents
# `
- (re.compile(u'`\s*()*\s*a', re.UNICODE), lambda match: u'à'),
- (re.compile(u'`\s*()*\s*A', re.UNICODE), lambda match: u'À'),
- (re.compile(u'`\s*()*\s*e', re.UNICODE), lambda match: u'è'),
- (re.compile(u'`\s*()*\s*E', re.UNICODE), lambda match: u'È'),
- (re.compile(u'`\s*()*\s*i', re.UNICODE), lambda match: u'ì'),
- (re.compile(u'`\s*()*\s*I', re.UNICODE), lambda match: u'Ì'),
- (re.compile(u'`\s*()*\s*o', re.UNICODE), lambda match: u'ò'),
- (re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'),
- (re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'),
- (re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'),
+ (re.compile(r'`\s*()*\s*a', re.UNICODE), lambda match: 'à'),
+ (re.compile(r'`\s*()*\s*A', re.UNICODE), lambda match: 'À'),
+ (re.compile(r'`\s*()*\s*e', re.UNICODE), lambda match: 'è'),
+ (re.compile(r'`\s*()*\s*E', re.UNICODE), lambda match: 'È'),
+ (re.compile(r'`\s*()*\s*i', re.UNICODE), lambda match: 'ì'),
+ (re.compile(r'`\s*()*\s*I', re.UNICODE), lambda match: 'Ì'),
+ (re.compile(r'`\s*()*\s*o', re.UNICODE), lambda match: 'ò'),
+ (re.compile(r'`\s*()*\s*O', re.UNICODE), lambda match: 'Ò'),
+ (re.compile(r'`\s*()*\s*u', re.UNICODE), lambda match: 'ù'),
+ (re.compile(r'`\s*()*\s*U', re.UNICODE), lambda match: 'Ù'),
# ` with letter before
- (re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'),
- (re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'),
- (re.compile(u'e\s*()*\s*`', re.UNICODE), lambda match: u'è'),
- (re.compile(u'E\s*()*\s*`', re.UNICODE), lambda match: u'È'),
- (re.compile(u'i\s*()*\s*`', re.UNICODE), lambda match: u'ì'),
- (re.compile(u'I\s*()*\s*`', re.UNICODE), lambda match: u'Ì'),
- (re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'),
- (re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'),
- (re.compile(u'u\s*()*\s*`', re.UNICODE), lambda match: u'ù'),
- (re.compile(u'U\s*()*\s*`', re.UNICODE), lambda match: u'Ù'),
+ (re.compile(r'a\s*()*\s*`', re.UNICODE), lambda match: 'à'),
+ (re.compile(r'A\s*()*\s*`', re.UNICODE), lambda match: 'À'),
+ (re.compile(r'e\s*()*\s*`', re.UNICODE), lambda match: 'è'),
+ (re.compile(r'E\s*()*\s*`', re.UNICODE), lambda match: 'È'),
+ (re.compile(r'i\s*()*\s*`', re.UNICODE), lambda match: 'ì'),
+ (re.compile(r'I\s*()*\s*`', re.UNICODE), lambda match: 'Ì'),
+ (re.compile(r'o\s*()*\s*`', re.UNICODE), lambda match: 'ò'),
+ (re.compile(r'O\s*()*\s*`', re.UNICODE), lambda match: 'Ò'),
+ (re.compile(r'u\s*()*\s*`', re.UNICODE), lambda match: 'ù'),
+ (re.compile(r'U\s*()*\s*`', re.UNICODE), lambda match: 'Ù'),
# ´
- (re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'),
- (re.compile(u'´\s*()*\s*A', re.UNICODE), lambda match: u'Á'),
- (re.compile(u'´\s*()*\s*c', re.UNICODE), lambda match: u'ć'),
- (re.compile(u'´\s*()*\s*C', re.UNICODE), lambda match: u'Ć'),
- (re.compile(u'´\s*()*\s*e', re.UNICODE), lambda match: u'é'),
- (re.compile(u'´\s*()*\s*E', re.UNICODE), lambda match: u'É'),
- (re.compile(u'´\s*()*\s*i', re.UNICODE), lambda match: u'í'),
- (re.compile(u'´\s*()*\s*I', re.UNICODE), lambda match: u'Í'),
- (re.compile(u'´\s*()*\s*l', re.UNICODE), lambda match: u'ĺ'),
- (re.compile(u'´\s*()*\s*L', re.UNICODE), lambda match: u'Ĺ'),
- (re.compile(u'´\s*()*\s*o', re.UNICODE), lambda match: u'ó'),
- (re.compile(u'´\s*()*\s*O', re.UNICODE), lambda match: u'Ó'),
- (re.compile(u'´\s*()*\s*n', re.UNICODE), lambda match: u'ń'),
- (re.compile(u'´\s*()*\s*N', re.UNICODE), lambda match: u'Ń'),
- (re.compile(u'´\s*()*\s*r', re.UNICODE), lambda match: u'ŕ'),
- (re.compile(u'´\s*()*\s*R', re.UNICODE), lambda match: u'Ŕ'),
- (re.compile(u'´\s*()*\s*s', re.UNICODE), lambda match: u'ś'),
- (re.compile(u'´\s*()*\s*S', re.UNICODE), lambda match: u'Ś'),
- (re.compile(u'´\s*()*\s*u', re.UNICODE), lambda match: u'ú'),
- (re.compile(u'´\s*()*\s*U', re.UNICODE), lambda match: u'Ú'),
- (re.compile(u'´\s*()*\s*z', re.UNICODE), lambda match: u'ź'),
- (re.compile(u'´\s*()*\s*Z', re.UNICODE), lambda match: u'Ź'),
+ (re.compile(r'´\s*()*\s*a', re.UNICODE), lambda match: 'á'),
+ (re.compile(r'´\s*()*\s*A', re.UNICODE), lambda match: 'Á'),
+ (re.compile(r'´\s*()*\s*c', re.UNICODE), lambda match: 'ć'),
+ (re.compile(r'´\s*()*\s*C', re.UNICODE), lambda match: 'Ć'),
+ (re.compile(r'´\s*()*\s*e', re.UNICODE), lambda match: 'é'),
+ (re.compile(r'´\s*()*\s*E', re.UNICODE), lambda match: 'É'),
+ (re.compile(r'´\s*()*\s*i', re.UNICODE), lambda match: 'í'),
+ (re.compile(r'´\s*()*\s*I', re.UNICODE), lambda match: 'Í'),
+ (re.compile(r'´\s*()*\s*l', re.UNICODE), lambda match: 'ĺ'),
+ (re.compile(r'´\s*()*\s*L', re.UNICODE), lambda match: 'Ĺ'),
+ (re.compile(r'´\s*()*\s*o', re.UNICODE), lambda match: 'ó'),
+ (re.compile(r'´\s*()*\s*O', re.UNICODE), lambda match: 'Ó'),
+ (re.compile(r'´\s*()*\s*n', re.UNICODE), lambda match: 'ń'),
+ (re.compile(r'´\s*()*\s*N', re.UNICODE), lambda match: 'Ń'),
+ (re.compile(r'´\s*()*\s*r', re.UNICODE), lambda match: 'ŕ'),
+ (re.compile(r'´\s*()*\s*R', re.UNICODE), lambda match: 'Ŕ'),
+ (re.compile(r'´\s*()*\s*s', re.UNICODE), lambda match: 'ś'),
+ (re.compile(r'´\s*()*\s*S', re.UNICODE), lambda match: 'Ś'),
+ (re.compile(r'´\s*()*\s*u', re.UNICODE), lambda match: 'ú'),
+ (re.compile(r'´\s*()*\s*U', re.UNICODE), lambda match: 'Ú'),
+ (re.compile(r'´\s*()*\s*z', re.UNICODE), lambda match: 'ź'),
+ (re.compile(r'´\s*()*\s*Z', re.UNICODE), lambda match: 'Ź'),
# ˆ
- (re.compile(u'ˆ\s*()*\s*a', re.UNICODE), lambda match: u'â'),
- (re.compile(u'ˆ\s*()*\s*A', re.UNICODE), lambda match: u'Â'),
- (re.compile(u'ˆ\s*()*\s*e', re.UNICODE), lambda match: u'ê'),
- (re.compile(u'ˆ\s*()*\s*E', re.UNICODE), lambda match: u'Ê'),
- (re.compile(u'ˆ\s*()*\s*i', re.UNICODE), lambda match: u'î'),
- (re.compile(u'ˆ\s*()*\s*I', re.UNICODE), lambda match: u'Î'),
- (re.compile(u'ˆ\s*()*\s*o', re.UNICODE), lambda match: u'ô'),
- (re.compile(u'ˆ\s*()*\s*O', re.UNICODE), lambda match: u'Ô'),
- (re.compile(u'ˆ\s*()*\s*u', re.UNICODE), lambda match: u'û'),
- (re.compile(u'ˆ\s*()*\s*U', re.UNICODE), lambda match: u'Û'),
+ (re.compile(r'ˆ\s*()*\s*a', re.UNICODE), lambda match: 'â'),
+ (re.compile(r'ˆ\s*()*\s*A', re.UNICODE), lambda match: 'Â'),
+ (re.compile(r'ˆ\s*()*\s*e', re.UNICODE), lambda match: 'ê'),
+ (re.compile(r'ˆ\s*()*\s*E', re.UNICODE), lambda match: 'Ê'),
+ (re.compile(r'ˆ\s*()*\s*i', re.UNICODE), lambda match: 'î'),
+ (re.compile(r'ˆ\s*()*\s*I', re.UNICODE), lambda match: 'Î'),
+ (re.compile(r'ˆ\s*()*\s*o', re.UNICODE), lambda match: 'ô'),
+ (re.compile(r'ˆ\s*()*\s*O', re.UNICODE), lambda match: 'Ô'),
+ (re.compile(r'ˆ\s*()*\s*u', re.UNICODE), lambda match: 'û'),
+ (re.compile(r'ˆ\s*()*\s*U', re.UNICODE), lambda match: 'Û'),
# ¸
- (re.compile(u'¸\s*()*\s*c', re.UNICODE), lambda match: u'ç'),
- (re.compile(u'¸\s*()*\s*C', re.UNICODE), lambda match: u'Ç'),
+ (re.compile(r'¸\s*()*\s*c', re.UNICODE), lambda match: 'ç'),
+ (re.compile(r'¸\s*()*\s*C', re.UNICODE), lambda match: 'Ç'),
# ˛
- (re.compile(u'\s*˛\s*()*\s*a', re.UNICODE), lambda match: u'ą'),
- (re.compile(u'\s*˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'),
- (re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'),
- (re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'),
+ (re.compile(r'\s*˛\s*()*\s*a', re.UNICODE), lambda match: 'ą'),
+ (re.compile(r'\s*˛\s*()*\s*A', re.UNICODE), lambda match: 'Ą'),
+ (re.compile(r'˛\s*()*\s*e', re.UNICODE), lambda match: 'ę'),
+ (re.compile(r'˛\s*()*\s*E', re.UNICODE), lambda match: 'Ę'),
# ˙
- (re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
- (re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
+ (re.compile(r'˙\s*()*\s*z', re.UNICODE), lambda match: 'ż'),
+ (re.compile(r'˙\s*()*\s*Z', re.UNICODE), lambda match: 'Ż'),
# ˇ
- (re.compile(u'ˇ\s*()*\s*c', re.UNICODE), lambda match: u'č'),
- (re.compile(u'ˇ\s*()*\s*C', re.UNICODE), lambda match: u'Č'),
- (re.compile(u'ˇ\s*()*\s*d', re.UNICODE), lambda match: u'ď'),
- (re.compile(u'ˇ\s*()*\s*D', re.UNICODE), lambda match: u'Ď'),
- (re.compile(u'ˇ\s*()*\s*e', re.UNICODE), lambda match: u'ě'),
- (re.compile(u'ˇ\s*()*\s*E', re.UNICODE), lambda match: u'Ě'),
- (re.compile(u'ˇ\s*()*\s*l', re.UNICODE), lambda match: u'ľ'),
- (re.compile(u'ˇ\s*()*\s*L', re.UNICODE), lambda match: u'Ľ'),
- (re.compile(u'ˇ\s*()*\s*n', re.UNICODE), lambda match: u'ň'),
- (re.compile(u'ˇ\s*()*\s*N', re.UNICODE), lambda match: u'Ň'),
- (re.compile(u'ˇ\s*()*\s*r', re.UNICODE), lambda match: u'ř'),
- (re.compile(u'ˇ\s*()*\s*R', re.UNICODE), lambda match: u'Ř'),
- (re.compile(u'ˇ\s*()*\s*s', re.UNICODE), lambda match: u'š'),
- (re.compile(u'ˇ\s*()*\s*S', re.UNICODE), lambda match: u'Š'),
- (re.compile(u'ˇ\s*()*\s*t', re.UNICODE), lambda match: u'ť'),
- (re.compile(u'ˇ\s*()*\s*T', re.UNICODE), lambda match: u'Ť'),
- (re.compile(u'ˇ\s*()*\s*z', re.UNICODE), lambda match: u'ž'),
- (re.compile(u'ˇ\s*()*\s*Z', re.UNICODE), lambda match: u'Ž'),
+ (re.compile(r'ˇ\s*()*\s*c', re.UNICODE), lambda match: 'č'),
+ (re.compile(r'ˇ\s*()*\s*C', re.UNICODE), lambda match: 'Č'),
+ (re.compile(r'ˇ\s*()*\s*d', re.UNICODE), lambda match: 'ď'),
+ (re.compile(r'ˇ\s*()*\s*D', re.UNICODE), lambda match: 'Ď'),
+ (re.compile(r'ˇ\s*()*\s*e', re.UNICODE), lambda match: 'ě'),
+ (re.compile(r'ˇ\s*()*\s*E', re.UNICODE), lambda match: 'Ě'),
+ (re.compile(r'ˇ\s*()*\s*l', re.UNICODE), lambda match: 'ľ'),
+ (re.compile(r'ˇ\s*()*\s*L', re.UNICODE), lambda match: 'Ľ'),
+ (re.compile(r'ˇ\s*()*\s*n', re.UNICODE), lambda match: 'ň'),
+ (re.compile(r'ˇ\s*()*\s*N', re.UNICODE), lambda match: 'Ň'),
+ (re.compile(r'ˇ\s*()*\s*r', re.UNICODE), lambda match: 'ř'),
+ (re.compile(r'ˇ\s*()*\s*R', re.UNICODE), lambda match: 'Ř'),
+ (re.compile(r'ˇ\s*()*\s*s', re.UNICODE), lambda match: 'š'),
+ (re.compile(r'ˇ\s*()*\s*S', re.UNICODE), lambda match: 'Š'),
+ (re.compile(r'ˇ\s*()*\s*t', re.UNICODE), lambda match: 'ť'),
+ (re.compile(r'ˇ\s*()*\s*T', re.UNICODE), lambda match: 'Ť'),
+ (re.compile(r'ˇ\s*()*\s*z', re.UNICODE), lambda match: 'ž'),
+ (re.compile(r'ˇ\s*()*\s*Z', re.UNICODE), lambda match: 'Ž'),
# °
- (re.compile(u'°\s*()*\s*u', re.UNICODE), lambda match: u'ů'),
- (re.compile(u'°\s*()*\s*U', re.UNICODE), lambda match: u'Ů'),
+ (re.compile(r'°\s*()*\s*u', re.UNICODE), lambda match: 'ů'),
+ (re.compile(r'°\s*()*\s*U', re.UNICODE), lambda match: 'Ů'),
# If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=)\s*file:/{2,4}[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
# Center separator lines
- (re.compile(u'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '\n
' + match.group('break') + '
'),
+ (re.compile(r'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '\n
' + match.group('break') + '
'),
# Remove
tags
(re.compile(r'', re.IGNORECASE), lambda match: ''),
@@ -478,9 +492,9 @@ class HTMLPreProcessor(object):
(re.compile(r'\s*