diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d791d45aad..846f27198b 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -1,15 +1,16 @@ #!/usr/bin/env python2 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -# flake8: noqa +from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' import functools, re, json +from math import ceil from calibre import entity_to_unicode, as_unicode -from polyglot.builtins import unicode_type +from polyglot.builtins import unicode_type, range XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') SVG_NS = 'http://www.w3.org/2000/svg' @@ -17,32 +18,32 @@ XLINK_NS = 'http://www.w3.org/1999/xlink' convert_entities = functools.partial(entity_to_unicode, result_exceptions={ - u'<' : '<', - u'>' : '>', - u"'" : ''', - u'"' : '"', - u'&' : '&', + '<' : '<', + '>' : '>', + "'" : ''', + '"' : '"', + '&' : '&', }) _span_pat = re.compile('', re.DOTALL|re.IGNORECASE) LIGATURES = { -# u'\u00c6': u'AE', -# u'\u00e6': u'ae', -# u'\u0152': u'OE', -# u'\u0153': u'oe', -# u'\u0132': u'IJ', -# u'\u0133': u'ij', -# u'\u1D6B': u'ue', - u'\uFB00': u'ff', - u'\uFB01': u'fi', - u'\uFB02': u'fl', - u'\uFB03': u'ffi', - u'\uFB04': u'ffl', - u'\uFB05': u'ft', - u'\uFB06': u'st', +# '\u00c6': 'AE', +# '\u00e6': 'ae', +# '\u0152': 'OE', +# '\u0153': 'oe', +# '\u0132': 'IJ', +# '\u0133': 'ij', +# '\u1D6B': 'ue', + '\uFB00': 'ff', + '\uFB01': 'fi', + '\uFB02': 'fl', + '\uFB03': 'ffi', + '\uFB04': 'ffl', + '\uFB05': 'ft', + '\uFB06': 'st', } -_ligpat = re.compile(u'|'.join(LIGATURES)) +_ligpat = re.compile('|'.join(LIGATURES)) def sanitize_head(match): @@ -96,9 +97,9 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') if format == 'html': - linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) + linere = re.compile(r'(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': - linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) + linere = re.compile(r'(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) elif format == 'txt': @@ -110,7 +111,7 @@ class DocAnalysis(object): Analyses the document to find the median line length. percentage is a decimal number, 0 - 1 which is used to determine how far in the list of line lengths to use. The list of line lengths is - ordered smallest to larged and does not include duplicates. 0.5 is the + ordered smallest to largest and does not include duplicates. 0.5 is the median value. ''' lengths = [] @@ -124,7 +125,7 @@ class DocAnalysis(object): lengths = list(set(lengths)) total = sum(lengths) avg = total / len(lengths) - max_line = avg * 2 + max_line = ceil(avg * 2) lengths = sorted(lengths) for i in range(len(lengths) - 1, -1, -1): @@ -163,9 +164,9 @@ class DocAnalysis(object): for line in self.lines: l = len(line) if l > minLineLength and l < maxLineLength: - l = int(l/100) - # print "adding "+str(l) - hRaw[l]+=1 + l = int(l/100) + # print "adding "+str(l) + hRaw[l]+=1 # Normalize the histogram into percents totalLines = len(self.lines) @@ -204,7 +205,10 @@ class Dehyphenator(object): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' # only remove if it's not already the point of hyphenation - self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" # noqa + self.suffix_string = ( + "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|" + "(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|" + "(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$") self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE) self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation @@ -245,7 +249,7 @@ class Dehyphenator(object): else: if self.verbose > 2: self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)) - return firsthalf+u'\u2014'+wraptags+secondhalf + return firsthalf+'\u2014'+wraptags+secondhalf else: if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6: @@ -269,17 +273,27 @@ class Dehyphenator(object): self.html = html self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) # noqa + intextmatch = re.compile(( + r'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?' + r'\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)' + r'?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)') % length) elif format == 'pdf': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + intextmatch = re.compile(( + r'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?P

|' + r'\s*

\s*<[iub]>)\s*(?P[\w\d]+)')% length) elif format == 'txt': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) # noqa + intextmatch = re.compile( + '(?<=.{%i})(?P[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\\w\\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)', re.UNICODE) + intextmatch = re.compile( + r'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)', re.UNICODE) elif format == 'html_cleanup': - intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') # noqa + intextmatch = re.compile( + r'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P\s*(' + r'\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') elif format == 'txt_cleanup': - intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)(?P\s+)(?P[\w\d]+)') + intextmatch = re.compile( + r'(?P[^\W\-]+)(-|‐)(?P\s+)(?P[\w\d]+)') html = intextmatch.sub(self.dehyphenate, html) return html @@ -316,7 +330,7 @@ class CSSPreProcessor(object): # are commented lines before the first @import or @charset rule. Since # the conversion will remove all stylesheets anyway, we don't lose # anything - data = re.sub(unicode_type(r'/\*.*?\*/'), u'', data, flags=re.DOTALL) + data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL) ans, namespaced = [], False for line in data.splitlines(): @@ -327,7 +341,7 @@ class CSSPreProcessor(object): namespaced = True ans.append(line) - return u'\n'.join(ans) + return '\n'.join(ans) class HTMLPreProcessor(object): @@ -350,121 +364,121 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ # Fix umlauts - (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), - (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), - (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'), - (re.compile(u'¨\s*()*\s*E', re.UNICODE), lambda match: u'Ë'), - (re.compile(u'¨\s*()*\s*i', re.UNICODE), lambda match: u'ï'), - (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'), - (re.compile(u'¨\s*()*\s*o', re.UNICODE), lambda match: u'ö'), - (re.compile(u'¨\s*()*\s*O', re.UNICODE), lambda match: u'Ö'), - (re.compile(u'¨\s*()*\s*u', re.UNICODE), lambda match: u'ü'), - (re.compile(u'¨\s*()*\s*U', re.UNICODE), lambda match: u'Ü'), + (re.compile(r'¨\s*()*\s*a', re.UNICODE), lambda match: 'ä'), + (re.compile(r'¨\s*()*\s*A', re.UNICODE), lambda match: 'Ä'), + (re.compile(r'¨\s*()*\s*e', re.UNICODE), lambda match: 'ë'), + (re.compile(r'¨\s*()*\s*E', re.UNICODE), lambda match: 'Ë'), + (re.compile(r'¨\s*()*\s*i', re.UNICODE), lambda match: 'ï'), + (re.compile(r'¨\s*()*\s*I', re.UNICODE), lambda match: 'Ï'), + (re.compile(r'¨\s*()*\s*o', re.UNICODE), lambda match: 'ö'), + (re.compile(r'¨\s*()*\s*O', re.UNICODE), lambda match: 'Ö'), + (re.compile(r'¨\s*()*\s*u', re.UNICODE), lambda match: 'ü'), + (re.compile(r'¨\s*()*\s*U', re.UNICODE), lambda match: 'Ü'), # Fix accents # ` - (re.compile(u'`\s*()*\s*a', re.UNICODE), lambda match: u'à'), - (re.compile(u'`\s*()*\s*A', re.UNICODE), lambda match: u'À'), - (re.compile(u'`\s*()*\s*e', re.UNICODE), lambda match: u'è'), - (re.compile(u'`\s*()*\s*E', re.UNICODE), lambda match: u'È'), - (re.compile(u'`\s*()*\s*i', re.UNICODE), lambda match: u'ì'), - (re.compile(u'`\s*()*\s*I', re.UNICODE), lambda match: u'Ì'), - (re.compile(u'`\s*()*\s*o', re.UNICODE), lambda match: u'ò'), - (re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'), - (re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'), - (re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'), + (re.compile(r'`\s*()*\s*a', re.UNICODE), lambda match: 'à'), + (re.compile(r'`\s*()*\s*A', re.UNICODE), lambda match: 'À'), + (re.compile(r'`\s*()*\s*e', re.UNICODE), lambda match: 'è'), + (re.compile(r'`\s*()*\s*E', re.UNICODE), lambda match: 'È'), + (re.compile(r'`\s*()*\s*i', re.UNICODE), lambda match: 'ì'), + (re.compile(r'`\s*()*\s*I', re.UNICODE), lambda match: 'Ì'), + (re.compile(r'`\s*()*\s*o', re.UNICODE), lambda match: 'ò'), + (re.compile(r'`\s*()*\s*O', re.UNICODE), lambda match: 'Ò'), + (re.compile(r'`\s*()*\s*u', re.UNICODE), lambda match: 'ù'), + (re.compile(r'`\s*()*\s*U', re.UNICODE), lambda match: 'Ù'), # ` with letter before - (re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'), - (re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'), - (re.compile(u'e\s*()*\s*`', re.UNICODE), lambda match: u'è'), - (re.compile(u'E\s*()*\s*`', re.UNICODE), lambda match: u'È'), - (re.compile(u'i\s*()*\s*`', re.UNICODE), lambda match: u'ì'), - (re.compile(u'I\s*()*\s*`', re.UNICODE), lambda match: u'Ì'), - (re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'), - (re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'), - (re.compile(u'u\s*()*\s*`', re.UNICODE), lambda match: u'ù'), - (re.compile(u'U\s*()*\s*`', re.UNICODE), lambda match: u'Ù'), + (re.compile(r'a\s*()*\s*`', re.UNICODE), lambda match: 'à'), + (re.compile(r'A\s*()*\s*`', re.UNICODE), lambda match: 'À'), + (re.compile(r'e\s*()*\s*`', re.UNICODE), lambda match: 'è'), + (re.compile(r'E\s*()*\s*`', re.UNICODE), lambda match: 'È'), + (re.compile(r'i\s*()*\s*`', re.UNICODE), lambda match: 'ì'), + (re.compile(r'I\s*()*\s*`', re.UNICODE), lambda match: 'Ì'), + (re.compile(r'o\s*()*\s*`', re.UNICODE), lambda match: 'ò'), + (re.compile(r'O\s*()*\s*`', re.UNICODE), lambda match: 'Ò'), + (re.compile(r'u\s*()*\s*`', re.UNICODE), lambda match: 'ù'), + (re.compile(r'U\s*()*\s*`', re.UNICODE), lambda match: 'Ù'), # ´ - (re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'), - (re.compile(u'´\s*()*\s*A', re.UNICODE), lambda match: u'Á'), - (re.compile(u'´\s*()*\s*c', re.UNICODE), lambda match: u'ć'), - (re.compile(u'´\s*()*\s*C', re.UNICODE), lambda match: u'Ć'), - (re.compile(u'´\s*()*\s*e', re.UNICODE), lambda match: u'é'), - (re.compile(u'´\s*()*\s*E', re.UNICODE), lambda match: u'É'), - (re.compile(u'´\s*()*\s*i', re.UNICODE), lambda match: u'í'), - (re.compile(u'´\s*()*\s*I', re.UNICODE), lambda match: u'Í'), - (re.compile(u'´\s*()*\s*l', re.UNICODE), lambda match: u'ĺ'), - (re.compile(u'´\s*()*\s*L', re.UNICODE), lambda match: u'Ĺ'), - (re.compile(u'´\s*()*\s*o', re.UNICODE), lambda match: u'ó'), - (re.compile(u'´\s*()*\s*O', re.UNICODE), lambda match: u'Ó'), - (re.compile(u'´\s*()*\s*n', re.UNICODE), lambda match: u'ń'), - (re.compile(u'´\s*()*\s*N', re.UNICODE), lambda match: u'Ń'), - (re.compile(u'´\s*()*\s*r', re.UNICODE), lambda match: u'ŕ'), - (re.compile(u'´\s*()*\s*R', re.UNICODE), lambda match: u'Ŕ'), - (re.compile(u'´\s*()*\s*s', re.UNICODE), lambda match: u'ś'), - (re.compile(u'´\s*()*\s*S', re.UNICODE), lambda match: u'Ś'), - (re.compile(u'´\s*()*\s*u', re.UNICODE), lambda match: u'ú'), - (re.compile(u'´\s*()*\s*U', re.UNICODE), lambda match: u'Ú'), - (re.compile(u'´\s*()*\s*z', re.UNICODE), lambda match: u'ź'), - (re.compile(u'´\s*()*\s*Z', re.UNICODE), lambda match: u'Ź'), + (re.compile(r'´\s*()*\s*a', re.UNICODE), lambda match: 'á'), + (re.compile(r'´\s*()*\s*A', re.UNICODE), lambda match: 'Á'), + (re.compile(r'´\s*()*\s*c', re.UNICODE), lambda match: 'ć'), + (re.compile(r'´\s*()*\s*C', re.UNICODE), lambda match: 'Ć'), + (re.compile(r'´\s*()*\s*e', re.UNICODE), lambda match: 'é'), + (re.compile(r'´\s*()*\s*E', re.UNICODE), lambda match: 'É'), + (re.compile(r'´\s*()*\s*i', re.UNICODE), lambda match: 'í'), + (re.compile(r'´\s*()*\s*I', re.UNICODE), lambda match: 'Í'), + (re.compile(r'´\s*()*\s*l', re.UNICODE), lambda match: 'ĺ'), + (re.compile(r'´\s*()*\s*L', re.UNICODE), lambda match: 'Ĺ'), + (re.compile(r'´\s*()*\s*o', re.UNICODE), lambda match: 'ó'), + (re.compile(r'´\s*()*\s*O', re.UNICODE), lambda match: 'Ó'), + (re.compile(r'´\s*()*\s*n', re.UNICODE), lambda match: 'ń'), + (re.compile(r'´\s*()*\s*N', re.UNICODE), lambda match: 'Ń'), + (re.compile(r'´\s*()*\s*r', re.UNICODE), lambda match: 'ŕ'), + (re.compile(r'´\s*()*\s*R', re.UNICODE), lambda match: 'Ŕ'), + (re.compile(r'´\s*()*\s*s', re.UNICODE), lambda match: 'ś'), + (re.compile(r'´\s*()*\s*S', re.UNICODE), lambda match: 'Ś'), + (re.compile(r'´\s*()*\s*u', re.UNICODE), lambda match: 'ú'), + (re.compile(r'´\s*()*\s*U', re.UNICODE), lambda match: 'Ú'), + (re.compile(r'´\s*()*\s*z', re.UNICODE), lambda match: 'ź'), + (re.compile(r'´\s*()*\s*Z', re.UNICODE), lambda match: 'Ź'), # ˆ - (re.compile(u'ˆ\s*()*\s*a', re.UNICODE), lambda match: u'â'), - (re.compile(u'ˆ\s*()*\s*A', re.UNICODE), lambda match: u'Â'), - (re.compile(u'ˆ\s*()*\s*e', re.UNICODE), lambda match: u'ê'), - (re.compile(u'ˆ\s*()*\s*E', re.UNICODE), lambda match: u'Ê'), - (re.compile(u'ˆ\s*()*\s*i', re.UNICODE), lambda match: u'î'), - (re.compile(u'ˆ\s*()*\s*I', re.UNICODE), lambda match: u'Î'), - (re.compile(u'ˆ\s*()*\s*o', re.UNICODE), lambda match: u'ô'), - (re.compile(u'ˆ\s*()*\s*O', re.UNICODE), lambda match: u'Ô'), - (re.compile(u'ˆ\s*()*\s*u', re.UNICODE), lambda match: u'û'), - (re.compile(u'ˆ\s*()*\s*U', re.UNICODE), lambda match: u'Û'), + (re.compile(r'ˆ\s*()*\s*a', re.UNICODE), lambda match: 'â'), + (re.compile(r'ˆ\s*()*\s*A', re.UNICODE), lambda match: 'Â'), + (re.compile(r'ˆ\s*()*\s*e', re.UNICODE), lambda match: 'ê'), + (re.compile(r'ˆ\s*()*\s*E', re.UNICODE), lambda match: 'Ê'), + (re.compile(r'ˆ\s*()*\s*i', re.UNICODE), lambda match: 'î'), + (re.compile(r'ˆ\s*()*\s*I', re.UNICODE), lambda match: 'Î'), + (re.compile(r'ˆ\s*()*\s*o', re.UNICODE), lambda match: 'ô'), + (re.compile(r'ˆ\s*()*\s*O', re.UNICODE), lambda match: 'Ô'), + (re.compile(r'ˆ\s*()*\s*u', re.UNICODE), lambda match: 'û'), + (re.compile(r'ˆ\s*()*\s*U', re.UNICODE), lambda match: 'Û'), # ¸ - (re.compile(u'¸\s*()*\s*c', re.UNICODE), lambda match: u'ç'), - (re.compile(u'¸\s*()*\s*C', re.UNICODE), lambda match: u'Ç'), + (re.compile(r'¸\s*()*\s*c', re.UNICODE), lambda match: 'ç'), + (re.compile(r'¸\s*()*\s*C', re.UNICODE), lambda match: 'Ç'), # ˛ - (re.compile(u'\s*˛\s*()*\s*a', re.UNICODE), lambda match: u'ą'), - (re.compile(u'\s*˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'), - (re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'), - (re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'), + (re.compile(r'\s*˛\s*()*\s*a', re.UNICODE), lambda match: 'ą'), + (re.compile(r'\s*˛\s*()*\s*A', re.UNICODE), lambda match: 'Ą'), + (re.compile(r'˛\s*()*\s*e', re.UNICODE), lambda match: 'ę'), + (re.compile(r'˛\s*()*\s*E', re.UNICODE), lambda match: 'Ę'), # ˙ - (re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'), - (re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'), + (re.compile(r'˙\s*()*\s*z', re.UNICODE), lambda match: 'ż'), + (re.compile(r'˙\s*()*\s*Z', re.UNICODE), lambda match: 'Ż'), # ˇ - (re.compile(u'ˇ\s*()*\s*c', re.UNICODE), lambda match: u'č'), - (re.compile(u'ˇ\s*()*\s*C', re.UNICODE), lambda match: u'Č'), - (re.compile(u'ˇ\s*()*\s*d', re.UNICODE), lambda match: u'ď'), - (re.compile(u'ˇ\s*()*\s*D', re.UNICODE), lambda match: u'Ď'), - (re.compile(u'ˇ\s*()*\s*e', re.UNICODE), lambda match: u'ě'), - (re.compile(u'ˇ\s*()*\s*E', re.UNICODE), lambda match: u'Ě'), - (re.compile(u'ˇ\s*()*\s*l', re.UNICODE), lambda match: u'ľ'), - (re.compile(u'ˇ\s*()*\s*L', re.UNICODE), lambda match: u'Ľ'), - (re.compile(u'ˇ\s*()*\s*n', re.UNICODE), lambda match: u'ň'), - (re.compile(u'ˇ\s*()*\s*N', re.UNICODE), lambda match: u'Ň'), - (re.compile(u'ˇ\s*()*\s*r', re.UNICODE), lambda match: u'ř'), - (re.compile(u'ˇ\s*()*\s*R', re.UNICODE), lambda match: u'Ř'), - (re.compile(u'ˇ\s*()*\s*s', re.UNICODE), lambda match: u'š'), - (re.compile(u'ˇ\s*()*\s*S', re.UNICODE), lambda match: u'Š'), - (re.compile(u'ˇ\s*()*\s*t', re.UNICODE), lambda match: u'ť'), - (re.compile(u'ˇ\s*()*\s*T', re.UNICODE), lambda match: u'Ť'), - (re.compile(u'ˇ\s*()*\s*z', re.UNICODE), lambda match: u'ž'), - (re.compile(u'ˇ\s*()*\s*Z', re.UNICODE), lambda match: u'Ž'), + (re.compile(r'ˇ\s*()*\s*c', re.UNICODE), lambda match: 'č'), + (re.compile(r'ˇ\s*()*\s*C', re.UNICODE), lambda match: 'Č'), + (re.compile(r'ˇ\s*()*\s*d', re.UNICODE), lambda match: 'ď'), + (re.compile(r'ˇ\s*()*\s*D', re.UNICODE), lambda match: 'Ď'), + (re.compile(r'ˇ\s*()*\s*e', re.UNICODE), lambda match: 'ě'), + (re.compile(r'ˇ\s*()*\s*E', re.UNICODE), lambda match: 'Ě'), + (re.compile(r'ˇ\s*()*\s*l', re.UNICODE), lambda match: 'ľ'), + (re.compile(r'ˇ\s*()*\s*L', re.UNICODE), lambda match: 'Ľ'), + (re.compile(r'ˇ\s*()*\s*n', re.UNICODE), lambda match: 'ň'), + (re.compile(r'ˇ\s*()*\s*N', re.UNICODE), lambda match: 'Ň'), + (re.compile(r'ˇ\s*()*\s*r', re.UNICODE), lambda match: 'ř'), + (re.compile(r'ˇ\s*()*\s*R', re.UNICODE), lambda match: 'Ř'), + (re.compile(r'ˇ\s*()*\s*s', re.UNICODE), lambda match: 'š'), + (re.compile(r'ˇ\s*()*\s*S', re.UNICODE), lambda match: 'Š'), + (re.compile(r'ˇ\s*()*\s*t', re.UNICODE), lambda match: 'ť'), + (re.compile(r'ˇ\s*()*\s*T', re.UNICODE), lambda match: 'Ť'), + (re.compile(r'ˇ\s*()*\s*z', re.UNICODE), lambda match: 'ž'), + (re.compile(r'ˇ\s*()*\s*Z', re.UNICODE), lambda match: 'Ž'), # ° - (re.compile(u'°\s*()*\s*u', re.UNICODE), lambda match: u'ů'), - (re.compile(u'°\s*()*\s*U', re.UNICODE), lambda match: u'Ů'), + (re.compile(r'°\s*()*\s*u', re.UNICODE), lambda match: 'ů'), + (re.compile(r'°\s*()*\s*U', re.UNICODE), lambda match: 'Ů'), # If pdf printed from a browser then the header/footer has a reliable pattern (re.compile(r'((?<=)\s*file:/{2,4}[A-Z].*
|file:////?[A-Z].*
(?=\s*


))', re.IGNORECASE), lambda match: ''), # Center separator lines - (re.compile(u'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '

\n

' + match.group('break') + '

'), + (re.compile(r'
\s*(?P([*#•✦=] *){3,})\s*
'), lambda match: '

\n

' + match.group('break') + '

'), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: ''), @@ -478,9 +492,9 @@ class HTMLPreProcessor(object): (re.compile(r'\s*'), lambda match : '

\n'), # Clean up spaces - (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), + (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics - (re.compile(u'(?'), lambda match: ' '), + (re.compile(r'(?'), lambda match: ' '), (re.compile(r'(?=\w)'), lambda match: ' '), ] @@ -490,9 +504,9 @@ class HTMLPreProcessor(object): (re.compile('
', re.IGNORECASE), lambda match : ' '), # Create header tags - (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), @@ -570,9 +584,11 @@ class HTMLPreProcessor(object): # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens - end_rules.append((re.compile(u'[­](

\s*

\s*)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile( + r'[­](

\s*

\s*)+\s*(?=[\[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting - end_rules.append((re.compile(u'[­]\s*()+(

\s*

\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile( + r'[­]\s*()+(

\s*

\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: '')) length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: @@ -581,13 +597,14 @@ class HTMLPreProcessor(object): if length: # print "The pdf line length returned is " + str(length) # unwrap em/en dashes - end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) + end_rules.append((re.compile( + r'(?<=.{%i}[–—])\s*

\s*(?=[\[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile(( - u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]' - u'|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?' - u'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines), + r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]' + r'|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?' + r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -657,7 +674,7 @@ class HTMLPreProcessor(object): from calibre.utils.localization import get_udc from calibre.utils.mreplace import MReplace unihandecoder = get_udc() - mr = MReplace(data={u'«':u'<'*3, u'»':u'>'*3}) + mr = MReplace(data={'«':'<'*3, '»':'>'*3}) html = mr.mreplace(html) html = unihandecoder.decode(html) @@ -675,7 +692,7 @@ class HTMLPreProcessor(object): try: unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars except AttributeError: - unsupported_unicode_chars = u'' + unsupported_unicode_chars = '' if unsupported_unicode_chars: from calibre.utils.localization import get_udc unihandecoder = get_udc()