mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
py3: Port preprocess.py
Now uses unicode literals and also fix various regexps that failed to compile or raised future warnings under py3
This commit is contained in:
parent
ed9b4fe49b
commit
66c0721ba7
@ -1,15 +1,16 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
# flake8: noqa
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import functools, re, json
|
||||
from math import ceil
|
||||
|
||||
from calibre import entity_to_unicode, as_unicode
|
||||
from polyglot.builtins import unicode_type
|
||||
from polyglot.builtins import unicode_type, range
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
@ -17,32 +18,32 @@ XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode,
|
||||
result_exceptions={
|
||||
u'<' : '<',
|
||||
u'>' : '>',
|
||||
u"'" : ''',
|
||||
u'"' : '"',
|
||||
u'&' : '&',
|
||||
'<' : '<',
|
||||
'>' : '>',
|
||||
"'" : ''',
|
||||
'"' : '"',
|
||||
'&' : '&',
|
||||
})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
|
||||
LIGATURES = {
|
||||
# u'\u00c6': u'AE',
|
||||
# u'\u00e6': u'ae',
|
||||
# u'\u0152': u'OE',
|
||||
# u'\u0153': u'oe',
|
||||
# u'\u0132': u'IJ',
|
||||
# u'\u0133': u'ij',
|
||||
# u'\u1D6B': u'ue',
|
||||
u'\uFB00': u'ff',
|
||||
u'\uFB01': u'fi',
|
||||
u'\uFB02': u'fl',
|
||||
u'\uFB03': u'ffi',
|
||||
u'\uFB04': u'ffl',
|
||||
u'\uFB05': u'ft',
|
||||
u'\uFB06': u'st',
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st',
|
||||
}
|
||||
|
||||
_ligpat = re.compile(u'|'.join(LIGATURES))
|
||||
_ligpat = re.compile('|'.join(LIGATURES))
|
||||
|
||||
|
||||
def sanitize_head(match):
|
||||
@ -96,9 +97,9 @@ class DocAnalysis(object):
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
@ -110,7 +111,7 @@ class DocAnalysis(object):
|
||||
Analyses the document to find the median line length.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to larged and does not include duplicates. 0.5 is the
|
||||
ordered smallest to largest and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
'''
|
||||
lengths = []
|
||||
@ -124,7 +125,7 @@ class DocAnalysis(object):
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = avg * 2
|
||||
max_line = ceil(avg * 2)
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
@ -204,7 +205,10 @@ class Dehyphenator(object):
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
# only remove if it's not already the point of hyphenation
|
||||
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$" # noqa
|
||||
self.suffix_string = (
|
||||
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
|
||||
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
|
||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
@ -245,7 +249,7 @@ class Dehyphenator(object):
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
|
||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||
return firsthalf+'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||
@ -269,17 +273,27 @@ class Dehyphenator(object):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length) # noqa
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
|
||||
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
|
||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length) # noqa
|
||||
intextmatch = re.compile(
|
||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(u'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||
intextmatch = re.compile(
|
||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') # noqa
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
@ -316,7 +330,7 @@ class CSSPreProcessor(object):
|
||||
# are commented lines before the first @import or @charset rule. Since
|
||||
# the conversion will remove all stylesheets anyway, we don't lose
|
||||
# anything
|
||||
data = re.sub(unicode_type(r'/\*.*?\*/'), u'', data, flags=re.DOTALL)
|
||||
data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
|
||||
|
||||
ans, namespaced = [], False
|
||||
for line in data.splitlines():
|
||||
@ -327,7 +341,7 @@ class CSSPreProcessor(object):
|
||||
namespaced = True
|
||||
ans.append(line)
|
||||
|
||||
return u'\n'.join(ans)
|
||||
return '\n'.join(ans)
|
||||
|
||||
|
||||
class HTMLPreProcessor(object):
|
||||
@ -350,121 +364,121 @@ class HTMLPreProcessor(object):
|
||||
# Fix pdftohtml markup
|
||||
PDFTOHTML = [
|
||||
# Fix umlauts
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ä'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ä'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ë'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ë'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ï'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ï'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ö'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ö'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ü'),
|
||||
(re.compile(r'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ü'),
|
||||
|
||||
# Fix accents
|
||||
# `
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
||||
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'à'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'À'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'è'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'È'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'ì'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Ì'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ò'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ò'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ù'),
|
||||
(re.compile(r'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ù'),
|
||||
|
||||
# ` with letter before
|
||||
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
|
||||
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
|
||||
(re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
|
||||
(re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
|
||||
(re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
|
||||
(re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
|
||||
(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
|
||||
(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
|
||||
(re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
|
||||
(re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),
|
||||
(re.compile(r'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'à'),
|
||||
(re.compile(r'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'À'),
|
||||
(re.compile(r'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'è'),
|
||||
(re.compile(r'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'È'),
|
||||
(re.compile(r'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ì'),
|
||||
(re.compile(r'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ì'),
|
||||
(re.compile(r'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ò'),
|
||||
(re.compile(r'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ò'),
|
||||
(re.compile(r'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'ù'),
|
||||
(re.compile(r'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: 'Ù'),
|
||||
|
||||
# ´
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ć'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ć'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ĺ'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ĺ'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ŕ'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ŕ'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ź'),
|
||||
(re.compile(u'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ź'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'á'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Á'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ć'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ć'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'é'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'É'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'í'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Í'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ĺ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ĺ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ó'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ó'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ń'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ń'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ŕ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ŕ'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'ś'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Ś'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ú'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ú'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ź'),
|
||||
(re.compile(r'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ź'),
|
||||
|
||||
# ˆ
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'),
|
||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'â'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Â'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ê'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ê'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: 'î'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: 'Î'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: 'ô'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: 'Ô'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'û'),
|
||||
(re.compile(r'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Û'),
|
||||
|
||||
# ¸
|
||||
(re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
|
||||
(re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
|
||||
(re.compile(r'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'ç'),
|
||||
(re.compile(r'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Ç'),
|
||||
|
||||
# ˛
|
||||
(re.compile(u'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
|
||||
(re.compile(u'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
|
||||
(re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'),
|
||||
(re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'),
|
||||
(re.compile(r'\s*˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: 'ą'),
|
||||
(re.compile(r'\s*˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: 'Ą'),
|
||||
(re.compile(r'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ę'),
|
||||
(re.compile(r'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ę'),
|
||||
|
||||
# ˙
|
||||
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
||||
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
||||
(re.compile(r'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ż'),
|
||||
(re.compile(r'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ż'),
|
||||
|
||||
# ˇ
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'č'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Č'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: u'ď'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: u'Ď'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ě'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ě'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ľ'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ľ'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ň'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ň'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ř'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ř'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'š'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Š'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: u'ť'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: u'Ť'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ž'),
|
||||
(re.compile(u'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ž'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: 'č'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: 'Č'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: 'ď'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: 'Ď'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: 'ě'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: 'Ě'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: 'ľ'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: 'Ľ'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: 'ň'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: 'Ň'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: 'ř'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: 'Ř'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: 'š'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: 'Š'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: 'ť'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: 'Ť'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: 'ž'),
|
||||
(re.compile(r'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: 'Ž'),
|
||||
|
||||
# °
|
||||
(re.compile(u'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ů'),
|
||||
(re.compile(u'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ů'),
|
||||
(re.compile(r'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: 'ů'),
|
||||
(re.compile(r'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: 'Ů'),
|
||||
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(u'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
|
||||
@ -478,9 +492,9 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
|
||||
|
||||
# Clean up spaces
|
||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
||||
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||
]
|
||||
|
||||
@ -490,9 +504,9 @@ class HTMLPreProcessor(object):
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
@ -570,9 +584,11 @@ class HTMLPreProcessor(object):
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(
|
||||
r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(
|
||||
r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
@ -581,13 +597,14 @@ class HTMLPreProcessor(object):
|
||||
if length:
|
||||
# print "The pdf line length returned is " + str(length)
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append((re.compile(
|
||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile((
|
||||
u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]'
|
||||
u'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
||||
u'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
||||
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]'
|
||||
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
||||
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
@ -657,7 +674,7 @@ class HTMLPreProcessor(object):
|
||||
from calibre.utils.localization import get_udc
|
||||
from calibre.utils.mreplace import MReplace
|
||||
unihandecoder = get_udc()
|
||||
mr = MReplace(data={u'«':u'<'*3, u'»':u'>'*3})
|
||||
mr = MReplace(data={'«':'<'*3, '»':'>'*3})
|
||||
html = mr.mreplace(html)
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
@ -675,7 +692,7 @@ class HTMLPreProcessor(object):
|
||||
try:
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
except AttributeError:
|
||||
unsupported_unicode_chars = u''
|
||||
unsupported_unicode_chars = ''
|
||||
if unsupported_unicode_chars:
|
||||
from calibre.utils.localization import get_udc
|
||||
unihandecoder = get_udc()
|
||||
|
Loading…
x
Reference in New Issue
Block a user