mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
replaced messed up rtf file
This commit is contained in:
parent
105d490c01
commit
f6de0bef13
@ -1,390 +1,344 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2010, Gerendi Sandor Attila'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import functools, re
|
"""
|
||||||
|
RTF tokenizer and token parser. v.1.0 (1/17/2010)
|
||||||
|
Author: Gerendi Sandor Attila
|
||||||
|
|
||||||
from calibre import entity_to_unicode
|
At this point this will tokenize a RTF file then rebuild it from the tokens.
|
||||||
|
In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
|
||||||
|
"""
|
||||||
|
|
||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
class tokenDelimitatorStart():
|
||||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
def __init__(self):
|
||||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
pass
|
||||||
|
def toRTF(self):
|
||||||
|
return b'{'
|
||||||
|
def __repr__(self):
|
||||||
|
return '{'
|
||||||
|
|
||||||
convert_entities = functools.partial(entity_to_unicode,
|
class tokenDelimitatorEnd():
|
||||||
result_exceptions = {
|
def __init__(self):
|
||||||
u'<' : '<',
|
pass
|
||||||
u'>' : '>',
|
def toRTF(self):
|
||||||
u"'" : ''',
|
return b'}'
|
||||||
u'"' : '"',
|
def __repr__(self):
|
||||||
u'&' : '&',
|
return '}'
|
||||||
})
|
|
||||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
|
||||||
|
|
||||||
LIGATURES = {
|
class tokenControlWord():
|
||||||
# u'\u00c6': u'AE',
|
def __init__(self, name, separator = ''):
|
||||||
# u'\u00e6': u'ae',
|
self.name = name
|
||||||
# u'\u0152': u'OE',
|
self.separator = separator
|
||||||
# u'\u0153': u'oe',
|
def toRTF(self):
|
||||||
# u'\u0132': u'IJ',
|
return self.name + self.separator
|
||||||
# u'\u0133': u'ij',
|
def __repr__(self):
|
||||||
# u'\u1D6B': u'ue',
|
return self.name + self.separator
|
||||||
u'\uFB00': u'ff',
|
|
||||||
u'\uFB01': u'fi',
|
|
||||||
u'\uFB02': u'fl',
|
|
||||||
u'\uFB03': u'ffi',
|
|
||||||
u'\uFB04': u'ffl',
|
|
||||||
u'\uFB05': u'ft',
|
|
||||||
u'\uFB06': u'st',
|
|
||||||
}
|
|
||||||
|
|
||||||
_ligpat = re.compile(u'|'.join(LIGATURES))
|
class tokenControlWordWithNumericArgument():
|
||||||
|
def __init__(self, name, argument, separator = ''):
|
||||||
|
self.name = name
|
||||||
|
self.argument = argument
|
||||||
|
self.separator = separator
|
||||||
|
def toRTF(self):
|
||||||
|
return self.name + repr(self.argument) + self.separator
|
||||||
|
def __repr__(self):
|
||||||
|
return self.name + repr(self.argument) + self.separator
|
||||||
|
|
||||||
def sanitize_head(match):
|
class tokenControlSymbol():
|
||||||
x = match.group(1)
|
def __init__(self, name):
|
||||||
x = _span_pat.sub('', x)
|
self.name = name
|
||||||
return '<head>\n%s\n</head>' % x
|
def toRTF(self):
|
||||||
|
return self.name
|
||||||
|
def __repr__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
def chap_head(match):
|
class tokenData():
|
||||||
chap = match.group('chap')
|
def __init__(self, data):
|
||||||
title = match.group('title')
|
self.data = data
|
||||||
if not title:
|
def toRTF(self):
|
||||||
return '<h1>'+chap+'</h1><br/>\n'
|
return self.data
|
||||||
else:
|
def __repr__(self):
|
||||||
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
return self.data
|
||||||
|
|
||||||
def wrap_lines(match):
|
class tokenBinN():
|
||||||
ital = match.group('ital')
|
def __init__(self, data, separator = ''):
|
||||||
if not ital:
|
self.data = data
|
||||||
return ' '
|
self.separator = separator
|
||||||
else:
|
def toRTF(self):
|
||||||
return ital+' '
|
return "\\bin" + repr(len(self.data)) + self.separator + self.data
|
||||||
|
def __repr__(self):
|
||||||
|
return "\\bin" + repr(len(self.data)) + self.separator + self.data
|
||||||
|
|
||||||
|
class token8bitChar():
|
||||||
|
def __init__(self, data):
|
||||||
|
self.data = data
|
||||||
|
def toRTF(self):
|
||||||
|
return "\\'" + self.data
|
||||||
|
def __repr__(self):
|
||||||
|
return "\\'" + self.data
|
||||||
|
|
||||||
|
class tokenUnicode():
|
||||||
|
def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
|
||||||
|
self.data = data
|
||||||
|
self.separator = separator
|
||||||
|
self.current_ucn = current_ucn
|
||||||
|
self.eqList = eqList
|
||||||
|
def toRTF(self):
|
||||||
|
result = '\\u' + repr(self.data) + ' '
|
||||||
|
ucn = self.current_ucn
|
||||||
|
if len(self.eqList) < ucn:
|
||||||
|
ucn = len(self.eqList)
|
||||||
|
result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
|
||||||
|
i = 0
|
||||||
|
for eq in self.eqList:
|
||||||
|
if i >= ucn:
|
||||||
|
break
|
||||||
|
result = result + eq.toRTF()
|
||||||
|
return result
|
||||||
|
def __repr__(self):
|
||||||
|
return '\\u' + repr(self.data)
|
||||||
|
|
||||||
|
|
||||||
def line_length(format, raw, percent):
|
def isAsciiLetter(value):
|
||||||
'''
|
return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
|
||||||
raw is the raw text to find the line length to use for wrapping.
|
|
||||||
percentage is a decimal number, 0 - 1 which is used to determine
|
|
||||||
how far in the list of line lengths to use. The list of line lengths is
|
|
||||||
ordered smallest to larged and does not include duplicates. 0.5 is the
|
|
||||||
median value.
|
|
||||||
'''
|
|
||||||
raw = raw.replace(' ', ' ')
|
|
||||||
if format == 'html':
|
|
||||||
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
|
||||||
elif format == 'pdf':
|
|
||||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
|
||||||
lines = linere.findall(raw)
|
|
||||||
print "percent is " + str(percent)
|
|
||||||
|
|
||||||
lengths = []
|
def isDigit(value):
|
||||||
for line in lines:
|
return (value >= '0') and (value <= '9')
|
||||||
if len(line) > 0:
|
|
||||||
lengths.append(len(line))
|
|
||||||
|
|
||||||
if not lengths:
|
def isChar(value, char):
|
||||||
return 0
|
return value == char
|
||||||
|
|
||||||
lengths = list(set(lengths))
|
def isString(buffer, string):
|
||||||
total = sum(lengths)
|
return buffer == string
|
||||||
avg = total / len(lengths)
|
|
||||||
max_line = avg * 2
|
|
||||||
|
|
||||||
lengths = sorted(lengths)
|
|
||||||
for i in range(len(lengths) - 1, -1, -1):
|
|
||||||
if lengths[i] > max_line:
|
|
||||||
del lengths[i]
|
|
||||||
|
|
||||||
if percent > 1:
|
|
||||||
percent = 1
|
|
||||||
if percent < 0:
|
|
||||||
percent = 0
|
|
||||||
|
|
||||||
index = int(len(lengths) * percent) - 1
|
|
||||||
|
|
||||||
return lengths[index]
|
|
||||||
|
|
||||||
|
|
||||||
class CSSPreProcessor(object):
|
class RtfTokenParser():
|
||||||
|
def __init__(self, tokens):
|
||||||
|
self.tokens = tokens
|
||||||
|
self.process()
|
||||||
|
self.processUnicode()
|
||||||
|
|
||||||
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
def process(self):
|
||||||
|
i = 0
|
||||||
|
newTokens = []
|
||||||
|
while i < len(self.tokens):
|
||||||
|
if isinstance(self.tokens[i], tokenControlSymbol):
|
||||||
|
if isString(self.tokens[i].name, "\\'"):
|
||||||
|
i = i + 1
|
||||||
|
if not isinstance(self.tokens[i], tokenData):
|
||||||
|
raise Exception('Error: token8bitChar without data.')
|
||||||
|
if len(self.tokens[i].data) < 2:
|
||||||
|
raise Exception('Error: token8bitChar without data.')
|
||||||
|
newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
|
||||||
|
if len(self.tokens[i].data) > 2:
|
||||||
|
newTokens.append(tokenData(self.tokens[i].data[2:]))
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
def __call__(self, data, add_namespace=False):
|
newTokens.append(self.tokens[i])
|
||||||
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
i = i + 1
|
||||||
data = self.PAGE_PAT.sub('', data)
|
|
||||||
if not add_namespace:
|
|
||||||
return data
|
|
||||||
ans, namespaced = [], False
|
|
||||||
for line in data.splitlines():
|
|
||||||
ll = line.lstrip()
|
|
||||||
if not (namespaced or ll.startswith('@import') or
|
|
||||||
ll.startswith('@charset')):
|
|
||||||
ans.append(XHTML_CSS_NAMESPACE.strip())
|
|
||||||
namespaced = True
|
|
||||||
ans.append(line)
|
|
||||||
|
|
||||||
return u'\n'.join(ans)
|
self.tokens = list(newTokens)
|
||||||
|
|
||||||
class HTMLPreProcessor(object):
|
def processUnicode(self):
|
||||||
|
i = 0
|
||||||
|
newTokens = []
|
||||||
|
ucNbStack = [1]
|
||||||
|
while i < len(self.tokens):
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorStart):
|
||||||
|
ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorEnd):
|
||||||
|
ucNbStack.pop()
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
|
||||||
|
if isString(self.tokens[i].name, '\\uc'):
|
||||||
|
ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isString(self.tokens[i].name, '\\u'):
|
||||||
|
x = i
|
||||||
|
j = 0
|
||||||
|
i = i + 1
|
||||||
|
replace = []
|
||||||
|
partialData = None
|
||||||
|
ucn = ucNbStack[len(ucNbStack) - 1]
|
||||||
|
while (i < len(self.tokens)) and (j < ucn):
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorStart):
|
||||||
|
break
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorEnd):
|
||||||
|
break
|
||||||
|
if isinstance(self.tokens[i], tokenData):
|
||||||
|
if len(self.tokens[i].data) >= ucn - j:
|
||||||
|
replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
|
||||||
|
if len(self.tokens[i].data) > ucn - j:
|
||||||
|
partialData = tokenData(self.tokens[i].data[ucn - j:])
|
||||||
|
i = i + 1
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
replace.append(self.tokens[i])
|
||||||
|
j = j + len(self.tokens[i].data)
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
|
||||||
|
replace.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
j = j + 1
|
||||||
|
continue
|
||||||
|
raise Exception('Error: incorect utf replacement.')
|
||||||
|
|
||||||
PREPROCESS = [
|
#calibre rtf2xml does not support utfreplace
|
||||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
replace = []
|
||||||
# Put all sorts of crap into <head>. This messes up lxml
|
|
||||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
|
||||||
sanitize_head),
|
|
||||||
# Convert all entities, since lxml doesn't handle them well
|
|
||||||
(re.compile(r'&(\S+?);'), convert_entities),
|
|
||||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
|
||||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
|
||||||
lambda match: ''),
|
|
||||||
]
|
|
||||||
|
|
||||||
# Fix pdftohtml markup
|
newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
|
||||||
PDFTOHTML = [
|
if partialData != None:
|
||||||
# Fix umlauts
|
newTokens.append(partialData)
|
||||||
# ¨
|
continue
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
|
|
||||||
(re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
|
|
||||||
|
|
||||||
# Fix accents
|
newTokens.append(self.tokens[i])
|
||||||
# `
|
i = i + 1
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
|
||||||
|
|
||||||
# ´
|
self.tokens = list(newTokens)
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ć'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ć'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ź'),
|
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ź'),
|
|
||||||
|
|
||||||
# ˆ
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'),
|
|
||||||
(re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'),
|
|
||||||
|
|
||||||
# ¸
|
def toRTF(self):
|
||||||
(re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
|
result = []
|
||||||
(re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
|
for token in self.tokens:
|
||||||
|
result.append(token.toRTF())
|
||||||
|
return "".join(result)
|
||||||
|
|
||||||
# ˛
|
|
||||||
(re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
|
|
||||||
(re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
|
|
||||||
(re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'),
|
|
||||||
(re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'),
|
|
||||||
|
|
||||||
# ˙
|
|
||||||
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
|
||||||
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
|
||||||
|
|
||||||
|
|
||||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
class RtfTokenizer():
|
||||||
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
def __init__(self, rtfData):
|
||||||
|
self.rtfData = []
|
||||||
|
self.tokens = []
|
||||||
|
self.rtfData = rtfData
|
||||||
|
self.tokenize()
|
||||||
|
|
||||||
# Center separator lines
|
def tokenize(self):
|
||||||
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
i = 0
|
||||||
|
lastDataStart = -1
|
||||||
|
while i < len(self.rtfData):
|
||||||
|
|
||||||
# Remove page links
|
if isChar(self.rtfData[i], '{'):
|
||||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
if lastDataStart > -1:
|
||||||
# Remove <hr> tags
|
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
lastDataStart = -1
|
||||||
# Replace <br><br> with <p>
|
self.tokens.append(tokenDelimitatorStart())
|
||||||
# (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
|
if isChar(self.rtfData[i], '}'):
|
||||||
(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
|
if lastDataStart > -1:
|
||||||
|
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||||
|
lastDataStart = -1
|
||||||
|
self.tokens.append(tokenDelimitatorEnd())
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Remove gray background
|
if isChar(self.rtfData[i], '\\'):
|
||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
if i + 1 >= len(self.rtfData):
|
||||||
|
raise Exception('Error: Control character found at the end of the document.')
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
if lastDataStart > -1:
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||||
|
lastDataStart = -1
|
||||||
|
|
||||||
# Have paragraphs show better
|
tokenStart = i
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
i = i + 1
|
||||||
# Clean up spaces
|
|
||||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
|
||||||
# Add space before and after italics
|
|
||||||
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
|
||||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
# Fix Book Designer markup
|
#Control Words
|
||||||
BOOK_DESIGNER = [
|
if isAsciiLetter(self.rtfData[i]):
|
||||||
# HR
|
#consume <ASCII Letter Sequence>
|
||||||
(re.compile('<hr>', re.IGNORECASE),
|
consumed = False
|
||||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
while i < len(self.rtfData):
|
||||||
# Create header tags
|
if not isAsciiLetter(self.rtfData[i]):
|
||||||
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
tokenEnd = i
|
||||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
consumed = True
|
||||||
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
break
|
||||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
i = i + 1
|
||||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
|
||||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
|
||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
|
||||||
]
|
|
||||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
|
||||||
extra_opts=None):
|
|
||||||
self.input_plugin_preprocess = input_plugin_preprocess
|
|
||||||
self.plugin_preprocess = plugin_preprocess
|
|
||||||
self.extra_opts = extra_opts
|
|
||||||
|
|
||||||
def is_baen(self, src):
|
if not consumed:
|
||||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
raise Exception('Error (at:%d): Control Word without end.'%(tokenStart))
|
||||||
re.IGNORECASE).search(src) is not None
|
|
||||||
|
|
||||||
def is_book_designer(self, raw):
|
#we have numeric argument before delimiter
|
||||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
|
||||||
|
#consume the numeric argument
|
||||||
|
consumed = False
|
||||||
|
l = 0
|
||||||
|
while i < len(self.rtfData):
|
||||||
|
if not isDigit(self.rtfData[i]):
|
||||||
|
consumed = True
|
||||||
|
break
|
||||||
|
l = l + 1
|
||||||
|
i = i + 1
|
||||||
|
if l > 10 :
|
||||||
|
raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
if not consumed:
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
|
||||||
|
|
||||||
def __call__(self, html, remove_special_chars=None,
|
separator = ''
|
||||||
get_preprocess_html=False):
|
if isChar(self.rtfData[i], ' '):
|
||||||
if remove_special_chars is not None:
|
separator = ' '
|
||||||
html = remove_special_chars.sub('', html)
|
|
||||||
html = html.replace('\0', '')
|
|
||||||
is_pdftohtml = self.is_pdftohtml(html)
|
|
||||||
if self.is_baen(html):
|
|
||||||
rules = []
|
|
||||||
elif self.is_book_designer(html):
|
|
||||||
rules = self.BOOK_DESIGNER
|
|
||||||
elif is_pdftohtml:
|
|
||||||
rules = self.PDFTOHTML
|
|
||||||
else:
|
|
||||||
rules = []
|
|
||||||
|
|
||||||
start_rules = []
|
controlWord = self.rtfData[tokenStart: tokenEnd]
|
||||||
if is_pdftohtml:
|
if tokenEnd < i:
|
||||||
# Remove non breaking spaces
|
value = int(self.rtfData[tokenEnd: i])
|
||||||
start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
|
if isString(controlWord, "\\bin"):
|
||||||
|
i = i + value
|
||||||
|
self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
|
||||||
|
else:
|
||||||
|
self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
|
||||||
|
else:
|
||||||
|
self.tokens.append(tokenControlWord(controlWord, separator))
|
||||||
|
#space delimiter, we should discard it
|
||||||
|
if self.rtfData[i] == ' ':
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
#Control Symbol
|
||||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
else:
|
||||||
|
self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
end_rules = []
|
if lastDataStart < 0:
|
||||||
if getattr(self.extra_opts, 'remove_header', None):
|
lastDataStart = i
|
||||||
try:
|
i = i + 1
|
||||||
rules.insert(0,
|
|
||||||
(re.compile(self.extra_opts.header_regex), lambda match : '')
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_header regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'remove_footer', None):
|
def toRTF(self):
|
||||||
try:
|
result = []
|
||||||
rules.insert(0,
|
for token in self.tokens:
|
||||||
(re.compile(self.extra_opts.footer_regex), lambda match : '')
|
result.append(token.toRTF())
|
||||||
)
|
return "".join(result)
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_footer regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
|
|
||||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
|
||||||
if is_pdftohtml:
|
|
||||||
end_rules.append(
|
|
||||||
(re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
|
||||||
)
|
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if __name__ == "__main__":
|
||||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
import sys
|
||||||
if length:
|
if len(sys.argv) < 2:
|
||||||
print "The pdf line length returned is " + str(length)
|
print ("Usage %prog rtfFileToConvert")
|
||||||
end_rules.append(
|
sys.exit()
|
||||||
# Un wrap using punctuation
|
f = open(sys.argv[1], 'rb')
|
||||||
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
data = f.read()
|
||||||
)
|
f.close()
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
tokenizer = RtfTokenizer(data)
|
||||||
html = rule[0].sub(rule[1], html)
|
parsedTokens = RtfTokenParser(tokenizer.tokens)
|
||||||
|
|
||||||
if get_preprocess_html:
|
data = parsedTokens.toRTF()
|
||||||
return html
|
|
||||||
|
|
||||||
def dump(raw, where):
|
f = open(sys.argv[1], 'w')
|
||||||
import os
|
f.write(data)
|
||||||
dp = getattr(self.extra_opts, 'debug_pipeline', None)
|
f.close()
|
||||||
if dp and os.path.exists(dp):
|
|
||||||
odir = os.path.join(dp, 'input')
|
|
||||||
if os.path.exists(odir):
|
|
||||||
odir = os.path.join(odir, where)
|
|
||||||
if not os.path.exists(odir):
|
|
||||||
os.makedirs(odir)
|
|
||||||
name, i = None, 0
|
|
||||||
while not name or os.path.exists(os.path.join(odir, name)):
|
|
||||||
i += 1
|
|
||||||
name = '%04d.html'%i
|
|
||||||
with open(os.path.join(odir, name), 'wb') as f:
|
|
||||||
f.write(raw.encode('utf-8'))
|
|
||||||
|
|
||||||
#dump(html, 'pre-preprocess')
|
|
||||||
|
|
||||||
for rule in rules + end_rules:
|
|
||||||
html = rule[0].sub(rule[1], html)
|
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
|
||||||
|
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
|
||||||
if 'svg:' in html and SVG_NS not in html:
|
|
||||||
html = html.replace(
|
|
||||||
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
|
|
||||||
if 'xlink:' in html and XLINK_NS not in html:
|
|
||||||
html = html.replace(
|
|
||||||
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
|
|
||||||
|
|
||||||
html = XMLDECL_RE.sub('', html)
|
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'asciiize', False):
|
|
||||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
|
||||||
unidecoder = Unidecoder()
|
|
||||||
html = unidecoder.decode(html)
|
|
||||||
|
|
||||||
if self.plugin_preprocess:
|
|
||||||
html = self.input_plugin_preprocess(html)
|
|
||||||
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user