mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
RTF Input: Support for unicode characters. Fixes #4501 (Unicode escaped RTF to XML problem)
This commit is contained in:
parent
c6692c859e
commit
03714a978f
@ -169,6 +169,21 @@ class RTFInput(InputFormatPlugin):
|
||||
with open('styles.css', 'ab') as f:
|
||||
f.write(css)
|
||||
|
||||
def preprocess(self, fname):
|
||||
self.log('\tPreprocessing to convert unicode characters')
|
||||
try:
|
||||
data = open(fname, 'rb').read()
|
||||
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
tokenizer = RtfTokenizer(data)
|
||||
tokens = RtfTokenParser(tokenizer.tokens)
|
||||
data = tokens.toRTF()
|
||||
fname = 'preprocessed.rtf'
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
except:
|
||||
self.log.exception(
|
||||
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
return fname
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
@ -177,8 +192,9 @@ class RTFInput(InputFormatPlugin):
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
fname = self.preprocess(stream.name)
|
||||
try:
|
||||
xml = self.generate_xml(stream.name)
|
||||
xml = self.generate_xml(fname)
|
||||
except RtfInvalidCodeException:
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.'))
|
||||
|
344
src/calibre/ebooks/rtf/preprocess.py
Normal file
344
src/calibre/ebooks/rtf/preprocess.py
Normal file
@ -0,0 +1,344 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Gerendi Sandor Attila'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
"""
|
||||
RTF tokenizer and token parser. v.1.0 (1/17/2010)
|
||||
Author: Gerendi Sandor Attila
|
||||
|
||||
At this point this will tokenize a RTF file then rebuild it from the tokens.
|
||||
In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
|
||||
"""
|
||||
|
||||
class tokenDelimitatorStart():
|
||||
def __init__(self):
|
||||
pass
|
||||
def toRTF(self):
|
||||
return b'{'
|
||||
def __repr__(self):
|
||||
return '{'
|
||||
|
||||
class tokenDelimitatorEnd():
|
||||
def __init__(self):
|
||||
pass
|
||||
def toRTF(self):
|
||||
return b'}'
|
||||
def __repr__(self):
|
||||
return '}'
|
||||
|
||||
class tokenControlWord():
|
||||
def __init__(self, name, separator = ''):
|
||||
self.name = name
|
||||
self.separator = separator
|
||||
def toRTF(self):
|
||||
return self.name + self.separator
|
||||
def __repr__(self):
|
||||
return self.name + self.separator
|
||||
|
||||
class tokenControlWordWithNumericArgument():
|
||||
def __init__(self, name, argument, separator = ''):
|
||||
self.name = name
|
||||
self.argument = argument
|
||||
self.separator = separator
|
||||
def toRTF(self):
|
||||
return self.name + repr(self.argument) + self.separator
|
||||
def __repr__(self):
|
||||
return self.name + repr(self.argument) + self.separator
|
||||
|
||||
class tokenControlSymbol():
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
def toRTF(self):
|
||||
return self.name
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
class tokenData():
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
def toRTF(self):
|
||||
return self.data
|
||||
def __repr__(self):
|
||||
return self.data
|
||||
|
||||
class tokenBinN():
|
||||
def __init__(self, data, separator = ''):
|
||||
self.data = data
|
||||
self.separator = separator
|
||||
def toRTF(self):
|
||||
return "\\bin" + repr(len(self.data)) + self.separator + self.data
|
||||
def __repr__(self):
|
||||
return "\\bin" + repr(len(self.data)) + self.separator + self.data
|
||||
|
||||
class token8bitChar():
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
def toRTF(self):
|
||||
return "\\'" + self.data
|
||||
def __repr__(self):
|
||||
return "\\'" + self.data
|
||||
|
||||
class tokenUnicode():
|
||||
def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
|
||||
self.data = data
|
||||
self.separator = separator
|
||||
self.current_ucn = current_ucn
|
||||
self.eqList = eqList
|
||||
def toRTF(self):
|
||||
result = '\\u' + repr(self.data) + ' '
|
||||
ucn = self.current_ucn
|
||||
if len(self.eqList) < ucn:
|
||||
ucn = len(self.eqList)
|
||||
result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
|
||||
i = 0
|
||||
for eq in self.eqList:
|
||||
if i >= ucn:
|
||||
break
|
||||
result = result + eq.toRTF()
|
||||
return result
|
||||
def __repr__(self):
|
||||
return '\\u' + repr(self.data)
|
||||
|
||||
|
||||
def isAsciiLetter(value):
|
||||
return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
|
||||
|
||||
def isDigit(value):
|
||||
return (value >= '0') and (value <= '9')
|
||||
|
||||
def isChar(value, char):
|
||||
return value == char
|
||||
|
||||
def isString(buffer, string):
|
||||
return buffer == string
|
||||
|
||||
|
||||
class RtfTokenParser():
|
||||
def __init__(self, tokens):
|
||||
self.tokens = tokens
|
||||
self.process()
|
||||
self.processUnicode()
|
||||
|
||||
def process(self):
|
||||
i = 0
|
||||
newTokens = []
|
||||
while i < len(self.tokens):
|
||||
if isinstance(self.tokens[i], tokenControlSymbol):
|
||||
if isString(self.tokens[i].name, "\\'"):
|
||||
i = i + 1
|
||||
if not isinstance(self.tokens[i], tokenData):
|
||||
raise BaseException('Error: token8bitChar without data.')
|
||||
if len(self.tokens[i].data) < 2:
|
||||
raise BaseException('Error: token8bitChar without data.')
|
||||
newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
|
||||
if len(self.tokens[i].data) > 2:
|
||||
newTokens.append(tokenData(self.tokens[i].data[2:]))
|
||||
i = i + 1
|
||||
continue
|
||||
|
||||
newTokens.append(self.tokens[i])
|
||||
i = i + 1
|
||||
|
||||
self.tokens = list(newTokens)
|
||||
|
||||
def processUnicode(self):
|
||||
i = 0
|
||||
newTokens = []
|
||||
ucNbStack = [1]
|
||||
while i < len(self.tokens):
|
||||
if isinstance(self.tokens[i], tokenDelimitatorStart):
|
||||
ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
|
||||
newTokens.append(self.tokens[i])
|
||||
i = i + 1
|
||||
continue
|
||||
if isinstance(self.tokens[i], tokenDelimitatorEnd):
|
||||
ucNbStack.pop()
|
||||
newTokens.append(self.tokens[i])
|
||||
i = i + 1
|
||||
continue
|
||||
if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
|
||||
if isString(self.tokens[i].name, '\\uc'):
|
||||
ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
|
||||
newTokens.append(self.tokens[i])
|
||||
i = i + 1
|
||||
continue
|
||||
if isString(self.tokens[i].name, '\\u'):
|
||||
x = i
|
||||
j = 0
|
||||
i = i + 1
|
||||
replace = []
|
||||
partialData = None
|
||||
ucn = ucNbStack[len(ucNbStack) - 1]
|
||||
while (i < len(self.tokens)) and (j < ucn):
|
||||
if isinstance(self.tokens[i], tokenDelimitatorStart):
|
||||
break
|
||||
if isinstance(self.tokens[i], tokenDelimitatorEnd):
|
||||
break
|
||||
if isinstance(self.tokens[i], tokenData):
|
||||
if len(self.tokens[i].data) >= ucn - j:
|
||||
replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
|
||||
if len(self.tokens[i].data) > ucn - j:
|
||||
partialData = tokenData(self.tokens[i].data[ucn - j:])
|
||||
i = i + 1
|
||||
break
|
||||
else:
|
||||
replace.append(self.tokens[i])
|
||||
j = j + len(self.tokens[i].data)
|
||||
i = i + 1
|
||||
continue
|
||||
if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
|
||||
replace.append(self.tokens[i])
|
||||
i = i + 1
|
||||
j = j + 1
|
||||
continue
|
||||
raise BaseException('Error: incorect utf replacement.')
|
||||
|
||||
#calibre rtf2xml does not support utfreplace
|
||||
replace = []
|
||||
|
||||
newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
|
||||
if partialData != None:
|
||||
newTokens.append(partialData)
|
||||
continue
|
||||
|
||||
newTokens.append(self.tokens[i])
|
||||
i = i + 1
|
||||
|
||||
self.tokens = list(newTokens)
|
||||
|
||||
|
||||
def toRTF(self):
|
||||
result = []
|
||||
for token in self.tokens:
|
||||
result.append(token.toRTF())
|
||||
return "".join(result)
|
||||
|
||||
|
||||
class RtfTokenizer():
|
||||
def __init__(self, rtfData):
|
||||
self.rtfData = []
|
||||
self.tokens = []
|
||||
self.rtfData = rtfData
|
||||
self.tokenize()
|
||||
|
||||
def tokenize(self):
|
||||
i = 0
|
||||
lastDataStart = -1
|
||||
while i < len(self.rtfData):
|
||||
|
||||
if isChar(self.rtfData[i], '{'):
|
||||
if lastDataStart > -1:
|
||||
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||
lastDataStart = -1
|
||||
self.tokens.append(tokenDelimitatorStart())
|
||||
i = i + 1
|
||||
continue
|
||||
|
||||
if isChar(self.rtfData[i], '}'):
|
||||
if lastDataStart > -1:
|
||||
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||
lastDataStart = -1
|
||||
self.tokens.append(tokenDelimitatorEnd())
|
||||
i = i + 1
|
||||
continue
|
||||
|
||||
if isChar(self.rtfData[i], '\\'):
|
||||
if i + 1 >= len(self.rtfData):
|
||||
raise BaseException('Error: Control character found at the end of the document.')
|
||||
|
||||
if lastDataStart > -1:
|
||||
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||
lastDataStart = -1
|
||||
|
||||
tokenStart = i
|
||||
i = i + 1
|
||||
|
||||
#Control Words
|
||||
if isAsciiLetter(self.rtfData[i]):
|
||||
#consume <ASCII Letter Sequence>
|
||||
consumed = False
|
||||
while i < len(self.rtfData):
|
||||
if not isAsciiLetter(self.rtfData[i]):
|
||||
tokenEnd = i
|
||||
consumed = True
|
||||
break
|
||||
i = i + 1
|
||||
|
||||
if not consumed:
|
||||
raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart))
|
||||
|
||||
#we have numeric argument before delimiter
|
||||
if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
|
||||
#consume the numeric argument
|
||||
consumed = False
|
||||
l = 0
|
||||
while i < len(self.rtfData):
|
||||
if not isDigit(self.rtfData[i]):
|
||||
consumed = True
|
||||
break
|
||||
l = l + 1
|
||||
i = i + 1
|
||||
if l > 10 :
|
||||
raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
|
||||
|
||||
if not consumed:
|
||||
raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
|
||||
|
||||
separator = ''
|
||||
if isChar(self.rtfData[i], ' '):
|
||||
separator = ' '
|
||||
|
||||
controlWord = self.rtfData[tokenStart: tokenEnd]
|
||||
if tokenEnd < i:
|
||||
value = int(self.rtfData[tokenEnd: i])
|
||||
if isString(controlWord, "\\bin"):
|
||||
i = i + value
|
||||
self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
|
||||
else:
|
||||
self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
|
||||
else:
|
||||
self.tokens.append(tokenControlWord(controlWord, separator))
|
||||
#space delimiter, we should discard it
|
||||
if self.rtfData[i] == ' ':
|
||||
i = i + 1
|
||||
|
||||
#Control Symbol
|
||||
else:
|
||||
self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
|
||||
i = i + 1
|
||||
continue
|
||||
|
||||
if lastDataStart < 0:
|
||||
lastDataStart = i
|
||||
i = i + 1
|
||||
|
||||
def toRTF(self):
|
||||
result = []
|
||||
for token in self.tokens:
|
||||
result.append(token.toRTF())
|
||||
return "".join(result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
if len(sys.argv) < 2:
|
||||
print ("Usage %prog rtfFileToConvert")
|
||||
sys.exit()
|
||||
f = open(sys.argv[1], 'rb')
|
||||
data = f.read()
|
||||
f.close()
|
||||
|
||||
tokenizer = RtfTokenizer(data)
|
||||
parsedTokens = RtfTokenParser(tokenizer.tokens)
|
||||
|
||||
data = parsedTokens.toRTF()
|
||||
|
||||
f = open(sys.argv[1], 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user