mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	RTF Input: Support for unicode characters. Fixes #4501 (Unicode escaped RTF to XML problem)
This commit is contained in:
		
							parent
							
								
									c6692c859e
								
							
						
					
					
						commit
						03714a978f
					
				@ -169,6 +169,21 @@ class RTFInput(InputFormatPlugin):
 | 
			
		||||
        with open('styles.css', 'ab') as f:
 | 
			
		||||
            f.write(css)
 | 
			
		||||
 | 
			
		||||
    def preprocess(self, fname):
 | 
			
		||||
        self.log('\tPreprocessing to convert unicode characters')
 | 
			
		||||
        try:
 | 
			
		||||
            data = open(fname, 'rb').read()
 | 
			
		||||
            from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
 | 
			
		||||
            tokenizer = RtfTokenizer(data)
 | 
			
		||||
            tokens = RtfTokenParser(tokenizer.tokens)
 | 
			
		||||
            data = tokens.toRTF()
 | 
			
		||||
            fname = 'preprocessed.rtf'
 | 
			
		||||
            with open(fname, 'wb') as f:
 | 
			
		||||
                f.write(data)
 | 
			
		||||
        except:
 | 
			
		||||
            self.log.exception(
 | 
			
		||||
            'Failed to preprocess RTF to convert unicode sequences, ignoring...')
 | 
			
		||||
        return fname
 | 
			
		||||
 | 
			
		||||
    def convert(self, stream, options, file_ext, log,
 | 
			
		||||
                accelerators):
 | 
			
		||||
@ -177,8 +192,9 @@ class RTFInput(InputFormatPlugin):
 | 
			
		||||
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
 | 
			
		||||
        self.log = log
 | 
			
		||||
        self.log('Converting RTF to XML...')
 | 
			
		||||
        fname = self.preprocess(stream.name)
 | 
			
		||||
        try:
 | 
			
		||||
            xml = self.generate_xml(stream.name)
 | 
			
		||||
            xml = self.generate_xml(fname)
 | 
			
		||||
        except RtfInvalidCodeException:
 | 
			
		||||
            raise ValueError(_('This RTF file has a feature calibre does not '
 | 
			
		||||
            'support. Convert it to HTML first and then try it.'))
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										344
									
								
								src/calibre/ebooks/rtf/preprocess.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										344
									
								
								src/calibre/ebooks/rtf/preprocess.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,344 @@
 | 
			
		||||
#!/usr/bin/env python
 | 
			
		||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 | 
			
		||||
from __future__ import with_statement
 | 
			
		||||
 | 
			
		||||
__license__   = 'GPL v3'
 | 
			
		||||
__copyright__ = '2010, Gerendi Sandor Attila'
 | 
			
		||||
__docformat__ = 'restructuredtext en'
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
RTF tokenizer and token parser. v.1.0 (1/17/2010)
 | 
			
		||||
Author: Gerendi Sandor Attila
 | 
			
		||||
 | 
			
		||||
At this point this will tokenize a RTF file then rebuild it from the tokens.
 | 
			
		||||
In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
class tokenDelimitatorStart():
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        pass
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return b'{'
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return '{'
 | 
			
		||||
 | 
			
		||||
class tokenDelimitatorEnd():
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        pass
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return b'}'
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return '}'
 | 
			
		||||
 | 
			
		||||
class tokenControlWord():
 | 
			
		||||
    def __init__(self, name, separator = ''):
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.separator = separator
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return self.name + self.separator
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return self.name + self.separator
 | 
			
		||||
 | 
			
		||||
class tokenControlWordWithNumericArgument():
 | 
			
		||||
    def __init__(self, name, argument, separator = ''):
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.argument = argument
 | 
			
		||||
        self.separator = separator
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return self.name + repr(self.argument) + self.separator
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return self.name + repr(self.argument) + self.separator
 | 
			
		||||
 | 
			
		||||
class tokenControlSymbol():
 | 
			
		||||
    def __init__(self, name):
 | 
			
		||||
        self.name = name
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return self.name
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return self.name
 | 
			
		||||
 | 
			
		||||
class tokenData():
 | 
			
		||||
    def __init__(self, data):
 | 
			
		||||
        self.data = data
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return self.data
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return self.data
 | 
			
		||||
 | 
			
		||||
class tokenBinN():
 | 
			
		||||
    def __init__(self, data, separator = ''):
 | 
			
		||||
        self.data = data
 | 
			
		||||
        self.separator = separator
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return "\\bin" + repr(len(self.data)) + self.separator + self.data
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return "\\bin" + repr(len(self.data)) + self.separator + self.data
 | 
			
		||||
 | 
			
		||||
class token8bitChar():
 | 
			
		||||
    def __init__(self, data):
 | 
			
		||||
        self.data = data
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        return "\\'" + self.data
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return "\\'" + self.data
 | 
			
		||||
 | 
			
		||||
class tokenUnicode():
 | 
			
		||||
    def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
 | 
			
		||||
        self.data = data
 | 
			
		||||
        self.separator = separator
 | 
			
		||||
        self.current_ucn = current_ucn
 | 
			
		||||
        self.eqList = eqList
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        result = '\\u' + repr(self.data) + ' '
 | 
			
		||||
        ucn = self.current_ucn
 | 
			
		||||
        if len(self.eqList) < ucn:
 | 
			
		||||
            ucn = len(self.eqList)
 | 
			
		||||
            result =  tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
 | 
			
		||||
        i = 0
 | 
			
		||||
        for eq in self.eqList:
 | 
			
		||||
            if i >= ucn:
 | 
			
		||||
                break
 | 
			
		||||
            result = result + eq.toRTF()
 | 
			
		||||
        return result
 | 
			
		||||
    def __repr__(self):
 | 
			
		||||
        return '\\u' + repr(self.data)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def isAsciiLetter(value):
 | 
			
		||||
    return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
 | 
			
		||||
 | 
			
		||||
def isDigit(value):
 | 
			
		||||
    return (value >= '0') and (value <= '9')
 | 
			
		||||
 | 
			
		||||
def isChar(value, char):
 | 
			
		||||
    return value == char
 | 
			
		||||
 | 
			
		||||
def isString(buffer, string):
 | 
			
		||||
    return buffer == string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RtfTokenParser():
 | 
			
		||||
    def __init__(self, tokens):
 | 
			
		||||
        self.tokens = tokens
 | 
			
		||||
        self.process()
 | 
			
		||||
        self.processUnicode()
 | 
			
		||||
 | 
			
		||||
    def process(self):
 | 
			
		||||
        i = 0
 | 
			
		||||
        newTokens = []
 | 
			
		||||
        while i < len(self.tokens):
 | 
			
		||||
            if isinstance(self.tokens[i], tokenControlSymbol):
 | 
			
		||||
                if isString(self.tokens[i].name, "\\'"):
 | 
			
		||||
                    i = i + 1
 | 
			
		||||
                    if not isinstance(self.tokens[i], tokenData):
 | 
			
		||||
                        raise BaseException('Error: token8bitChar without data.')
 | 
			
		||||
                    if len(self.tokens[i].data) < 2:
 | 
			
		||||
                        raise BaseException('Error: token8bitChar without data.')
 | 
			
		||||
                    newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
 | 
			
		||||
                    if len(self.tokens[i].data) > 2:
 | 
			
		||||
                        newTokens.append(tokenData(self.tokens[i].data[2:]))
 | 
			
		||||
                    i = i + 1
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
            newTokens.append(self.tokens[i])
 | 
			
		||||
            i = i + 1
 | 
			
		||||
 | 
			
		||||
        self.tokens = list(newTokens)
 | 
			
		||||
 | 
			
		||||
    def processUnicode(self):
 | 
			
		||||
        i = 0
 | 
			
		||||
        newTokens = []
 | 
			
		||||
        ucNbStack = [1]
 | 
			
		||||
        while i < len(self.tokens):
 | 
			
		||||
            if isinstance(self.tokens[i], tokenDelimitatorStart):
 | 
			
		||||
                ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
 | 
			
		||||
                newTokens.append(self.tokens[i])
 | 
			
		||||
                i = i + 1
 | 
			
		||||
                continue
 | 
			
		||||
            if isinstance(self.tokens[i], tokenDelimitatorEnd):
 | 
			
		||||
                ucNbStack.pop()
 | 
			
		||||
                newTokens.append(self.tokens[i])
 | 
			
		||||
                i = i + 1
 | 
			
		||||
                continue
 | 
			
		||||
            if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
 | 
			
		||||
                if isString(self.tokens[i].name, '\\uc'):
 | 
			
		||||
                    ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
 | 
			
		||||
                    newTokens.append(self.tokens[i])
 | 
			
		||||
                    i = i + 1
 | 
			
		||||
                    continue
 | 
			
		||||
                if isString(self.tokens[i].name, '\\u'):
 | 
			
		||||
                    x = i
 | 
			
		||||
                    j = 0
 | 
			
		||||
                    i = i + 1
 | 
			
		||||
                    replace = []
 | 
			
		||||
                    partialData = None
 | 
			
		||||
                    ucn = ucNbStack[len(ucNbStack) - 1]
 | 
			
		||||
                    while (i < len(self.tokens)) and (j < ucn):
 | 
			
		||||
                        if isinstance(self.tokens[i], tokenDelimitatorStart):
 | 
			
		||||
                            break
 | 
			
		||||
                        if isinstance(self.tokens[i], tokenDelimitatorEnd):
 | 
			
		||||
                            break
 | 
			
		||||
                        if isinstance(self.tokens[i], tokenData):
 | 
			
		||||
                            if len(self.tokens[i].data) >= ucn - j:
 | 
			
		||||
                                replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
 | 
			
		||||
                                if len(self.tokens[i].data) > ucn - j:
 | 
			
		||||
                                    partialData = tokenData(self.tokens[i].data[ucn - j:])
 | 
			
		||||
                                i = i + 1
 | 
			
		||||
                                break
 | 
			
		||||
                            else:
 | 
			
		||||
                                replace.append(self.tokens[i])
 | 
			
		||||
                                j = j + len(self.tokens[i].data)
 | 
			
		||||
                                i = i + 1
 | 
			
		||||
                                continue
 | 
			
		||||
                        if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
 | 
			
		||||
                            replace.append(self.tokens[i])
 | 
			
		||||
                            i = i + 1
 | 
			
		||||
                            j = j + 1
 | 
			
		||||
                            continue
 | 
			
		||||
                        raise BaseException('Error: incorect utf replacement.')
 | 
			
		||||
 | 
			
		||||
                    #calibre rtf2xml does not support utfreplace
 | 
			
		||||
                    replace = []
 | 
			
		||||
 | 
			
		||||
                    newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
 | 
			
		||||
                    if partialData != None:
 | 
			
		||||
                        newTokens.append(partialData)
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
            newTokens.append(self.tokens[i])
 | 
			
		||||
            i = i + 1
 | 
			
		||||
 | 
			
		||||
        self.tokens = list(newTokens)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        result = []
 | 
			
		||||
        for token in self.tokens:
 | 
			
		||||
            result.append(token.toRTF())
 | 
			
		||||
        return "".join(result)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RtfTokenizer():
 | 
			
		||||
    def __init__(self, rtfData):
 | 
			
		||||
        self.rtfData = []
 | 
			
		||||
        self.tokens = []
 | 
			
		||||
        self.rtfData = rtfData
 | 
			
		||||
        self.tokenize()
 | 
			
		||||
 | 
			
		||||
    def tokenize(self):
 | 
			
		||||
        i = 0
 | 
			
		||||
        lastDataStart = -1
 | 
			
		||||
        while i < len(self.rtfData):
 | 
			
		||||
 | 
			
		||||
            if isChar(self.rtfData[i], '{'):
 | 
			
		||||
                if lastDataStart > -1:
 | 
			
		||||
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
 | 
			
		||||
                    lastDataStart = -1
 | 
			
		||||
                self.tokens.append(tokenDelimitatorStart())
 | 
			
		||||
                i = i + 1
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if isChar(self.rtfData[i], '}'):
 | 
			
		||||
                if lastDataStart > -1:
 | 
			
		||||
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
 | 
			
		||||
                    lastDataStart = -1
 | 
			
		||||
                self.tokens.append(tokenDelimitatorEnd())
 | 
			
		||||
                i = i + 1
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if isChar(self.rtfData[i], '\\'):
 | 
			
		||||
                if i + 1 >= len(self.rtfData):
 | 
			
		||||
                    raise BaseException('Error: Control character found at the end of the document.')
 | 
			
		||||
 | 
			
		||||
                if lastDataStart > -1:
 | 
			
		||||
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
 | 
			
		||||
                    lastDataStart = -1
 | 
			
		||||
 | 
			
		||||
                tokenStart = i
 | 
			
		||||
                i = i + 1
 | 
			
		||||
 | 
			
		||||
                #Control Words
 | 
			
		||||
                if isAsciiLetter(self.rtfData[i]):
 | 
			
		||||
                    #consume <ASCII Letter Sequence>
 | 
			
		||||
                    consumed = False
 | 
			
		||||
                    while i < len(self.rtfData):
 | 
			
		||||
                        if not isAsciiLetter(self.rtfData[i]):
 | 
			
		||||
                            tokenEnd = i
 | 
			
		||||
                            consumed = True
 | 
			
		||||
                            break
 | 
			
		||||
                        i = i + 1
 | 
			
		||||
 | 
			
		||||
                    if not consumed:
 | 
			
		||||
                        raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart))
 | 
			
		||||
 | 
			
		||||
                    #we have numeric argument before delimiter
 | 
			
		||||
                    if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
 | 
			
		||||
                        #consume the numeric argument
 | 
			
		||||
                        consumed = False
 | 
			
		||||
                        l = 0
 | 
			
		||||
                        while i < len(self.rtfData):
 | 
			
		||||
                            if not isDigit(self.rtfData[i]):
 | 
			
		||||
                                consumed = True
 | 
			
		||||
                                break
 | 
			
		||||
                            l = l + 1
 | 
			
		||||
                            i = i + 1
 | 
			
		||||
                            if l > 10 :
 | 
			
		||||
                                raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
 | 
			
		||||
 | 
			
		||||
                        if not consumed:
 | 
			
		||||
                            raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
 | 
			
		||||
 | 
			
		||||
                    separator = ''
 | 
			
		||||
                    if isChar(self.rtfData[i], ' '):
 | 
			
		||||
                        separator = ' '
 | 
			
		||||
 | 
			
		||||
                    controlWord = self.rtfData[tokenStart: tokenEnd]
 | 
			
		||||
                    if tokenEnd < i:
 | 
			
		||||
                        value = int(self.rtfData[tokenEnd: i])
 | 
			
		||||
                        if isString(controlWord, "\\bin"):
 | 
			
		||||
                            i = i + value
 | 
			
		||||
                            self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
 | 
			
		||||
                        else:
 | 
			
		||||
                            self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
 | 
			
		||||
                    else:
 | 
			
		||||
                        self.tokens.append(tokenControlWord(controlWord, separator))
 | 
			
		||||
                    #space delimiter, we should discard it
 | 
			
		||||
                    if self.rtfData[i] == ' ':
 | 
			
		||||
                        i = i + 1
 | 
			
		||||
 | 
			
		||||
                #Control Symbol
 | 
			
		||||
                else:
 | 
			
		||||
                    self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
 | 
			
		||||
                    i = i + 1
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if lastDataStart < 0:
 | 
			
		||||
                lastDataStart = i
 | 
			
		||||
            i = i + 1
 | 
			
		||||
 | 
			
		||||
    def toRTF(self):
 | 
			
		||||
        result = []
 | 
			
		||||
        for token in self.tokens:
 | 
			
		||||
            result.append(token.toRTF())
 | 
			
		||||
        return "".join(result)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    import sys
 | 
			
		||||
    if len(sys.argv) < 2:
 | 
			
		||||
        print ("Usage %prog rtfFileToConvert")
 | 
			
		||||
        sys.exit()
 | 
			
		||||
    f = open(sys.argv[1], 'rb')
 | 
			
		||||
    data = f.read()
 | 
			
		||||
    f.close()
 | 
			
		||||
 | 
			
		||||
    tokenizer = RtfTokenizer(data)
 | 
			
		||||
    parsedTokens = RtfTokenParser(tokenizer.tokens)
 | 
			
		||||
 | 
			
		||||
    data = parsedTokens.toRTF()
 | 
			
		||||
 | 
			
		||||
    f = open(sys.argv[1], 'w')
 | 
			
		||||
    f.write(data)
 | 
			
		||||
    f.close()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user