mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
py3: partial work towards making rtf2xml actually work
This commit is contained in:
parent
a8a74b7c53
commit
c6e0698c36
@ -562,7 +562,7 @@ class ParseRtf:
|
|||||||
def __make_temp_file(self,file):
|
def __make_temp_file(self,file):
|
||||||
"""Make a temporary file to parse"""
|
"""Make a temporary file to parse"""
|
||||||
write_file="rtf_write_file"
|
write_file="rtf_write_file"
|
||||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
read_obj = file if hasattr(file, 'read') else open(file,'rb')
|
||||||
with open(write_file, 'wb') as write_obj:
|
with open(write_file, 'wb') as write_obj:
|
||||||
for line in read_obj:
|
for line in read_obj:
|
||||||
write_obj.write(line)
|
write_obj.write(line)
|
||||||
|
@ -36,11 +36,11 @@ class FixLineEndings:
|
|||||||
|
|
||||||
def fix_endings(self):
|
def fix_endings(self):
|
||||||
# read
|
# read
|
||||||
with open(self.__file, 'r') as read_obj:
|
with open(self.__file, 'rb') as read_obj:
|
||||||
input_file = read_obj.read()
|
input_file = read_obj.read()
|
||||||
# calibre go from win and mac to unix
|
# calibre go from win and mac to unix
|
||||||
input_file = input_file.replace('\r\n', '\n')
|
input_file = input_file.replace(b'\r\n', b'\n')
|
||||||
input_file = input_file.replace('\r', '\n')
|
input_file = input_file.replace(b'\r', b'\n')
|
||||||
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
|
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
|
||||||
if self.__replace_illegals:
|
if self.__replace_illegals:
|
||||||
input_file = clean_ascii_chars(input_file)
|
input_file = clean_ascii_chars(input_file)
|
||||||
|
@ -43,8 +43,8 @@ class ProcessTokens:
|
|||||||
self.__bug_handler = bug_handler
|
self.__bug_handler = bug_handler
|
||||||
|
|
||||||
def compile_expressions(self):
|
def compile_expressions(self):
|
||||||
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
|
||||||
self.__utf_exp = re.compile(r'(&.*?;)')
|
self.__utf_exp = re.compile(br'(&.*?;)')
|
||||||
|
|
||||||
def initiate_token_dict(self):
|
def initiate_token_dict(self):
|
||||||
self.__return_code = 0
|
self.__return_code = 0
|
||||||
@ -762,10 +762,10 @@ class ProcessTokens:
|
|||||||
def process_cw(self, token):
|
def process_cw(self, token):
|
||||||
"""Change the value of the control word by determining what dictionary
|
"""Change the value of the control word by determining what dictionary
|
||||||
it belongs to"""
|
it belongs to"""
|
||||||
special = ['*', ':', '}', '{', '~', '_', '-', ';']
|
special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
|
||||||
# if token != "{" or token != "}":
|
# if token != "{" or token != "}":
|
||||||
token = token[1:] # strip off leading \
|
token = token[1:] # strip off leading \
|
||||||
token = token.replace(" ", "")
|
token = token.replace(b" ", b"")
|
||||||
# if not token: return
|
# if not token: return
|
||||||
only_alpha = token.isalpha()
|
only_alpha = token.isalpha()
|
||||||
num = None
|
num = None
|
||||||
@ -784,24 +784,24 @@ class ProcessTokens:
|
|||||||
def process_tokens(self):
|
def process_tokens(self):
|
||||||
"""Main method for handling other methods. """
|
"""Main method for handling other methods. """
|
||||||
line_count = 0
|
line_count = 0
|
||||||
with open(self.__file, 'r') as read_obj:
|
with open(self.__file, 'rb') as read_obj:
|
||||||
with open(self.__write_to, 'wb') as write_obj:
|
with open(self.__write_to, 'wb') as write_obj:
|
||||||
for line in read_obj:
|
for line in read_obj:
|
||||||
token = line.replace("\n","")
|
token = line.replace(b"\n",b"")
|
||||||
line_count += 1
|
line_count += 1
|
||||||
if line_count == 1 and token != '\\{':
|
if line_count == 1 and token != b'\\{':
|
||||||
msg = '\nInvalid RTF: document doesn\'t start with {\n'
|
msg = '\nInvalid RTF: document doesn\'t start with {\n'
|
||||||
raise self.__exception_handler(msg)
|
raise self.__exception_handler(msg)
|
||||||
elif line_count == 2 and token[0:4] != '\\rtf':
|
elif line_count == 2 and token[0:4] != b'\\rtf':
|
||||||
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
|
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
|
||||||
raise self.__exception_handler(msg)
|
raise self.__exception_handler(msg)
|
||||||
|
|
||||||
the_index = token.find('\\ ')
|
the_index = token.find(b'\\ ')
|
||||||
if token is not None and the_index > -1:
|
if token is not None and the_index > -1:
|
||||||
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
|
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
|
||||||
% line_count
|
% line_count
|
||||||
raise self.__exception_handler(msg)
|
raise self.__exception_handler(msg)
|
||||||
elif token[:1] == "\\":
|
elif token[:1] == b"\\":
|
||||||
try:
|
try:
|
||||||
token.decode('us-ascii')
|
token.decode('us-ascii')
|
||||||
except UnicodeError as msg:
|
except UnicodeError as msg:
|
||||||
@ -816,10 +816,10 @@ class ProcessTokens:
|
|||||||
for field in fields:
|
for field in fields:
|
||||||
if not field:
|
if not field:
|
||||||
continue
|
continue
|
||||||
if field[0:1] == '&':
|
if field[0:1] == b'&':
|
||||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
write_obj.write(b'tx<ut<__________<%s\n' % field)
|
||||||
else:
|
else:
|
||||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
write_obj.write(b'tx<nu<__________<%s\n' % field)
|
||||||
|
|
||||||
if not line_count:
|
if not line_count:
|
||||||
msg = '\nInvalid RTF: file appears to be empty.\n'
|
msg = '\nInvalid RTF: file appears to be empty.\n'
|
||||||
|
@ -94,7 +94,7 @@ class Tokenize:
|
|||||||
uni_len = len(match_obj.group(0))
|
uni_len = len(match_obj.group(0))
|
||||||
if uni_char < 0:
|
if uni_char < 0:
|
||||||
uni_char += 65536
|
uni_char += 65536
|
||||||
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
|
||||||
self.__uc_char = self.__uc_value[-1]
|
self.__uc_char = self.__uc_value[-1]
|
||||||
# there is only an unicode char
|
# there is only an unicode char
|
||||||
if len(token)<= uni_len:
|
if len(token)<= uni_len:
|
||||||
@ -113,11 +113,11 @@ class Tokenize:
|
|||||||
def __sub_reg_split(self,input_file):
|
def __sub_reg_split(self,input_file):
|
||||||
input_file = self.__replace_spchar.mreplace(input_file)
|
input_file = self.__replace_spchar.mreplace(input_file)
|
||||||
# this is for older RTF
|
# this is for older RTF
|
||||||
input_file = self.__par_exp.sub('\n\\par \n', input_file)
|
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
|
||||||
input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file)
|
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
|
||||||
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
|
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
|
||||||
input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
|
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
|
||||||
input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
|
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
|
||||||
# remove \n in bin data
|
# remove \n in bin data
|
||||||
input_file = self.__bin_exp.sub(lambda x:
|
input_file = self.__bin_exp.sub(lambda x:
|
||||||
x.group().replace('\n', '') + '\n', input_file)
|
x.group().replace('\n', '') + '\n', input_file)
|
||||||
@ -188,7 +188,7 @@ class Tokenize:
|
|||||||
|
|
||||||
# write
|
# write
|
||||||
with open(self.__write_to, 'wb') as write_obj:
|
with open(self.__write_to, 'wb') as write_obj:
|
||||||
write_obj.write('\n'.join(tokens))
|
write_obj.write('\n'.join(tokens).encode('utf-8'))
|
||||||
# Move and copy
|
# Move and copy
|
||||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||||
if self.__copy:
|
if self.__copy:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user