py3: partial work towards making rtf2xml actually work

This commit is contained in:
Eli Schwartz 2019-05-20 00:49:28 -04:00
parent a8a74b7c53
commit c6e0698c36
No known key found for this signature in database
GPG Key ID: CEB167EFB5722BD6
4 changed files with 23 additions and 23 deletions

View File

@ -562,7 +562,7 @@ class ParseRtf:
def __make_temp_file(self,file): def __make_temp_file(self,file):
"""Make a temporary file to parse""" """Make a temporary file to parse"""
write_file="rtf_write_file" write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open(file,'r') read_obj = file if hasattr(file, 'read') else open(file,'rb')
with open(write_file, 'wb') as write_obj: with open(write_file, 'wb') as write_obj:
for line in read_obj: for line in read_obj:
write_obj.write(line) write_obj.write(line)

View File

@ -36,11 +36,11 @@ class FixLineEndings:
def fix_endings(self): def fix_endings(self):
# read # read
with open(self.__file, 'r') as read_obj: with open(self.__file, 'rb') as read_obj:
input_file = read_obj.read() input_file = read_obj.read()
# calibre go from win and mac to unix # calibre go from win and mac to unix
input_file = input_file.replace('\r\n', '\n') input_file = input_file.replace(b'\r\n', b'\n')
input_file = input_file.replace('\r', '\n') input_file = input_file.replace(b'\r', b'\n')
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
if self.__replace_illegals: if self.__replace_illegals:
input_file = clean_ascii_chars(input_file) input_file = clean_ascii_chars(input_file)

View File

@ -43,8 +43,8 @@ class ProcessTokens:
self.__bug_handler = bug_handler self.__bug_handler = bug_handler
def compile_expressions(self): def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)") self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)') self.__utf_exp = re.compile(br'(&.*?;)')
def initiate_token_dict(self): def initiate_token_dict(self):
self.__return_code = 0 self.__return_code = 0
@ -762,10 +762,10 @@ class ProcessTokens:
def process_cw(self, token): def process_cw(self, token):
"""Change the value of the control word by determining what dictionary """Change the value of the control word by determining what dictionary
it belongs to""" it belongs to"""
special = ['*', ':', '}', '{', '~', '_', '-', ';'] special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
# if token != "{" or token != "}": # if token != "{" or token != "}":
token = token[1:] # strip off leading \ token = token[1:] # strip off leading \
token = token.replace(" ", "") token = token.replace(b" ", b"")
# if not token: return # if not token: return
only_alpha = token.isalpha() only_alpha = token.isalpha()
num = None num = None
@ -784,24 +784,24 @@ class ProcessTokens:
def process_tokens(self): def process_tokens(self):
"""Main method for handling other methods. """ """Main method for handling other methods. """
line_count = 0 line_count = 0
with open(self.__file, 'r') as read_obj: with open(self.__file, 'rb') as read_obj:
with open(self.__write_to, 'wb') as write_obj: with open(self.__write_to, 'wb') as write_obj:
for line in read_obj: for line in read_obj:
token = line.replace("\n","") token = line.replace(b"\n",b"")
line_count += 1 line_count += 1
if line_count == 1 and token != '\\{': if line_count == 1 and token != b'\\{':
msg = '\nInvalid RTF: document doesn\'t start with {\n' msg = '\nInvalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler(msg) raise self.__exception_handler(msg)
elif line_count == 2 and token[0:4] != '\\rtf': elif line_count == 2 and token[0:4] != b'\\rtf':
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n' msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler(msg) raise self.__exception_handler(msg)
the_index = token.find('\\ ') the_index = token.find(b'\\ ')
if token is not None and the_index > -1: if token is not None and the_index > -1:
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
% line_count % line_count
raise self.__exception_handler(msg) raise self.__exception_handler(msg)
elif token[:1] == "\\": elif token[:1] == b"\\":
try: try:
token.decode('us-ascii') token.decode('us-ascii')
except UnicodeError as msg: except UnicodeError as msg:
@ -816,10 +816,10 @@ class ProcessTokens:
for field in fields: for field in fields:
if not field: if not field:
continue continue
if field[0:1] == '&': if field[0:1] == b'&':
write_obj.write('tx<ut<__________<%s\n' % field) write_obj.write(b'tx<ut<__________<%s\n' % field)
else: else:
write_obj.write('tx<nu<__________<%s\n' % field) write_obj.write(b'tx<nu<__________<%s\n' % field)
if not line_count: if not line_count:
msg = '\nInvalid RTF: file appears to be empty.\n' msg = '\nInvalid RTF: file appears to be empty.\n'

View File

@ -94,7 +94,7 @@ class Tokenize:
uni_len = len(match_obj.group(0)) uni_len = len(match_obj.group(0))
if uni_char < 0: if uni_char < 0:
uni_char += 65536 uni_char += 65536
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace') uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
self.__uc_char = self.__uc_value[-1] self.__uc_char = self.__uc_value[-1]
# there is only an unicode char # there is only an unicode char
if len(token)<= uni_len: if len(token)<= uni_len:
@ -113,11 +113,11 @@ class Tokenize:
def __sub_reg_split(self,input_file): def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file) input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF # this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file) input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file) input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
input_file = self.__cs_ast.sub(r"\g<1>", input_file) input_file = self.__cs_ast.sub(r"\g<1>", input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file) input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file) input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
# remove \n in bin data # remove \n in bin data
input_file = self.__bin_exp.sub(lambda x: input_file = self.__bin_exp.sub(lambda x:
x.group().replace('\n', '') + '\n', input_file) x.group().replace('\n', '') + '\n', input_file)
@ -188,7 +188,7 @@ class Tokenize:
# write # write
with open(self.__write_to, 'wb') as write_obj: with open(self.__write_to, 'wb') as write_obj:
write_obj.write('\n'.join(tokens)) write_obj.write('\n'.join(tokens).encode('utf-8'))
# Move and copy # Move and copy
copy_obj = copy.Copy(bug_handler=self.__bug_handler) copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy: if self.__copy: