From c6e0698c36ef5e848beaf076cbc3265ccd128734 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Mon, 20 May 2019 00:49:28 -0400 Subject: [PATCH] py3: partial work towards making rtf2xml actually work --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 +- src/calibre/ebooks/rtf2xml/line_endings.py | 6 ++--- src/calibre/ebooks/rtf2xml/process_tokens.py | 26 ++++++++++---------- src/calibre/ebooks/rtf2xml/tokenize.py | 12 ++++----- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 8321f5cccd..a3d52a854c 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -562,7 +562,7 @@ class ParseRtf: def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" - read_obj = file if hasattr(file, 'read') else open(file,'r') + read_obj = file if hasattr(file, 'read') else open(file,'rb') with open(write_file, 'wb') as write_obj: for line in read_obj: write_obj.write(line) diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py index 3e2b8156e8..5dbc59a995 100755 --- a/src/calibre/ebooks/rtf2xml/line_endings.py +++ b/src/calibre/ebooks/rtf2xml/line_endings.py @@ -36,11 +36,11 @@ class FixLineEndings: def fix_endings(self): # read - with open(self.__file, 'r') as read_obj: + with open(self.__file, 'rb') as read_obj: input_file = read_obj.read() # calibre go from win and mac to unix - input_file = input_file.replace('\r\n', '\n') - input_file = input_file.replace('\r', '\n') + input_file = input_file.replace(b'\r\n', b'\n') + input_file = input_file.replace(b'\r', b'\n') # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 if self.__replace_illegals: input_file = clean_ascii_chars(input_file) diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 0f18d5ff9b..30dc0545ee 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -43,8 +43,8 @@ class ProcessTokens: self.__bug_handler = bug_handler def compile_expressions(self): - self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)") - self.__utf_exp = re.compile(r'(&.*?;)') + self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)") + self.__utf_exp = re.compile(br'(&.*?;)') def initiate_token_dict(self): self.__return_code = 0 @@ -762,10 +762,10 @@ class ProcessTokens: def process_cw(self, token): """Change the value of the control word by determining what dictionary it belongs to""" - special = ['*', ':', '}', '{', '~', '_', '-', ';'] + special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';'] # if token != "{" or token != "}": token = token[1:] # strip off leading \ - token = token.replace(" ", "") + token = token.replace(b" ", b"") # if not token: return only_alpha = token.isalpha() num = None @@ -784,24 +784,24 @@ class ProcessTokens: def process_tokens(self): """Main method for handling other methods. """ line_count = 0 - with open(self.__file, 'r') as read_obj: + with open(self.__file, 'rb') as read_obj: with open(self.__write_to, 'wb') as write_obj: for line in read_obj: - token = line.replace("\n","") + token = line.replace(b"\n",b"") line_count += 1 - if line_count == 1 and token != '\\{': + if line_count == 1 and token != b'\\{': msg = '\nInvalid RTF: document doesn\'t start with {\n' raise self.__exception_handler(msg) - elif line_count == 2 and token[0:4] != '\\rtf': + elif line_count == 2 and token[0:4] != b'\\rtf': msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n' raise self.__exception_handler(msg) - the_index = token.find('\\ ') + the_index = token.find(b'\\ ') if token is not None and the_index > -1: msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ % line_count raise self.__exception_handler(msg) - elif token[:1] == "\\": + elif token[:1] == b"\\": try: token.decode('us-ascii') except UnicodeError as msg: @@ -816,10 +816,10 @@ class ProcessTokens: for field in fields: if not field: continue - if field[0:1] == '&': - write_obj.write('tx\n\\g<2>", input_file) + input_file = self.__par_exp.sub(r'\n\\par \n', input_file) + input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file) input_file = self.__cs_ast.sub(r"\g<1>", input_file) - input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file) - input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file) + input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file) + input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file) # remove \n in bin data input_file = self.__bin_exp.sub(lambda x: x.group().replace('\n', '') + '\n', input_file) @@ -188,7 +188,7 @@ class Tokenize: # write with open(self.__write_to, 'wb') as write_obj: - write_obj.write('\n'.join(tokens)) + write_obj.write('\n'.join(tokens).encode('utf-8')) # Move and copy copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: