py3: partial work towards making rtf2xml actually work

2025-07-09 03:04:10 -04:00 · 2019-05-20 00:49:28 -04:00 · 2019-05-20 00:49:28 -04:00 · c6e0698c36
commit c6e0698c36
parent a8a74b7c53
4 changed files with 23 additions and 23 deletions
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -562,7 +562,7 @@ class ParseRtf:
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
-        read_obj = file if hasattr(file, 'read') else open(file,'r')
+        read_obj = file if hasattr(file, 'read') else open(file,'rb')
        with open(write_file, 'wb') as write_obj:
            for line in read_obj:
                write_obj.write(line)
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@ -36,11 +36,11 @@ class FixLineEndings:

    def fix_endings(self):
        # read
-        with open(self.__file, 'r') as read_obj:
+        with open(self.__file, 'rb') as read_obj:
            input_file = read_obj.read()
        # calibre go from win and mac to unix
-        input_file = input_file.replace('\r\n', '\n')
-        input_file = input_file.replace('\r', '\n')
+        input_file = input_file.replace(b'\r\n', b'\n')
+        input_file = input_file.replace(b'\r', b'\n')
        # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
        if self.__replace_illegals:
            input_file = clean_ascii_chars(input_file)
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -43,8 +43,8 @@ class ProcessTokens:
        self.__bug_handler = bug_handler

    def compile_expressions(self):
-        self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
-        self.__utf_exp = re.compile(r'(&.*?;)')
+        self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
+        self.__utf_exp = re.compile(br'(&.*?;)')

    def initiate_token_dict(self):
        self.__return_code = 0
@ -762,10 +762,10 @@ class ProcessTokens:
    def process_cw(self, token):
        """Change the value of the control word by determining what dictionary
        it belongs to"""
-        special = ['*', ':', '}', '{', '~', '_', '-', ';']
+        special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
        # if token != "{" or token != "}":
        token = token[1:]  # strip off leading \
-        token = token.replace(" ", "")
+        token = token.replace(b" ", b"")
        # if not token: return
        only_alpha = token.isalpha()
        num = None
@ -784,24 +784,24 @@ class ProcessTokens:
    def process_tokens(self):
        """Main method for handling other methods. """
        line_count = 0
-        with open(self.__file, 'r') as read_obj:
+        with open(self.__file, 'rb') as read_obj:
            with open(self.__write_to, 'wb') as write_obj:
                for line in read_obj:
-                    token = line.replace("\n","")
+                    token = line.replace(b"\n",b"")
                    line_count += 1
-                    if line_count == 1 and token != '\\{':
+                    if line_count == 1 and token != b'\\{':
                        msg = '\nInvalid RTF: document doesn\'t start with {\n'
                        raise self.__exception_handler(msg)
-                    elif line_count == 2 and token[0:4] != '\\rtf':
+                    elif line_count == 2 and token[0:4] != b'\\rtf':
                        msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
                        raise self.__exception_handler(msg)

-                    the_index = token.find('\\ ')
+                    the_index = token.find(b'\\ ')
                    if token is not None and the_index > -1:
                        msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
                            % line_count
                        raise self.__exception_handler(msg)
-                    elif token[:1] == "\\":
+                    elif token[:1] == b"\\":
                        try:
                            token.decode('us-ascii')
                        except UnicodeError as msg:
@ -816,10 +816,10 @@ class ProcessTokens:
                        for field in fields:
                            if not field:
                                continue
-                            if field[0:1] == '&':
-                                write_obj.write('tx<ut<__________<%s\n' % field)
+                            if field[0:1] == b'&':
+                                write_obj.write(b'tx<ut<__________<%s\n' % field)
                            else:
-                                write_obj.write('tx<nu<__________<%s\n' % field)
+                                write_obj.write(b'tx<nu<__________<%s\n' % field)

        if not line_count:
            msg = '\nInvalid RTF: file appears to be empty.\n'
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -94,7 +94,7 @@ class Tokenize:
            uni_len = len(match_obj.group(0))
            if uni_char < 0:
                uni_char += 65536
-            uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace')
+            uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
            self.__uc_char = self.__uc_value[-1]
            # there is only an unicode char
            if len(token)<= uni_len:
@ -113,11 +113,11 @@ class Tokenize:
    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
        # this is for older RTF
-        input_file = self.__par_exp.sub('\n\\par \n', input_file)
-        input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file)
+        input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
+        input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
        input_file = self.__cs_ast.sub(r"\g<1>", input_file)
-        input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
-        input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
+        input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
+        input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
        # remove \n in bin data
        input_file = self.__bin_exp.sub(lambda x:
                                        x.group().replace('\n', '') + '\n', input_file)
@ -188,7 +188,7 @@ class Tokenize:

        # write
        with open(self.__write_to, 'wb') as write_obj:
-            write_obj.write('\n'.join(tokens))
+            write_obj.write('\n'.join(tokens).encode('utf-8'))
        # Move and copy
        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
        if self.__copy: