From c6e0698c36ef5e848beaf076cbc3265ccd128734 Mon Sep 17 00:00:00 2001
From: Eli Schwartz <eschwartz@archlinux.org>
Date: Mon, 20 May 2019 00:49:28 -0400
Subject: [PATCH] py3: partial work towards making rtf2xml actually work

---
 src/calibre/ebooks/rtf2xml/ParseRtf.py       |  2 +-
 src/calibre/ebooks/rtf2xml/line_endings.py   |  6 ++---
 src/calibre/ebooks/rtf2xml/process_tokens.py | 26 ++++++++++----------
 src/calibre/ebooks/rtf2xml/tokenize.py       | 12 ++++-----
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py
index 8321f5cccd..a3d52a854c 100755
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@@ -562,7 +562,7 @@ class ParseRtf:
     def __make_temp_file(self,file):
         """Make a temporary file to parse"""
         write_file="rtf_write_file"
-        read_obj = file if hasattr(file, 'read') else open(file,'r')
+        read_obj = file if hasattr(file, 'read') else open(file,'rb')
         with open(write_file, 'wb') as write_obj:
             for line in read_obj:
                 write_obj.write(line)
diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py
index 3e2b8156e8..5dbc59a995 100755
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@@ -36,11 +36,11 @@ class FixLineEndings:
 
     def fix_endings(self):
         # read
-        with open(self.__file, 'r') as read_obj:
+        with open(self.__file, 'rb') as read_obj:
             input_file = read_obj.read()
         # calibre go from win and mac to unix
-        input_file = input_file.replace('\r\n', '\n')
-        input_file = input_file.replace('\r', '\n')
+        input_file = input_file.replace(b'\r\n', b'\n')
+        input_file = input_file.replace(b'\r', b'\n')
         # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
         if self.__replace_illegals:
             input_file = clean_ascii_chars(input_file)
diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py
index 0f18d5ff9b..30dc0545ee 100755
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@@ -43,8 +43,8 @@ class ProcessTokens:
         self.__bug_handler = bug_handler
 
     def compile_expressions(self):
-        self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
-        self.__utf_exp = re.compile(r'(&.*?;)')
+        self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
+        self.__utf_exp = re.compile(br'(&.*?;)')
 
     def initiate_token_dict(self):
         self.__return_code = 0
@@ -762,10 +762,10 @@ class ProcessTokens:
     def process_cw(self, token):
         """Change the value of the control word by determining what dictionary
         it belongs to"""
-        special = ['*', ':', '}', '{', '~', '_', '-', ';']
+        special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
         # if token != "{" or token != "}":
         token = token[1:]  # strip off leading \
-        token = token.replace(" ", "")
+        token = token.replace(b" ", b"")
         # if not token: return
         only_alpha = token.isalpha()
         num = None
@@ -784,24 +784,24 @@ class ProcessTokens:
     def process_tokens(self):
         """Main method for handling other methods. """
         line_count = 0
-        with open(self.__file, 'r') as read_obj:
+        with open(self.__file, 'rb') as read_obj:
             with open(self.__write_to, 'wb') as write_obj:
                 for line in read_obj:
-                    token = line.replace("\n","")
+                    token = line.replace(b"\n",b"")
                     line_count += 1
-                    if line_count == 1 and token != '\\{':
+                    if line_count == 1 and token != b'\\{':
                         msg = '\nInvalid RTF: document doesn\'t start with {\n'
                         raise self.__exception_handler(msg)
-                    elif line_count == 2 and token[0:4] != '\\rtf':
+                    elif line_count == 2 and token[0:4] != b'\\rtf':
                         msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
                         raise self.__exception_handler(msg)
 
-                    the_index = token.find('\\ ')
+                    the_index = token.find(b'\\ ')
                     if token is not None and the_index > -1:
                         msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
                             % line_count
                         raise self.__exception_handler(msg)
-                    elif token[:1] == "\\":
+                    elif token[:1] == b"\\":
                         try:
                             token.decode('us-ascii')
                         except UnicodeError as msg:
@@ -816,10 +816,10 @@ class ProcessTokens:
                         for field in fields:
                             if not field:
                                 continue
-                            if field[0:1] == '&':
-                                write_obj.write('tx<ut<__________<%s\n' % field)
+                            if field[0:1] == b'&':
+                                write_obj.write(b'tx<ut<__________<%s\n' % field)
                             else:
-                                write_obj.write('tx<nu<__________<%s\n' % field)
+                                write_obj.write(b'tx<nu<__________<%s\n' % field)
 
         if not line_count:
             msg = '\nInvalid RTF: file appears to be empty.\n'
diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py
index 0158ffd39c..fcca41aa01 100755
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@@ -94,7 +94,7 @@ class Tokenize:
             uni_len = len(match_obj.group(0))
             if uni_char < 0:
                 uni_char += 65536
-            uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace')
+            uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
             self.__uc_char = self.__uc_value[-1]
             # there is only an unicode char
             if len(token)<= uni_len:
@@ -113,11 +113,11 @@ class Tokenize:
     def __sub_reg_split(self,input_file):
         input_file = self.__replace_spchar.mreplace(input_file)
         # this is for older RTF
-        input_file = self.__par_exp.sub('\n\\par \n', input_file)
-        input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file)
+        input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
+        input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
         input_file = self.__cs_ast.sub(r"\g<1>", input_file)
-        input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
-        input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
+        input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
+        input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
         # remove \n in bin data
         input_file = self.__bin_exp.sub(lambda x:
                                         x.group().replace('\n', '') + '\n', input_file)
@@ -188,7 +188,7 @@ class Tokenize:
 
         # write
         with open(self.__write_to, 'wb') as write_obj:
-            write_obj.write('\n'.join(tokens))
+            write_obj.write('\n'.join(tokens).encode('utf-8'))
         # Move and copy
         copy_obj = copy.Copy(bug_handler=self.__bug_handler)
         if self.__copy: