mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
py3: partial work towards making rtf2xml actually work
This commit is contained in:
parent
a8a74b7c53
commit
c6e0698c36
@ -562,7 +562,7 @@ class ParseRtf:
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'r')
|
||||
read_obj = file if hasattr(file, 'read') else open(file,'rb')
|
||||
with open(write_file, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
|
@ -36,11 +36,11 @@ class FixLineEndings:
|
||||
|
||||
def fix_endings(self):
|
||||
# read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__file, 'rb') as read_obj:
|
||||
input_file = read_obj.read()
|
||||
# calibre go from win and mac to unix
|
||||
input_file = input_file.replace('\r\n', '\n')
|
||||
input_file = input_file.replace('\r', '\n')
|
||||
input_file = input_file.replace(b'\r\n', b'\n')
|
||||
input_file = input_file.replace(b'\r', b'\n')
|
||||
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
|
||||
if self.__replace_illegals:
|
||||
input_file = clean_ascii_chars(input_file)
|
||||
|
@ -43,8 +43,8 @@ class ProcessTokens:
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def compile_expressions(self):
|
||||
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(r'(&.*?;)')
|
||||
self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(br'(&.*?;)')
|
||||
|
||||
def initiate_token_dict(self):
|
||||
self.__return_code = 0
|
||||
@ -762,10 +762,10 @@ class ProcessTokens:
|
||||
def process_cw(self, token):
|
||||
"""Change the value of the control word by determining what dictionary
|
||||
it belongs to"""
|
||||
special = ['*', ':', '}', '{', '~', '_', '-', ';']
|
||||
special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
|
||||
# if token != "{" or token != "}":
|
||||
token = token[1:] # strip off leading \
|
||||
token = token.replace(" ", "")
|
||||
token = token.replace(b" ", b"")
|
||||
# if not token: return
|
||||
only_alpha = token.isalpha()
|
||||
num = None
|
||||
@ -784,24 +784,24 @@ class ProcessTokens:
|
||||
def process_tokens(self):
|
||||
"""Main method for handling other methods. """
|
||||
line_count = 0
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__file, 'rb') as read_obj:
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
for line in read_obj:
|
||||
token = line.replace("\n","")
|
||||
token = line.replace(b"\n",b"")
|
||||
line_count += 1
|
||||
if line_count == 1 and token != '\\{':
|
||||
if line_count == 1 and token != b'\\{':
|
||||
msg = '\nInvalid RTF: document doesn\'t start with {\n'
|
||||
raise self.__exception_handler(msg)
|
||||
elif line_count == 2 and token[0:4] != '\\rtf':
|
||||
elif line_count == 2 and token[0:4] != b'\\rtf':
|
||||
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
|
||||
raise self.__exception_handler(msg)
|
||||
|
||||
the_index = token.find('\\ ')
|
||||
the_index = token.find(b'\\ ')
|
||||
if token is not None and the_index > -1:
|
||||
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
|
||||
% line_count
|
||||
raise self.__exception_handler(msg)
|
||||
elif token[:1] == "\\":
|
||||
elif token[:1] == b"\\":
|
||||
try:
|
||||
token.decode('us-ascii')
|
||||
except UnicodeError as msg:
|
||||
@ -816,10 +816,10 @@ class ProcessTokens:
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
if field[0:1] == b'&':
|
||||
write_obj.write(b'tx<ut<__________<%s\n' % field)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
write_obj.write(b'tx<nu<__________<%s\n' % field)
|
||||
|
||||
if not line_count:
|
||||
msg = '\nInvalid RTF: file appears to be empty.\n'
|
||||
|
@ -94,7 +94,7 @@ class Tokenize:
|
||||
uni_len = len(match_obj.group(0))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace')
|
||||
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
|
||||
self.__uc_char = self.__uc_value[-1]
|
||||
# there is only an unicode char
|
||||
if len(token)<= uni_len:
|
||||
@ -113,11 +113,11 @@ class Tokenize:
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
# this is for older RTF
|
||||
input_file = self.__par_exp.sub('\n\\par \n', input_file)
|
||||
input_file = self.__cwdigit_exp.sub("\\g<1>\n\\g<2>", input_file)
|
||||
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
|
||||
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
|
||||
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
|
||||
input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
|
||||
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
|
||||
# remove \n in bin data
|
||||
input_file = self.__bin_exp.sub(lambda x:
|
||||
x.group().replace('\n', '') + '\n', input_file)
|
||||
@ -188,7 +188,7 @@ class Tokenize:
|
||||
|
||||
# write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
write_obj.write('\n'.join(tokens))
|
||||
write_obj.write('\n'.join(tokens).encode('utf-8'))
|
||||
# Move and copy
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
|
Loading…
x
Reference in New Issue
Block a user