Date: Mon, 20 May 2019 00:39:36 -0400
Subject: [PATCH 12/14] py3: make pdf input work
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f6230269fa..ccbddb2eaa 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -584,7 +584,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
- (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
+ (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\\\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa
)
for rule in self.PREPROCESS + start_rules:
From a8a74b7c53bf0c900fa19d859e848752ea81be5c Mon Sep 17 00:00:00 2001
From: Eli Schwartz
Date: Mon, 20 May 2019 00:48:23 -0400
Subject: [PATCH 13/14] py3: use proper dict.keys() handling
---
src/calibre/ebooks/rtf2xml/paragraph_def.py | 23 +++++++--------------
1 file changed, 8 insertions(+), 15 deletions(-)
diff --git a/src/calibre/ebooks/rtf2xml/paragraph_def.py b/src/calibre/ebooks/rtf2xml/paragraph_def.py
index 82962fe9ea..0812e15776 100755
--- a/src/calibre/ebooks/rtf2xml/paragraph_def.py
+++ b/src/calibre/ebooks/rtf2xml/paragraph_def.py
@@ -608,12 +608,10 @@ if another paragraph_def is found, the state changes to collect_tokens.
# when determining uniqueness for a style, ingorne these values, since
# they don't tell us if the style is unique
ignore_values = ['style-num', 'nest-level', 'in-table']
- keys = self.__att_val_dict.keys()
- keys.sort()
- for key in keys:
- if key in ignore_values:
+ for k, v in self.__att_val_dict.items():
+ if k in ignore_values:
continue
- my_string += '%s:%s' % (key, self.__att_val_dict[key])
+ my_string += '%s:%s' % (k, v)
if my_string in self.__style_num_strings:
num = self.__style_num_strings.index(my_string)
num += 1 # since indexing starts at zero, rather than 1
@@ -637,12 +635,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
style_string += ('<%s>%s' % ('tabs', the_value))
- keys = self.__att_val_dict.keys()
- keys.sort()
- for key in keys:
- if key != 'name' and key !='style-num' and key != 'in-table'\
- and key not in tabs_list:
- style_string += ('<%s>%s' % (key, self.__att_val_dict[key]))
+ for k, v in self.__att_val_dict.items():
+ if k not in ['name', 'style-num', 'in-table'] + tabs_list:
+ style_string += ('<%s>%s' % (k, v))
style_string += '\n'
self.__body_style_strings.append(style_string)
@@ -690,11 +685,9 @@ if another paragraph_def is found, the state changes to collect_tokens.
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
self.__write_obj.write('<%s>%s' % ('tabs', the_value))
- keys = self.__att_val_dict.keys()
- keys.sort()
+ keys = sorted(self.__att_val_dict.keys())
for key in keys:
- if key != 'name' and key !='style-num' and key != 'in-table'\
- and key not in tabs_list:
+ if key not in ['name', 'style-num', 'in-table'] + tabs_list:
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
self.__write_obj.write('\n')
self.__write_obj.write(self.__start2_marker)
From c6e0698c36ef5e848beaf076cbc3265ccd128734 Mon Sep 17 00:00:00 2001
From: Eli Schwartz
Date: Mon, 20 May 2019 00:49:28 -0400
Subject: [PATCH 14/14] py3: partial work towards making rtf2xml actually work
---
src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 +-
src/calibre/ebooks/rtf2xml/line_endings.py | 6 ++---
src/calibre/ebooks/rtf2xml/process_tokens.py | 26 ++++++++++----------
src/calibre/ebooks/rtf2xml/tokenize.py | 12 ++++-----
4 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py
index 8321f5cccd..a3d52a854c 100755
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@@ -562,7 +562,7 @@ class ParseRtf:
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
- read_obj = file if hasattr(file, 'read') else open(file,'r')
+ read_obj = file if hasattr(file, 'read') else open(file,'rb')
with open(write_file, 'wb') as write_obj:
for line in read_obj:
write_obj.write(line)
diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py
index 3e2b8156e8..5dbc59a995 100755
--- a/src/calibre/ebooks/rtf2xml/line_endings.py
+++ b/src/calibre/ebooks/rtf2xml/line_endings.py
@@ -36,11 +36,11 @@ class FixLineEndings:
def fix_endings(self):
# read
- with open(self.__file, 'r') as read_obj:
+ with open(self.__file, 'rb') as read_obj:
input_file = read_obj.read()
# calibre go from win and mac to unix
- input_file = input_file.replace('\r\n', '\n')
- input_file = input_file.replace('\r', '\n')
+ input_file = input_file.replace(b'\r\n', b'\n')
+ input_file = input_file.replace(b'\r', b'\n')
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
if self.__replace_illegals:
input_file = clean_ascii_chars(input_file)
diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py
index 0f18d5ff9b..30dc0545ee 100755
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@@ -43,8 +43,8 @@ class ProcessTokens:
self.__bug_handler = bug_handler
def compile_expressions(self):
- self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
- self.__utf_exp = re.compile(r'(&.*?;)')
+ self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)")
+ self.__utf_exp = re.compile(br'(&.*?;)')
def initiate_token_dict(self):
self.__return_code = 0
@@ -762,10 +762,10 @@ class ProcessTokens:
def process_cw(self, token):
"""Change the value of the control word by determining what dictionary
it belongs to"""
- special = ['*', ':', '}', '{', '~', '_', '-', ';']
+ special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';']
# if token != "{" or token != "}":
token = token[1:] # strip off leading \
- token = token.replace(" ", "")
+ token = token.replace(b" ", b"")
# if not token: return
only_alpha = token.isalpha()
num = None
@@ -784,24 +784,24 @@ class ProcessTokens:
def process_tokens(self):
"""Main method for handling other methods. """
line_count = 0
- with open(self.__file, 'r') as read_obj:
+ with open(self.__file, 'rb') as read_obj:
with open(self.__write_to, 'wb') as write_obj:
for line in read_obj:
- token = line.replace("\n","")
+ token = line.replace(b"\n",b"")
line_count += 1
- if line_count == 1 and token != '\\{':
+ if line_count == 1 and token != b'\\{':
msg = '\nInvalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler(msg)
- elif line_count == 2 and token[0:4] != '\\rtf':
+ elif line_count == 2 and token[0:4] != b'\\rtf':
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler(msg)
- the_index = token.find('\\ ')
+ the_index = token.find(b'\\ ')
if token is not None and the_index > -1:
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
% line_count
raise self.__exception_handler(msg)
- elif token[:1] == "\\":
+ elif token[:1] == b"\\":
try:
token.decode('us-ascii')
except UnicodeError as msg:
@@ -816,10 +816,10 @@ class ProcessTokens:
for field in fields:
if not field:
continue
- if field[0:1] == '&':
- write_obj.write('tx\n\\g<2>", input_file)
+ input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
+ input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
- input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file)
- input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file)
+ input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
+ input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
# remove \n in bin data
input_file = self.__bin_exp.sub(lambda x:
x.group().replace('\n', '') + '\n', input_file)
@@ -188,7 +188,7 @@ class Tokenize:
# write
with open(self.__write_to, 'wb') as write_obj:
- write_obj.write('\n'.join(tokens))
+ write_obj.write('\n'.join(tokens).encode('utf-8'))
# Move and copy
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy: