From ab7e134a40bc62d3a4d6213b01bd84863b2e204e Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Fri, 17 May 2019 17:11:36 -0400 Subject: [PATCH 01/14] py3: make pdb output work --- src/calibre/ebooks/pdb/header.py | 6 +++--- src/calibre/ebooks/pdb/palmdoc/writer.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 86ae4d3bcc..eb2a786dfd 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -67,8 +67,8 @@ class PdbHeaderReader(object): class PdbHeaderBuilder(object): def __init__(self, identity, title): - self.identity = identity.ljust(3, '\x00')[:8] - self.title = '%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + self.identity = identity.ljust(3, '\x00')[:8].encode('utf-8') + self.title = b'%s\x00' % re.sub('[^-A-Za-z0-9 ]+', '_', title).ljust(31, '\x00')[:31].encode('ascii', 'replace') def build_header(self, section_lengths, out_stream): ''' @@ -85,4 +85,4 @@ class PdbHeaderBuilder(object): for id, record in enumerate(section_lengths): out_stream.write(struct.pack('>LBBBB', long_type(offset), 0, 0, 0, 0)) offset += record - out_stream.write('\x00\x00') + out_stream.write(b'\x00\x00') diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 390329b124..13d69b451f 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import division ''' Writer content to palmdoc pdb file. @@ -57,13 +58,13 @@ class Writer(FormatWriter): txt_length = len(txt) txt_records = [] - for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): + for i in range(0, (len(txt) // MAX_RECORD_SIZE) + 1): txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) return txt_records, txt_length def _header_record(self, txt_length, record_count): - record = '' + record = b'' record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression). record += struct.pack('>H', 0) # [2:4], Always 0. @@ -73,4 +74,3 @@ class Writer(FormatWriter): record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text. return record - From 00ed9305cbe53d3b933bd65b5a80e8806f511f54 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Fri, 17 May 2019 19:12:56 -0400 Subject: [PATCH 02/14] py3: fix mobi ncx get_id_tag/get_id_tag_by_pos_fid are internal functions that always return bytes, but when using the former proxied through the latter, we did not always decode the result in the process of generating an OPF. As a result, books would end up with nav links pointing to urls resembling "foo.html#b'anchor'". Fix by moving down the decode attempt to cover both, right before writing it back into the index_entries. --- src/calibre/ebooks/mobi/reader/mobi8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 6fb58c62e7..452bdb7d63 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -391,7 +391,7 @@ class Mobi8Reader(object): fi = self.get_file_info(pos) if fi.filename is None: raise ValueError('Index entry has invalid pos: %d'%pos) - idtag = self.get_id_tag(pos).decode(self.header.codec) + idtag = self.get_id_tag(pos) href = '%s/%s'%(fi.type, fi.filename) else: try: @@ -403,7 +403,7 @@ class Mobi8Reader(object): continue entry['href'] = href - entry['idtag'] = idtag + entry['idtag'] = idtag.decode(self.header.codec) for e in remove: index_entries.remove(e) From 047d539e0a580b858d85c52171960ea93c9a94b0 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 14:22:35 -0400 Subject: [PATCH 03/14] py3: os.walk explicitly returns bytes when given a bytes argument And therefore when replacing paths in it, we need to use bytes as well. --- src/calibre/ebooks/oeb/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index d17c73c6d6..98b6ef5c7b 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -605,7 +605,7 @@ class DirContainer(object): for root, dirs, files in os.walk(base): for fname in files: fname = os.path.join(root, fname) - fname = fname.replace('\\', '/') + fname = fname.replace(b'\\', b'/') if not isinstance(fname, unicode_type): try: fname = fname.decode(filesystem_encoding) From 4bbc8df3abc2ce06d03633bd40e84c3292588211 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 14:26:13 -0400 Subject: [PATCH 04/14] py3: make pdb input work --- src/calibre/ebooks/pdb/header.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index eb2a786dfd..efd0a1c3aa 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -24,7 +24,7 @@ class PdbHeaderReader(object): def identity(self): self.stream.seek(60) ident = self.stream.read(8) - return ident + return ident.decode('utf-8') def section_count(self): self.stream.seek(76) From 8e368c0d465fa632dfa50dbbae6ac245b2392da9 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 14:30:12 -0400 Subject: [PATCH 05/14] py3: more fixes for snb output string.split is deprecated, use native functions on the original string itself. --- src/calibre/ebooks/conversion/plugins/snb_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/plugins/snb_output.py b/src/calibre/ebooks/conversion/plugins/snb_output.py index 00d0b0dc34..6f4de34c79 100644 --- a/src/calibre/ebooks/conversion/plugins/snb_output.py +++ b/src/calibre/ebooks/conversion/plugins/snb_output.py @@ -142,7 +142,7 @@ class SNBOutput(OutputFormatPlugin): for tocitem in oeb_book.toc: if tocitem.href.find('#') != -1: - item = string.split(tocitem.href, '#') + item = tocitem.href.split('#') if len(item) != 2: log.error('Error in TOC item: %s' % tocitem) else: From 1ed017fabd7fc25bad5219bdd919249016b5a18c Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 14:33:52 -0400 Subject: [PATCH 06/14] py3: make pmlz output work in python3, the re module is more picky about what arguments are used with it, and invalid escapes do not fall back on being treated as string literals, but raise an error. Use raw strings to ensure that the escaped backslashes are preserved all the way to the regular expressions themselves. --- src/calibre/ebooks/pml/pmlml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index f6f737275e..854bd7fa5c 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -174,8 +174,8 @@ class PMLMLizer(object): return text def prepare_text(self, text): - # Replace empty paragraphs with \c pml codes used to denote emtpy lines. - text = re.sub(unicode_type(r'(?<=

)\s*]*>[\xc2\xa0\s]*

'), '\\c\n\\c', text) + # Replace empty paragraphs with \c pml codes used to denote empty lines. + text = re.sub(unicode_type(r'(?<=

)\s*]*>[\xc2\xa0\s]*

'), r'\\c\n\\c', text) return text def clean_text(self, text): @@ -207,7 +207,7 @@ class PMLMLizer(object): text = re.sub('[ ]{2,}', ' ', text) # Condense excessive \c empty line sequences. - text = re.sub('(\\c\\s*\\c\\s*){2,}', '\\c \n\\c\n', text) + text = re.sub(r'(\\c\\s*\\c\\s*){2,}', r'\\c \n\\c\n', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) From 65c0ca944b86fe8f0cc898a1e2b0b5ad0c176cd3 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 14:56:07 -0400 Subject: [PATCH 07/14] py3: make pml input work Opening a file in binary mode in python3 returns a BufferedReader which does not have an encoding attribute. Assume in this case it is functionally equivalent to None. I'm not sure when pml_stream.encoding will ever equal anything, since even on python2 the attribute exists but is None when opening files in binary mode... which we explicitly do. So I'm not sure why this ever checks the existing encoding. Possibly when the input plugin is given a file opened in text mode, not raw mode? in that case, it may be wrong to always decode it when reading. --- src/calibre/ebooks/conversion/plugins/pml_input.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/plugins/pml_input.py b/src/calibre/ebooks/conversion/plugins/pml_input.py index 3388bdefd3..8bdb773358 100644 --- a/src/calibre/ebooks/conversion/plugins/pml_input.py +++ b/src/calibre/ebooks/conversion/plugins/pml_input.py @@ -41,7 +41,9 @@ class PMLInput(InputFormatPlugin): else: html_stream = html_path - ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252' + ienc = getattr(pml_stream, 'encoding', None) + if ienc is None: + ienc = 'cp1252' if self.options.input_encoding: ienc = self.options.input_encoding From f75ea236e5656d016a9f369153b2386eff4c2001 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 17:04:32 -0400 Subject: [PATCH 08/14] py3: make templite templates use python3-compatible syntax --- resources/templates/html_export_default.tmpl | 24 +++++++++---------- .../templates/html_export_default_index.tmpl | 18 +++++++------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/resources/templates/html_export_default.tmpl b/resources/templates/html_export_default.tmpl index c3ed921255..7aac247e59 100644 --- a/resources/templates/html_export_default.tmpl +++ b/resources/templates/html_export_default.tmpl @@ -14,16 +14,16 @@ ${head_content}$ ${for title in meta.titles():}$ ${if pos1:}$

- ${print title}$ + ${print(title)}$

${:else:}$ -
${print title}$
+
${print(title)}$
${:endif}$ ${pos1=0}$ ${:endfor}$
- ${print ', '.join(meta.creators())}$ + ${print(', '.join(meta.creators()))}$
@@ -33,13 +33,13 @@ ${head_content}$ ${if prevLink or nextLink:}$ ${:endif}$ @@ -49,22 +49,22 @@ ${head_content}$ ${if has_toc:}$ ${:endif}$ diff --git a/resources/templates/html_export_default_index.tmpl b/resources/templates/html_export_default_index.tmpl index 4a9e8ab6f3..f0665ad275 100644 --- a/resources/templates/html_export_default_index.tmpl +++ b/resources/templates/html_export_default_index.tmpl @@ -6,10 +6,10 @@ -${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$ +${print(', '.join(meta.creators()))}$ - ${print(next(meta.titles())); print(meta.titles().close())}$ ${for item in meta:}$ - + ${:endfor}$ @@ -22,16 +22,16 @@ ${:endfor}$ ${for title in meta.titles():}$ ${if pos1:}$

- ${print title}$ + ${print(title)}$

${:else:}$ -
${print title}$
+
${print(title)}$
${:endif}$ ${pos1=0}$ ${:endfor}$
- ${print ', '.join(meta.creators()),}$ + ${print(', '.join(meta.creators()))}$
@@ -40,19 +40,19 @@ ${:endfor}$ ${if has_toc:}$
-

${print _('Table of contents'),}$

+

${print(_('Table of contents'))}$

${toc}$
${:else:}$ -

${print _('No table of contents present'),}$

- +

${print(_('No table of contents present'))}$

+ ${:endif}$
${if nextLink:}$ - ${print _('next page'),}$ + ${print(_('next page'))}$ ${:endif}$
From fda2ab002477f11a339abe94cd565959a30cebe9 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 17:05:01 -0400 Subject: [PATCH 09/14] py3: make html output work The templates are initialized as decoded unicode strings, and templite expects to work in unicode strings as well. However, etree.tostring returns a bytestring unless explicitly told to use the 'unicode' encoding ('utf-8' is not enough) --- src/calibre/ebooks/conversion/plugins/html_output.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/html_output.py b/src/calibre/ebooks/conversion/plugins/html_output.py index 3caa19ef2f..ba2e922267 100644 --- a/src/calibre/ebooks/conversion/plugins/html_output.py +++ b/src/calibre/ebooks/conversion/plugins/html_output.py @@ -79,7 +79,7 @@ class HTMLOutput(OutputFormatPlugin): from lxml import etree root = self.generate_toc(oeb_book, ref_url, output_dir) - return etree.tostring(root, pretty_print=True, encoding='utf-8', + return etree.tostring(root, pretty_print=True, encoding='unicode', xml_declaration=False) def convert(self, oeb_book, output_path, input_plugin, opts, log): @@ -161,14 +161,14 @@ class HTMLOutput(OutputFormatPlugin): # get & clean HTML -data head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] - head_content = etree.tostring(head, pretty_print=True, encoding='utf-8') + head_content = etree.tostring(head, pretty_print=True, encoding='unicode') head_content = re.sub(r'\<\/?head.*\>', '', head_content) head_content = re.sub(re.compile(r'\', re.M|re.S), '', head_content) head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2>', head_content) # get & clean HTML -data body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] - ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8') + ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode') ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2>', ebook_content) @@ -202,7 +202,7 @@ class HTMLOutput(OutputFormatPlugin): # write html to file with open(path, 'wb') as f: - f.write(t) + f.write(t.encode('utf-8')) item.unload_data_from_memory(memory=path) zfile = zipfile.ZipFile(output_path, "w") From aa43816d090b393e7e11d41670b7a288bef60c4a Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 21:53:14 -0400 Subject: [PATCH 10/14] py3: make txtz input work --- src/calibre/ebooks/conversion/plugins/txt_input.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index f5def565d2..e9f6fa54fe 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin): block_to_single_line, separate_hard_scene_breaks) self.log = log - txt = '' + txt = b'' log.debug('Reading text from file...') length = 0 base_dir = getcwd() @@ -151,7 +151,7 @@ class TXTInput(InputFormatPlugin): for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: - txt += tf.read() + '\n\n' + txt += tf.read() + b'\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) From 9f65185da35b722781cb197318129da351ef08ab Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Sun, 19 May 2019 21:53:25 -0400 Subject: [PATCH 11/14] py3: make fb2 input work --- src/calibre/ebooks/conversion/plugins/fb2_input.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/fb2_input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py index 9992797e3d..d802fe2b2a 100644 --- a/src/calibre/ebooks/conversion/plugins/fb2_input.py +++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py @@ -90,7 +90,8 @@ class FB2Input(InputFormatPlugin): css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') - ss = open(P('templates/fb2.xsl'), 'rb').read() + with open(P('templates/fb2.xsl'), 'rb') as f: + ss = f.read().decode('utf-8') ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log('Disabling generation of inline FB2 TOC') @@ -124,8 +125,10 @@ class FB2Input(InputFormatPlugin): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) - open(u'index.xhtml', 'wb').write(index) - open(u'inline-styles.css', 'wb').write(css) + with open(u'index.xhtml', 'wb') as f: + f.write(index.encode('utf-8')) + with open(u'inline-styles.css', 'wb') as f: + f.write(css.encode('utf-8')) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: From 4b0e241555407b0baf3129eab1aae452714f99d6 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Mon, 20 May 2019 00:39:36 -0400 Subject: [PATCH 12/14] py3: make pdf input work --- src/calibre/ebooks/conversion/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f6230269fa..ccbddb2eaa 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -584,7 +584,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\\\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa ) for rule in self.PREPROCESS + start_rules: From a8a74b7c53bf0c900fa19d859e848752ea81be5c Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Mon, 20 May 2019 00:48:23 -0400 Subject: [PATCH 13/14] py3: use proper dict.keys() handling --- src/calibre/ebooks/rtf2xml/paragraph_def.py | 23 +++++++-------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/paragraph_def.py b/src/calibre/ebooks/rtf2xml/paragraph_def.py index 82962fe9ea..0812e15776 100755 --- a/src/calibre/ebooks/rtf2xml/paragraph_def.py +++ b/src/calibre/ebooks/rtf2xml/paragraph_def.py @@ -608,12 +608,10 @@ if another paragraph_def is found, the state changes to collect_tokens. # when determining uniqueness for a style, ingorne these values, since # they don't tell us if the style is unique ignore_values = ['style-num', 'nest-level', 'in-table'] - keys = self.__att_val_dict.keys() - keys.sort() - for key in keys: - if key in ignore_values: + for k, v in self.__att_val_dict.items(): + if k in ignore_values: continue - my_string += '%s:%s' % (key, self.__att_val_dict[key]) + my_string += '%s:%s' % (k, v) if my_string in self.__style_num_strings: num = self.__style_num_strings.index(my_string) num += 1 # since indexing starts at zero, rather than 1 @@ -637,12 +635,9 @@ if another paragraph_def is found, the state changes to collect_tokens. the_value = self.__att_val_dict['tabs'] # the_value = the_value[:-1] style_string += ('<%s>%s' % ('tabs', the_value)) - keys = self.__att_val_dict.keys() - keys.sort() - for key in keys: - if key != 'name' and key !='style-num' and key != 'in-table'\ - and key not in tabs_list: - style_string += ('<%s>%s' % (key, self.__att_val_dict[key])) + for k, v in self.__att_val_dict.items(): + if k not in ['name', 'style-num', 'in-table'] + tabs_list: + style_string += ('<%s>%s' % (k, v)) style_string += '\n' self.__body_style_strings.append(style_string) @@ -690,11 +685,9 @@ if another paragraph_def is found, the state changes to collect_tokens. the_value = self.__att_val_dict['tabs'] # the_value = the_value[:-1] self.__write_obj.write('<%s>%s' % ('tabs', the_value)) - keys = self.__att_val_dict.keys() - keys.sort() + keys = sorted(self.__att_val_dict.keys()) for key in keys: - if key != 'name' and key !='style-num' and key != 'in-table'\ - and key not in tabs_list: + if key not in ['name', 'style-num', 'in-table'] + tabs_list: self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key])) self.__write_obj.write('\n') self.__write_obj.write(self.__start2_marker) From c6e0698c36ef5e848beaf076cbc3265ccd128734 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Mon, 20 May 2019 00:49:28 -0400 Subject: [PATCH 14/14] py3: partial work towards making rtf2xml actually work --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 +- src/calibre/ebooks/rtf2xml/line_endings.py | 6 ++--- src/calibre/ebooks/rtf2xml/process_tokens.py | 26 ++++++++++---------- src/calibre/ebooks/rtf2xml/tokenize.py | 12 ++++----- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 8321f5cccd..a3d52a854c 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -562,7 +562,7 @@ class ParseRtf: def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" - read_obj = file if hasattr(file, 'read') else open(file,'r') + read_obj = file if hasattr(file, 'read') else open(file,'rb') with open(write_file, 'wb') as write_obj: for line in read_obj: write_obj.write(line) diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py index 3e2b8156e8..5dbc59a995 100755 --- a/src/calibre/ebooks/rtf2xml/line_endings.py +++ b/src/calibre/ebooks/rtf2xml/line_endings.py @@ -36,11 +36,11 @@ class FixLineEndings: def fix_endings(self): # read - with open(self.__file, 'r') as read_obj: + with open(self.__file, 'rb') as read_obj: input_file = read_obj.read() # calibre go from win and mac to unix - input_file = input_file.replace('\r\n', '\n') - input_file = input_file.replace('\r', '\n') + input_file = input_file.replace(b'\r\n', b'\n') + input_file = input_file.replace(b'\r', b'\n') # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 if self.__replace_illegals: input_file = clean_ascii_chars(input_file) diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 0f18d5ff9b..30dc0545ee 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -43,8 +43,8 @@ class ProcessTokens: self.__bug_handler = bug_handler def compile_expressions(self): - self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)") - self.__utf_exp = re.compile(r'(&.*?;)') + self.__num_exp = re.compile(br"([a-zA-Z]+)(.*)") + self.__utf_exp = re.compile(br'(&.*?;)') def initiate_token_dict(self): self.__return_code = 0 @@ -762,10 +762,10 @@ class ProcessTokens: def process_cw(self, token): """Change the value of the control word by determining what dictionary it belongs to""" - special = ['*', ':', '}', '{', '~', '_', '-', ';'] + special = [b'*', b':', b'}', b'{', b'~', b'_', b'-', b';'] # if token != "{" or token != "}": token = token[1:] # strip off leading \ - token = token.replace(" ", "") + token = token.replace(b" ", b"") # if not token: return only_alpha = token.isalpha() num = None @@ -784,24 +784,24 @@ class ProcessTokens: def process_tokens(self): """Main method for handling other methods. """ line_count = 0 - with open(self.__file, 'r') as read_obj: + with open(self.__file, 'rb') as read_obj: with open(self.__write_to, 'wb') as write_obj: for line in read_obj: - token = line.replace("\n","") + token = line.replace(b"\n",b"") line_count += 1 - if line_count == 1 and token != '\\{': + if line_count == 1 and token != b'\\{': msg = '\nInvalid RTF: document doesn\'t start with {\n' raise self.__exception_handler(msg) - elif line_count == 2 and token[0:4] != '\\rtf': + elif line_count == 2 and token[0:4] != b'\\rtf': msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n' raise self.__exception_handler(msg) - the_index = token.find('\\ ') + the_index = token.find(b'\\ ') if token is not None and the_index > -1: msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ % line_count raise self.__exception_handler(msg) - elif token[:1] == "\\": + elif token[:1] == b"\\": try: token.decode('us-ascii') except UnicodeError as msg: @@ -816,10 +816,10 @@ class ProcessTokens: for field in fields: if not field: continue - if field[0:1] == '&': - write_obj.write('tx\n\\g<2>", input_file) + input_file = self.__par_exp.sub(r'\n\\par \n', input_file) + input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file) input_file = self.__cs_ast.sub(r"\g<1>", input_file) - input_file = self.__ms_hex_exp.sub("\\mshex0\\g<1> ", input_file) - input_file = self.__utf_ud.sub("\\{\\uc0 \\g<1>\\}", input_file) + input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file) + input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file) # remove \n in bin data input_file = self.__bin_exp.sub(lambda x: x.group().replace('\n', '') + '\n', input_file) @@ -188,7 +188,7 @@ class Tokenize: # write with open(self.__write_to, 'wb') as write_obj: - write_obj.write('\n'.join(tokens)) + write_obj.write('\n'.join(tokens).encode('utf-8')) # Move and copy copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: