From 8311b262050e5105d1c7e35ff4509e437f9fc7ea Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Wed, 1 May 2019 10:54:40 -0400 Subject: [PATCH 1/2] wip: tcr compression regex flags do not need "u" when the search string does not make use of the feature In python3, re.U is the default for unicode strings. For byte strings, which is what we use, it is a fatal error. --- src/calibre/ebooks/compression/tcr.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py index 565399eb4d..6abd243fbb 100644 --- a/src/calibre/ebooks/compression/tcr.py +++ b/src/calibre/ebooks/compression/tcr.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL 3' @@ -35,10 +34,10 @@ class TCRCompressor(object): The intent is to create more unused codes. ''' possible_codes = [] - a_code = set(re.findall(b'(?msu).', self.coded_txt)) + a_code = set(re.findall(b'(?ms).', self.coded_txt)) for code in a_code: - single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt)) + single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt)) if len(single_code) == 1: possible_codes.append(single_code.pop()) @@ -60,7 +59,7 @@ class TCRCompressor(object): ''' Create new codes from codes that occur in pairs often. ''' - possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt))) + possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt))) new_codes_count = [] for c in possible_new_codes: @@ -77,7 +76,7 @@ class TCRCompressor(object): def compress(self, txt): self._reset() - self.codes = list(set(re.findall(b'(?msu).', txt))) + self.codes = list(set(re.findall(b'(?ms).', txt))) # Replace the text with their corresponding code for c in txt: From 28767243257769847fa82f0465f436ad4fdcf30d Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Thu, 12 Sep 2019 19:51:20 -0400 Subject: [PATCH 2/2] py3: make tcr input/output work Fix tcr compression by unifying bytearrays on python2/python3 and acting appropriately. --- src/calibre/ebooks/compression/tcr.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py index 6abd243fbb..3080084196 100644 --- a/src/calibre/ebooks/compression/tcr.py +++ b/src/calibre/ebooks/compression/tcr.py @@ -6,7 +6,7 @@ __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' import re -from polyglot.builtins import range, int_to_byte +from polyglot.builtins import int_to_byte, is_py3, range class TCRCompressor(object): @@ -42,8 +42,10 @@ class TCRCompressor(object): possible_codes.append(single_code.pop()) for code in possible_codes: + if not is_py3: + code = bytearray(code) self.coded_txt = self.coded_txt.replace(code, code[0:1]) - self.codes[ord(code[0:1])] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])]) + self.codes[code[0]] = b'%s%s' % (self.codes[code[0]], self.codes[code[1]]) def _free_unused_codes(self): ''' @@ -79,8 +81,9 @@ class TCRCompressor(object): self.codes = list(set(re.findall(b'(?ms).', txt))) # Replace the text with their corresponding code - for c in txt: - self.coded_txt += int_to_byte(self.codes.index(c)) + # FIXME: python3 is native bytearray, but all we want are bytes + for c in bytearray(txt): + self.coded_txt += int_to_byte(self.codes.index(int_to_byte(c))) # Zero the unused codes and record which are unused. for i in range(len(self.codes), 256): @@ -95,9 +98,9 @@ class TCRCompressor(object): unused_code = self.unused_codes.pop() # Take the last possible codes and split it into individual # codes. The last possible code is the most often occurring. - code1, code2 = possible_codes.pop() - self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)]) - self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code)) + code = possible_codes.pop() + self.codes[unused_code] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])]) + self.coded_txt = self.coded_txt.replace(code, int_to_byte(unused_code)) self._combine_codes() self._free_unused_codes() possible_codes = self._new_codes()