From c83cdcf08651179da7f0214b6a09059ceb83f6c2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 1 Jun 2019 15:16:57 +0530 Subject: [PATCH] py3: More unicode porting --- src/calibre/ebooks/compression/tcr.py | 38 +++++++++--------- src/calibre/ebooks/lit/writer.py | 5 +-- src/calibre/ebooks/metadata/mobi.py | 4 +- src/calibre/ebooks/metadata/topaz.py | 55 +++++++++++++++------------ 4 files changed, 54 insertions(+), 48 deletions(-) diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py index 35b47aaff1..c02da7bee8 100644 --- a/src/calibre/ebooks/compression/tcr.py +++ b/src/calibre/ebooks/compression/tcr.py @@ -1,11 +1,13 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' import re -from polyglot.builtins import range +from polyglot.builtins import range, int_to_byte class TCRCompressor(object): @@ -21,7 +23,7 @@ class TCRCompressor(object): def _reset(self): # List of indexes in the codes list that are empty and can hold new codes self.unused_codes = set() - self.coded_txt = '' + self.coded_txt = b'' # Generate initial codes from text. # The index of the list will be the code that represents the characters at that location # in the list @@ -33,16 +35,16 @@ class TCRCompressor(object): The intent is to create more unused codes. ''' possible_codes = [] - a_code = set(re.findall('(?msu).', self.coded_txt)) + a_code = set(re.findall(b'(?msu).', self.coded_txt)) for code in a_code: - single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt)) + single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt)) if len(single_code) == 1: possible_codes.append(single_code.pop()) for code in possible_codes: self.coded_txt = self.coded_txt.replace(code, code[0]) - self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])]) + self.codes[ord(code[0])] = b'%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])]) def _free_unused_codes(self): ''' @@ -51,14 +53,14 @@ class TCRCompressor(object): ''' for i in range(256): if i not in self.unused_codes: - if chr(i) not in self.coded_txt: + if int_to_byte(i) not in self.coded_txt: self.unused_codes.add(i) def _new_codes(self): ''' Create new codes from codes that occur in pairs often. ''' - possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt))) + possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt))) new_codes_count = [] for c in possible_new_codes: @@ -75,15 +77,15 @@ class TCRCompressor(object): def compress(self, txt): self._reset() - self.codes = list(set(re.findall('(?msu).', txt))) + self.codes = list(set(re.findall(b'(?msu).', txt))) # Replace the text with their corresponding code for c in txt: - self.coded_txt += chr(self.codes.index(c)) + self.coded_txt += int_to_byte(self.codes.index(c)) # Zero the unused codes and record which are unused. for i in range(len(self.codes), 256): - self.codes.append('') + self.codes.append(b'') self.unused_codes.add(i) self._combine_codes() @@ -95,8 +97,8 @@ class TCRCompressor(object): # Take the last possible codes and split it into individual # codes. The last possible code is the most often occurring. code1, code2 = possible_codes.pop() - self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)]) - self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code)) + self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)]) + self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code)) self._combine_codes() self._free_unused_codes() possible_codes = self._new_codes() @@ -107,18 +109,18 @@ class TCRCompressor(object): code_dict = [] for i in range(0, 256): if i in self.unused_codes: - code_dict.append(chr(0)) + code_dict.append(b'\0') else: - code_dict.append(chr(len(self.codes[i])) + self.codes[i]) + code_dict.append(int_to_byte(len(self.codes[i])) + self.codes[i]) # Join the identifier with the dictionary and coded text. - return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt + return b'!!8-Bit!!'+b''.join(code_dict)+self.coded_txt def decompress(stream): txt = [] stream.seek(0) - if stream.read(9) != '!!8-Bit!!': + if stream.read(9) != b'!!8-Bit!!': raise ValueError('File %s contains an invalid TCR header.' % stream.name) # Codes that the file contents are broken down into. @@ -129,11 +131,11 @@ def decompress(stream): # Map the values in the file to locations in the string list. entry_loc = stream.read(1) - while entry_loc != '': # EOF + while entry_loc != b'': # EOF txt.append(entries[ord(entry_loc)]) entry_loc = stream.read(1) - return ''.join(txt) + return b''.join(txt) def compress(txt): diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 68ff33d692..f1191b781e 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -11,7 +11,7 @@ from itertools import count, chain from operator import attrgetter import io import time -import random +import os import re import copy import uuid @@ -134,8 +134,7 @@ def decint(value): return bytes(bytearray(reversed(ans))) -def randbytes(n): - return ''.join(chr(random.randint(0, 255)) for x in range(n)) +randbytes = os.urandom def warn(x): diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index 06f32ecfc3..7c80898901 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -20,7 +20,7 @@ from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.date import now as nowf from calibre.utils.imghdr import what from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 -from polyglot.builtins import unicode_type, range +from polyglot.builtins import unicode_type, range, codepoint_to_chr def is_image(ss): @@ -281,7 +281,7 @@ class MetadataUpdater(object): def hexdump(self, src, length=16): # Diagnostic - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)]) N=0 result='' while src: diff --git a/src/calibre/ebooks/metadata/topaz.py b/src/calibre/ebooks/metadata/topaz.py index 12bb199feb..364df5bba7 100644 --- a/src/calibre/ebooks/metadata/topaz.py +++ b/src/calibre/ebooks/metadata/topaz.py @@ -1,5 +1,5 @@ -from __future__ import with_statement -from __future__ import print_function +from __future__ import absolute_import, division, print_function, unicode_literals + __license__ = 'GPL 3' __copyright__ = '2010, Greg Riker ' __docformat__ = 'restructuredtext en' @@ -10,6 +10,12 @@ from struct import pack from calibre.ebooks.metadata import MetaInformation from calibre import force_unicode +from polyglot.builtins import codepoint_to_chr, int_to_byte + + +def is_dkey(x): + q = b'dkey' if isinstance(x, bytes) else 'dkey' + return x == q class StringIO(io.StringIO): @@ -118,14 +124,13 @@ class MetadataUpdater(object): self.get_original_metadata() if 'bookLength' in self.metadata: return int(self.metadata['bookLength']) - else: - return 0 + return 0 - def decode_vwi(self,bytes): + def decode_vwi(self, byts): pos, val = 0, 0 done = False - byts = bytearray(bytes) - while pos < len(bytes) and not done: + byts = bytearray(byts) + while pos < len(byts) and not done: b = byts[pos] pos += 1 if (b & 0x80) == 0: @@ -149,7 +154,7 @@ class MetadataUpdater(object): def dump_hex(self, src, length=16): ''' Diagnostic ''' - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)]) N=0 result='' while src: @@ -166,36 +171,36 @@ class MetadataUpdater(object): print('%s: %s' % (tag, repr(self.metadata[tag]))) def encode_vwi(self,value): - bytes = [] + ans = [] multi_byte = (value > 0x7f) while value: b = value & 0x7f value >>= 7 if value == 0: if multi_byte: - bytes.append(b|0x80) - if bytes[-1] == 0xFF: - bytes.append(0x80) - if len(bytes) == 4: - return pack('>BBBB',bytes[3],bytes[2],bytes[1],bytes[0]).decode('iso-8859-1') - elif len(bytes) == 3: - return pack('>BBB',bytes[2],bytes[1],bytes[0]).decode('iso-8859-1') - elif len(bytes) == 2: - return pack('>BB',bytes[1],bytes[0]).decode('iso-8859-1') + ans.append(b|0x80) + if ans[-1] == 0xFF: + ans.append(0x80) + if len(ans) == 4: + return pack('>BBBB',ans[3],ans[2],ans[1],ans[0]).decode('iso-8859-1') + elif len(ans) == 3: + return pack('>BBB',ans[2],ans[1],ans[0]).decode('iso-8859-1') + elif len(ans) == 2: + return pack('>BB',ans[1],ans[0]).decode('iso-8859-1') else: return pack('>B', b).decode('iso-8859-1') else: - if len(bytes): - bytes.append(b|0x80) + if len(ans): + ans.append(b|0x80) else: - bytes.append(b) + ans.append(b) # If value == 0, return 0 return pack('>B', 0x0).decode('iso-8859-1') def generate_dkey(self): for x in self.topaz_headers: - if self.topaz_headers[x]['tag'] == 'dkey': + if is_dkey(self.topaz_headers[x]['tag']): if self.topaz_headers[x]['blocks']: offset = self.base + self.topaz_headers[x]['blocks'][0]['offset'] len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp'] @@ -208,7 +213,7 @@ class MetadataUpdater(object): offset += 1 dks.write(dkey['tag']) offset += len('dkey') - dks.write(u'\0') + dks.write('\0') offset += 1 dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1')) return dks.getvalue().encode('iso-8859-1') @@ -245,8 +250,8 @@ class MetadataUpdater(object): ms = StringIO() ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1')) ms.write(self.md_header['tag']) - ms.write(chr(self.md_header['flags'])) - ms.write(chr(len(self.metadata))) + ms.write(int_to_byte(self.md_header['flags'])) + ms.write(int_to_byte(len(self.metadata))) # Add the metadata fields. # for tag in self.metadata: