py3: More unicode porting

2025-08-11 09:13:57 -04:00 · 2019-06-01 15:16:57 +05:30 · 2019-06-01 15:16:57 +05:30 · c83cdcf086
commit c83cdcf086
parent 151e736538
4 changed files with 54 additions and 48 deletions
--- a/src/calibre/ebooks/compression/tcr.py
+++ b/src/calibre/ebooks/compression/tcr.py
@ -1,11 +1,13 @@
 # -*- coding: utf-8 -*-

+from __future__ import absolute_import, division, print_function, unicode_literals
+
 __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

 import re
-from polyglot.builtins import range
+from polyglot.builtins import range, int_to_byte


 class TCRCompressor(object):
@ -21,7 +23,7 @@ class TCRCompressor(object):
    def _reset(self):
        # List of indexes in the codes list that are empty and can hold new codes
        self.unused_codes = set()
-        self.coded_txt = ''
+        self.coded_txt = b''
        # Generate initial codes from text.
        # The index of the list will be the code that represents the characters at that location
        # in the list
@ -33,16 +35,16 @@ class TCRCompressor(object):
        The intent is to create more unused codes.
        '''
        possible_codes = []
-        a_code = set(re.findall('(?msu).', self.coded_txt))
+        a_code = set(re.findall(b'(?msu).', self.coded_txt))

        for code in a_code:
-            single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
+            single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
            if len(single_code) == 1:
                possible_codes.append(single_code.pop())

        for code in possible_codes:
            self.coded_txt = self.coded_txt.replace(code, code[0])
-            self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
+            self.codes[ord(code[0])] = b'%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])

    def _free_unused_codes(self):
        '''
@ -51,14 +53,14 @@ class TCRCompressor(object):
        '''
        for i in range(256):
            if i not in self.unused_codes:
-                if chr(i) not in self.coded_txt:
+                if int_to_byte(i) not in self.coded_txt:
                    self.unused_codes.add(i)

    def _new_codes(self):
        '''
        Create new codes from codes that occur in pairs often.
        '''
-        possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
+        possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
        new_codes_count = []

        for c in possible_new_codes:
@ -75,15 +77,15 @@ class TCRCompressor(object):
    def compress(self, txt):
        self._reset()

-        self.codes = list(set(re.findall('(?msu).', txt)))
+        self.codes = list(set(re.findall(b'(?msu).', txt)))

        # Replace the text with their corresponding code
        for c in txt:
-            self.coded_txt += chr(self.codes.index(c))
+            self.coded_txt += int_to_byte(self.codes.index(c))

        # Zero the unused codes and record which are unused.
        for i in range(len(self.codes), 256):
-            self.codes.append('')
+            self.codes.append(b'')
            self.unused_codes.add(i)

        self._combine_codes()
@ -95,8 +97,8 @@ class TCRCompressor(object):
                # Take the last possible codes and split it into individual
                # codes. The last possible code is the most often occurring.
                code1, code2 = possible_codes.pop()
-                self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
-                self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
+                self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
+                self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code))
            self._combine_codes()
            self._free_unused_codes()
            possible_codes = self._new_codes()
@ -107,18 +109,18 @@ class TCRCompressor(object):
        code_dict = []
        for i in range(0, 256):
            if i in self.unused_codes:
-                code_dict.append(chr(0))
+                code_dict.append(b'\0')
            else:
-                code_dict.append(chr(len(self.codes[i])) + self.codes[i])
+                code_dict.append(int_to_byte(len(self.codes[i])) + self.codes[i])

        # Join the identifier with the dictionary and coded text.
-        return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
+        return b'!!8-Bit!!'+b''.join(code_dict)+self.coded_txt


 def decompress(stream):
    txt = []
    stream.seek(0)
-    if stream.read(9) != '!!8-Bit!!':
+    if stream.read(9) != b'!!8-Bit!!':
        raise ValueError('File %s contains an invalid TCR header.' % stream.name)

    # Codes that the file contents are broken down into.
@ -129,11 +131,11 @@ def decompress(stream):

    # Map the values in the file to locations in the string list.
    entry_loc = stream.read(1)
-    while entry_loc != '':  # EOF
+    while entry_loc != b'':  # EOF
        txt.append(entries[ord(entry_loc)])
        entry_loc = stream.read(1)

-    return ''.join(txt)
+    return b''.join(txt)


 def compress(txt):
--- a/src/calibre/ebooks/lit/writer.py
+++ b/src/calibre/ebooks/lit/writer.py
@ -11,7 +11,7 @@ from itertools import count, chain
 from operator import attrgetter
 import io
 import time
-import random
+import os
 import re
 import copy
 import uuid
@ -134,8 +134,7 @@ def decint(value):
    return bytes(bytearray(reversed(ans)))


-def randbytes(n):
-    return ''.join(chr(random.randint(0, 255)) for x in range(n))
+randbytes = os.urandom


 def warn(x):
--- a/src/calibre/ebooks/metadata/mobi.py
+++ b/src/calibre/ebooks/metadata/mobi.py
@ -20,7 +20,7 @@ from calibre.ebooks.mobi.langcodes import iana2mobi
 from calibre.utils.date import now as nowf
 from calibre.utils.imghdr import what
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
-from polyglot.builtins import unicode_type, range
+from polyglot.builtins import unicode_type, range, codepoint_to_chr


 def is_image(ss):
@ -281,7 +281,7 @@ class MetadataUpdater(object):

    def hexdump(self, src, length=16):
        # Diagnostic
-        FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
+        FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
        N=0
        result=''
        while src:
--- a/src/calibre/ebooks/metadata/topaz.py
+++ b/src/calibre/ebooks/metadata/topaz.py
@ -1,5 +1,5 @@
-from __future__ import with_statement
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 __license__ = 'GPL 3'
 __copyright__ = '2010, Greg Riker <griker@hotmail.com>'
 __docformat__ = 'restructuredtext en'
@ -10,6 +10,12 @@ from struct import pack

 from calibre.ebooks.metadata import MetaInformation
 from calibre import force_unicode
+from polyglot.builtins import codepoint_to_chr, int_to_byte
+
+
+def is_dkey(x):
+    q = b'dkey' if isinstance(x, bytes) else 'dkey'
+    return x == q


 class StringIO(io.StringIO):
@ -118,14 +124,13 @@ class MetadataUpdater(object):
        self.get_original_metadata()
        if 'bookLength' in self.metadata:
            return int(self.metadata['bookLength'])
-        else:
        return 0

-    def decode_vwi(self,bytes):
+    def decode_vwi(self, byts):
        pos, val = 0, 0
        done = False
-        byts = bytearray(bytes)
-        while pos < len(bytes) and not done:
+        byts = bytearray(byts)
+        while pos < len(byts) and not done:
            b = byts[pos]
            pos += 1
            if (b & 0x80) == 0:
@ -149,7 +154,7 @@ class MetadataUpdater(object):

    def dump_hex(self, src, length=16):
        ''' Diagnostic '''
-        FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
+        FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
        N=0
        result=''
        while src:
@ -166,36 +171,36 @@ class MetadataUpdater(object):
            print('%s: %s' % (tag, repr(self.metadata[tag])))

    def encode_vwi(self,value):
-        bytes = []
+        ans = []
        multi_byte = (value > 0x7f)
        while value:
            b = value & 0x7f
            value >>= 7
            if value == 0:
                if multi_byte:
-                    bytes.append(b|0x80)
-                    if bytes[-1] == 0xFF:
-                        bytes.append(0x80)
-                    if len(bytes) == 4:
-                        return pack('>BBBB',bytes[3],bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
-                    elif len(bytes) == 3:
-                        return pack('>BBB',bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
-                    elif len(bytes) == 2:
-                        return pack('>BB',bytes[1],bytes[0]).decode('iso-8859-1')
+                    ans.append(b|0x80)
+                    if ans[-1] == 0xFF:
+                        ans.append(0x80)
+                    if len(ans) == 4:
+                        return pack('>BBBB',ans[3],ans[2],ans[1],ans[0]).decode('iso-8859-1')
+                    elif len(ans) == 3:
+                        return pack('>BBB',ans[2],ans[1],ans[0]).decode('iso-8859-1')
+                    elif len(ans) == 2:
+                        return pack('>BB',ans[1],ans[0]).decode('iso-8859-1')
                else:
                    return pack('>B', b).decode('iso-8859-1')
            else:
-                if len(bytes):
-                    bytes.append(b|0x80)
+                if len(ans):
+                    ans.append(b|0x80)
                else:
-                    bytes.append(b)
+                    ans.append(b)

        # If value == 0, return 0
        return pack('>B', 0x0).decode('iso-8859-1')

    def generate_dkey(self):
        for x in self.topaz_headers:
-            if self.topaz_headers[x]['tag'] == 'dkey':
+            if is_dkey(self.topaz_headers[x]['tag']):
                if self.topaz_headers[x]['blocks']:
                    offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
                    len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
@ -208,7 +213,7 @@ class MetadataUpdater(object):
        offset += 1
        dks.write(dkey['tag'])
        offset += len('dkey')
-        dks.write(u'\0')
+        dks.write('\0')
        offset += 1
        dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
        return dks.getvalue().encode('iso-8859-1')
@ -245,8 +250,8 @@ class MetadataUpdater(object):
        ms = StringIO()
        ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
        ms.write(self.md_header['tag'])
-        ms.write(chr(self.md_header['flags']))
-        ms.write(chr(len(self.metadata)))
+        ms.write(int_to_byte(self.md_header['flags']))
+        ms.write(int_to_byte(len(self.metadata)))

        # Add the metadata fields.
        # for tag in self.metadata: