py3: More unicode porting

This commit is contained in:
Kovid Goyal 2019-06-01 15:16:57 +05:30
parent 151e736538
commit c83cdcf086
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 54 additions and 48 deletions

View File

@ -1,11 +1,13 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re
from polyglot.builtins import range from polyglot.builtins import range, int_to_byte
class TCRCompressor(object): class TCRCompressor(object):
@ -21,7 +23,7 @@ class TCRCompressor(object):
def _reset(self): def _reset(self):
# List of indexes in the codes list that are empty and can hold new codes # List of indexes in the codes list that are empty and can hold new codes
self.unused_codes = set() self.unused_codes = set()
self.coded_txt = '' self.coded_txt = b''
# Generate initial codes from text. # Generate initial codes from text.
# The index of the list will be the code that represents the characters at that location # The index of the list will be the code that represents the characters at that location
# in the list # in the list
@ -33,16 +35,16 @@ class TCRCompressor(object):
The intent is to create more unused codes. The intent is to create more unused codes.
''' '''
possible_codes = [] possible_codes = []
a_code = set(re.findall('(?msu).', self.coded_txt)) a_code = set(re.findall(b'(?msu).', self.coded_txt))
for code in a_code: for code in a_code:
single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt)) single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
if len(single_code) == 1: if len(single_code) == 1:
possible_codes.append(single_code.pop()) possible_codes.append(single_code.pop())
for code in possible_codes: for code in possible_codes:
self.coded_txt = self.coded_txt.replace(code, code[0]) self.coded_txt = self.coded_txt.replace(code, code[0])
self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])]) self.codes[ord(code[0])] = b'%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
def _free_unused_codes(self): def _free_unused_codes(self):
''' '''
@ -51,14 +53,14 @@ class TCRCompressor(object):
''' '''
for i in range(256): for i in range(256):
if i not in self.unused_codes: if i not in self.unused_codes:
if chr(i) not in self.coded_txt: if int_to_byte(i) not in self.coded_txt:
self.unused_codes.add(i) self.unused_codes.add(i)
def _new_codes(self): def _new_codes(self):
''' '''
Create new codes from codes that occur in pairs often. Create new codes from codes that occur in pairs often.
''' '''
possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt))) possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
new_codes_count = [] new_codes_count = []
for c in possible_new_codes: for c in possible_new_codes:
@ -75,15 +77,15 @@ class TCRCompressor(object):
def compress(self, txt): def compress(self, txt):
self._reset() self._reset()
self.codes = list(set(re.findall('(?msu).', txt))) self.codes = list(set(re.findall(b'(?msu).', txt)))
# Replace the text with their corresponding code # Replace the text with their corresponding code
for c in txt: for c in txt:
self.coded_txt += chr(self.codes.index(c)) self.coded_txt += int_to_byte(self.codes.index(c))
# Zero the unused codes and record which are unused. # Zero the unused codes and record which are unused.
for i in range(len(self.codes), 256): for i in range(len(self.codes), 256):
self.codes.append('') self.codes.append(b'')
self.unused_codes.add(i) self.unused_codes.add(i)
self._combine_codes() self._combine_codes()
@ -95,8 +97,8 @@ class TCRCompressor(object):
# Take the last possible codes and split it into individual # Take the last possible codes and split it into individual
# codes. The last possible code is the most often occurring. # codes. The last possible code is the most often occurring.
code1, code2 = possible_codes.pop() code1, code2 = possible_codes.pop()
self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)]) self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code)) self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code))
self._combine_codes() self._combine_codes()
self._free_unused_codes() self._free_unused_codes()
possible_codes = self._new_codes() possible_codes = self._new_codes()
@ -107,18 +109,18 @@ class TCRCompressor(object):
code_dict = [] code_dict = []
for i in range(0, 256): for i in range(0, 256):
if i in self.unused_codes: if i in self.unused_codes:
code_dict.append(chr(0)) code_dict.append(b'\0')
else: else:
code_dict.append(chr(len(self.codes[i])) + self.codes[i]) code_dict.append(int_to_byte(len(self.codes[i])) + self.codes[i])
# Join the identifier with the dictionary and coded text. # Join the identifier with the dictionary and coded text.
return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt return b'!!8-Bit!!'+b''.join(code_dict)+self.coded_txt
def decompress(stream): def decompress(stream):
txt = [] txt = []
stream.seek(0) stream.seek(0)
if stream.read(9) != '!!8-Bit!!': if stream.read(9) != b'!!8-Bit!!':
raise ValueError('File %s contains an invalid TCR header.' % stream.name) raise ValueError('File %s contains an invalid TCR header.' % stream.name)
# Codes that the file contents are broken down into. # Codes that the file contents are broken down into.
@ -129,11 +131,11 @@ def decompress(stream):
# Map the values in the file to locations in the string list. # Map the values in the file to locations in the string list.
entry_loc = stream.read(1) entry_loc = stream.read(1)
while entry_loc != '': # EOF while entry_loc != b'': # EOF
txt.append(entries[ord(entry_loc)]) txt.append(entries[ord(entry_loc)])
entry_loc = stream.read(1) entry_loc = stream.read(1)
return ''.join(txt) return b''.join(txt)
def compress(txt): def compress(txt):

View File

@ -11,7 +11,7 @@ from itertools import count, chain
from operator import attrgetter from operator import attrgetter
import io import io
import time import time
import random import os
import re import re
import copy import copy
import uuid import uuid
@ -134,8 +134,7 @@ def decint(value):
return bytes(bytearray(reversed(ans))) return bytes(bytearray(reversed(ans)))
def randbytes(n): randbytes = os.urandom
return ''.join(chr(random.randint(0, 255)) for x in range(n))
def warn(x): def warn(x):

View File

@ -20,7 +20,7 @@ from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.date import now as nowf from calibre.utils.date import now as nowf
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import unicode_type, range from polyglot.builtins import unicode_type, range, codepoint_to_chr
def is_image(ss): def is_image(ss):
@ -281,7 +281,7 @@ class MetadataUpdater(object):
def hexdump(self, src, length=16): def hexdump(self, src, length=16):
# Diagnostic # Diagnostic
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
N=0 N=0
result='' result=''
while src: while src:

View File

@ -1,5 +1,5 @@
from __future__ import with_statement from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import print_function
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2010, Greg Riker <griker@hotmail.com>' __copyright__ = '2010, Greg Riker <griker@hotmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
@ -10,6 +10,12 @@ from struct import pack
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre import force_unicode from calibre import force_unicode
from polyglot.builtins import codepoint_to_chr, int_to_byte
def is_dkey(x):
q = b'dkey' if isinstance(x, bytes) else 'dkey'
return x == q
class StringIO(io.StringIO): class StringIO(io.StringIO):
@ -118,14 +124,13 @@ class MetadataUpdater(object):
self.get_original_metadata() self.get_original_metadata()
if 'bookLength' in self.metadata: if 'bookLength' in self.metadata:
return int(self.metadata['bookLength']) return int(self.metadata['bookLength'])
else: return 0
return 0
def decode_vwi(self,bytes): def decode_vwi(self, byts):
pos, val = 0, 0 pos, val = 0, 0
done = False done = False
byts = bytearray(bytes) byts = bytearray(byts)
while pos < len(bytes) and not done: while pos < len(byts) and not done:
b = byts[pos] b = byts[pos]
pos += 1 pos += 1
if (b & 0x80) == 0: if (b & 0x80) == 0:
@ -149,7 +154,7 @@ class MetadataUpdater(object):
def dump_hex(self, src, length=16): def dump_hex(self, src, length=16):
''' Diagnostic ''' ''' Diagnostic '''
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
N=0 N=0
result='' result=''
while src: while src:
@ -166,36 +171,36 @@ class MetadataUpdater(object):
print('%s: %s' % (tag, repr(self.metadata[tag]))) print('%s: %s' % (tag, repr(self.metadata[tag])))
def encode_vwi(self,value): def encode_vwi(self,value):
bytes = [] ans = []
multi_byte = (value > 0x7f) multi_byte = (value > 0x7f)
while value: while value:
b = value & 0x7f b = value & 0x7f
value >>= 7 value >>= 7
if value == 0: if value == 0:
if multi_byte: if multi_byte:
bytes.append(b|0x80) ans.append(b|0x80)
if bytes[-1] == 0xFF: if ans[-1] == 0xFF:
bytes.append(0x80) ans.append(0x80)
if len(bytes) == 4: if len(ans) == 4:
return pack('>BBBB',bytes[3],bytes[2],bytes[1],bytes[0]).decode('iso-8859-1') return pack('>BBBB',ans[3],ans[2],ans[1],ans[0]).decode('iso-8859-1')
elif len(bytes) == 3: elif len(ans) == 3:
return pack('>BBB',bytes[2],bytes[1],bytes[0]).decode('iso-8859-1') return pack('>BBB',ans[2],ans[1],ans[0]).decode('iso-8859-1')
elif len(bytes) == 2: elif len(ans) == 2:
return pack('>BB',bytes[1],bytes[0]).decode('iso-8859-1') return pack('>BB',ans[1],ans[0]).decode('iso-8859-1')
else: else:
return pack('>B', b).decode('iso-8859-1') return pack('>B', b).decode('iso-8859-1')
else: else:
if len(bytes): if len(ans):
bytes.append(b|0x80) ans.append(b|0x80)
else: else:
bytes.append(b) ans.append(b)
# If value == 0, return 0 # If value == 0, return 0
return pack('>B', 0x0).decode('iso-8859-1') return pack('>B', 0x0).decode('iso-8859-1')
def generate_dkey(self): def generate_dkey(self):
for x in self.topaz_headers: for x in self.topaz_headers:
if self.topaz_headers[x]['tag'] == 'dkey': if is_dkey(self.topaz_headers[x]['tag']):
if self.topaz_headers[x]['blocks']: if self.topaz_headers[x]['blocks']:
offset = self.base + self.topaz_headers[x]['blocks'][0]['offset'] offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp'] len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
@ -208,7 +213,7 @@ class MetadataUpdater(object):
offset += 1 offset += 1
dks.write(dkey['tag']) dks.write(dkey['tag'])
offset += len('dkey') offset += len('dkey')
dks.write(u'\0') dks.write('\0')
offset += 1 offset += 1
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1')) dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
return dks.getvalue().encode('iso-8859-1') return dks.getvalue().encode('iso-8859-1')
@ -245,8 +250,8 @@ class MetadataUpdater(object):
ms = StringIO() ms = StringIO()
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1')) ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
ms.write(self.md_header['tag']) ms.write(self.md_header['tag'])
ms.write(chr(self.md_header['flags'])) ms.write(int_to_byte(self.md_header['flags']))
ms.write(chr(len(self.metadata))) ms.write(int_to_byte(len(self.metadata)))
# Add the metadata fields. # Add the metadata fields.
# for tag in self.metadata: # for tag in self.metadata: