py3: More unicode porting

This commit is contained in:
Kovid Goyal 2019-06-01 15:16:57 +05:30
parent 151e736538
commit c83cdcf086
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 54 additions and 48 deletions

View File

@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
from polyglot.builtins import range
from polyglot.builtins import range, int_to_byte
class TCRCompressor(object):
@ -21,7 +23,7 @@ class TCRCompressor(object):
def _reset(self):
# List of indexes in the codes list that are empty and can hold new codes
self.unused_codes = set()
self.coded_txt = ''
self.coded_txt = b''
# Generate initial codes from text.
# The index of the list will be the code that represents the characters at that location
# in the list
@ -33,16 +35,16 @@ class TCRCompressor(object):
The intent is to create more unused codes.
'''
possible_codes = []
a_code = set(re.findall('(?msu).', self.coded_txt))
a_code = set(re.findall(b'(?msu).', self.coded_txt))
for code in a_code:
single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
if len(single_code) == 1:
possible_codes.append(single_code.pop())
for code in possible_codes:
self.coded_txt = self.coded_txt.replace(code, code[0])
self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
self.codes[ord(code[0])] = b'%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
def _free_unused_codes(self):
'''
@ -51,14 +53,14 @@ class TCRCompressor(object):
'''
for i in range(256):
if i not in self.unused_codes:
if chr(i) not in self.coded_txt:
if int_to_byte(i) not in self.coded_txt:
self.unused_codes.add(i)
def _new_codes(self):
'''
Create new codes from codes that occur in pairs often.
'''
possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
new_codes_count = []
for c in possible_new_codes:
@ -75,15 +77,15 @@ class TCRCompressor(object):
def compress(self, txt):
self._reset()
self.codes = list(set(re.findall('(?msu).', txt)))
self.codes = list(set(re.findall(b'(?msu).', txt)))
# Replace the text with their corresponding code
for c in txt:
self.coded_txt += chr(self.codes.index(c))
self.coded_txt += int_to_byte(self.codes.index(c))
# Zero the unused codes and record which are unused.
for i in range(len(self.codes), 256):
self.codes.append('')
self.codes.append(b'')
self.unused_codes.add(i)
self._combine_codes()
@ -95,8 +97,8 @@ class TCRCompressor(object):
# Take the last possible codes and split it into individual
# codes. The last possible code is the most often occurring.
code1, code2 = possible_codes.pop()
self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code))
self._combine_codes()
self._free_unused_codes()
possible_codes = self._new_codes()
@ -107,18 +109,18 @@ class TCRCompressor(object):
code_dict = []
for i in range(0, 256):
if i in self.unused_codes:
code_dict.append(chr(0))
code_dict.append(b'\0')
else:
code_dict.append(chr(len(self.codes[i])) + self.codes[i])
code_dict.append(int_to_byte(len(self.codes[i])) + self.codes[i])
# Join the identifier with the dictionary and coded text.
return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
return b'!!8-Bit!!'+b''.join(code_dict)+self.coded_txt
def decompress(stream):
txt = []
stream.seek(0)
if stream.read(9) != '!!8-Bit!!':
if stream.read(9) != b'!!8-Bit!!':
raise ValueError('File %s contains an invalid TCR header.' % stream.name)
# Codes that the file contents are broken down into.
@ -129,11 +131,11 @@ def decompress(stream):
# Map the values in the file to locations in the string list.
entry_loc = stream.read(1)
while entry_loc != '': # EOF
while entry_loc != b'': # EOF
txt.append(entries[ord(entry_loc)])
entry_loc = stream.read(1)
return ''.join(txt)
return b''.join(txt)
def compress(txt):

View File

@ -11,7 +11,7 @@ from itertools import count, chain
from operator import attrgetter
import io
import time
import random
import os
import re
import copy
import uuid
@ -134,8 +134,7 @@ def decint(value):
return bytes(bytearray(reversed(ans)))
def randbytes(n):
return ''.join(chr(random.randint(0, 255)) for x in range(n))
randbytes = os.urandom
def warn(x):

View File

@ -20,7 +20,7 @@ from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.date import now as nowf
from calibre.utils.imghdr import what
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import unicode_type, range
from polyglot.builtins import unicode_type, range, codepoint_to_chr
def is_image(ss):
@ -281,7 +281,7 @@ class MetadataUpdater(object):
def hexdump(self, src, length=16):
# Diagnostic
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
N=0
result=''
while src:

View File

@ -1,5 +1,5 @@
from __future__ import with_statement
from __future__ import print_function
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Greg Riker <griker@hotmail.com>'
__docformat__ = 'restructuredtext en'
@ -10,6 +10,12 @@ from struct import pack
from calibre.ebooks.metadata import MetaInformation
from calibre import force_unicode
from polyglot.builtins import codepoint_to_chr, int_to_byte
def is_dkey(x):
q = b'dkey' if isinstance(x, bytes) else 'dkey'
return x == q
class StringIO(io.StringIO):
@ -118,14 +124,13 @@ class MetadataUpdater(object):
self.get_original_metadata()
if 'bookLength' in self.metadata:
return int(self.metadata['bookLength'])
else:
return 0
def decode_vwi(self,bytes):
def decode_vwi(self, byts):
pos, val = 0, 0
done = False
byts = bytearray(bytes)
while pos < len(bytes) and not done:
byts = bytearray(byts)
while pos < len(byts) and not done:
b = byts[pos]
pos += 1
if (b & 0x80) == 0:
@ -149,7 +154,7 @@ class MetadataUpdater(object):
def dump_hex(self, src, length=16):
''' Diagnostic '''
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
N=0
result=''
while src:
@ -166,36 +171,36 @@ class MetadataUpdater(object):
print('%s: %s' % (tag, repr(self.metadata[tag])))
def encode_vwi(self,value):
bytes = []
ans = []
multi_byte = (value > 0x7f)
while value:
b = value & 0x7f
value >>= 7
if value == 0:
if multi_byte:
bytes.append(b|0x80)
if bytes[-1] == 0xFF:
bytes.append(0x80)
if len(bytes) == 4:
return pack('>BBBB',bytes[3],bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
elif len(bytes) == 3:
return pack('>BBB',bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
elif len(bytes) == 2:
return pack('>BB',bytes[1],bytes[0]).decode('iso-8859-1')
ans.append(b|0x80)
if ans[-1] == 0xFF:
ans.append(0x80)
if len(ans) == 4:
return pack('>BBBB',ans[3],ans[2],ans[1],ans[0]).decode('iso-8859-1')
elif len(ans) == 3:
return pack('>BBB',ans[2],ans[1],ans[0]).decode('iso-8859-1')
elif len(ans) == 2:
return pack('>BB',ans[1],ans[0]).decode('iso-8859-1')
else:
return pack('>B', b).decode('iso-8859-1')
else:
if len(bytes):
bytes.append(b|0x80)
if len(ans):
ans.append(b|0x80)
else:
bytes.append(b)
ans.append(b)
# If value == 0, return 0
return pack('>B', 0x0).decode('iso-8859-1')
def generate_dkey(self):
for x in self.topaz_headers:
if self.topaz_headers[x]['tag'] == 'dkey':
if is_dkey(self.topaz_headers[x]['tag']):
if self.topaz_headers[x]['blocks']:
offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
@ -208,7 +213,7 @@ class MetadataUpdater(object):
offset += 1
dks.write(dkey['tag'])
offset += len('dkey')
dks.write(u'\0')
dks.write('\0')
offset += 1
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
return dks.getvalue().encode('iso-8859-1')
@ -245,8 +250,8 @@ class MetadataUpdater(object):
ms = StringIO()
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
ms.write(self.md_header['tag'])
ms.write(chr(self.md_header['flags']))
ms.write(chr(len(self.metadata)))
ms.write(int_to_byte(self.md_header['flags']))
ms.write(int_to_byte(len(self.metadata)))
# Add the metadata fields.
# for tag in self.metadata: