mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
py3: More unicode porting
This commit is contained in:
parent
151e736538
commit
c83cdcf086
@ -1,11 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from polyglot.builtins import range
|
||||
from polyglot.builtins import range, int_to_byte
|
||||
|
||||
|
||||
class TCRCompressor(object):
|
||||
@ -21,7 +23,7 @@ class TCRCompressor(object):
|
||||
def _reset(self):
|
||||
# List of indexes in the codes list that are empty and can hold new codes
|
||||
self.unused_codes = set()
|
||||
self.coded_txt = ''
|
||||
self.coded_txt = b''
|
||||
# Generate initial codes from text.
|
||||
# The index of the list will be the code that represents the characters at that location
|
||||
# in the list
|
||||
@ -33,16 +35,16 @@ class TCRCompressor(object):
|
||||
The intent is to create more unused codes.
|
||||
'''
|
||||
possible_codes = []
|
||||
a_code = set(re.findall('(?msu).', self.coded_txt))
|
||||
a_code = set(re.findall(b'(?msu).', self.coded_txt))
|
||||
|
||||
for code in a_code:
|
||||
single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
|
||||
single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
|
||||
if len(single_code) == 1:
|
||||
possible_codes.append(single_code.pop())
|
||||
|
||||
for code in possible_codes:
|
||||
self.coded_txt = self.coded_txt.replace(code, code[0])
|
||||
self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
|
||||
self.codes[ord(code[0])] = b'%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
|
||||
|
||||
def _free_unused_codes(self):
|
||||
'''
|
||||
@ -51,14 +53,14 @@ class TCRCompressor(object):
|
||||
'''
|
||||
for i in range(256):
|
||||
if i not in self.unused_codes:
|
||||
if chr(i) not in self.coded_txt:
|
||||
if int_to_byte(i) not in self.coded_txt:
|
||||
self.unused_codes.add(i)
|
||||
|
||||
def _new_codes(self):
|
||||
'''
|
||||
Create new codes from codes that occur in pairs often.
|
||||
'''
|
||||
possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
|
||||
possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
|
||||
new_codes_count = []
|
||||
|
||||
for c in possible_new_codes:
|
||||
@ -75,15 +77,15 @@ class TCRCompressor(object):
|
||||
def compress(self, txt):
|
||||
self._reset()
|
||||
|
||||
self.codes = list(set(re.findall('(?msu).', txt)))
|
||||
self.codes = list(set(re.findall(b'(?msu).', txt)))
|
||||
|
||||
# Replace the text with their corresponding code
|
||||
for c in txt:
|
||||
self.coded_txt += chr(self.codes.index(c))
|
||||
self.coded_txt += int_to_byte(self.codes.index(c))
|
||||
|
||||
# Zero the unused codes and record which are unused.
|
||||
for i in range(len(self.codes), 256):
|
||||
self.codes.append('')
|
||||
self.codes.append(b'')
|
||||
self.unused_codes.add(i)
|
||||
|
||||
self._combine_codes()
|
||||
@ -95,8 +97,8 @@ class TCRCompressor(object):
|
||||
# Take the last possible codes and split it into individual
|
||||
# codes. The last possible code is the most often occurring.
|
||||
code1, code2 = possible_codes.pop()
|
||||
self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
|
||||
self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
|
||||
self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
|
||||
self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code))
|
||||
self._combine_codes()
|
||||
self._free_unused_codes()
|
||||
possible_codes = self._new_codes()
|
||||
@ -107,18 +109,18 @@ class TCRCompressor(object):
|
||||
code_dict = []
|
||||
for i in range(0, 256):
|
||||
if i in self.unused_codes:
|
||||
code_dict.append(chr(0))
|
||||
code_dict.append(b'\0')
|
||||
else:
|
||||
code_dict.append(chr(len(self.codes[i])) + self.codes[i])
|
||||
code_dict.append(int_to_byte(len(self.codes[i])) + self.codes[i])
|
||||
|
||||
# Join the identifier with the dictionary and coded text.
|
||||
return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
|
||||
return b'!!8-Bit!!'+b''.join(code_dict)+self.coded_txt
|
||||
|
||||
|
||||
def decompress(stream):
|
||||
txt = []
|
||||
stream.seek(0)
|
||||
if stream.read(9) != '!!8-Bit!!':
|
||||
if stream.read(9) != b'!!8-Bit!!':
|
||||
raise ValueError('File %s contains an invalid TCR header.' % stream.name)
|
||||
|
||||
# Codes that the file contents are broken down into.
|
||||
@ -129,11 +131,11 @@ def decompress(stream):
|
||||
|
||||
# Map the values in the file to locations in the string list.
|
||||
entry_loc = stream.read(1)
|
||||
while entry_loc != '': # EOF
|
||||
while entry_loc != b'': # EOF
|
||||
txt.append(entries[ord(entry_loc)])
|
||||
entry_loc = stream.read(1)
|
||||
|
||||
return ''.join(txt)
|
||||
return b''.join(txt)
|
||||
|
||||
|
||||
def compress(txt):
|
||||
|
@ -11,7 +11,7 @@ from itertools import count, chain
|
||||
from operator import attrgetter
|
||||
import io
|
||||
import time
|
||||
import random
|
||||
import os
|
||||
import re
|
||||
import copy
|
||||
import uuid
|
||||
@ -134,8 +134,7 @@ def decint(value):
|
||||
return bytes(bytearray(reversed(ans)))
|
||||
|
||||
|
||||
def randbytes(n):
|
||||
return ''.join(chr(random.randint(0, 255)) for x in range(n))
|
||||
randbytes = os.urandom
|
||||
|
||||
|
||||
def warn(x):
|
||||
|
@ -20,7 +20,7 @@ from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||
from calibre.utils.date import now as nowf
|
||||
from calibre.utils.imghdr import what
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from polyglot.builtins import unicode_type, range
|
||||
from polyglot.builtins import unicode_type, range, codepoint_to_chr
|
||||
|
||||
|
||||
def is_image(ss):
|
||||
@ -281,7 +281,7 @@ class MetadataUpdater(object):
|
||||
|
||||
def hexdump(self, src, length=16):
|
||||
# Diagnostic
|
||||
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
||||
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
|
||||
N=0
|
||||
result=''
|
||||
while src:
|
||||
|
@ -1,5 +1,5 @@
|
||||
from __future__ import with_statement
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Greg Riker <griker@hotmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
@ -10,6 +10,12 @@ from struct import pack
|
||||
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre import force_unicode
|
||||
from polyglot.builtins import codepoint_to_chr, int_to_byte
|
||||
|
||||
|
||||
def is_dkey(x):
|
||||
q = b'dkey' if isinstance(x, bytes) else 'dkey'
|
||||
return x == q
|
||||
|
||||
|
||||
class StringIO(io.StringIO):
|
||||
@ -118,14 +124,13 @@ class MetadataUpdater(object):
|
||||
self.get_original_metadata()
|
||||
if 'bookLength' in self.metadata:
|
||||
return int(self.metadata['bookLength'])
|
||||
else:
|
||||
return 0
|
||||
|
||||
def decode_vwi(self,bytes):
|
||||
def decode_vwi(self, byts):
|
||||
pos, val = 0, 0
|
||||
done = False
|
||||
byts = bytearray(bytes)
|
||||
while pos < len(bytes) and not done:
|
||||
byts = bytearray(byts)
|
||||
while pos < len(byts) and not done:
|
||||
b = byts[pos]
|
||||
pos += 1
|
||||
if (b & 0x80) == 0:
|
||||
@ -149,7 +154,7 @@ class MetadataUpdater(object):
|
||||
|
||||
def dump_hex(self, src, length=16):
|
||||
''' Diagnostic '''
|
||||
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
||||
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
|
||||
N=0
|
||||
result=''
|
||||
while src:
|
||||
@ -166,36 +171,36 @@ class MetadataUpdater(object):
|
||||
print('%s: %s' % (tag, repr(self.metadata[tag])))
|
||||
|
||||
def encode_vwi(self,value):
|
||||
bytes = []
|
||||
ans = []
|
||||
multi_byte = (value > 0x7f)
|
||||
while value:
|
||||
b = value & 0x7f
|
||||
value >>= 7
|
||||
if value == 0:
|
||||
if multi_byte:
|
||||
bytes.append(b|0x80)
|
||||
if bytes[-1] == 0xFF:
|
||||
bytes.append(0x80)
|
||||
if len(bytes) == 4:
|
||||
return pack('>BBBB',bytes[3],bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
|
||||
elif len(bytes) == 3:
|
||||
return pack('>BBB',bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
|
||||
elif len(bytes) == 2:
|
||||
return pack('>BB',bytes[1],bytes[0]).decode('iso-8859-1')
|
||||
ans.append(b|0x80)
|
||||
if ans[-1] == 0xFF:
|
||||
ans.append(0x80)
|
||||
if len(ans) == 4:
|
||||
return pack('>BBBB',ans[3],ans[2],ans[1],ans[0]).decode('iso-8859-1')
|
||||
elif len(ans) == 3:
|
||||
return pack('>BBB',ans[2],ans[1],ans[0]).decode('iso-8859-1')
|
||||
elif len(ans) == 2:
|
||||
return pack('>BB',ans[1],ans[0]).decode('iso-8859-1')
|
||||
else:
|
||||
return pack('>B', b).decode('iso-8859-1')
|
||||
else:
|
||||
if len(bytes):
|
||||
bytes.append(b|0x80)
|
||||
if len(ans):
|
||||
ans.append(b|0x80)
|
||||
else:
|
||||
bytes.append(b)
|
||||
ans.append(b)
|
||||
|
||||
# If value == 0, return 0
|
||||
return pack('>B', 0x0).decode('iso-8859-1')
|
||||
|
||||
def generate_dkey(self):
|
||||
for x in self.topaz_headers:
|
||||
if self.topaz_headers[x]['tag'] == 'dkey':
|
||||
if is_dkey(self.topaz_headers[x]['tag']):
|
||||
if self.topaz_headers[x]['blocks']:
|
||||
offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
|
||||
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
|
||||
@ -208,7 +213,7 @@ class MetadataUpdater(object):
|
||||
offset += 1
|
||||
dks.write(dkey['tag'])
|
||||
offset += len('dkey')
|
||||
dks.write(u'\0')
|
||||
dks.write('\0')
|
||||
offset += 1
|
||||
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
|
||||
return dks.getvalue().encode('iso-8859-1')
|
||||
@ -245,8 +250,8 @@ class MetadataUpdater(object):
|
||||
ms = StringIO()
|
||||
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
|
||||
ms.write(self.md_header['tag'])
|
||||
ms.write(chr(self.md_header['flags']))
|
||||
ms.write(chr(len(self.metadata)))
|
||||
ms.write(int_to_byte(self.md_header['flags']))
|
||||
ms.write(int_to_byte(len(self.metadata)))
|
||||
|
||||
# Add the metadata fields.
|
||||
# for tag in self.metadata:
|
||||
|
Loading…
x
Reference in New Issue
Block a user