mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
py3: More unicode porting
This commit is contained in:
parent
151e736538
commit
c83cdcf086
@ -1,11 +1,13 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from polyglot.builtins import range
|
from polyglot.builtins import range, int_to_byte
|
||||||
|
|
||||||
|
|
||||||
class TCRCompressor(object):
|
class TCRCompressor(object):
|
||||||
@ -21,7 +23,7 @@ class TCRCompressor(object):
|
|||||||
def _reset(self):
|
def _reset(self):
|
||||||
# List of indexes in the codes list that are empty and can hold new codes
|
# List of indexes in the codes list that are empty and can hold new codes
|
||||||
self.unused_codes = set()
|
self.unused_codes = set()
|
||||||
self.coded_txt = ''
|
self.coded_txt = b''
|
||||||
# Generate initial codes from text.
|
# Generate initial codes from text.
|
||||||
# The index of the list will be the code that represents the characters at that location
|
# The index of the list will be the code that represents the characters at that location
|
||||||
# in the list
|
# in the list
|
||||||
@ -33,16 +35,16 @@ class TCRCompressor(object):
|
|||||||
The intent is to create more unused codes.
|
The intent is to create more unused codes.
|
||||||
'''
|
'''
|
||||||
possible_codes = []
|
possible_codes = []
|
||||||
a_code = set(re.findall('(?msu).', self.coded_txt))
|
a_code = set(re.findall(b'(?msu).', self.coded_txt))
|
||||||
|
|
||||||
for code in a_code:
|
for code in a_code:
|
||||||
single_code = set(re.findall('(?msu)%s.' % re.escape(code), self.coded_txt))
|
single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
|
||||||
if len(single_code) == 1:
|
if len(single_code) == 1:
|
||||||
possible_codes.append(single_code.pop())
|
possible_codes.append(single_code.pop())
|
||||||
|
|
||||||
for code in possible_codes:
|
for code in possible_codes:
|
||||||
self.coded_txt = self.coded_txt.replace(code, code[0])
|
self.coded_txt = self.coded_txt.replace(code, code[0])
|
||||||
self.codes[ord(code[0])] = '%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
|
self.codes[ord(code[0])] = b'%s%s' % (self.codes[ord(code[0])], self.codes[ord(code[1])])
|
||||||
|
|
||||||
def _free_unused_codes(self):
|
def _free_unused_codes(self):
|
||||||
'''
|
'''
|
||||||
@ -51,14 +53,14 @@ class TCRCompressor(object):
|
|||||||
'''
|
'''
|
||||||
for i in range(256):
|
for i in range(256):
|
||||||
if i not in self.unused_codes:
|
if i not in self.unused_codes:
|
||||||
if chr(i) not in self.coded_txt:
|
if int_to_byte(i) not in self.coded_txt:
|
||||||
self.unused_codes.add(i)
|
self.unused_codes.add(i)
|
||||||
|
|
||||||
def _new_codes(self):
|
def _new_codes(self):
|
||||||
'''
|
'''
|
||||||
Create new codes from codes that occur in pairs often.
|
Create new codes from codes that occur in pairs often.
|
||||||
'''
|
'''
|
||||||
possible_new_codes = list(set(re.findall('(?msu)..', self.coded_txt)))
|
possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
|
||||||
new_codes_count = []
|
new_codes_count = []
|
||||||
|
|
||||||
for c in possible_new_codes:
|
for c in possible_new_codes:
|
||||||
@ -75,15 +77,15 @@ class TCRCompressor(object):
|
|||||||
def compress(self, txt):
|
def compress(self, txt):
|
||||||
self._reset()
|
self._reset()
|
||||||
|
|
||||||
self.codes = list(set(re.findall('(?msu).', txt)))
|
self.codes = list(set(re.findall(b'(?msu).', txt)))
|
||||||
|
|
||||||
# Replace the text with their corresponding code
|
# Replace the text with their corresponding code
|
||||||
for c in txt:
|
for c in txt:
|
||||||
self.coded_txt += chr(self.codes.index(c))
|
self.coded_txt += int_to_byte(self.codes.index(c))
|
||||||
|
|
||||||
# Zero the unused codes and record which are unused.
|
# Zero the unused codes and record which are unused.
|
||||||
for i in range(len(self.codes), 256):
|
for i in range(len(self.codes), 256):
|
||||||
self.codes.append('')
|
self.codes.append(b'')
|
||||||
self.unused_codes.add(i)
|
self.unused_codes.add(i)
|
||||||
|
|
||||||
self._combine_codes()
|
self._combine_codes()
|
||||||
@ -95,8 +97,8 @@ class TCRCompressor(object):
|
|||||||
# Take the last possible codes and split it into individual
|
# Take the last possible codes and split it into individual
|
||||||
# codes. The last possible code is the most often occurring.
|
# codes. The last possible code is the most often occurring.
|
||||||
code1, code2 = possible_codes.pop()
|
code1, code2 = possible_codes.pop()
|
||||||
self.codes[unused_code] = '%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
|
self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
|
||||||
self.coded_txt = self.coded_txt.replace('%s%s' % (code1, code2), chr(unused_code))
|
self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code))
|
||||||
self._combine_codes()
|
self._combine_codes()
|
||||||
self._free_unused_codes()
|
self._free_unused_codes()
|
||||||
possible_codes = self._new_codes()
|
possible_codes = self._new_codes()
|
||||||
@ -107,18 +109,18 @@ class TCRCompressor(object):
|
|||||||
code_dict = []
|
code_dict = []
|
||||||
for i in range(0, 256):
|
for i in range(0, 256):
|
||||||
if i in self.unused_codes:
|
if i in self.unused_codes:
|
||||||
code_dict.append(chr(0))
|
code_dict.append(b'\0')
|
||||||
else:
|
else:
|
||||||
code_dict.append(chr(len(self.codes[i])) + self.codes[i])
|
code_dict.append(int_to_byte(len(self.codes[i])) + self.codes[i])
|
||||||
|
|
||||||
# Join the identifier with the dictionary and coded text.
|
# Join the identifier with the dictionary and coded text.
|
||||||
return '!!8-Bit!!'+''.join(code_dict)+self.coded_txt
|
return b'!!8-Bit!!'+b''.join(code_dict)+self.coded_txt
|
||||||
|
|
||||||
|
|
||||||
def decompress(stream):
|
def decompress(stream):
|
||||||
txt = []
|
txt = []
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
if stream.read(9) != '!!8-Bit!!':
|
if stream.read(9) != b'!!8-Bit!!':
|
||||||
raise ValueError('File %s contains an invalid TCR header.' % stream.name)
|
raise ValueError('File %s contains an invalid TCR header.' % stream.name)
|
||||||
|
|
||||||
# Codes that the file contents are broken down into.
|
# Codes that the file contents are broken down into.
|
||||||
@ -129,11 +131,11 @@ def decompress(stream):
|
|||||||
|
|
||||||
# Map the values in the file to locations in the string list.
|
# Map the values in the file to locations in the string list.
|
||||||
entry_loc = stream.read(1)
|
entry_loc = stream.read(1)
|
||||||
while entry_loc != '': # EOF
|
while entry_loc != b'': # EOF
|
||||||
txt.append(entries[ord(entry_loc)])
|
txt.append(entries[ord(entry_loc)])
|
||||||
entry_loc = stream.read(1)
|
entry_loc = stream.read(1)
|
||||||
|
|
||||||
return ''.join(txt)
|
return b''.join(txt)
|
||||||
|
|
||||||
|
|
||||||
def compress(txt):
|
def compress(txt):
|
||||||
|
@ -11,7 +11,7 @@ from itertools import count, chain
|
|||||||
from operator import attrgetter
|
from operator import attrgetter
|
||||||
import io
|
import io
|
||||||
import time
|
import time
|
||||||
import random
|
import os
|
||||||
import re
|
import re
|
||||||
import copy
|
import copy
|
||||||
import uuid
|
import uuid
|
||||||
@ -134,8 +134,7 @@ def decint(value):
|
|||||||
return bytes(bytearray(reversed(ans)))
|
return bytes(bytearray(reversed(ans)))
|
||||||
|
|
||||||
|
|
||||||
def randbytes(n):
|
randbytes = os.urandom
|
||||||
return ''.join(chr(random.randint(0, 255)) for x in range(n))
|
|
||||||
|
|
||||||
|
|
||||||
def warn(x):
|
def warn(x):
|
||||||
|
@ -20,7 +20,7 @@ from calibre.ebooks.mobi.langcodes import iana2mobi
|
|||||||
from calibre.utils.date import now as nowf
|
from calibre.utils.date import now as nowf
|
||||||
from calibre.utils.imghdr import what
|
from calibre.utils.imghdr import what
|
||||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||||
from polyglot.builtins import unicode_type, range
|
from polyglot.builtins import unicode_type, range, codepoint_to_chr
|
||||||
|
|
||||||
|
|
||||||
def is_image(ss):
|
def is_image(ss):
|
||||||
@ -281,7 +281,7 @@ class MetadataUpdater(object):
|
|||||||
|
|
||||||
def hexdump(self, src, length=16):
|
def hexdump(self, src, length=16):
|
||||||
# Diagnostic
|
# Diagnostic
|
||||||
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
|
||||||
N=0
|
N=0
|
||||||
result=''
|
result=''
|
||||||
while src:
|
while src:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from __future__ import with_statement
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
from __future__ import print_function
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2010, Greg Riker <griker@hotmail.com>'
|
__copyright__ = '2010, Greg Riker <griker@hotmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
@ -10,6 +10,12 @@ from struct import pack
|
|||||||
|
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre import force_unicode
|
from calibre import force_unicode
|
||||||
|
from polyglot.builtins import codepoint_to_chr, int_to_byte
|
||||||
|
|
||||||
|
|
||||||
|
def is_dkey(x):
|
||||||
|
q = b'dkey' if isinstance(x, bytes) else 'dkey'
|
||||||
|
return x == q
|
||||||
|
|
||||||
|
|
||||||
class StringIO(io.StringIO):
|
class StringIO(io.StringIO):
|
||||||
@ -118,14 +124,13 @@ class MetadataUpdater(object):
|
|||||||
self.get_original_metadata()
|
self.get_original_metadata()
|
||||||
if 'bookLength' in self.metadata:
|
if 'bookLength' in self.metadata:
|
||||||
return int(self.metadata['bookLength'])
|
return int(self.metadata['bookLength'])
|
||||||
else:
|
return 0
|
||||||
return 0
|
|
||||||
|
|
||||||
def decode_vwi(self,bytes):
|
def decode_vwi(self, byts):
|
||||||
pos, val = 0, 0
|
pos, val = 0, 0
|
||||||
done = False
|
done = False
|
||||||
byts = bytearray(bytes)
|
byts = bytearray(byts)
|
||||||
while pos < len(bytes) and not done:
|
while pos < len(byts) and not done:
|
||||||
b = byts[pos]
|
b = byts[pos]
|
||||||
pos += 1
|
pos += 1
|
||||||
if (b & 0x80) == 0:
|
if (b & 0x80) == 0:
|
||||||
@ -149,7 +154,7 @@ class MetadataUpdater(object):
|
|||||||
|
|
||||||
def dump_hex(self, src, length=16):
|
def dump_hex(self, src, length=16):
|
||||||
''' Diagnostic '''
|
''' Diagnostic '''
|
||||||
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)])
|
||||||
N=0
|
N=0
|
||||||
result=''
|
result=''
|
||||||
while src:
|
while src:
|
||||||
@ -166,36 +171,36 @@ class MetadataUpdater(object):
|
|||||||
print('%s: %s' % (tag, repr(self.metadata[tag])))
|
print('%s: %s' % (tag, repr(self.metadata[tag])))
|
||||||
|
|
||||||
def encode_vwi(self,value):
|
def encode_vwi(self,value):
|
||||||
bytes = []
|
ans = []
|
||||||
multi_byte = (value > 0x7f)
|
multi_byte = (value > 0x7f)
|
||||||
while value:
|
while value:
|
||||||
b = value & 0x7f
|
b = value & 0x7f
|
||||||
value >>= 7
|
value >>= 7
|
||||||
if value == 0:
|
if value == 0:
|
||||||
if multi_byte:
|
if multi_byte:
|
||||||
bytes.append(b|0x80)
|
ans.append(b|0x80)
|
||||||
if bytes[-1] == 0xFF:
|
if ans[-1] == 0xFF:
|
||||||
bytes.append(0x80)
|
ans.append(0x80)
|
||||||
if len(bytes) == 4:
|
if len(ans) == 4:
|
||||||
return pack('>BBBB',bytes[3],bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
|
return pack('>BBBB',ans[3],ans[2],ans[1],ans[0]).decode('iso-8859-1')
|
||||||
elif len(bytes) == 3:
|
elif len(ans) == 3:
|
||||||
return pack('>BBB',bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
|
return pack('>BBB',ans[2],ans[1],ans[0]).decode('iso-8859-1')
|
||||||
elif len(bytes) == 2:
|
elif len(ans) == 2:
|
||||||
return pack('>BB',bytes[1],bytes[0]).decode('iso-8859-1')
|
return pack('>BB',ans[1],ans[0]).decode('iso-8859-1')
|
||||||
else:
|
else:
|
||||||
return pack('>B', b).decode('iso-8859-1')
|
return pack('>B', b).decode('iso-8859-1')
|
||||||
else:
|
else:
|
||||||
if len(bytes):
|
if len(ans):
|
||||||
bytes.append(b|0x80)
|
ans.append(b|0x80)
|
||||||
else:
|
else:
|
||||||
bytes.append(b)
|
ans.append(b)
|
||||||
|
|
||||||
# If value == 0, return 0
|
# If value == 0, return 0
|
||||||
return pack('>B', 0x0).decode('iso-8859-1')
|
return pack('>B', 0x0).decode('iso-8859-1')
|
||||||
|
|
||||||
def generate_dkey(self):
|
def generate_dkey(self):
|
||||||
for x in self.topaz_headers:
|
for x in self.topaz_headers:
|
||||||
if self.topaz_headers[x]['tag'] == 'dkey':
|
if is_dkey(self.topaz_headers[x]['tag']):
|
||||||
if self.topaz_headers[x]['blocks']:
|
if self.topaz_headers[x]['blocks']:
|
||||||
offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
|
offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
|
||||||
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
|
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
|
||||||
@ -208,7 +213,7 @@ class MetadataUpdater(object):
|
|||||||
offset += 1
|
offset += 1
|
||||||
dks.write(dkey['tag'])
|
dks.write(dkey['tag'])
|
||||||
offset += len('dkey')
|
offset += len('dkey')
|
||||||
dks.write(u'\0')
|
dks.write('\0')
|
||||||
offset += 1
|
offset += 1
|
||||||
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
|
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
|
||||||
return dks.getvalue().encode('iso-8859-1')
|
return dks.getvalue().encode('iso-8859-1')
|
||||||
@ -245,8 +250,8 @@ class MetadataUpdater(object):
|
|||||||
ms = StringIO()
|
ms = StringIO()
|
||||||
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
|
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
|
||||||
ms.write(self.md_header['tag'])
|
ms.write(self.md_header['tag'])
|
||||||
ms.write(chr(self.md_header['flags']))
|
ms.write(int_to_byte(self.md_header['flags']))
|
||||||
ms.write(chr(len(self.metadata)))
|
ms.write(int_to_byte(len(self.metadata)))
|
||||||
|
|
||||||
# Add the metadata fields.
|
# Add the metadata fields.
|
||||||
# for tag in self.metadata:
|
# for tag in self.metadata:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user