This commit is contained in:
Kovid Goyal 2019-09-13 07:16:13 +05:30
commit 1bb9f07886
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3' __license__ = 'GPL 3'
@ -7,7 +6,7 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re
from polyglot.builtins import range, int_to_byte from polyglot.builtins import int_to_byte, is_py3, range
class TCRCompressor(object): class TCRCompressor(object):
@ -35,16 +34,18 @@ class TCRCompressor(object):
The intent is to create more unused codes. The intent is to create more unused codes.
''' '''
possible_codes = [] possible_codes = []
a_code = set(re.findall(b'(?msu).', self.coded_txt)) a_code = set(re.findall(b'(?ms).', self.coded_txt))
for code in a_code: for code in a_code:
single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt)) single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
if len(single_code) == 1: if len(single_code) == 1:
possible_codes.append(single_code.pop()) possible_codes.append(single_code.pop())
for code in possible_codes: for code in possible_codes:
if not is_py3:
code = bytearray(code)
self.coded_txt = self.coded_txt.replace(code, code[0:1]) self.coded_txt = self.coded_txt.replace(code, code[0:1])
self.codes[ord(code[0:1])] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])]) self.codes[code[0]] = b'%s%s' % (self.codes[code[0]], self.codes[code[1]])
def _free_unused_codes(self): def _free_unused_codes(self):
''' '''
@ -60,7 +61,7 @@ class TCRCompressor(object):
''' '''
Create new codes from codes that occur in pairs often. Create new codes from codes that occur in pairs often.
''' '''
possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt))) possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
new_codes_count = [] new_codes_count = []
for c in possible_new_codes: for c in possible_new_codes:
@ -77,11 +78,12 @@ class TCRCompressor(object):
def compress(self, txt): def compress(self, txt):
self._reset() self._reset()
self.codes = list(set(re.findall(b'(?msu).', txt))) self.codes = list(set(re.findall(b'(?ms).', txt)))
# Replace the text with their corresponding code # Replace the text with their corresponding code
for c in txt: # FIXME: python3 is native bytearray, but all we want are bytes
self.coded_txt += int_to_byte(self.codes.index(c)) for c in bytearray(txt):
self.coded_txt += int_to_byte(self.codes.index(int_to_byte(c)))
# Zero the unused codes and record which are unused. # Zero the unused codes and record which are unused.
for i in range(len(self.codes), 256): for i in range(len(self.codes), 256):
@ -96,9 +98,9 @@ class TCRCompressor(object):
unused_code = self.unused_codes.pop() unused_code = self.unused_codes.pop()
# Take the last possible codes and split it into individual # Take the last possible codes and split it into individual
# codes. The last possible code is the most often occurring. # codes. The last possible code is the most often occurring.
code1, code2 = possible_codes.pop() code = possible_codes.pop()
self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)]) self.codes[unused_code] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])])
self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code)) self.coded_txt = self.coded_txt.replace(code, int_to_byte(unused_code))
self._combine_codes() self._combine_codes()
self._free_unused_codes() self._free_unused_codes()
possible_codes = self._new_codes() possible_codes = self._new_codes()