wip: tcr compression

regex flags do not need "u" when the search string does not make use of the feature

In python3, re.U is the default for unicode strings. For byte strings,
which is what we use, it is a fatal error.
This commit is contained in:
Eli Schwartz 2019-05-01 10:54:40 -04:00
parent 51d4b5a5e9
commit 8311b26205
No known key found for this signature in database
GPG Key ID: CEB167EFB5722BD6

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
@ -35,10 +34,10 @@ class TCRCompressor(object):
The intent is to create more unused codes.
'''
possible_codes = []
a_code = set(re.findall(b'(?msu).', self.coded_txt))
a_code = set(re.findall(b'(?ms).', self.coded_txt))
for code in a_code:
single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
if len(single_code) == 1:
possible_codes.append(single_code.pop())
@ -60,7 +59,7 @@ class TCRCompressor(object):
'''
Create new codes from codes that occur in pairs often.
'''
possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
new_codes_count = []
for c in possible_new_codes:
@ -77,7 +76,7 @@ class TCRCompressor(object):
def compress(self, txt):
self._reset()
self.codes = list(set(re.findall(b'(?msu).', txt)))
self.codes = list(set(re.findall(b'(?ms).', txt)))
# Replace the text with their corresponding code
for c in txt: