TCR Output. Move TCR decompression to TCR compression file.

2026-04-25 02:09:53 -04:00 · 2009-10-19 07:15:27 -04:00 · 2009-10-19 07:15:27 -04:00 · df6d759b38
commit df6d759b38
parent 1e3832a204
4 changed files with 188 additions and 20 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -352,6 +352,7 @@ from calibre.ebooks.pdf.output import PDFOutput
 from calibre.ebooks.pml.output import PMLOutput
 from calibre.ebooks.rb.output import RBOutput
 from calibre.ebooks.rtf.output import RTFOutput
+from calibre.ebooks.tcr.output import TCROutput
 from calibre.ebooks.txt.output import TXTOutput

 from calibre.customize.profiles import input_profiles, output_profiles
@ -402,6 +403,7 @@ plugins += [
    PMLOutput,
    RBOutput,
    RTFOutput,
+    TCROutput,
    TXTOutput,
 ]
 plugins += [
--- a/src/calibre/ebooks/compression/tcr.py
+++ b/src/calibre/ebooks/compression/tcr.py
@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+def decompress(stream):
+        txt = []
+        stream.seek(0)
+        if stream.read(9) != '!!8-Bit!!':
+            raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
+
+        # Codes that the file contents are broken down into.
+        entries = []
+        for i in xrange(256):
+            entry_len = ord(stream.read(1))
+            entries.append(stream.read(entry_len))
+
+        # Map the values in the file to locations in the string list.
+        entry_loc = stream.read(1)
+        while entry_loc != '': # EOF
+            txt.append(entries[ord(entry_loc)])
+            entry_loc = stream.read(1)
+
+        return ''.join(txt)
+
+
+def compress(txt, level=5):
+    '''
+    TCR compression takes the form header+code_list+coded_text.
+    The header is always "!!8-Bit!!". The code list is a list of 256 strings.
+    The list takes the form 1 byte length and then a string. Each position in
+    The list corresponds to a code found in the file. The coded text is
+    string of characters vaules. for instance the character Q represents the
+    value 81 which corresponds to the string in the code list at position 81.
+    '''
+    # Turn each unique character into a coded value.
+    # The code of the string at a given position are represented by the position
+    # they occupy in the list.
+    codes = list(set(re.findall('(?msu).', txt)))
+    for i in range(len(codes), 256):
+        codes.append('')
+    # Set the compression level.
+    if level <= 1:
+        new_length = 256
+    if level >= 10:
+        new_length = 1
+    else:
+        new_length = int(256 * (10 - level) * .1)
+    new_length = 1 if new_length < 1 else new_length
+    # Replace txt with codes.
+    coded_txt = ''
+    for c in txt:
+        coded_txt += chr(codes.index(c))
+    txt = coded_txt
+    # Start compressing the text.
+    new = True
+    merged = True
+    while new or merged:
+        # Merge codes that always follow another code
+        merge = []
+        merged = False
+        for i in xrange(256):
+            if codes[i] != '':
+                # Find all codes that are next to i.
+                fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
+                # 1 if only one code comes after i.
+                if len(fall) == 1:
+                    # We are searching codes and each code is always 1 character.
+                    j = ord(fall[0][1:2])
+                    # Only merge if the total length of the string represented by
+                    # code is less than 256.
+                    if len(codes[i]) + len(codes[j]) < 256:
+                        merge.append((i, j))
+        if merge:
+            merged = True
+            for i, j in merge:
+                # Merge the string for j into the string for i.
+                if i == j:
+                    # Don't use += here just in case something goes wrong. This
+                    # will prevent out of control memory consumption. This is
+                    # unecessary but when creating this routine it happened due
+                    # to an error.
+                    codes[i] = codes[i] + codes[i]
+                else:
+                    codes[i] = codes[i] + codes[j]
+                txt = txt.replace(chr(i)+chr(j), chr(i))
+                if chr(j) not in txt:
+                    codes[j] = ''
+        new = False
+        if '' in codes:
+            # Create a list of codes based on combinations of codes that are next
+            # to each other. The amount of savings for the new code is calculated.
+            new_codes = []
+            for c in list(set(re.findall('(?msu)..', txt))):
+                i = ord(c[0:1])
+                j = ord(c[1:2])
+                if codes[i]+codes[j] in codes:
+                    continue
+                savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
+                if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
+                    new_codes.append((savings, i, j, codes[i], codes[j]))
+            if new_codes:
+                new = True
+                # Sort the codes from highest savings to lowest.
+                new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
+                # The shorter new_length the more chances time merging will happen
+                # giving more changes for better codes to be created. However,
+                # the shorter new_lengh the longer it will take to compress.
+                new_codes = new_codes[:new_length]
+                for code in new_codes:
+                    if '' not in codes:
+                        break
+                    c = codes.index('')
+                    codes[c] = code[3]+code[4]
+                    txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
+    # Generate the code dictionary.
+    header = []
+    for code in codes:
+        header.append(chr(len(code))+code)
+    for i in xrange(len(header), 256):
+        header.append(chr(0))
+    # Join the identifier with the dictionary and coded text.
+    return '!!8-Bit!!'+''.join(header)+txt
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@ -9,6 +9,7 @@ import os
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted
+from calibre.ebooks.compression.tcr import decompress

 class TCRInput(InputFormatPlugin):

@ -31,28 +32,9 @@ class TCRInput(InputFormatPlugin):
    ])

    def convert(self, stream, options, file_ext, log, accelerators):
-        txt = []
-
-        log.debug('Checking TCR header...')
-        if stream.read(9) != '!!8-Bit!!':
-            raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
-
-        log.debug('Building string dictionary...')
-        # Dictionary codes that the file contents are broken down into.
-        entries = []
-        for i in xrange(256):
-            entry_len = ord(stream.read(1))
-            entries.append(stream.read(entry_len))
-
        log.info('Decompressing text...')
-        # Map the values in the file to locations in the string list.
-        entry_loc = stream.read(1)
-        while entry_loc != '': # EOF
-            txt.append(entries[ord(entry_loc)])
-            entry_loc = stream.read(1)
-
        ienc = options.input_encoding if options.input_encoding else 'utf-8'
-        txt = ''.join(txt).decode(ienc, 'replace')
+        txt = decompress(stream).decode(ienc, 'replace')

        log.info('Converting text to OEB...')
        if options.single_line_paras:
--- a/src/calibre/ebooks/tcr/output.py
+++ b/src/calibre/ebooks/tcr/output.py
@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin, \
+    OptionRecommendation
+from calibre.ebooks.txt.txtml import TXTMLizer
+from calibre.ebooks.compression.tcr import compress
+
+class TCROutput(OutputFormatPlugin):
+
+    name = 'TCR Output'
+    author = 'John Schember'
+    file_type = 'tcr'
+
+    options = set([
+        OptionRecommendation(name='output_encoding', recommended_value='utf-8',
+            level=OptionRecommendation.LOW,
+            help=_('Specify the character encoding of the output document. ' \
+            'The default is utf-8.')),
+        OptionRecommendation(name='compression_level', recommended_value=5,
+            level=OptionRecommendation.LOW,
+            help=_('Speciy the compression level to use. Scale 1 - 10. 1 ' \
+            'being the lowest compression but the fastest and 10 being the ' \
+            'highest compression but the slowest.')),
+    ])
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = open(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        setattr(opts, 'flush_paras', False)
+        setattr(opts, 'max_line_length', 0)
+        setattr(opts, 'force_max_line_length', False)
+        setattr(opts, 'indent_paras', False)
+
+        writer = TXTMLizer(log)
+        txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace')
+
+        log.info('Compressing text...')
+        txt = compress(txt, opts.compression_level)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+        out_stream.write(txt)
+
+        if close:
+            out_stream.close()