TCR Output. Move TCR decompression to TCR compression file.

This commit is contained in:
John Schember 2009-10-19 07:15:27 -04:00
parent 1e3832a204
commit df6d759b38
4 changed files with 188 additions and 20 deletions

View File

@ -352,6 +352,7 @@ from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.output import PMLOutput from calibre.ebooks.pml.output import PMLOutput
from calibre.ebooks.rb.output import RBOutput from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
from calibre.customize.profiles import input_profiles, output_profiles from calibre.customize.profiles import input_profiles, output_profiles
@ -402,6 +403,7 @@ plugins += [
PMLOutput, PMLOutput,
RBOutput, RBOutput,
RTFOutput, RTFOutput,
TCROutput,
TXTOutput, TXTOutput,
] ]
plugins += [ plugins += [

View File

@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
def decompress(stream):
txt = []
stream.seek(0)
if stream.read(9) != '!!8-Bit!!':
raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
# Codes that the file contents are broken down into.
entries = []
for i in xrange(256):
entry_len = ord(stream.read(1))
entries.append(stream.read(entry_len))
# Map the values in the file to locations in the string list.
entry_loc = stream.read(1)
while entry_loc != '': # EOF
txt.append(entries[ord(entry_loc)])
entry_loc = stream.read(1)
return ''.join(txt)
def compress(txt, level=5):
'''
TCR compression takes the form header+code_list+coded_text.
The header is always "!!8-Bit!!". The code list is a list of 256 strings.
The list takes the form 1 byte length and then a string. Each position in
The list corresponds to a code found in the file. The coded text is
string of characters vaules. for instance the character Q represents the
value 81 which corresponds to the string in the code list at position 81.
'''
# Turn each unique character into a coded value.
# The code of the string at a given position are represented by the position
# they occupy in the list.
codes = list(set(re.findall('(?msu).', txt)))
for i in range(len(codes), 256):
codes.append('')
# Set the compression level.
if level <= 1:
new_length = 256
if level >= 10:
new_length = 1
else:
new_length = int(256 * (10 - level) * .1)
new_length = 1 if new_length < 1 else new_length
# Replace txt with codes.
coded_txt = ''
for c in txt:
coded_txt += chr(codes.index(c))
txt = coded_txt
# Start compressing the text.
new = True
merged = True
while new or merged:
# Merge codes that always follow another code
merge = []
merged = False
for i in xrange(256):
if codes[i] != '':
# Find all codes that are next to i.
fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
# 1 if only one code comes after i.
if len(fall) == 1:
# We are searching codes and each code is always 1 character.
j = ord(fall[0][1:2])
# Only merge if the total length of the string represented by
# code is less than 256.
if len(codes[i]) + len(codes[j]) < 256:
merge.append((i, j))
if merge:
merged = True
for i, j in merge:
# Merge the string for j into the string for i.
if i == j:
# Don't use += here just in case something goes wrong. This
# will prevent out of control memory consumption. This is
# unecessary but when creating this routine it happened due
# to an error.
codes[i] = codes[i] + codes[i]
else:
codes[i] = codes[i] + codes[j]
txt = txt.replace(chr(i)+chr(j), chr(i))
if chr(j) not in txt:
codes[j] = ''
new = False
if '' in codes:
# Create a list of codes based on combinations of codes that are next
# to each other. The amount of savings for the new code is calculated.
new_codes = []
for c in list(set(re.findall('(?msu)..', txt))):
i = ord(c[0:1])
j = ord(c[1:2])
if codes[i]+codes[j] in codes:
continue
savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
new_codes.append((savings, i, j, codes[i], codes[j]))
if new_codes:
new = True
# Sort the codes from highest savings to lowest.
new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
# The shorter new_length the more chances time merging will happen
# giving more changes for better codes to be created. However,
# the shorter new_lengh the longer it will take to compress.
new_codes = new_codes[:new_length]
for code in new_codes:
if '' not in codes:
break
c = codes.index('')
codes[c] = code[3]+code[4]
txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
# Generate the code dictionary.
header = []
for code in codes:
header.append(chr(len(code))+code)
for i in xrange(len(header), 256):
header.append(chr(0))
# Join the identifier with the dictionary and coded text.
return '!!8-Bit!!'+''.join(header)+txt

View File

@ -9,6 +9,7 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted separate_paragraphs_single_line, separate_paragraphs_print_formatted
from calibre.ebooks.compression.tcr import decompress
class TCRInput(InputFormatPlugin): class TCRInput(InputFormatPlugin):
@ -31,28 +32,9 @@ class TCRInput(InputFormatPlugin):
]) ])
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
txt = []
log.debug('Checking TCR header...')
if stream.read(9) != '!!8-Bit!!':
raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
log.debug('Building string dictionary...')
# Dictionary codes that the file contents are broken down into.
entries = []
for i in xrange(256):
entry_len = ord(stream.read(1))
entries.append(stream.read(entry_len))
log.info('Decompressing text...') log.info('Decompressing text...')
# Map the values in the file to locations in the string list.
entry_loc = stream.read(1)
while entry_loc != '': # EOF
txt.append(entries[ord(entry_loc)])
entry_loc = stream.read(1)
ienc = options.input_encoding if options.input_encoding else 'utf-8' ienc = options.input_encoding if options.input_encoding else 'utf-8'
txt = ''.join(txt).decode(ienc, 'replace') txt = decompress(stream).decode(ienc, 'replace')
log.info('Converting text to OEB...') log.info('Converting text to OEB...')
if options.single_line_paras: if options.single_line_paras:

View File

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.compression.tcr import compress
class TCROutput(OutputFormatPlugin):
name = 'TCR Output'
author = 'John Schember'
file_type = 'tcr'
options = set([
OptionRecommendation(name='output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \
'The default is utf-8.')),
OptionRecommendation(name='compression_level', recommended_value=5,
level=OptionRecommendation.LOW,
help=_('Speciy the compression level to use. Scale 1 - 10. 1 ' \
'being the lowest compression but the fastest and 10 being the ' \
'highest compression but the slowest.')),
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
setattr(opts, 'flush_paras', False)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
setattr(opts, 'indent_paras', False)
writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace')
log.info('Compressing text...')
txt = compress(txt, opts.compression_level)
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt)
if close:
out_stream.close()