TCR Output. Move TCR decompression to TCR compression file.

This commit is contained in:
John Schember 2009-10-19 07:15:27 -04:00
parent 1e3832a204
commit df6d759b38
4 changed files with 188 additions and 20 deletions

View File

@ -352,6 +352,7 @@ from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.output import PMLOutput
from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.customize.profiles import input_profiles, output_profiles
@ -402,6 +403,7 @@ plugins += [
PMLOutput,
RBOutput,
RTFOutput,
TCROutput,
TXTOutput,
]
plugins += [

View File

@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
def decompress(stream):
txt = []
stream.seek(0)
if stream.read(9) != '!!8-Bit!!':
raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
# Codes that the file contents are broken down into.
entries = []
for i in xrange(256):
entry_len = ord(stream.read(1))
entries.append(stream.read(entry_len))
# Map the values in the file to locations in the string list.
entry_loc = stream.read(1)
while entry_loc != '': # EOF
txt.append(entries[ord(entry_loc)])
entry_loc = stream.read(1)
return ''.join(txt)
def compress(txt, level=5):
'''
TCR compression takes the form header+code_list+coded_text.
The header is always "!!8-Bit!!". The code list is a list of 256 strings.
The list takes the form 1 byte length and then a string. Each position in
The list corresponds to a code found in the file. The coded text is
string of characters vaules. for instance the character Q represents the
value 81 which corresponds to the string in the code list at position 81.
'''
# Turn each unique character into a coded value.
# The code of the string at a given position are represented by the position
# they occupy in the list.
codes = list(set(re.findall('(?msu).', txt)))
for i in range(len(codes), 256):
codes.append('')
# Set the compression level.
if level <= 1:
new_length = 256
if level >= 10:
new_length = 1
else:
new_length = int(256 * (10 - level) * .1)
new_length = 1 if new_length < 1 else new_length
# Replace txt with codes.
coded_txt = ''
for c in txt:
coded_txt += chr(codes.index(c))
txt = coded_txt
# Start compressing the text.
new = True
merged = True
while new or merged:
# Merge codes that always follow another code
merge = []
merged = False
for i in xrange(256):
if codes[i] != '':
# Find all codes that are next to i.
fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
# 1 if only one code comes after i.
if len(fall) == 1:
# We are searching codes and each code is always 1 character.
j = ord(fall[0][1:2])
# Only merge if the total length of the string represented by
# code is less than 256.
if len(codes[i]) + len(codes[j]) < 256:
merge.append((i, j))
if merge:
merged = True
for i, j in merge:
# Merge the string for j into the string for i.
if i == j:
# Don't use += here just in case something goes wrong. This
# will prevent out of control memory consumption. This is
# unecessary but when creating this routine it happened due
# to an error.
codes[i] = codes[i] + codes[i]
else:
codes[i] = codes[i] + codes[j]
txt = txt.replace(chr(i)+chr(j), chr(i))
if chr(j) not in txt:
codes[j] = ''
new = False
if '' in codes:
# Create a list of codes based on combinations of codes that are next
# to each other. The amount of savings for the new code is calculated.
new_codes = []
for c in list(set(re.findall('(?msu)..', txt))):
i = ord(c[0:1])
j = ord(c[1:2])
if codes[i]+codes[j] in codes:
continue
savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
new_codes.append((savings, i, j, codes[i], codes[j]))
if new_codes:
new = True
# Sort the codes from highest savings to lowest.
new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
# The shorter new_length the more chances time merging will happen
# giving more changes for better codes to be created. However,
# the shorter new_lengh the longer it will take to compress.
new_codes = new_codes[:new_length]
for code in new_codes:
if '' not in codes:
break
c = codes.index('')
codes[c] = code[3]+code[4]
txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
# Generate the code dictionary.
header = []
for code in codes:
header.append(chr(len(code))+code)
for i in xrange(len(header), 256):
header.append(chr(0))
# Join the identifier with the dictionary and coded text.
return '!!8-Bit!!'+''.join(header)+txt

View File

@ -9,6 +9,7 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
from calibre.ebooks.compression.tcr import decompress
class TCRInput(InputFormatPlugin):
@ -31,28 +32,9 @@ class TCRInput(InputFormatPlugin):
])
def convert(self, stream, options, file_ext, log, accelerators):
txt = []
log.debug('Checking TCR header...')
if stream.read(9) != '!!8-Bit!!':
raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
log.debug('Building string dictionary...')
# Dictionary codes that the file contents are broken down into.
entries = []
for i in xrange(256):
entry_len = ord(stream.read(1))
entries.append(stream.read(entry_len))
log.info('Decompressing text...')
# Map the values in the file to locations in the string list.
entry_loc = stream.read(1)
while entry_loc != '': # EOF
txt.append(entries[ord(entry_loc)])
entry_loc = stream.read(1)
ienc = options.input_encoding if options.input_encoding else 'utf-8'
txt = ''.join(txt).decode(ienc, 'replace')
txt = decompress(stream).decode(ienc, 'replace')
log.info('Converting text to OEB...')
if options.single_line_paras:

View File

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.compression.tcr import compress
class TCROutput(OutputFormatPlugin):
name = 'TCR Output'
author = 'John Schember'
file_type = 'tcr'
options = set([
OptionRecommendation(name='output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \
'The default is utf-8.')),
OptionRecommendation(name='compression_level', recommended_value=5,
level=OptionRecommendation.LOW,
help=_('Speciy the compression level to use. Scale 1 - 10. 1 ' \
'being the lowest compression but the fastest and 10 being the ' \
'highest compression but the slowest.')),
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
setattr(opts, 'flush_paras', False)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
setattr(opts, 'indent_paras', False)
writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace')
log.info('Compressing text...')
txt = compress(txt, opts.compression_level)
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt)
if close:
out_stream.close()