mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TCR Output. Move TCR decompression to TCR compression file.
This commit is contained in:
parent
1e3832a204
commit
df6d759b38
@ -352,6 +352,7 @@ from calibre.ebooks.pdf.output import PDFOutput
|
|||||||
from calibre.ebooks.pml.output import PMLOutput
|
from calibre.ebooks.pml.output import PMLOutput
|
||||||
from calibre.ebooks.rb.output import RBOutput
|
from calibre.ebooks.rb.output import RBOutput
|
||||||
from calibre.ebooks.rtf.output import RTFOutput
|
from calibre.ebooks.rtf.output import RTFOutput
|
||||||
|
from calibre.ebooks.tcr.output import TCROutput
|
||||||
from calibre.ebooks.txt.output import TXTOutput
|
from calibre.ebooks.txt.output import TXTOutput
|
||||||
|
|
||||||
from calibre.customize.profiles import input_profiles, output_profiles
|
from calibre.customize.profiles import input_profiles, output_profiles
|
||||||
@ -402,6 +403,7 @@ plugins += [
|
|||||||
PMLOutput,
|
PMLOutput,
|
||||||
RBOutput,
|
RBOutput,
|
||||||
RTFOutput,
|
RTFOutput,
|
||||||
|
TCROutput,
|
||||||
TXTOutput,
|
TXTOutput,
|
||||||
]
|
]
|
||||||
plugins += [
|
plugins += [
|
||||||
|
126
src/calibre/ebooks/compression/tcr.py
Normal file
126
src/calibre/ebooks/compression/tcr.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
def decompress(stream):
|
||||||
|
txt = []
|
||||||
|
stream.seek(0)
|
||||||
|
if stream.read(9) != '!!8-Bit!!':
|
||||||
|
raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
|
||||||
|
|
||||||
|
# Codes that the file contents are broken down into.
|
||||||
|
entries = []
|
||||||
|
for i in xrange(256):
|
||||||
|
entry_len = ord(stream.read(1))
|
||||||
|
entries.append(stream.read(entry_len))
|
||||||
|
|
||||||
|
# Map the values in the file to locations in the string list.
|
||||||
|
entry_loc = stream.read(1)
|
||||||
|
while entry_loc != '': # EOF
|
||||||
|
txt.append(entries[ord(entry_loc)])
|
||||||
|
entry_loc = stream.read(1)
|
||||||
|
|
||||||
|
return ''.join(txt)
|
||||||
|
|
||||||
|
|
||||||
|
def compress(txt, level=5):
|
||||||
|
'''
|
||||||
|
TCR compression takes the form header+code_list+coded_text.
|
||||||
|
The header is always "!!8-Bit!!". The code list is a list of 256 strings.
|
||||||
|
The list takes the form 1 byte length and then a string. Each position in
|
||||||
|
The list corresponds to a code found in the file. The coded text is
|
||||||
|
string of characters vaules. for instance the character Q represents the
|
||||||
|
value 81 which corresponds to the string in the code list at position 81.
|
||||||
|
'''
|
||||||
|
# Turn each unique character into a coded value.
|
||||||
|
# The code of the string at a given position are represented by the position
|
||||||
|
# they occupy in the list.
|
||||||
|
codes = list(set(re.findall('(?msu).', txt)))
|
||||||
|
for i in range(len(codes), 256):
|
||||||
|
codes.append('')
|
||||||
|
# Set the compression level.
|
||||||
|
if level <= 1:
|
||||||
|
new_length = 256
|
||||||
|
if level >= 10:
|
||||||
|
new_length = 1
|
||||||
|
else:
|
||||||
|
new_length = int(256 * (10 - level) * .1)
|
||||||
|
new_length = 1 if new_length < 1 else new_length
|
||||||
|
# Replace txt with codes.
|
||||||
|
coded_txt = ''
|
||||||
|
for c in txt:
|
||||||
|
coded_txt += chr(codes.index(c))
|
||||||
|
txt = coded_txt
|
||||||
|
# Start compressing the text.
|
||||||
|
new = True
|
||||||
|
merged = True
|
||||||
|
while new or merged:
|
||||||
|
# Merge codes that always follow another code
|
||||||
|
merge = []
|
||||||
|
merged = False
|
||||||
|
for i in xrange(256):
|
||||||
|
if codes[i] != '':
|
||||||
|
# Find all codes that are next to i.
|
||||||
|
fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt)))
|
||||||
|
# 1 if only one code comes after i.
|
||||||
|
if len(fall) == 1:
|
||||||
|
# We are searching codes and each code is always 1 character.
|
||||||
|
j = ord(fall[0][1:2])
|
||||||
|
# Only merge if the total length of the string represented by
|
||||||
|
# code is less than 256.
|
||||||
|
if len(codes[i]) + len(codes[j]) < 256:
|
||||||
|
merge.append((i, j))
|
||||||
|
if merge:
|
||||||
|
merged = True
|
||||||
|
for i, j in merge:
|
||||||
|
# Merge the string for j into the string for i.
|
||||||
|
if i == j:
|
||||||
|
# Don't use += here just in case something goes wrong. This
|
||||||
|
# will prevent out of control memory consumption. This is
|
||||||
|
# unecessary but when creating this routine it happened due
|
||||||
|
# to an error.
|
||||||
|
codes[i] = codes[i] + codes[i]
|
||||||
|
else:
|
||||||
|
codes[i] = codes[i] + codes[j]
|
||||||
|
txt = txt.replace(chr(i)+chr(j), chr(i))
|
||||||
|
if chr(j) not in txt:
|
||||||
|
codes[j] = ''
|
||||||
|
new = False
|
||||||
|
if '' in codes:
|
||||||
|
# Create a list of codes based on combinations of codes that are next
|
||||||
|
# to each other. The amount of savings for the new code is calculated.
|
||||||
|
new_codes = []
|
||||||
|
for c in list(set(re.findall('(?msu)..', txt))):
|
||||||
|
i = ord(c[0:1])
|
||||||
|
j = ord(c[1:2])
|
||||||
|
if codes[i]+codes[j] in codes:
|
||||||
|
continue
|
||||||
|
savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j])
|
||||||
|
if savings > 2 and len(codes[i]) + len(codes[j]) < 256:
|
||||||
|
new_codes.append((savings, i, j, codes[i], codes[j]))
|
||||||
|
if new_codes:
|
||||||
|
new = True
|
||||||
|
# Sort the codes from highest savings to lowest.
|
||||||
|
new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0)
|
||||||
|
# The shorter new_length the more chances time merging will happen
|
||||||
|
# giving more changes for better codes to be created. However,
|
||||||
|
# the shorter new_lengh the longer it will take to compress.
|
||||||
|
new_codes = new_codes[:new_length]
|
||||||
|
for code in new_codes:
|
||||||
|
if '' not in codes:
|
||||||
|
break
|
||||||
|
c = codes.index('')
|
||||||
|
codes[c] = code[3]+code[4]
|
||||||
|
txt = txt.replace(chr(code[1])+chr(code[2]), chr(c))
|
||||||
|
# Generate the code dictionary.
|
||||||
|
header = []
|
||||||
|
for code in codes:
|
||||||
|
header.append(chr(len(code))+code)
|
||||||
|
for i in xrange(len(header), 256):
|
||||||
|
header.append(chr(0))
|
||||||
|
# Join the identifier with the dictionary and coded text.
|
||||||
|
return '!!8-Bit!!'+''.join(header)+txt
|
@ -9,6 +9,7 @@ import os
|
|||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||||
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
||||||
|
from calibre.ebooks.compression.tcr import decompress
|
||||||
|
|
||||||
class TCRInput(InputFormatPlugin):
|
class TCRInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -31,28 +32,9 @@ class TCRInput(InputFormatPlugin):
|
|||||||
])
|
])
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
txt = []
|
|
||||||
|
|
||||||
log.debug('Checking TCR header...')
|
|
||||||
if stream.read(9) != '!!8-Bit!!':
|
|
||||||
raise ValueError('File %s contaions an invalid TCR header.' % stream.name)
|
|
||||||
|
|
||||||
log.debug('Building string dictionary...')
|
|
||||||
# Dictionary codes that the file contents are broken down into.
|
|
||||||
entries = []
|
|
||||||
for i in xrange(256):
|
|
||||||
entry_len = ord(stream.read(1))
|
|
||||||
entries.append(stream.read(entry_len))
|
|
||||||
|
|
||||||
log.info('Decompressing text...')
|
log.info('Decompressing text...')
|
||||||
# Map the values in the file to locations in the string list.
|
|
||||||
entry_loc = stream.read(1)
|
|
||||||
while entry_loc != '': # EOF
|
|
||||||
txt.append(entries[ord(entry_loc)])
|
|
||||||
entry_loc = stream.read(1)
|
|
||||||
|
|
||||||
ienc = options.input_encoding if options.input_encoding else 'utf-8'
|
ienc = options.input_encoding if options.input_encoding else 'utf-8'
|
||||||
txt = ''.join(txt).decode(ienc, 'replace')
|
txt = decompress(stream).decode(ienc, 'replace')
|
||||||
|
|
||||||
log.info('Converting text to OEB...')
|
log.info('Converting text to OEB...')
|
||||||
if options.single_line_paras:
|
if options.single_line_paras:
|
||||||
|
58
src/calibre/ebooks/tcr/output.py
Normal file
58
src/calibre/ebooks/tcr/output.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||||
|
OptionRecommendation
|
||||||
|
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||||
|
from calibre.ebooks.compression.tcr import compress
|
||||||
|
|
||||||
|
class TCROutput(OutputFormatPlugin):
|
||||||
|
|
||||||
|
name = 'TCR Output'
|
||||||
|
author = 'John Schember'
|
||||||
|
file_type = 'tcr'
|
||||||
|
|
||||||
|
options = set([
|
||||||
|
OptionRecommendation(name='output_encoding', recommended_value='utf-8',
|
||||||
|
level=OptionRecommendation.LOW,
|
||||||
|
help=_('Specify the character encoding of the output document. ' \
|
||||||
|
'The default is utf-8.')),
|
||||||
|
OptionRecommendation(name='compression_level', recommended_value=5,
|
||||||
|
level=OptionRecommendation.LOW,
|
||||||
|
help=_('Speciy the compression level to use. Scale 1 - 10. 1 ' \
|
||||||
|
'being the lowest compression but the fastest and 10 being the ' \
|
||||||
|
'highest compression but the slowest.')),
|
||||||
|
])
|
||||||
|
|
||||||
|
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||||
|
close = False
|
||||||
|
if not hasattr(output_path, 'write'):
|
||||||
|
close = True
|
||||||
|
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||||
|
os.makedirs(os.path.dirname(output_path))
|
||||||
|
out_stream = open(output_path, 'wb')
|
||||||
|
else:
|
||||||
|
out_stream = output_path
|
||||||
|
|
||||||
|
setattr(opts, 'flush_paras', False)
|
||||||
|
setattr(opts, 'max_line_length', 0)
|
||||||
|
setattr(opts, 'force_max_line_length', False)
|
||||||
|
setattr(opts, 'indent_paras', False)
|
||||||
|
|
||||||
|
writer = TXTMLizer(log)
|
||||||
|
txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace')
|
||||||
|
|
||||||
|
log.info('Compressing text...')
|
||||||
|
txt = compress(txt, opts.compression_level)
|
||||||
|
|
||||||
|
out_stream.seek(0)
|
||||||
|
out_stream.truncate()
|
||||||
|
out_stream.write(txt)
|
||||||
|
|
||||||
|
if close:
|
||||||
|
out_stream.close()
|
Loading…
x
Reference in New Issue
Block a user