mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-26 08:12:25 -04:00 
			
		
		
		
	New output format: TCR
This commit is contained in:
		
							parent
							
								
									eddf7201af
								
							
						
					
					
						commit
						eed5931c85
					
				| @ -352,6 +352,7 @@ from calibre.ebooks.pdf.output import PDFOutput | ||||
| from calibre.ebooks.pml.output import PMLOutput | ||||
| from calibre.ebooks.rb.output import RBOutput | ||||
| from calibre.ebooks.rtf.output import RTFOutput | ||||
| from calibre.ebooks.tcr.output import TCROutput | ||||
| from calibre.ebooks.txt.output import TXTOutput | ||||
| 
 | ||||
| from calibre.customize.profiles import input_profiles, output_profiles | ||||
| @ -402,6 +403,7 @@ plugins += [ | ||||
|     PMLOutput, | ||||
|     RBOutput, | ||||
|     RTFOutput, | ||||
|     TCROutput, | ||||
|     TXTOutput, | ||||
| ] | ||||
| plugins += [ | ||||
|  | ||||
							
								
								
									
										126
									
								
								src/calibre/ebooks/compression/tcr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								src/calibre/ebooks/compression/tcr.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,126 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| __license__ = 'GPL 3' | ||||
| __copyright__ = '2009, John Schember <john@nachtimwald.com>' | ||||
| __docformat__ = 'restructuredtext en' | ||||
| 
 | ||||
| import re | ||||
| 
 | ||||
| def decompress(stream): | ||||
|         txt = [] | ||||
|         stream.seek(0) | ||||
|         if stream.read(9) != '!!8-Bit!!': | ||||
|             raise ValueError('File %s contaions an invalid TCR header.' % stream.name) | ||||
| 
 | ||||
|         # Codes that the file contents are broken down into. | ||||
|         entries = [] | ||||
|         for i in xrange(256): | ||||
|             entry_len = ord(stream.read(1)) | ||||
|             entries.append(stream.read(entry_len)) | ||||
| 
 | ||||
|         # Map the values in the file to locations in the string list. | ||||
|         entry_loc = stream.read(1) | ||||
|         while entry_loc != '': # EOF | ||||
|             txt.append(entries[ord(entry_loc)]) | ||||
|             entry_loc = stream.read(1) | ||||
| 
 | ||||
|         return ''.join(txt) | ||||
| 
 | ||||
| 
 | ||||
| def compress(txt, level=5): | ||||
|     ''' | ||||
|     TCR compression takes the form header+code_list+coded_text. | ||||
|     The header is always "!!8-Bit!!". The code list is a list of 256 strings. | ||||
|     The list takes the form 1 byte length and then a string. Each position in | ||||
|     The list corresponds to a code found in the file. The coded text is | ||||
|     string of characters vaules. for instance the character Q represents the | ||||
|     value 81 which corresponds to the string in the code list at position 81. | ||||
|     ''' | ||||
|     # Turn each unique character into a coded value. | ||||
|     # The code of the string at a given position are represented by the position | ||||
|     # they occupy in the list. | ||||
|     codes = list(set(re.findall('(?msu).', txt))) | ||||
|     for i in range(len(codes), 256): | ||||
|         codes.append('') | ||||
|     # Set the compression level. | ||||
|     if level <= 1: | ||||
|         new_length = 256 | ||||
|     if level >= 10: | ||||
|         new_length = 1 | ||||
|     else: | ||||
|         new_length = int(256 * (10 - level) * .1) | ||||
|     new_length = 1 if new_length < 1 else new_length | ||||
|     # Replace txt with codes. | ||||
|     coded_txt = '' | ||||
|     for c in txt: | ||||
|         coded_txt += chr(codes.index(c)) | ||||
|     txt = coded_txt | ||||
|     # Start compressing the text. | ||||
|     new = True | ||||
|     merged = True | ||||
|     while new or merged: | ||||
|         # Merge codes that always follow another code | ||||
|         merge = [] | ||||
|         merged = False | ||||
|         for i in xrange(256): | ||||
|             if codes[i] != '': | ||||
|                 # Find all codes that are next to i. | ||||
|                 fall = list(set(re.findall('(?msu)%s.' % re.escape(chr(i)), txt))) | ||||
|                 # 1 if only one code comes after i. | ||||
|                 if len(fall) == 1: | ||||
|                     # We are searching codes and each code is always 1 character. | ||||
|                     j = ord(fall[0][1:2]) | ||||
|                     # Only merge if the total length of the string represented by | ||||
|                     # code is less than 256. | ||||
|                     if len(codes[i]) + len(codes[j]) < 256: | ||||
|                         merge.append((i, j)) | ||||
|         if merge: | ||||
|             merged = True | ||||
|             for i, j in merge: | ||||
|                 # Merge the string for j into the string for i. | ||||
|                 if i == j: | ||||
|                     # Don't use += here just in case something goes wrong. This | ||||
|                     # will prevent out of control memory consumption. This is | ||||
|                     # unecessary but when creating this routine it happened due | ||||
|                     # to an error. | ||||
|                     codes[i] = codes[i] + codes[i] | ||||
|                 else: | ||||
|                     codes[i] = codes[i] + codes[j] | ||||
|                 txt = txt.replace(chr(i)+chr(j), chr(i)) | ||||
|                 if chr(j) not in txt: | ||||
|                     codes[j] = '' | ||||
|         new = False | ||||
|         if '' in codes: | ||||
|             # Create a list of codes based on combinations of codes that are next | ||||
|             # to each other. The amount of savings for the new code is calculated. | ||||
|             new_codes = [] | ||||
|             for c in list(set(re.findall('(?msu)..', txt))): | ||||
|                 i = ord(c[0:1]) | ||||
|                 j = ord(c[1:2]) | ||||
|                 if codes[i]+codes[j] in codes: | ||||
|                     continue | ||||
|                 savings = txt.count(chr(i)+chr(j)) - len(codes[i]) - len(codes[j]) | ||||
|                 if savings > 2 and len(codes[i]) + len(codes[j]) < 256: | ||||
|                     new_codes.append((savings, i, j, codes[i], codes[j])) | ||||
|             if new_codes: | ||||
|                 new = True | ||||
|                 # Sort the codes from highest savings to lowest. | ||||
|                 new_codes.sort(lambda x, y: -1 if x[0] > y[0] else 1 if x[0] < y[0] else 0) | ||||
|                 # The shorter new_length the more chances time merging will happen | ||||
|                 # giving more changes for better codes to be created. However, | ||||
|                 # the shorter new_lengh the longer it will take to compress. | ||||
|                 new_codes = new_codes[:new_length] | ||||
|                 for code in new_codes: | ||||
|                     if '' not in codes: | ||||
|                         break | ||||
|                     c = codes.index('') | ||||
|                     codes[c] = code[3]+code[4] | ||||
|                     txt = txt.replace(chr(code[1])+chr(code[2]), chr(c)) | ||||
|     # Generate the code dictionary. | ||||
|     header = [] | ||||
|     for code in codes: | ||||
|         header.append(chr(len(code))+code) | ||||
|     for i in xrange(len(header), 256): | ||||
|         header.append(chr(0)) | ||||
|     # Join the identifier with the dictionary and coded text. | ||||
|     return '!!8-Bit!!'+''.join(header)+txt | ||||
| @ -9,6 +9,7 @@ import os | ||||
| from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation | ||||
| from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ | ||||
|     separate_paragraphs_single_line, separate_paragraphs_print_formatted | ||||
| from calibre.ebooks.compression.tcr import decompress | ||||
| 
 | ||||
| class TCRInput(InputFormatPlugin): | ||||
| 
 | ||||
| @ -31,28 +32,9 @@ class TCRInput(InputFormatPlugin): | ||||
|     ]) | ||||
| 
 | ||||
|     def convert(self, stream, options, file_ext, log, accelerators): | ||||
|         txt = [] | ||||
| 
 | ||||
|         log.debug('Checking TCR header...') | ||||
|         if stream.read(9) != '!!8-Bit!!': | ||||
|             raise ValueError('File %s contaions an invalid TCR header.' % stream.name) | ||||
| 
 | ||||
|         log.debug('Building string dictionary...') | ||||
|         # Dictionary codes that the file contents are broken down into. | ||||
|         entries = [] | ||||
|         for i in xrange(256): | ||||
|             entry_len = ord(stream.read(1)) | ||||
|             entries.append(stream.read(entry_len)) | ||||
| 
 | ||||
|         log.info('Decompressing text...') | ||||
|         # Map the values in the file to locations in the string list. | ||||
|         entry_loc = stream.read(1) | ||||
|         while entry_loc != '': # EOF | ||||
|             txt.append(entries[ord(entry_loc)]) | ||||
|             entry_loc = stream.read(1) | ||||
| 
 | ||||
|         ienc = options.input_encoding if options.input_encoding else 'utf-8' | ||||
|         txt = ''.join(txt).decode(ienc, 'replace') | ||||
|         txt = decompress(stream).decode(ienc, 'replace') | ||||
| 
 | ||||
|         log.info('Converting text to OEB...') | ||||
|         if options.single_line_paras: | ||||
|  | ||||
							
								
								
									
										58
									
								
								src/calibre/ebooks/tcr/output.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								src/calibre/ebooks/tcr/output.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,58 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| __license__ = 'GPL 3' | ||||
| __copyright__ = '2009, John Schember <john@nachtimwald.com>' | ||||
| __docformat__ = 'restructuredtext en' | ||||
| 
 | ||||
| import os | ||||
| 
 | ||||
| from calibre.customize.conversion import OutputFormatPlugin, \ | ||||
|     OptionRecommendation | ||||
| from calibre.ebooks.txt.txtml import TXTMLizer | ||||
| from calibre.ebooks.compression.tcr import compress | ||||
| 
 | ||||
| class TCROutput(OutputFormatPlugin): | ||||
| 
 | ||||
|     name = 'TCR Output' | ||||
|     author = 'John Schember' | ||||
|     file_type = 'tcr' | ||||
| 
 | ||||
|     options = set([ | ||||
|         OptionRecommendation(name='output_encoding', recommended_value='utf-8', | ||||
|             level=OptionRecommendation.LOW, | ||||
|             help=_('Specify the character encoding of the output document. ' \ | ||||
|             'The default is utf-8.')), | ||||
|         OptionRecommendation(name='compression_level', recommended_value=5, | ||||
|             level=OptionRecommendation.LOW, | ||||
|             help=_('Speciy the compression level to use. Scale 1 - 10. 1 ' \ | ||||
|             'being the lowest compression but the fastest and 10 being the ' \ | ||||
|             'highest compression but the slowest.')), | ||||
|     ]) | ||||
| 
 | ||||
|     def convert(self, oeb_book, output_path, input_plugin, opts, log): | ||||
|         close = False | ||||
|         if not hasattr(output_path, 'write'): | ||||
|             close = True | ||||
|             if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': | ||||
|                 os.makedirs(os.path.dirname(output_path)) | ||||
|             out_stream = open(output_path, 'wb') | ||||
|         else: | ||||
|             out_stream = output_path | ||||
| 
 | ||||
|         setattr(opts, 'flush_paras', False) | ||||
|         setattr(opts, 'max_line_length', 0) | ||||
|         setattr(opts, 'force_max_line_length', False) | ||||
|         setattr(opts, 'indent_paras', False) | ||||
| 
 | ||||
|         writer = TXTMLizer(log) | ||||
|         txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace') | ||||
| 
 | ||||
|         log.info('Compressing text...') | ||||
|         txt = compress(txt, opts.compression_level) | ||||
| 
 | ||||
|         out_stream.seek(0) | ||||
|         out_stream.truncate() | ||||
|         out_stream.write(txt) | ||||
| 
 | ||||
|         if close: | ||||
|             out_stream.close() | ||||
| @ -21,7 +21,7 @@ What formats does |app| support conversion to/from? | ||||
| It can convert every input format in the following list, to every output format. | ||||
| 
 | ||||
| *Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT | ||||
| *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TXT | ||||
| *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT | ||||
| 
 | ||||
| ** PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user