diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index dad77ea3aa..9bfe6d4255 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -95,6 +95,18 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ + # Fix umlauts + (re.compile(u'¨\s*()*\s*o', re.UNICODE), lambda match: u'ö'), + (re.compile(u'¨\s*()*\s*O', re.UNICODE), lambda match: u'Ö'), + (re.compile(u'¨\s*()*\s*u', re.UNICODE), lambda match: u'ü'), + (re.compile(u'¨\s*()*\s*U', re.UNICODE), lambda match: u'Ü'), + (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'), + (re.compile(u'¨\s*()*\s*E', re.UNICODE), lambda match: u'Ë'), + (re.compile(u'¨\s*()*\s*i', re.UNICODE), lambda match: u'ï'), + (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'), + (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), + (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), + # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags diff --git a/src/calibre/ebooks/pdb/ereader/inspector.py b/src/calibre/ebooks/pdb/ereader/inspector.py new file mode 100644 index 0000000000..a3875daad4 --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/inspector.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +''' +Inspect the header of ereader files. This is primarily used for debugging. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import struct, sys + +from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.ebooks.pdb.ereader.reader import HeaderRecord + +def pdb_header_info(header): + print 'PDB Header Info:' + print '' + print 'Identity: %s' % header.ident + print 'Total Sectons: %s' % header.num_sections + print 'Title: %s' % header.title + print '' + +def ereader_header_info(header): + h0 = header.section_data(0) + + print 'Ereader Record 0 (Header) Info:' + print '' + print '0-2 Version: %i' % struct.unpack('>H', h0[0:2])[0] + print '2-4: %i' % struct.unpack('>H', h0[2:4])[0] + print '4-6: %i' % struct.unpack('>H', h0[4:6])[0] + print '6-8: %i' % struct.unpack('>H', h0[6:8])[0] + print '8-10: %i' % struct.unpack('>H', h0[8:10])[0] + print '10-12: %i' % struct.unpack('>H', h0[10:12])[0] + print '12-14 Non-Text: %i' % struct.unpack('>H', h0[12:14])[0] + print '14-16: %i' % struct.unpack('>H', h0[14:16])[0] + print '16-18: %i' % struct.unpack('>H', h0[16:18])[0] + print '18-20: %i' % struct.unpack('>H', h0[18:20])[0] + print '20-22: %i' % struct.unpack('>H', h0[20:22])[0] + print '22-24: %i' % struct.unpack('>H', h0[22:24])[0] + print '24-26: %i' % struct.unpack('>H', h0[24:26])[0] + print '26-28: %i' % struct.unpack('>H', h0[26:28])[0] + print '28-30 footnote_rec: %i' % struct.unpack('>H', h0[28:30])[0] + print '30-32 sidebar_rec: %i' % struct.unpack('>H', h0[30:32])[0] + print '32-34 bookmark_offset: %i' % struct.unpack('>H', h0[32:34])[0] + print '34-36: %i' % struct.unpack('>H', h0[34:36])[0] + print '36-38: %i' % struct.unpack('>H', h0[36:38])[0] + print '38-40: %i' % struct.unpack('>H', h0[38:40])[0] + print '40-42 image_data_offset: %i' % struct.unpack('>H', h0[40:42])[0] + print '42-44: %i' % struct.unpack('>H', h0[42:44])[0] + print '44-46 metadata_offset: %i' % struct.unpack('>H', h0[44:46])[0] + print '46-48: %i' % struct.unpack('>H', h0[46:48])[0] + print '48-50 footnote_offset: %i' % struct.unpack('>H', h0[48:50])[0] + print '50-52 sidebar_offset: %i' % struct.unpack('>H', h0[50:52])[0] + print '52-54 last_data_offset: %i' % struct.unpack('>H', h0[52:54])[0] + + print '' + +def section_lengths(header): + print 'Section Sizes' + print '' + + for i in range(0, header.section_count()): + size = len(header.section_data(i)) + if size > 65505: + message = '<--- Over!' + else: + message = '' + + print 'Section %i: %i %s' % (i, size, message) + +def main(args=sys.argv): + if len(args) < 2: + print 'Error: requires input file.' + return 1 + + f = open(sys.argv[1], 'rb') + + pheader = PdbHeaderReader(f) + + pdb_header_info(pheader) + ereader_header_info(pheader) + section_lengths(pheader) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 65eb35157e..b831849488 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -27,7 +27,7 @@ class Writer(object): hr = [self._header_record(len(text), len(images))] - sections = hr+text+images+metadata + sections = hr+text+images+metadata+['MeTaInFo\x00'] lengths = [len(i) for i in sections] @@ -82,7 +82,7 @@ class Writer(object): if image_items > 0: image_data_offset = text_items + 1 meta_data_offset = image_data_offset + image_items - last_data_offset = meta_data_offset + 1 + last_data_offset = meta_data_offset + 2 else: meta_data_offset = text_items + 1 last_data_offset = meta_data_offset + 1 @@ -90,6 +90,35 @@ class Writer(object): record = u'' + record += struct.pack('>H', version) # [0:2] + record += struct.pack('>H', 0) # [2:4] + record += struct.pack('>H', 0) # [4:6] + record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC + record += struct.pack('>H', last_data_offset) # [8:10] + record += struct.pack('>H', last_data_offset) # [10:12] + record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset + record += struct.pack('>H', non_text_offset) # [14:16] + record += struct.pack('>H', 1) # [16:18] + record += struct.pack('>H', 1) # [18:20] + record += struct.pack('>H', 0) # [20:22] + record += struct.pack('>H', 1) # [22:24] + record += struct.pack('>H', 1) # [24:26] + record += struct.pack('>H', 0) # [26:28] + record += struct.pack('>H', 0) # [28:30] # footnote_rec + record += struct.pack('>H', 0) # [30:32] # sidebar_rec + record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset + record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC + record += struct.pack('>H', non_text_offset) # [36:38] + record += struct.pack('>H', non_text_offset + 1) # [38:40] + record += struct.pack('>H', image_data_offset) # [40:42] + record += struct.pack('>H', image_data_offset) # [42:44] + record += struct.pack('>H', meta_data_offset) # [44:46] + record += struct.pack('>H', meta_data_offset) # [46:48] + record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset + record += struct.pack('>H', last_data_offset) # [52:54] # sidebar_offset + record += struct.pack('>H', last_data_offset) # [54:56] # last_data_offset + + ''' # Version record += struct.pack('>H', version) record = record.ljust(12, '\x00') @@ -112,6 +141,6 @@ class Writer(object): record += struct.pack('>H', last_data_offset) record = record.ljust(52, '\x00') record += struct.pack('>H', last_data_offset) - + ''' return record