Fix bug 2342. ereader inspector script to aid in implementing writer. ereader writer tweaks.

This commit is contained in:
John Schember 2009-04-26 17:09:23 -04:00
parent 1daf7bd86a
commit ccdb992992
3 changed files with 131 additions and 3 deletions

View File

@ -95,6 +95,18 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup
PDFTOHTML = [
# Fix umlauts
(re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
(re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
(re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
(re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
(re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
(re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
# Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove <hr> tags

View File

@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
'''
Inspect the header of ereader files. This is primarily used for debugging.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import struct, sys
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb.ereader.reader import HeaderRecord
def pdb_header_info(header):
print 'PDB Header Info:'
print ''
print 'Identity: %s' % header.ident
print 'Total Sectons: %s' % header.num_sections
print 'Title: %s' % header.title
print ''
def ereader_header_info(header):
h0 = header.section_data(0)
print 'Ereader Record 0 (Header) Info:'
print ''
print '0-2 Version: %i' % struct.unpack('>H', h0[0:2])[0]
print '2-4: %i' % struct.unpack('>H', h0[2:4])[0]
print '4-6: %i' % struct.unpack('>H', h0[4:6])[0]
print '6-8: %i' % struct.unpack('>H', h0[6:8])[0]
print '8-10: %i' % struct.unpack('>H', h0[8:10])[0]
print '10-12: %i' % struct.unpack('>H', h0[10:12])[0]
print '12-14 Non-Text: %i' % struct.unpack('>H', h0[12:14])[0]
print '14-16: %i' % struct.unpack('>H', h0[14:16])[0]
print '16-18: %i' % struct.unpack('>H', h0[16:18])[0]
print '18-20: %i' % struct.unpack('>H', h0[18:20])[0]
print '20-22: %i' % struct.unpack('>H', h0[20:22])[0]
print '22-24: %i' % struct.unpack('>H', h0[22:24])[0]
print '24-26: %i' % struct.unpack('>H', h0[24:26])[0]
print '26-28: %i' % struct.unpack('>H', h0[26:28])[0]
print '28-30 footnote_rec: %i' % struct.unpack('>H', h0[28:30])[0]
print '30-32 sidebar_rec: %i' % struct.unpack('>H', h0[30:32])[0]
print '32-34 bookmark_offset: %i' % struct.unpack('>H', h0[32:34])[0]
print '34-36: %i' % struct.unpack('>H', h0[34:36])[0]
print '36-38: %i' % struct.unpack('>H', h0[36:38])[0]
print '38-40: %i' % struct.unpack('>H', h0[38:40])[0]
print '40-42 image_data_offset: %i' % struct.unpack('>H', h0[40:42])[0]
print '42-44: %i' % struct.unpack('>H', h0[42:44])[0]
print '44-46 metadata_offset: %i' % struct.unpack('>H', h0[44:46])[0]
print '46-48: %i' % struct.unpack('>H', h0[46:48])[0]
print '48-50 footnote_offset: %i' % struct.unpack('>H', h0[48:50])[0]
print '50-52 sidebar_offset: %i' % struct.unpack('>H', h0[50:52])[0]
print '52-54 last_data_offset: %i' % struct.unpack('>H', h0[52:54])[0]
print ''
def section_lengths(header):
print 'Section Sizes'
print ''
for i in range(0, header.section_count()):
size = len(header.section_data(i))
if size > 65505:
message = '<--- Over!'
else:
message = ''
print 'Section %i: %i %s' % (i, size, message)
def main(args=sys.argv):
if len(args) < 2:
print 'Error: requires input file.'
return 1
f = open(sys.argv[1], 'rb')
pheader = PdbHeaderReader(f)
pdb_header_info(pheader)
ereader_header_info(pheader)
section_lengths(pheader)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -27,7 +27,7 @@ class Writer(object):
hr = [self._header_record(len(text), len(images))]
sections = hr+text+images+metadata
sections = hr+text+images+metadata+['MeTaInFo\x00']
lengths = [len(i) for i in sections]
@ -82,7 +82,7 @@ class Writer(object):
if image_items > 0:
image_data_offset = text_items + 1
meta_data_offset = image_data_offset + image_items
last_data_offset = meta_data_offset + 1
last_data_offset = meta_data_offset + 2
else:
meta_data_offset = text_items + 1
last_data_offset = meta_data_offset + 1
@ -90,6 +90,35 @@ class Writer(object):
record = u''
record += struct.pack('>H', version) # [0:2]
record += struct.pack('>H', 0) # [2:4]
record += struct.pack('>H', 0) # [4:6]
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC
record += struct.pack('>H', last_data_offset) # [8:10]
record += struct.pack('>H', last_data_offset) # [10:12]
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
record += struct.pack('>H', non_text_offset) # [14:16]
record += struct.pack('>H', 1) # [16:18]
record += struct.pack('>H', 1) # [18:20]
record += struct.pack('>H', 0) # [20:22]
record += struct.pack('>H', 1) # [22:24]
record += struct.pack('>H', 1) # [24:26]
record += struct.pack('>H', 0) # [26:28]
record += struct.pack('>H', 0) # [28:30] # footnote_rec
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
record += struct.pack('>H', non_text_offset) # [36:38]
record += struct.pack('>H', non_text_offset + 1) # [38:40]
record += struct.pack('>H', image_data_offset) # [40:42]
record += struct.pack('>H', image_data_offset) # [42:44]
record += struct.pack('>H', meta_data_offset) # [44:46]
record += struct.pack('>H', meta_data_offset) # [46:48]
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset
record += struct.pack('>H', last_data_offset) # [52:54] # sidebar_offset
record += struct.pack('>H', last_data_offset) # [54:56] # last_data_offset
'''
# Version
record += struct.pack('>H', version)
record = record.ljust(12, '\x00')
@ -112,6 +141,6 @@ class Writer(object):
record += struct.pack('>H', last_data_offset)
record = record.ljust(52, '\x00')
record += struct.pack('>H', last_data_offset)
'''
return record