From 4c49aa5c60cf711c2e404b787b0d5ec40a765d35 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 10 May 2009 12:23:26 -0400 Subject: [PATCH] eReader output. HTML_TO_PML cleanup. --- src/calibre/ebooks/pdb/__init__.py | 6 +- src/calibre/ebooks/pdb/ereader/reader.py | 2 +- src/calibre/ebooks/pdb/ereader/writer.py | 135 ++++++++++++----------- src/calibre/ebooks/pdb/formatwriter.py | 2 +- src/calibre/ebooks/pdb/output.py | 2 +- src/calibre/ebooks/pdb/palmdoc/writer.py | 2 +- src/calibre/ebooks/pdb/ztxt/writer.py | 2 +- src/calibre/ebooks/pml/input.py | 6 +- src/calibre/ebooks/pml/pmlconverter.py | 17 +-- 9 files changed, 86 insertions(+), 88 deletions(-) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 70a12ceb96..d8850cfb16 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -22,10 +22,12 @@ FORMAT_READERS = { from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer +from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer FORMAT_WRITERS = { - 'doc' : palmdoc_writer, - 'ztxt' : ztxt_writer, + 'doc' : palmdoc_writer, + 'ztxt' : ztxt_writer, + 'ereader' : ereader_writer, } IDENTITY_TO_NAME = { diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 13e204fd5e..90138180d2 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -126,7 +126,7 @@ class Reader(FormatReader): with CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') - index.write(html) + index.write(html.encode('utf-8')) if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index ea9144579c..3f2e0d9225 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -12,19 +12,29 @@ import struct, zlib import Image, cStringIO +from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pml.pmlconverter import html_to_pml +# We are using the older identity because we do not user newer features +# (sidebar, footnotes). This will ensure compatibility with older readers. +# If newer features are used (anything supported by dropbook but not by makebook +# change the identity to the newer PNRdPPrs. IDENTITY = 'PNPdPPrs' -class Writer(object): +# This is an arbitrary number that is small enough to work. The actual maximum +# record size is unknown. +MAX_RECORD_SIZE = 3560 + +class Writer(FormatWriter): - def __init__(self, log): + def __init__(self, opts, log): + self.opts = opts self.log = log - def dump(self, oeb_book, out_stream, metadata=None): + def write_content(self, oeb_book, out_stream, metadata=None): text = self._text(oeb_book.spine) images = self._images(oeb_book.manifest) metadata = [self._metadata(metadata)] @@ -42,26 +52,28 @@ class Writer(object): out_stream.write(item) def _text(self, pages): - # Todo: Split pages over 65505 Bytes - pml_pages = [] - + pml = '' for page in pages: - pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8'))) + pml += html_to_pml(unicode(page)).encode('cp1252') + + pml_pages = [] + for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): + pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])) return pml_pages def _images(self, manifest): - # Todo: resize images over 65505 Bytes images = [] for item in manifest: if item.media_type in OEB_IMAGES: - image = '\x00\x00\x00\x00' + image = 'PNG ' image += image_name(item.href) image = image.ljust(62, '\x00') - im = Image.open(cStringIO.StringIO(item.data)) + im = Image.open(cStringIO.StringIO(item.data)).convert('P') + im.thumbnail((300,300), Image.ANTIALIAS) data = cStringIO.StringIO() im.save(data, 'PNG') @@ -83,22 +95,38 @@ class Writer(object): publisher\x00 isbn\x00 ''' - return '\x00\x00\x00\x00\x00' + + title = _('Unknown') + author = _('Unknown') + copyright = '' + publisher = '' + isbn = '' + + if metadata != None: + if len(metadata.title) >= 1: + title = metadata.title[0].value + if len(metadata.creator) >= 1: + from calibre.ebooks.metadata import authors_to_string + author = authors_to_string([x.value for x in metadata.creator]) + if len(metadata.rights) >= 1: + copyright = metadata.rights[0].value + if len(metadata.publisher) >= 1: + publisher = metadata.publisher[0].value + + return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn) def _header_record(self, text_items, image_items): - # Todo: Find out more about header and add correct values to the file - # can be read by eReader reader software. ''' text_items = the number of text pages image_items = the number of images ''' - version = 10 + version = 10 # Zlib compression non_text_offset = text_items + 1 if image_items > 0: image_data_offset = text_items + 1 meta_data_offset = image_data_offset + image_items - last_data_offset = meta_data_offset + 2 + last_data_offset = meta_data_offset + 1 else: meta_data_offset = text_items + 1 last_data_offset = meta_data_offset + 1 @@ -106,61 +134,36 @@ class Writer(object): record = '' - record += struct.pack('>H', version) # [0:2] + record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM record += struct.pack('>H', 0) # [2:4] record += struct.pack('>H', 0) # [4:6] - record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC - record += struct.pack('>H', last_data_offset) # [8:10] - record += struct.pack('>H', last_data_offset) # [10:12] - record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset - record += struct.pack('>H', non_text_offset) # [14:16] - record += struct.pack('>H', 1) # [16:18] - record += struct.pack('>H', 1) # [18:20] - record += struct.pack('>H', 0) # [20:22] - record += struct.pack('>H', 1) # [22:24] - record += struct.pack('>H', 1) # [24:26] + record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text + record += struct.pack('>H', 0) # [8:10] + record += struct.pack('>H', 0) # [10:12] + record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset + record += struct.pack('>H', 0) # [14:16] + record += struct.pack('>H', 0) # [16:18] + record += struct.pack('>H', 0) # [18:20] + record += struct.pack('>H', image_items) # [20:22] # Number of images + record += struct.pack('>H', 0) # [22:24] + record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not record += struct.pack('>H', 0) # [26:28] - record += struct.pack('>H', 0) # [28:30] # footnote_rec - record += struct.pack('>H', 0) # [30:32] # sidebar_rec - record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset - record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC - record += struct.pack('>H', non_text_offset) # [36:38] - record += struct.pack('>H', non_text_offset + 1) # [38:40] - record += struct.pack('>H', image_data_offset) # [40:42] - record += struct.pack('>H', image_data_offset) # [42:44] - record += struct.pack('>H', meta_data_offset) # [44:46] - record += struct.pack('>H', meta_data_offset) # [46:48] - record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset - record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset - record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset + record += struct.pack('>H', 0) # [28:30] # footnote_rec + record += struct.pack('>H', 0) # [30:32] # sidebar_rec + record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset + record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC + record += struct.pack('>H', 0) # [36:38] + record += struct.pack('>H', 0) # [38:40] + record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images + record += struct.pack('>H', 0) # [42:44] + record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images + record += struct.pack('>H', 0) # [46:48] + record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images + record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images + record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset - record += struct.pack('>H', 1) # [54:56] - for i in range(56, 132, 2): - record += struct.pack('>H', 0) + for i in range(54, 132, 2): + record += struct.pack('>H', 0) # [54:132] - ''' - # Version - record += struct.pack('>H', version) - record = record.ljust(12, '\x00') - # Non-text offset, everything between record 0 and non_text_offset is text pages - record += struct.pack('>H', non_text_offset) - record = record.ljust(28, '\x00') - # Footnote and Sidebar rec - record += struct.pack('>H', 0) - record += struct.pack('>H', 0) - record += struct.pack('>H', last_data_offset) - record = record.ljust(40, '\x00') - # image pages - record += struct.pack('>H', image_data_offset) - record = record.ljust(44, '\x00') - # metadata string - record += struct.pack('>H', meta_data_offset) - record = record.ljust(48, '\x00') - # footnote and sidebar offsets - record += struct.pack('>H', last_data_offset) - record += struct.pack('>H', last_data_offset) - record = record.ljust(52, '\x00') - record += struct.pack('>H', last_data_offset) - ''' return record diff --git a/src/calibre/ebooks/pdb/formatwriter.py b/src/calibre/ebooks/pdb/formatwriter.py index 18b5f56219..0e124bd861 100644 --- a/src/calibre/ebooks/pdb/formatwriter.py +++ b/src/calibre/ebooks/pdb/formatwriter.py @@ -14,5 +14,5 @@ class FormatWriter(object): def __init__(self, opts, log): raise NotImplementedError() - def write_content(self, oeb_book, output_stream, ): + def write_content(self, oeb_book, output_stream, metadata=None): raise NotImplementedError() diff --git a/src/calibre/ebooks/pdb/output.py b/src/calibre/ebooks/pdb/output.py index edf047442a..29de9bd99c 100644 --- a/src/calibre/ebooks/pdb/output.py +++ b/src/calibre/ebooks/pdb/output.py @@ -44,7 +44,7 @@ class PDBOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - writer.write_content(oeb_book, out_stream) + writer.write_content(oeb_book, out_stream, oeb_book.metadata) if close: out_stream.close() diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 835b2c6cb3..e841e69054 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -23,7 +23,7 @@ class Writer(FormatWriter): self.opts = opts self.log = log - def write_content(self, oeb_book, out_stream): + def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') txt_records, txt_length = self._generate_text(oeb_book.spine) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index fd7a07d7f9..5d8aa97a69 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -22,7 +22,7 @@ class Writer(FormatWriter): self.opts = opts self.log = log - def write_content(self, oeb_book, out_stream): + def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') txt_records, txt_length = self._generate_text(oeb_book.spine) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index a3dbc98568..d755890ca8 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -41,7 +41,7 @@ class PMLInput(InputFormatPlugin): ienc = self.options.input_encoding html = pml_to_html(pml_stream.read().decode(ienc)) - html_stream.write('</head><body>' + html + '</body></html>') + html_stream.write('<html><head><title /></head><body>' + html.encode('utf-8') + '</body></html>') if pclose: pml_stream.close() @@ -63,10 +63,12 @@ class PMLInput(InputFormatPlugin): html_name = os.path.splitext(os.path.basename(pml))[0]+'.html' html_path = os.path.join(os.getcwd(), html_name) - pages.append(html_name) + pages.append(html_name) self.process_pml(pml, html_path) imgs = glob.glob(os.path.join(tdir, '*.png')) + if len(imgs) > 0: + os.makedirs(os.path.join(os.getcwd(), 'images')) for img in imgs: pimg_name = os.path.basename(img) pimg_path = os.path.join(os.getcwd(), 'images', pimg_name) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 76769e35d7..dded21c38c 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en' import re +from calibre import entity_to_unicode from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.htmlsymbols import HTML_SYMBOLS @@ -86,10 +87,9 @@ HTML_PML_RULES = [ (re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))), (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), - #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))), - (re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % hex(int(match.group('num')))[2:].rjust(4, '0')), - (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')), + (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')), + (re.compile('&(?P<num>#\d+);'), lambda match: entity_to_unicode(match)), + (re.compile('&(?P<num>.+);'), lambda match: entity_to_unicode(match)), (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), (re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), (re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), @@ -161,14 +161,5 @@ def html_to_pml(html): body = rule[0].sub(rule[1], body) pml += body - - # Replace symbols outside of cp1512 wtih \Uxxxx - chars = set(pml) - unichars = [] - for c in chars: - if ord(c) > 128: - unichars.append(c) - for u in unichars: - pml = pml.replace(u, '\U%s' % hex(ord(u))[2:].rjust(4, '0')) return pml