eReader output. HTML_TO_PML cleanup.

This commit is contained in:
John Schember 2009-05-10 12:23:26 -04:00
parent 49c7999064
commit 4c49aa5c60
9 changed files with 86 additions and 88 deletions

View File

@ -22,10 +22,12 @@ FORMAT_READERS = {
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
FORMAT_WRITERS = {
'doc' : palmdoc_writer,
'ztxt' : ztxt_writer,
'doc' : palmdoc_writer,
'ztxt' : ztxt_writer,
'ereader' : ereader_writer,
}
IDENTITY_TO_NAME = {

View File

@ -126,7 +126,7 @@ class Reader(FormatReader):
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
index.write(html)
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))

View File

@ -12,19 +12,29 @@ import struct, zlib
import Image, cStringIO
from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.pml.pmlconverter import html_to_pml
# We are using the older identity because we do not user newer features
# (sidebar, footnotes). This will ensure compatibility with older readers.
# If newer features are used (anything supported by dropbook but not by makebook
# change the identity to the newer PNRdPPrs.
IDENTITY = 'PNPdPPrs'
class Writer(object):
# This is an arbitrary number that is small enough to work. The actual maximum
# record size is unknown.
MAX_RECORD_SIZE = 3560
def __init__(self, log):
class Writer(FormatWriter):
def __init__(self, opts, log):
self.opts = opts
self.log = log
def dump(self, oeb_book, out_stream, metadata=None):
def write_content(self, oeb_book, out_stream, metadata=None):
text = self._text(oeb_book.spine)
images = self._images(oeb_book.manifest)
metadata = [self._metadata(metadata)]
@ -42,26 +52,28 @@ class Writer(object):
out_stream.write(item)
def _text(self, pages):
# Todo: Split pages over 65505 Bytes
pml_pages = []
pml = ''
for page in pages:
pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))
pml += html_to_pml(unicode(page)).encode('cp1252')
pml_pages = []
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
return pml_pages
def _images(self, manifest):
# Todo: resize images over 65505 Bytes
images = []
for item in manifest:
if item.media_type in OEB_IMAGES:
image = '\x00\x00\x00\x00'
image = 'PNG '
image += image_name(item.href)
image = image.ljust(62, '\x00')
im = Image.open(cStringIO.StringIO(item.data))
im = Image.open(cStringIO.StringIO(item.data)).convert('P')
im.thumbnail((300,300), Image.ANTIALIAS)
data = cStringIO.StringIO()
im.save(data, 'PNG')
@ -83,22 +95,38 @@ class Writer(object):
publisher\x00
isbn\x00
'''
return '\x00\x00\x00\x00\x00'
title = _('Unknown')
author = _('Unknown')
copyright = ''
publisher = ''
isbn = ''
if metadata != None:
if len(metadata.title) >= 1:
title = metadata.title[0].value
if len(metadata.creator) >= 1:
from calibre.ebooks.metadata import authors_to_string
author = authors_to_string([x.value for x in metadata.creator])
if len(metadata.rights) >= 1:
copyright = metadata.rights[0].value
if len(metadata.publisher) >= 1:
publisher = metadata.publisher[0].value
return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
def _header_record(self, text_items, image_items):
# Todo: Find out more about header and add correct values to the file
# can be read by eReader reader software.
'''
text_items = the number of text pages
image_items = the number of images
'''
version = 10
version = 10 # Zlib compression
non_text_offset = text_items + 1
if image_items > 0:
image_data_offset = text_items + 1
meta_data_offset = image_data_offset + image_items
last_data_offset = meta_data_offset + 2
last_data_offset = meta_data_offset + 1
else:
meta_data_offset = text_items + 1
last_data_offset = meta_data_offset + 1
@ -106,61 +134,36 @@ class Writer(object):
record = ''
record += struct.pack('>H', version) # [0:2]
record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
record += struct.pack('>H', 0) # [2:4]
record += struct.pack('>H', 0) # [4:6]
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC
record += struct.pack('>H', last_data_offset) # [8:10]
record += struct.pack('>H', last_data_offset) # [10:12]
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
record += struct.pack('>H', non_text_offset) # [14:16]
record += struct.pack('>H', 1) # [16:18]
record += struct.pack('>H', 1) # [18:20]
record += struct.pack('>H', 0) # [20:22]
record += struct.pack('>H', 1) # [22:24]
record += struct.pack('>H', 1) # [24:26]
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
record += struct.pack('>H', 0) # [8:10]
record += struct.pack('>H', 0) # [10:12]
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
record += struct.pack('>H', 0) # [14:16]
record += struct.pack('>H', 0) # [16:18]
record += struct.pack('>H', 0) # [18:20]
record += struct.pack('>H', image_items) # [20:22] # Number of images
record += struct.pack('>H', 0) # [22:24]
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not
record += struct.pack('>H', 0) # [26:28]
record += struct.pack('>H', 0) # [28:30] # footnote_rec
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
record += struct.pack('>H', non_text_offset) # [36:38]
record += struct.pack('>H', non_text_offset + 1) # [38:40]
record += struct.pack('>H', image_data_offset) # [40:42]
record += struct.pack('>H', image_data_offset) # [42:44]
record += struct.pack('>H', meta_data_offset) # [44:46]
record += struct.pack('>H', meta_data_offset) # [46:48]
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset
record += struct.pack('>H', 0) # [28:30] # footnote_rec
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
record += struct.pack('>H', 0) # [36:38]
record += struct.pack('>H', 0) # [38:40]
record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images
record += struct.pack('>H', 0) # [42:44]
record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images
record += struct.pack('>H', 0) # [46:48]
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset
record += struct.pack('>H', 1) # [54:56]
for i in range(56, 132, 2):
record += struct.pack('>H', 0)
for i in range(54, 132, 2):
record += struct.pack('>H', 0) # [54:132]
'''
# Version
record += struct.pack('>H', version)
record = record.ljust(12, '\x00')
# Non-text offset, everything between record 0 and non_text_offset is text pages
record += struct.pack('>H', non_text_offset)
record = record.ljust(28, '\x00')
# Footnote and Sidebar rec
record += struct.pack('>H', 0)
record += struct.pack('>H', 0)
record += struct.pack('>H', last_data_offset)
record = record.ljust(40, '\x00')
# image pages
record += struct.pack('>H', image_data_offset)
record = record.ljust(44, '\x00')
# metadata string
record += struct.pack('>H', meta_data_offset)
record = record.ljust(48, '\x00')
# footnote and sidebar offsets
record += struct.pack('>H', last_data_offset)
record += struct.pack('>H', last_data_offset)
record = record.ljust(52, '\x00')
record += struct.pack('>H', last_data_offset)
'''
return record

View File

@ -14,5 +14,5 @@ class FormatWriter(object):
def __init__(self, opts, log):
raise NotImplementedError()
def write_content(self, oeb_book, output_stream, ):
def write_content(self, oeb_book, output_stream, metadata=None):
raise NotImplementedError()

View File

@ -44,7 +44,7 @@ class PDBOutput(OutputFormatPlugin):
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream)
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@ -23,7 +23,7 @@ class Writer(FormatWriter):
self.opts = opts
self.log = log
def write_content(self, oeb_book, out_stream):
def write_content(self, oeb_book, out_stream, metadata=None):
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
txt_records, txt_length = self._generate_text(oeb_book.spine)

View File

@ -22,7 +22,7 @@ class Writer(FormatWriter):
self.opts = opts
self.log = log
def write_content(self, oeb_book, out_stream):
def write_content(self, oeb_book, out_stream, metadata=None):
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
txt_records, txt_length = self._generate_text(oeb_book.spine)

View File

@ -41,7 +41,7 @@ class PMLInput(InputFormatPlugin):
ienc = self.options.input_encoding
html = pml_to_html(pml_stream.read().decode(ienc))
html_stream.write('<html><head><title /></head><body>' + html + '</body></html>')
html_stream.write('<html><head><title /></head><body>' + html.encode('utf-8') + '</body></html>')
if pclose:
pml_stream.close()
@ -67,6 +67,8 @@ class PMLInput(InputFormatPlugin):
self.process_pml(pml, html_path)
imgs = glob.glob(os.path.join(tdir, '*.png'))
if len(imgs) > 0:
os.makedirs(os.path.join(os.getcwd(), 'images'))
for img in imgs:
pimg_name = os.path.basename(img)
pimg_path = os.path.join(os.getcwd(), 'images', pimg_name)

View File

@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
import re
from calibre import entity_to_unicode
from calibre.ebooks.pdb.ereader import image_name
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
@ -86,10 +87,9 @@ HTML_PML_RULES = [
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
#(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))),
(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % hex(int(match.group('num')))[2:].rjust(4, '0')),
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')),
(re.compile('&(?P<num>#\d+);'), lambda match: entity_to_unicode(match)),
(re.compile('&(?P<num>.+);'), lambda match: entity_to_unicode(match)),
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
(re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
@ -162,13 +162,4 @@ def html_to_pml(html):
pml += body
# Replace symbols outside of cp1512 wtih \Uxxxx
chars = set(pml)
unichars = []
for c in chars:
if ord(c) > 128:
unichars.append(c)
for u in unichars:
pml = pml.replace(u, '\U%s' % hex(ord(u))[2:].rjust(4, '0'))
return pml