mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
eReader output. HTML_TO_PML cleanup.
This commit is contained in:
parent
49c7999064
commit
4c49aa5c60
@ -22,10 +22,12 @@ FORMAT_READERS = {
|
|||||||
|
|
||||||
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
|
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
|
||||||
from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
|
from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
|
||||||
|
from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
|
||||||
|
|
||||||
FORMAT_WRITERS = {
|
FORMAT_WRITERS = {
|
||||||
'doc' : palmdoc_writer,
|
'doc' : palmdoc_writer,
|
||||||
'ztxt' : ztxt_writer,
|
'ztxt' : ztxt_writer,
|
||||||
|
'ereader' : ereader_writer,
|
||||||
}
|
}
|
||||||
|
|
||||||
IDENTITY_TO_NAME = {
|
IDENTITY_TO_NAME = {
|
||||||
|
@ -126,7 +126,7 @@ class Reader(FormatReader):
|
|||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir):
|
||||||
with open('index.html', 'wb') as index:
|
with open('index.html', 'wb') as index:
|
||||||
self.log.debug('Writing text to index.html')
|
self.log.debug('Writing text to index.html')
|
||||||
index.write(html)
|
index.write(html.encode('utf-8'))
|
||||||
|
|
||||||
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
||||||
os.makedirs(os.path.join(output_dir, 'images/'))
|
os.makedirs(os.path.join(output_dir, 'images/'))
|
||||||
|
@ -12,19 +12,29 @@ import struct, zlib
|
|||||||
|
|
||||||
import Image, cStringIO
|
import Image, cStringIO
|
||||||
|
|
||||||
|
from calibre.ebooks.pdb.formatwriter import FormatWriter
|
||||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderBuilder
|
from calibre.ebooks.pdb.header import PdbHeaderBuilder
|
||||||
from calibre.ebooks.pdb.ereader import image_name
|
from calibre.ebooks.pdb.ereader import image_name
|
||||||
from calibre.ebooks.pml.pmlconverter import html_to_pml
|
from calibre.ebooks.pml.pmlconverter import html_to_pml
|
||||||
|
|
||||||
|
# We are using the older identity because we do not user newer features
|
||||||
|
# (sidebar, footnotes). This will ensure compatibility with older readers.
|
||||||
|
# If newer features are used (anything supported by dropbook but not by makebook
|
||||||
|
# change the identity to the newer PNRdPPrs.
|
||||||
IDENTITY = 'PNPdPPrs'
|
IDENTITY = 'PNPdPPrs'
|
||||||
|
|
||||||
class Writer(object):
|
# This is an arbitrary number that is small enough to work. The actual maximum
|
||||||
|
# record size is unknown.
|
||||||
|
MAX_RECORD_SIZE = 3560
|
||||||
|
|
||||||
def __init__(self, log):
|
class Writer(FormatWriter):
|
||||||
|
|
||||||
|
def __init__(self, opts, log):
|
||||||
|
self.opts = opts
|
||||||
self.log = log
|
self.log = log
|
||||||
|
|
||||||
def dump(self, oeb_book, out_stream, metadata=None):
|
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||||
text = self._text(oeb_book.spine)
|
text = self._text(oeb_book.spine)
|
||||||
images = self._images(oeb_book.manifest)
|
images = self._images(oeb_book.manifest)
|
||||||
metadata = [self._metadata(metadata)]
|
metadata = [self._metadata(metadata)]
|
||||||
@ -42,26 +52,28 @@ class Writer(object):
|
|||||||
out_stream.write(item)
|
out_stream.write(item)
|
||||||
|
|
||||||
def _text(self, pages):
|
def _text(self, pages):
|
||||||
# Todo: Split pages over 65505 Bytes
|
pml = ''
|
||||||
pml_pages = []
|
|
||||||
|
|
||||||
for page in pages:
|
for page in pages:
|
||||||
pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))
|
pml += html_to_pml(unicode(page)).encode('cp1252')
|
||||||
|
|
||||||
|
pml_pages = []
|
||||||
|
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
|
||||||
|
pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
|
||||||
|
|
||||||
return pml_pages
|
return pml_pages
|
||||||
|
|
||||||
def _images(self, manifest):
|
def _images(self, manifest):
|
||||||
# Todo: resize images over 65505 Bytes
|
|
||||||
images = []
|
images = []
|
||||||
|
|
||||||
for item in manifest:
|
for item in manifest:
|
||||||
if item.media_type in OEB_IMAGES:
|
if item.media_type in OEB_IMAGES:
|
||||||
image = '\x00\x00\x00\x00'
|
image = 'PNG '
|
||||||
|
|
||||||
image += image_name(item.href)
|
image += image_name(item.href)
|
||||||
image = image.ljust(62, '\x00')
|
image = image.ljust(62, '\x00')
|
||||||
|
|
||||||
im = Image.open(cStringIO.StringIO(item.data))
|
im = Image.open(cStringIO.StringIO(item.data)).convert('P')
|
||||||
|
im.thumbnail((300,300), Image.ANTIALIAS)
|
||||||
|
|
||||||
data = cStringIO.StringIO()
|
data = cStringIO.StringIO()
|
||||||
im.save(data, 'PNG')
|
im.save(data, 'PNG')
|
||||||
@ -83,22 +95,38 @@ class Writer(object):
|
|||||||
publisher\x00
|
publisher\x00
|
||||||
isbn\x00
|
isbn\x00
|
||||||
'''
|
'''
|
||||||
return '\x00\x00\x00\x00\x00'
|
|
||||||
|
title = _('Unknown')
|
||||||
|
author = _('Unknown')
|
||||||
|
copyright = ''
|
||||||
|
publisher = ''
|
||||||
|
isbn = ''
|
||||||
|
|
||||||
|
if metadata != None:
|
||||||
|
if len(metadata.title) >= 1:
|
||||||
|
title = metadata.title[0].value
|
||||||
|
if len(metadata.creator) >= 1:
|
||||||
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
|
author = authors_to_string([x.value for x in metadata.creator])
|
||||||
|
if len(metadata.rights) >= 1:
|
||||||
|
copyright = metadata.rights[0].value
|
||||||
|
if len(metadata.publisher) >= 1:
|
||||||
|
publisher = metadata.publisher[0].value
|
||||||
|
|
||||||
|
return '%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)
|
||||||
|
|
||||||
def _header_record(self, text_items, image_items):
|
def _header_record(self, text_items, image_items):
|
||||||
# Todo: Find out more about header and add correct values to the file
|
|
||||||
# can be read by eReader reader software.
|
|
||||||
'''
|
'''
|
||||||
text_items = the number of text pages
|
text_items = the number of text pages
|
||||||
image_items = the number of images
|
image_items = the number of images
|
||||||
'''
|
'''
|
||||||
version = 10
|
version = 10 # Zlib compression
|
||||||
non_text_offset = text_items + 1
|
non_text_offset = text_items + 1
|
||||||
|
|
||||||
if image_items > 0:
|
if image_items > 0:
|
||||||
image_data_offset = text_items + 1
|
image_data_offset = text_items + 1
|
||||||
meta_data_offset = image_data_offset + image_items
|
meta_data_offset = image_data_offset + image_items
|
||||||
last_data_offset = meta_data_offset + 2
|
last_data_offset = meta_data_offset + 1
|
||||||
else:
|
else:
|
||||||
meta_data_offset = text_items + 1
|
meta_data_offset = text_items + 1
|
||||||
last_data_offset = meta_data_offset + 1
|
last_data_offset = meta_data_offset + 1
|
||||||
@ -106,61 +134,36 @@ class Writer(object):
|
|||||||
|
|
||||||
record = ''
|
record = ''
|
||||||
|
|
||||||
record += struct.pack('>H', version) # [0:2]
|
record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
|
||||||
record += struct.pack('>H', 0) # [2:4]
|
record += struct.pack('>H', 0) # [2:4]
|
||||||
record += struct.pack('>H', 0) # [4:6]
|
record += struct.pack('>H', 0) # [4:6]
|
||||||
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC
|
record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text
|
||||||
record += struct.pack('>H', last_data_offset) # [8:10]
|
record += struct.pack('>H', 0) # [8:10]
|
||||||
record += struct.pack('>H', last_data_offset) # [10:12]
|
record += struct.pack('>H', 0) # [10:12]
|
||||||
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
|
record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset
|
||||||
record += struct.pack('>H', non_text_offset) # [14:16]
|
record += struct.pack('>H', 0) # [14:16]
|
||||||
record += struct.pack('>H', 1) # [16:18]
|
record += struct.pack('>H', 0) # [16:18]
|
||||||
record += struct.pack('>H', 1) # [18:20]
|
record += struct.pack('>H', 0) # [18:20]
|
||||||
record += struct.pack('>H', 0) # [20:22]
|
record += struct.pack('>H', image_items) # [20:22] # Number of images
|
||||||
record += struct.pack('>H', 1) # [22:24]
|
record += struct.pack('>H', 0) # [22:24]
|
||||||
record += struct.pack('>H', 1) # [24:26]
|
record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not
|
||||||
record += struct.pack('>H', 0) # [26:28]
|
record += struct.pack('>H', 0) # [26:28]
|
||||||
record += struct.pack('>H', 0) # [28:30] # footnote_rec
|
record += struct.pack('>H', 0) # [28:30] # footnote_rec
|
||||||
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
|
record += struct.pack('>H', 0) # [30:32] # sidebar_rec
|
||||||
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
|
record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset
|
||||||
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
|
record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC
|
||||||
record += struct.pack('>H', non_text_offset) # [36:38]
|
record += struct.pack('>H', 0) # [36:38]
|
||||||
record += struct.pack('>H', non_text_offset + 1) # [38:40]
|
record += struct.pack('>H', 0) # [38:40]
|
||||||
record += struct.pack('>H', image_data_offset) # [40:42]
|
record += struct.pack('>H', image_data_offset) # [40:42] # image_data_offset. This will be the last data offset if there are no images
|
||||||
record += struct.pack('>H', image_data_offset) # [42:44]
|
record += struct.pack('>H', 0) # [42:44]
|
||||||
record += struct.pack('>H', meta_data_offset) # [44:46]
|
record += struct.pack('>H', meta_data_offset) # [44:46] # meta_data_offset. This will be the last data offset if there are no images
|
||||||
record += struct.pack('>H', meta_data_offset) # [46:48]
|
record += struct.pack('>H', 0) # [46:48]
|
||||||
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset
|
record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset. This will be the last data offset if there are no images
|
||||||
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset
|
record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset. This will be the last data offset if there are no images
|
||||||
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset
|
record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset
|
||||||
|
|
||||||
record += struct.pack('>H', 1) # [54:56]
|
for i in range(54, 132, 2):
|
||||||
for i in range(56, 132, 2):
|
record += struct.pack('>H', 0) # [54:132]
|
||||||
record += struct.pack('>H', 0)
|
|
||||||
|
|
||||||
'''
|
|
||||||
# Version
|
|
||||||
record += struct.pack('>H', version)
|
|
||||||
record = record.ljust(12, '\x00')
|
|
||||||
# Non-text offset, everything between record 0 and non_text_offset is text pages
|
|
||||||
record += struct.pack('>H', non_text_offset)
|
|
||||||
record = record.ljust(28, '\x00')
|
|
||||||
# Footnote and Sidebar rec
|
|
||||||
record += struct.pack('>H', 0)
|
|
||||||
record += struct.pack('>H', 0)
|
|
||||||
record += struct.pack('>H', last_data_offset)
|
|
||||||
record = record.ljust(40, '\x00')
|
|
||||||
# image pages
|
|
||||||
record += struct.pack('>H', image_data_offset)
|
|
||||||
record = record.ljust(44, '\x00')
|
|
||||||
# metadata string
|
|
||||||
record += struct.pack('>H', meta_data_offset)
|
|
||||||
record = record.ljust(48, '\x00')
|
|
||||||
# footnote and sidebar offsets
|
|
||||||
record += struct.pack('>H', last_data_offset)
|
|
||||||
record += struct.pack('>H', last_data_offset)
|
|
||||||
record = record.ljust(52, '\x00')
|
|
||||||
record += struct.pack('>H', last_data_offset)
|
|
||||||
'''
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
@ -14,5 +14,5 @@ class FormatWriter(object):
|
|||||||
def __init__(self, opts, log):
|
def __init__(self, opts, log):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def write_content(self, oeb_book, output_stream, ):
|
def write_content(self, oeb_book, output_stream, metadata=None):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
@ -44,7 +44,7 @@ class PDBOutput(OutputFormatPlugin):
|
|||||||
out_stream.seek(0)
|
out_stream.seek(0)
|
||||||
out_stream.truncate()
|
out_stream.truncate()
|
||||||
|
|
||||||
writer.write_content(oeb_book, out_stream)
|
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
|
||||||
|
|
||||||
if close:
|
if close:
|
||||||
out_stream.close()
|
out_stream.close()
|
||||||
|
@ -23,7 +23,7 @@ class Writer(FormatWriter):
|
|||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.log = log
|
self.log = log
|
||||||
|
|
||||||
def write_content(self, oeb_book, out_stream):
|
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||||
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
|
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
|
||||||
|
|
||||||
txt_records, txt_length = self._generate_text(oeb_book.spine)
|
txt_records, txt_length = self._generate_text(oeb_book.spine)
|
||||||
|
@ -22,7 +22,7 @@ class Writer(FormatWriter):
|
|||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.log = log
|
self.log = log
|
||||||
|
|
||||||
def write_content(self, oeb_book, out_stream):
|
def write_content(self, oeb_book, out_stream, metadata=None):
|
||||||
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
|
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
|
||||||
|
|
||||||
txt_records, txt_length = self._generate_text(oeb_book.spine)
|
txt_records, txt_length = self._generate_text(oeb_book.spine)
|
||||||
|
@ -41,7 +41,7 @@ class PMLInput(InputFormatPlugin):
|
|||||||
ienc = self.options.input_encoding
|
ienc = self.options.input_encoding
|
||||||
|
|
||||||
html = pml_to_html(pml_stream.read().decode(ienc))
|
html = pml_to_html(pml_stream.read().decode(ienc))
|
||||||
html_stream.write('<html><head><title /></head><body>' + html + '</body></html>')
|
html_stream.write('<html><head><title /></head><body>' + html.encode('utf-8') + '</body></html>')
|
||||||
|
|
||||||
if pclose:
|
if pclose:
|
||||||
pml_stream.close()
|
pml_stream.close()
|
||||||
@ -67,6 +67,8 @@ class PMLInput(InputFormatPlugin):
|
|||||||
self.process_pml(pml, html_path)
|
self.process_pml(pml, html_path)
|
||||||
|
|
||||||
imgs = glob.glob(os.path.join(tdir, '*.png'))
|
imgs = glob.glob(os.path.join(tdir, '*.png'))
|
||||||
|
if len(imgs) > 0:
|
||||||
|
os.makedirs(os.path.join(os.getcwd(), 'images'))
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
pimg_name = os.path.basename(img)
|
pimg_name = os.path.basename(img)
|
||||||
pimg_path = os.path.join(os.getcwd(), 'images', pimg_name)
|
pimg_path = os.path.join(os.getcwd(), 'images', pimg_name)
|
||||||
|
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from calibre import entity_to_unicode
|
||||||
from calibre.ebooks.pdb.ereader import image_name
|
from calibre.ebooks.pdb.ereader import image_name
|
||||||
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
|
||||||
|
|
||||||
@ -86,10 +87,9 @@ HTML_PML_RULES = [
|
|||||||
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
|
(re.compile('<a.*?href="#footnote-(?P<target>.+?).*?">(?P<text>.+?)</a>'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))),
|
||||||
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
||||||
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
||||||
#(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
|
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')),
|
||||||
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))),
|
(re.compile('&(?P<num>#\d+);'), lambda match: entity_to_unicode(match)),
|
||||||
(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % hex(int(match.group('num')))[2:].rjust(4, '0')),
|
(re.compile('&(?P<num>.+);'), lambda match: entity_to_unicode(match)),
|
||||||
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
|
|
||||||
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||||
(re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
(re.compile('<small>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||||
(re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
(re.compile('<sub .*?>(?P<text>.+?)</sub>', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')),
|
||||||
@ -162,13 +162,4 @@ def html_to_pml(html):
|
|||||||
|
|
||||||
pml += body
|
pml += body
|
||||||
|
|
||||||
# Replace symbols outside of cp1512 wtih \Uxxxx
|
|
||||||
chars = set(pml)
|
|
||||||
unichars = []
|
|
||||||
for c in chars:
|
|
||||||
if ord(c) > 128:
|
|
||||||
unichars.append(c)
|
|
||||||
for u in unichars:
|
|
||||||
pml = pml.replace(u, '\U%s' % hex(ord(u))[2:].rjust(4, '0'))
|
|
||||||
|
|
||||||
return pml
|
return pml
|
||||||
|
Loading…
x
Reference in New Issue
Block a user