mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
eReader input support Makebook produced books (202 byte header documents).
This commit is contained in:
parent
24ca1a1134
commit
e4ee664bb3
@ -8,11 +8,13 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import struct
|
||||||
|
|
||||||
from calibre.ebooks.metadata import MetaInformation, authors_to_string
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader, PdbHeaderBuilder
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
from calibre.ebooks.pdb.ereader.reader import HeaderRecord
|
from calibre.ebooks.pdb.ereader.reader132 import HeaderRecord
|
||||||
|
from calibre.ebooks.pdb.header import PdbHeaderBuilder
|
||||||
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
|
|
||||||
def get_metadata(stream, extract_cover=True):
|
def get_metadata(stream, extract_cover=True):
|
||||||
"""
|
"""
|
||||||
@ -20,14 +22,14 @@ def get_metadata(stream, extract_cover=True):
|
|||||||
"""
|
"""
|
||||||
mi = MetaInformation(None, [_('Unknown')])
|
mi = MetaInformation(None, [_('Unknown')])
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
|
||||||
pheader = PdbHeaderReader(stream)
|
pheader = PdbHeaderReader(stream)
|
||||||
hr = HeaderRecord(pheader.section_data(0))
|
hr = HeaderRecord(pheader.section_data(0))
|
||||||
|
|
||||||
if hr.version in (2, 10) and hr.has_metadata == 1:
|
if hr.version in (2, 10) and hr.has_metadata == 1:
|
||||||
try:
|
try:
|
||||||
mdata = pheader.section_data(hr.metadata_offset)
|
mdata = pheader.section_data(hr.metadata_offset)
|
||||||
|
|
||||||
mdata = mdata.split('\x00')
|
mdata = mdata.split('\x00')
|
||||||
mi.title = mdata[0]
|
mi.title = mdata[0]
|
||||||
mi.authors = [mdata[1]]
|
mi.authors = [mdata[1]]
|
||||||
@ -35,7 +37,7 @@ def get_metadata(stream, extract_cover=True):
|
|||||||
mi.isbn = mdata[4]
|
mi.isbn = mdata[4]
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if not mi.title:
|
if not mi.title:
|
||||||
mi.title = pheader.title if pheader.title else _('Unknown')
|
mi.title = pheader.title if pheader.title else _('Unknown')
|
||||||
|
|
||||||
@ -43,26 +45,31 @@ def get_metadata(stream, extract_cover=True):
|
|||||||
|
|
||||||
def set_metadata(stream, mi):
|
def set_metadata(stream, mi):
|
||||||
pheader = PdbHeaderReader(stream)
|
pheader = PdbHeaderReader(stream)
|
||||||
|
|
||||||
|
# Only Dropbook produced 132 byte record0 files are supported
|
||||||
|
if pheader.section_data(0) != 132:
|
||||||
|
return
|
||||||
|
|
||||||
sections = [pheader.section_data(x) for x in range(0, pheader.section_count())]
|
sections = [pheader.section_data(x) for x in range(0, pheader.section_count())]
|
||||||
hr = HeaderRecord(sections[0])
|
hr = HeaderRecord(sections[0])
|
||||||
|
|
||||||
if hr.version not in (2, 10):
|
if hr.version not in (2, 10):
|
||||||
return
|
return
|
||||||
|
|
||||||
# Create a metadata record for the file if one does not alreay exist
|
# Create a metadata record for the file if one does not alreay exist
|
||||||
if not hr.has_metadata:
|
if not hr.has_metadata:
|
||||||
sections += ['', 'MeTaInFo\x00']
|
sections += ['', 'MeTaInFo\x00']
|
||||||
last_data = len(sections) - 1
|
last_data = len(sections) - 1
|
||||||
|
|
||||||
for i in range(0, 132, 2):
|
for i in range(0, 132, 2):
|
||||||
val, = struct.unpack('>H', sections[0][i:i+2])
|
val, = struct.unpack('>H', sections[0][i:i + 2])
|
||||||
if val >= hr.last_data_offset:
|
if val >= hr.last_data_offset:
|
||||||
sections[0][i:i+2] = struct.pack('>H', last_data)
|
sections[0][i:i + 2] = struct.pack('>H', last_data)
|
||||||
|
|
||||||
sections[0][24:26] = struct.pack('>H', 1) # Set has metadata
|
sections[0][24:26] = struct.pack('>H', 1) # Set has metadata
|
||||||
sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata
|
sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata
|
||||||
sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated
|
sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated
|
||||||
|
|
||||||
# Merge the metadata into the file
|
# Merge the metadata into the file
|
||||||
file_mi = get_metadata(stream, False)
|
file_mi = get_metadata(stream, False)
|
||||||
file_mi.smart_update(mi)
|
file_mi.smart_update(mi)
|
||||||
@ -79,4 +86,3 @@ def set_metadata(stream, mi):
|
|||||||
# Write the data back to the file
|
# Write the data back to the file
|
||||||
for item in sections:
|
for item in sections:
|
||||||
stream.write(item)
|
stream.write(item)
|
||||||
|
|
||||||
|
@ -8,186 +8,27 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import struct
|
|
||||||
import zlib
|
|
||||||
|
|
||||||
from calibre import CurrentDir
|
|
||||||
from calibre.ebooks import DRMError
|
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
|
||||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
|
||||||
from calibre.ebooks.pdb.ereader import EreaderError
|
|
||||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
|
from calibre.ebooks.pdb.ereader.reader132 import Reader132
|
||||||
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
from calibre.ebooks.pdb.ereader.reader202 import Reader202
|
||||||
|
|
||||||
class HeaderRecord(object):
|
|
||||||
'''
|
|
||||||
The first record in the file is always the header record. It holds
|
|
||||||
information related to the location of text, images, and so on
|
|
||||||
in the file. This is used in conjunction with the sections
|
|
||||||
defined in the file header.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self, raw):
|
|
||||||
self.version, = struct.unpack('>H', raw[0:2])
|
|
||||||
self.non_text_offset, = struct.unpack('>H', raw[12:14])
|
|
||||||
self.has_metadata, = struct.unpack('>H', raw[24:26])
|
|
||||||
self.footnote_rec, = struct.unpack('>H', raw[28:30])
|
|
||||||
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
|
|
||||||
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
|
|
||||||
self.image_data_offset, = struct.unpack('>H', raw[40:42])
|
|
||||||
self.metadata_offset, = struct.unpack('>H', raw[44:46])
|
|
||||||
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
|
||||||
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
|
|
||||||
self.last_data_offset, = struct.unpack('>H', raw[52:54])
|
|
||||||
|
|
||||||
self.num_text_pages = self.non_text_offset - 1
|
|
||||||
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
|
||||||
|
|
||||||
|
|
||||||
class Reader(FormatReader):
|
class Reader(FormatReader):
|
||||||
|
|
||||||
def __init__(self, header, stream, log, encoding=None):
|
def __init__(self, header, stream, log, encoding=None):
|
||||||
self.log = log
|
record0_size = len(header.section_data(0))
|
||||||
self.encoding = encoding
|
|
||||||
|
|
||||||
self.sections = []
|
if record0_size == 132:
|
||||||
for i in range(header.num_sections):
|
self.reader = Reader132(header, stream, log, encoding)
|
||||||
self.sections.append(header.section_data(i))
|
elif record0_size == 202:
|
||||||
|
self.reader = Reader202(header, stream, log, encoding)
|
||||||
self.header_record = HeaderRecord(self.section_data(0))
|
else:
|
||||||
|
raise ValueError('Unknown eReader Header')
|
||||||
if self.header_record.version not in (2, 10):
|
|
||||||
if self.header_record.version in (260, 272):
|
|
||||||
raise DRMError('eReader DRM is not supported.')
|
|
||||||
else:
|
|
||||||
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata.pdb import get_metadata
|
|
||||||
self.mi = get_metadata(stream, False)
|
|
||||||
|
|
||||||
def section_data(self, number):
|
|
||||||
return self.sections[number]
|
|
||||||
|
|
||||||
def decompress_text(self, number):
|
|
||||||
if self.header_record.version == 2:
|
|
||||||
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
|
||||||
if self.header_record.version == 10:
|
|
||||||
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
|
||||||
|
|
||||||
|
|
||||||
def get_image(self, number):
|
|
||||||
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
|
|
||||||
return 'empty', ''
|
|
||||||
data = self.section_data(number)
|
|
||||||
name = data[4:4 + 32].strip('\x00')
|
|
||||||
img = data[62:]
|
|
||||||
return name, img
|
|
||||||
|
|
||||||
def get_text_page(self, number):
|
|
||||||
'''
|
|
||||||
Only palmdoc and zlib compressed are supported. The text is
|
|
||||||
assumed to be encoded as Windows-1252. The encoding is part of
|
|
||||||
the eReader file spec and should always be this encoding.
|
|
||||||
'''
|
|
||||||
if number not in range(1, self.header_record.num_text_pages + 1):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
return self.decompress_text(number)
|
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
output_dir = os.path.abspath(output_dir)
|
return self.reader.extract_content(output_dir)
|
||||||
|
|
||||||
if not os.path.exists(output_dir):
|
|
||||||
os.makedirs(output_dir)
|
|
||||||
|
|
||||||
html = u'<html><head><title></title></head><body>'
|
|
||||||
|
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
|
||||||
self.log.debug('Extracting text page %i' % i)
|
|
||||||
html += pml_to_html(self.get_text_page(i))
|
|
||||||
|
|
||||||
if self.header_record.footnote_rec > 0:
|
|
||||||
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
|
||||||
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
|
||||||
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
|
|
||||||
self.log.debug('Extracting footnote page %i' % i)
|
|
||||||
html += '<dl>'
|
|
||||||
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
|
|
||||||
html += '</dl>'
|
|
||||||
|
|
||||||
if self.header_record.sidebar_rec > 0:
|
|
||||||
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
|
||||||
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
|
||||||
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
|
|
||||||
self.log.debug('Extracting sidebar page %i' % i)
|
|
||||||
html += '<dl>'
|
|
||||||
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
|
|
||||||
html += '</dl>'
|
|
||||||
|
|
||||||
html += '</body></html>'
|
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
|
||||||
with open('index.html', 'wb') as index:
|
|
||||||
self.log.debug('Writing text to index.html')
|
|
||||||
index.write(html.encode('utf-8'))
|
|
||||||
|
|
||||||
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
|
||||||
os.makedirs(os.path.join(output_dir, 'images/'))
|
|
||||||
images = []
|
|
||||||
with CurrentDir(os.path.join(output_dir, 'images/')):
|
|
||||||
for i in range(0, self.header_record.num_image_pages):
|
|
||||||
name, img = self.get_image(self.header_record.image_data_offset + i)
|
|
||||||
images.append(name)
|
|
||||||
with open(name, 'wb') as imgf:
|
|
||||||
self.log.debug('Writing image %s to images/' % name)
|
|
||||||
imgf.write(img)
|
|
||||||
|
|
||||||
opf_path = self.create_opf(output_dir, images)
|
|
||||||
|
|
||||||
return opf_path
|
|
||||||
|
|
||||||
def create_opf(self, output_dir, images):
|
|
||||||
with CurrentDir(output_dir):
|
|
||||||
opf = OPFCreator(output_dir, self.mi)
|
|
||||||
|
|
||||||
manifest = [('index.html', None)]
|
|
||||||
|
|
||||||
for i in images:
|
|
||||||
manifest.append((os.path.join('images/', i), None))
|
|
||||||
|
|
||||||
opf.create_manifest(manifest)
|
|
||||||
opf.create_spine(['index.html'])
|
|
||||||
with open('metadata.opf', 'wb') as opffile:
|
|
||||||
opf.render(opffile)
|
|
||||||
|
|
||||||
return os.path.join(output_dir, 'metadata.opf')
|
|
||||||
|
|
||||||
def dump_pml(self):
|
def dump_pml(self):
|
||||||
'''
|
return self.reader.dump_pml()
|
||||||
This is primarily used for debugging and 3rd party tools to
|
|
||||||
get the plm markup that comprises the text in the file.
|
|
||||||
'''
|
|
||||||
pml = ''
|
|
||||||
|
|
||||||
for i in range(1, self.header_record.num_text_pages + 1):
|
|
||||||
pml += self.get_text_page(i)
|
|
||||||
|
|
||||||
return pml
|
|
||||||
|
|
||||||
def dump_images(self, output_dir):
|
|
||||||
'''
|
|
||||||
This is primarily used for debugging and 3rd party tools to
|
|
||||||
get the images in the file.
|
|
||||||
'''
|
|
||||||
if not os.path.exists(output_dir):
|
|
||||||
os.makedirs(output_dir)
|
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
|
||||||
for i in range(0, self.header_record.num_image_pages):
|
|
||||||
name, img = self.get_image(self.header_record.image_data_offset + i)
|
|
||||||
with open(name, 'wb') as imgf:
|
|
||||||
imgf.write(img)
|
|
||||||
|
|
||||||
|
def dump_images(self):
|
||||||
|
return self.reader.dump_images()
|
||||||
|
192
src/calibre/ebooks/pdb/ereader/reader132.py
Normal file
192
src/calibre/ebooks/pdb/ereader/reader132.py
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
'''
|
||||||
|
Read content from ereader pdb file with a 132 byte header created by Dropbook.
|
||||||
|
'''
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import struct
|
||||||
|
import zlib
|
||||||
|
|
||||||
|
from calibre import CurrentDir
|
||||||
|
from calibre.ebooks import DRMError
|
||||||
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.pdb.ereader import EreaderError
|
||||||
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
|
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
|
||||||
|
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
||||||
|
|
||||||
|
class HeaderRecord(object):
|
||||||
|
'''
|
||||||
|
The first record in the file is always the header record. It holds
|
||||||
|
information related to the location of text, images, and so on
|
||||||
|
in the file. This is used in conjunction with the sections
|
||||||
|
defined in the file header.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, raw):
|
||||||
|
self.version, = struct.unpack('>H', raw[0:2])
|
||||||
|
self.non_text_offset, = struct.unpack('>H', raw[12:14])
|
||||||
|
self.has_metadata, = struct.unpack('>H', raw[24:26])
|
||||||
|
self.footnote_rec, = struct.unpack('>H', raw[28:30])
|
||||||
|
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
|
||||||
|
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
|
||||||
|
self.image_data_offset, = struct.unpack('>H', raw[40:42])
|
||||||
|
self.metadata_offset, = struct.unpack('>H', raw[44:46])
|
||||||
|
self.footnote_offset, = struct.unpack('>H', raw[48:50])
|
||||||
|
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
|
||||||
|
self.last_data_offset, = struct.unpack('>H', raw[52:54])
|
||||||
|
|
||||||
|
self.num_text_pages = self.non_text_offset - 1
|
||||||
|
self.num_image_pages = self.metadata_offset - self.image_data_offset
|
||||||
|
|
||||||
|
|
||||||
|
class Reader132(FormatReader):
|
||||||
|
|
||||||
|
def __init__(self, header, stream, log, encoding=None):
|
||||||
|
self.log = log
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
self.sections = []
|
||||||
|
for i in range(header.num_sections):
|
||||||
|
self.sections.append(header.section_data(i))
|
||||||
|
|
||||||
|
self.header_record = HeaderRecord(self.section_data(0))
|
||||||
|
|
||||||
|
if self.header_record.version not in (2, 10):
|
||||||
|
if self.header_record.version in (260, 272):
|
||||||
|
raise DRMError('eReader DRM is not supported.')
|
||||||
|
else:
|
||||||
|
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata.pdb import get_metadata
|
||||||
|
self.mi = get_metadata(stream, False)
|
||||||
|
|
||||||
|
def section_data(self, number):
|
||||||
|
return self.sections[number]
|
||||||
|
|
||||||
|
def decompress_text(self, number):
|
||||||
|
if self.header_record.version == 2:
|
||||||
|
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
||||||
|
if self.header_record.version == 10:
|
||||||
|
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
|
||||||
|
|
||||||
|
def get_image(self, number):
|
||||||
|
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
|
||||||
|
return 'empty', ''
|
||||||
|
data = self.section_data(number)
|
||||||
|
name = data[4:4 + 32].strip('\x00')
|
||||||
|
img = data[62:]
|
||||||
|
return name, img
|
||||||
|
|
||||||
|
def get_text_page(self, number):
|
||||||
|
'''
|
||||||
|
Only palmdoc and zlib compressed are supported. The text is
|
||||||
|
assumed to be encoded as Windows-1252. The encoding is part of
|
||||||
|
the eReader file spec and should always be this encoding.
|
||||||
|
'''
|
||||||
|
if number not in range(1, self.header_record.num_text_pages + 1):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
return self.decompress_text(number)
|
||||||
|
|
||||||
|
def extract_content(self, output_dir):
|
||||||
|
output_dir = os.path.abspath(output_dir)
|
||||||
|
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
html = u'<html><head><title></title></head><body>'
|
||||||
|
|
||||||
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
|
self.log.debug('Extracting text page %i' % i)
|
||||||
|
html += pml_to_html(self.get_text_page(i))
|
||||||
|
|
||||||
|
if self.header_record.footnote_rec > 0:
|
||||||
|
html += '<br /><h1>%s</h1>' % _('Footnotes')
|
||||||
|
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
|
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
|
||||||
|
self.log.debug('Extracting footnote page %i' % i)
|
||||||
|
html += '<dl>'
|
||||||
|
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
|
||||||
|
html += '</dl>'
|
||||||
|
|
||||||
|
if self.header_record.sidebar_rec > 0:
|
||||||
|
html += '<br /><h1>%s</h1>' % _('Sidebar')
|
||||||
|
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
|
||||||
|
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
|
||||||
|
self.log.debug('Extracting sidebar page %i' % i)
|
||||||
|
html += '<dl>'
|
||||||
|
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
|
||||||
|
html += '</dl>'
|
||||||
|
|
||||||
|
html += '</body></html>'
|
||||||
|
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
with open('index.html', 'wb') as index:
|
||||||
|
self.log.debug('Writing text to index.html')
|
||||||
|
index.write(html.encode('utf-8'))
|
||||||
|
|
||||||
|
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
||||||
|
os.makedirs(os.path.join(output_dir, 'images/'))
|
||||||
|
images = []
|
||||||
|
with CurrentDir(os.path.join(output_dir, 'images/')):
|
||||||
|
for i in range(0, self.header_record.num_image_pages):
|
||||||
|
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||||
|
images.append(name)
|
||||||
|
with open(name, 'wb') as imgf:
|
||||||
|
self.log.debug('Writing image %s to images/' % name)
|
||||||
|
imgf.write(img)
|
||||||
|
|
||||||
|
opf_path = self.create_opf(output_dir, images)
|
||||||
|
|
||||||
|
return opf_path
|
||||||
|
|
||||||
|
def create_opf(self, output_dir, images):
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
opf = OPFCreator(output_dir, self.mi)
|
||||||
|
|
||||||
|
manifest = [('index.html', None)]
|
||||||
|
|
||||||
|
for i in images:
|
||||||
|
manifest.append((os.path.join('images/', i), None))
|
||||||
|
|
||||||
|
opf.create_manifest(manifest)
|
||||||
|
opf.create_spine(['index.html'])
|
||||||
|
with open('metadata.opf', 'wb') as opffile:
|
||||||
|
opf.render(opffile)
|
||||||
|
|
||||||
|
return os.path.join(output_dir, 'metadata.opf')
|
||||||
|
|
||||||
|
def dump_pml(self):
|
||||||
|
'''
|
||||||
|
This is primarily used for debugging and 3rd party tools to
|
||||||
|
get the plm markup that comprises the text in the file.
|
||||||
|
'''
|
||||||
|
pml = ''
|
||||||
|
|
||||||
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
|
pml += self.get_text_page(i)
|
||||||
|
|
||||||
|
return pml
|
||||||
|
|
||||||
|
def dump_images(self, output_dir):
|
||||||
|
'''
|
||||||
|
This is primarily used for debugging and 3rd party tools to
|
||||||
|
get the images in the file.
|
||||||
|
'''
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
for i in range(0, self.header_record.num_image_pages):
|
||||||
|
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||||
|
with open(name, 'wb') as imgf:
|
||||||
|
imgf.write(img)
|
||||||
|
|
155
src/calibre/ebooks/pdb/ereader/reader202.py
Normal file
155
src/calibre/ebooks/pdb/ereader/reader202.py
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
'''
|
||||||
|
Read content from ereader pdb file with a 202 byte header created by Makebook.
|
||||||
|
'''
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
|
||||||
|
from calibre import CurrentDir
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.pml.pmlconverter import pml_to_html
|
||||||
|
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||||
|
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||||
|
from calibre.ebooks.pdb.ereader import EreaderError
|
||||||
|
|
||||||
|
class HeaderRecord(object):
|
||||||
|
'''
|
||||||
|
The first record in the file is always the header record. It holds
|
||||||
|
information related to the location of text, images, and so on
|
||||||
|
in the file. This is used in conjunction with the sections
|
||||||
|
defined in the file header.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, raw):
|
||||||
|
self.version, = struct.unpack('>H', raw[0:2])
|
||||||
|
self.non_text_offset, = struct.unpack('>H', raw[8:10])
|
||||||
|
|
||||||
|
self.num_text_pages = self.non_text_offset - 1
|
||||||
|
|
||||||
|
|
||||||
|
class Reader202(FormatReader):
|
||||||
|
|
||||||
|
def __init__(self, header, stream, log, encoding=None):
|
||||||
|
self.log = log
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
self.sections = []
|
||||||
|
for i in range(header.num_sections):
|
||||||
|
self.sections.append(header.section_data(i))
|
||||||
|
|
||||||
|
self.header_record = HeaderRecord(self.section_data(0))
|
||||||
|
|
||||||
|
if self.header_record.version != 4:
|
||||||
|
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
||||||
|
|
||||||
|
def section_data(self, number):
|
||||||
|
return self.sections[number]
|
||||||
|
|
||||||
|
def decompress_text(self, number):
|
||||||
|
return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding)
|
||||||
|
|
||||||
|
def get_image(self, number):
|
||||||
|
name = None
|
||||||
|
img = None
|
||||||
|
|
||||||
|
data = self.section_data(number)
|
||||||
|
if data.startswith('PNG'):
|
||||||
|
name = data[4:4 + 32].strip('\x00')
|
||||||
|
img = data[62:]
|
||||||
|
|
||||||
|
return name, img
|
||||||
|
|
||||||
|
def get_text_page(self, number):
|
||||||
|
'''
|
||||||
|
Only palmdoc compression is supported. The text is xored with 0xA5 and
|
||||||
|
assumed to be encoded as Windows-1252. The encoding is part of
|
||||||
|
the eReader file spec and should always be this encoding.
|
||||||
|
'''
|
||||||
|
if number not in range(1, self.header_record.num_text_pages + 1):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
return self.decompress_text(number)
|
||||||
|
|
||||||
|
def extract_content(self, output_dir):
|
||||||
|
output_dir = os.path.abspath(output_dir)
|
||||||
|
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
html = u'<html><head><title></title></head><body>'
|
||||||
|
|
||||||
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
|
self.log.debug('Extracting text page %i' % i)
|
||||||
|
html += pml_to_html(self.get_text_page(i))
|
||||||
|
|
||||||
|
|
||||||
|
html += '</body></html>'
|
||||||
|
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
with open('index.html', 'wb') as index:
|
||||||
|
self.log.debug('Writing text to index.html')
|
||||||
|
index.write(html.encode('utf-8'))
|
||||||
|
|
||||||
|
if not os.path.exists(os.path.join(output_dir, 'images/')):
|
||||||
|
os.makedirs(os.path.join(output_dir, 'images/'))
|
||||||
|
images = []
|
||||||
|
with CurrentDir(os.path.join(output_dir, 'images/')):
|
||||||
|
for i in range(self.header_record.non_text_offset, len(self.sections)):
|
||||||
|
name, img = self.get_image(i)
|
||||||
|
if name:
|
||||||
|
images.append(name)
|
||||||
|
with open(name, 'wb') as imgf:
|
||||||
|
self.log.debug('Writing image %s to images/' % name)
|
||||||
|
imgf.write(img)
|
||||||
|
|
||||||
|
opf_path = self.create_opf(output_dir, images)
|
||||||
|
|
||||||
|
return opf_path
|
||||||
|
|
||||||
|
def create_opf(self, output_dir, images):
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
opf = OPFCreator(output_dir, MetaInformation(_('Unknown'), _('Unknown')))
|
||||||
|
|
||||||
|
manifest = [('index.html', None)]
|
||||||
|
|
||||||
|
for i in images:
|
||||||
|
manifest.append((os.path.join('images/', i), None))
|
||||||
|
|
||||||
|
opf.create_manifest(manifest)
|
||||||
|
opf.create_spine(['index.html'])
|
||||||
|
with open('metadata.opf', 'wb') as opffile:
|
||||||
|
opf.render(opffile)
|
||||||
|
|
||||||
|
return os.path.join(output_dir, 'metadata.opf')
|
||||||
|
|
||||||
|
def dump_pml(self):
|
||||||
|
'''
|
||||||
|
This is primarily used for debugging and 3rd party tools to
|
||||||
|
get the plm markup that comprises the text in the file.
|
||||||
|
'''
|
||||||
|
pml = ''
|
||||||
|
|
||||||
|
for i in range(1, self.header_record.num_text_pages + 1):
|
||||||
|
pml += self.get_text_page(i)
|
||||||
|
|
||||||
|
return pml
|
||||||
|
|
||||||
|
def dump_images(self, output_dir):
|
||||||
|
'''
|
||||||
|
This is primarily used for debugging and 3rd party tools to
|
||||||
|
get the images in the file.
|
||||||
|
'''
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
for i in range(0, self.header_record.num_image_pages):
|
||||||
|
name, img = self.get_image(self.header_record.image_data_offset + i)
|
||||||
|
with open(name, 'wb') as imgf:
|
||||||
|
imgf.write(img)
|
Loading…
x
Reference in New Issue
Block a user