diff --git a/src/calibre/ebooks/metadata/ereader.py b/src/calibre/ebooks/metadata/ereader.py index b1edee10b0..6e6624ce2a 100644 --- a/src/calibre/ebooks/metadata/ereader.py +++ b/src/calibre/ebooks/metadata/ereader.py @@ -8,11 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import re +import struct -from calibre.ebooks.metadata import MetaInformation, authors_to_string -from calibre.ebooks.pdb.header import PdbHeaderReader, PdbHeaderBuilder -from calibre.ebooks.pdb.ereader.reader import HeaderRecord +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata import authors_to_string +from calibre.ebooks.pdb.ereader.reader132 import HeaderRecord +from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.pdb.header import PdbHeaderReader def get_metadata(stream, extract_cover=True): """ @@ -20,14 +22,14 @@ def get_metadata(stream, extract_cover=True): """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) - + pheader = PdbHeaderReader(stream) hr = HeaderRecord(pheader.section_data(0)) - + if hr.version in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) - + mdata = mdata.split('\x00') mi.title = mdata[0] mi.authors = [mdata[1]] @@ -35,7 +37,7 @@ def get_metadata(stream, extract_cover=True): mi.isbn = mdata[4] except: pass - + if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') @@ -43,26 +45,31 @@ def get_metadata(stream, extract_cover=True): def set_metadata(stream, mi): pheader = PdbHeaderReader(stream) + + # Only Dropbook produced 132 byte record0 files are supported + if pheader.section_data(0) != 132: + return + sections = [pheader.section_data(x) for x in range(0, pheader.section_count())] hr = HeaderRecord(sections[0]) - + if hr.version not in (2, 10): return - + # Create a metadata record for the file if one does not alreay exist if not hr.has_metadata: sections += ['', 'MeTaInFo\x00'] last_data = len(sections) - 1 - + for i in range(0, 132, 2): - val, = struct.unpack('>H', sections[0][i:i+2]) + val, = struct.unpack('>H', sections[0][i:i + 2]) if val >= hr.last_data_offset: - sections[0][i:i+2] = struct.pack('>H', last_data) - + sections[0][i:i + 2] = struct.pack('>H', last_data) + sections[0][24:26] = struct.pack('>H', 1) # Set has metadata sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated - + # Merge the metadata into the file file_mi = get_metadata(stream, False) file_mi.smart_update(mi) @@ -79,4 +86,3 @@ def set_metadata(stream, mi): # Write the data back to the file for item in sections: stream.write(item) - diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 7d29ef243c..3afb13f035 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,186 +8,27 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os -import re -import struct -import zlib - -from calibre import CurrentDir -from calibre.ebooks import DRMError -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.compression.palmdoc import decompress_doc -from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html -from calibre.ebooks.pml.pmlconverter import pml_to_html - -class HeaderRecord(object): - ''' - The first record in the file is always the header record. It holds - information related to the location of text, images, and so on - in the file. This is used in conjunction with the sections - defined in the file header. - ''' - - def __init__(self, raw): - self.version, = struct.unpack('>H', raw[0:2]) - self.non_text_offset, = struct.unpack('>H', raw[12:14]) - self.has_metadata, = struct.unpack('>H', raw[24:26]) - self.footnote_rec, = struct.unpack('>H', raw[28:30]) - self.sidebar_rec, = struct.unpack('>H', raw[30:32]) - self.bookmark_offset, = struct.unpack('>H', raw[32:34]) - self.image_data_offset, = struct.unpack('>H', raw[40:42]) - self.metadata_offset, = struct.unpack('>H', raw[44:46]) - self.footnote_offset, = struct.unpack('>H', raw[48:50]) - self.sidebar_offset, = struct.unpack('>H', raw[50:52]) - self.last_data_offset, = struct.unpack('>H', raw[52:54]) - - self.num_text_pages = self.non_text_offset - 1 - self.num_image_pages = self.metadata_offset - self.image_data_offset - +from calibre.ebooks.pdb.ereader.reader132 import Reader132 +from calibre.ebooks.pdb.ereader.reader202 import Reader202 class Reader(FormatReader): def __init__(self, header, stream, log, encoding=None): - self.log = log - self.encoding = encoding + record0_size = len(header.section_data(0)) - self.sections = [] - for i in range(header.num_sections): - self.sections.append(header.section_data(i)) - - self.header_record = HeaderRecord(self.section_data(0)) - - if self.header_record.version not in (2, 10): - if self.header_record.version in (260, 272): - raise DRMError('eReader DRM is not supported.') - else: - raise EreaderError('Unknown book version %i.' % self.header_record.version) - - from calibre.ebooks.metadata.pdb import get_metadata - self.mi = get_metadata(stream, False) - - def section_data(self, number): - return self.sections[number] - - def decompress_text(self, number): - if self.header_record.version == 2: - return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) - if self.header_record.version == 10: - return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) - - - def get_image(self, number): - if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: - return 'empty', '' - data = self.section_data(number) - name = data[4:4 + 32].strip('\x00') - img = data[62:] - return name, img - - def get_text_page(self, number): - ''' - Only palmdoc and zlib compressed are supported. The text is - assumed to be encoded as Windows-1252. The encoding is part of - the eReader file spec and should always be this encoding. - ''' - if number not in range(1, self.header_record.num_text_pages + 1): - return '' - - return self.decompress_text(number) + if record0_size == 132: + self.reader = Reader132(header, stream, log, encoding) + elif record0_size == 202: + self.reader = Reader202(header, stream, log, encoding) + else: + raise ValueError('Unknown eReader Header') def extract_content(self, output_dir): - output_dir = os.path.abspath(output_dir) - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - html = u'' - - for i in range(1, self.header_record.num_text_pages + 1): - self.log.debug('Extracting text page %i' % i) - html += pml_to_html(self.get_text_page(i)) - - if self.header_record.footnote_rec > 0: - html += '

%s

' % _('Footnotes') - footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) - for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): - self.log.debug('Extracting footnote page %i' % i) - html += '
' - html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - html += '
' - - if self.header_record.sidebar_rec > 0: - html += '

%s

' % _('Sidebar') - sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) - for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): - self.log.debug('Extracting sidebar page %i' % i) - html += '
' - html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) - html += '
' - - html += '' - - with CurrentDir(output_dir): - with open('index.html', 'wb') as index: - self.log.debug('Writing text to index.html') - index.write(html.encode('utf-8')) - - if not os.path.exists(os.path.join(output_dir, 'images/')): - os.makedirs(os.path.join(output_dir, 'images/')) - images = [] - with CurrentDir(os.path.join(output_dir, 'images/')): - for i in range(0, self.header_record.num_image_pages): - name, img = self.get_image(self.header_record.image_data_offset + i) - images.append(name) - with open(name, 'wb') as imgf: - self.log.debug('Writing image %s to images/' % name) - imgf.write(img) - - opf_path = self.create_opf(output_dir, images) - - return opf_path - - def create_opf(self, output_dir, images): - with CurrentDir(output_dir): - opf = OPFCreator(output_dir, self.mi) - - manifest = [('index.html', None)] - - for i in images: - manifest.append((os.path.join('images/', i), None)) - - opf.create_manifest(manifest) - opf.create_spine(['index.html']) - with open('metadata.opf', 'wb') as opffile: - opf.render(opffile) - - return os.path.join(output_dir, 'metadata.opf') + return self.reader.extract_content(output_dir) def dump_pml(self): - ''' - This is primarily used for debugging and 3rd party tools to - get the plm markup that comprises the text in the file. - ''' - pml = '' - - for i in range(1, self.header_record.num_text_pages + 1): - pml += self.get_text_page(i) - - return pml - - def dump_images(self, output_dir): - ''' - This is primarily used for debugging and 3rd party tools to - get the images in the file. - ''' - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with CurrentDir(output_dir): - for i in range(0, self.header_record.num_image_pages): - name, img = self.get_image(self.header_record.image_data_offset + i) - with open(name, 'wb') as imgf: - imgf.write(img) + return self.reader.dump_pml() + def dump_images(self): + return self.reader.dump_images() diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py new file mode 100644 index 0000000000..91edfaf48b --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +''' +Read content from ereader pdb file with a 132 byte header created by Dropbook. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import re +import struct +import zlib + +from calibre import CurrentDir +from calibre.ebooks import DRMError +from calibre.ebooks.compression.palmdoc import decompress_doc +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.pdb.ereader import EreaderError +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html +from calibre.ebooks.pml.pmlconverter import pml_to_html + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.version, = struct.unpack('>H', raw[0:2]) + self.non_text_offset, = struct.unpack('>H', raw[12:14]) + self.has_metadata, = struct.unpack('>H', raw[24:26]) + self.footnote_rec, = struct.unpack('>H', raw[28:30]) + self.sidebar_rec, = struct.unpack('>H', raw[30:32]) + self.bookmark_offset, = struct.unpack('>H', raw[32:34]) + self.image_data_offset, = struct.unpack('>H', raw[40:42]) + self.metadata_offset, = struct.unpack('>H', raw[44:46]) + self.footnote_offset, = struct.unpack('>H', raw[48:50]) + self.sidebar_offset, = struct.unpack('>H', raw[50:52]) + self.last_data_offset, = struct.unpack('>H', raw[52:54]) + + self.num_text_pages = self.non_text_offset - 1 + self.num_image_pages = self.metadata_offset - self.image_data_offset + + +class Reader132(FormatReader): + + def __init__(self, header, stream, log, encoding=None): + self.log = log + self.encoding = encoding + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + if self.header_record.version not in (2, 10): + if self.header_record.version in (260, 272): + raise DRMError('eReader DRM is not supported.') + else: + raise EreaderError('Unknown book version %i.' % self.header_record.version) + + from calibre.ebooks.metadata.pdb import get_metadata + self.mi = get_metadata(stream, False) + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + if self.header_record.version == 2: + return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) + if self.header_record.version == 10: + return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) + + def get_image(self, number): + if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: + return 'empty', '' + data = self.section_data(number) + name = data[4:4 + 32].strip('\x00') + img = data[62:] + return name, img + + def get_text_page(self, number): + ''' + Only palmdoc and zlib compressed are supported. The text is + assumed to be encoded as Windows-1252. The encoding is part of + the eReader file spec and should always be this encoding. + ''' + if number not in range(1, self.header_record.num_text_pages + 1): + return '' + + return self.decompress_text(number) + + def extract_content(self, output_dir): + output_dir = os.path.abspath(output_dir) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + html = u'' + + for i in range(1, self.header_record.num_text_pages + 1): + self.log.debug('Extracting text page %i' % i) + html += pml_to_html(self.get_text_page(i)) + + if self.header_record.footnote_rec > 0: + html += '

%s

' % _('Footnotes') + footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): + self.log.debug('Extracting footnote page %i' % i) + html += '
' + html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) + html += '
' + + if self.header_record.sidebar_rec > 0: + html += '

%s

' % _('Sidebar') + sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): + self.log.debug('Extracting sidebar page %i' % i) + html += '
' + html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) + html += '
' + + html += '' + + with CurrentDir(output_dir): + with open('index.html', 'wb') as index: + self.log.debug('Writing text to index.html') + index.write(html.encode('utf-8')) + + if not os.path.exists(os.path.join(output_dir, 'images/')): + os.makedirs(os.path.join(output_dir, 'images/')) + images = [] + with CurrentDir(os.path.join(output_dir, 'images/')): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + images.append(name) + with open(name, 'wb') as imgf: + self.log.debug('Writing image %s to images/' % name) + imgf.write(img) + + opf_path = self.create_opf(output_dir, images) + + return opf_path + + def create_opf(self, output_dir, images): + with CurrentDir(output_dir): + opf = OPFCreator(output_dir, self.mi) + + manifest = [('index.html', None)] + + for i in images: + manifest.append((os.path.join('images/', i), None)) + + opf.create_manifest(manifest) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(output_dir, 'metadata.opf') + + def dump_pml(self): + ''' + This is primarily used for debugging and 3rd party tools to + get the plm markup that comprises the text in the file. + ''' + pml = '' + + for i in range(1, self.header_record.num_text_pages + 1): + pml += self.get_text_page(i) + + return pml + + def dump_images(self, output_dir): + ''' + This is primarily used for debugging and 3rd party tools to + get the images in the file. + ''' + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with CurrentDir(output_dir): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + with open(name, 'wb') as imgf: + imgf.write(img) + diff --git a/src/calibre/ebooks/pdb/ereader/reader202.py b/src/calibre/ebooks/pdb/ereader/reader202.py new file mode 100644 index 0000000000..120cb5f1d2 --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/reader202.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +''' +Read content from ereader pdb file with a 202 byte header created by Makebook. +''' +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import struct + +from calibre import CurrentDir +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.pml.pmlconverter import pml_to_html +from calibre.ebooks.compression.palmdoc import decompress_doc +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.pdb.ereader import EreaderError + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.version, = struct.unpack('>H', raw[0:2]) + self.non_text_offset, = struct.unpack('>H', raw[8:10]) + + self.num_text_pages = self.non_text_offset - 1 + + +class Reader202(FormatReader): + + def __init__(self, header, stream, log, encoding=None): + self.log = log + self.encoding = encoding + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + if self.header_record.version != 4: + raise EreaderError('Unknown book version %i.' % self.header_record.version) + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding) + + def get_image(self, number): + name = None + img = None + + data = self.section_data(number) + if data.startswith('PNG'): + name = data[4:4 + 32].strip('\x00') + img = data[62:] + + return name, img + + def get_text_page(self, number): + ''' + Only palmdoc compression is supported. The text is xored with 0xA5 and + assumed to be encoded as Windows-1252. The encoding is part of + the eReader file spec and should always be this encoding. + ''' + if number not in range(1, self.header_record.num_text_pages + 1): + return '' + + return self.decompress_text(number) + + def extract_content(self, output_dir): + output_dir = os.path.abspath(output_dir) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + html = u'' + + for i in range(1, self.header_record.num_text_pages + 1): + self.log.debug('Extracting text page %i' % i) + html += pml_to_html(self.get_text_page(i)) + + + html += '' + + with CurrentDir(output_dir): + with open('index.html', 'wb') as index: + self.log.debug('Writing text to index.html') + index.write(html.encode('utf-8')) + + if not os.path.exists(os.path.join(output_dir, 'images/')): + os.makedirs(os.path.join(output_dir, 'images/')) + images = [] + with CurrentDir(os.path.join(output_dir, 'images/')): + for i in range(self.header_record.non_text_offset, len(self.sections)): + name, img = self.get_image(i) + if name: + images.append(name) + with open(name, 'wb') as imgf: + self.log.debug('Writing image %s to images/' % name) + imgf.write(img) + + opf_path = self.create_opf(output_dir, images) + + return opf_path + + def create_opf(self, output_dir, images): + with CurrentDir(output_dir): + opf = OPFCreator(output_dir, MetaInformation(_('Unknown'), _('Unknown'))) + + manifest = [('index.html', None)] + + for i in images: + manifest.append((os.path.join('images/', i), None)) + + opf.create_manifest(manifest) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(output_dir, 'metadata.opf') + + def dump_pml(self): + ''' + This is primarily used for debugging and 3rd party tools to + get the plm markup that comprises the text in the file. + ''' + pml = '' + + for i in range(1, self.header_record.num_text_pages + 1): + pml += self.get_text_page(i) + + return pml + + def dump_images(self, output_dir): + ''' + This is primarily used for debugging and 3rd party tools to + get the images in the file. + ''' + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with CurrentDir(output_dir): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + with open(name, 'wb') as imgf: + imgf.write(img)