From d1087fa909dfccc7c87f813bb14a6e2512479387 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 20 May 2009 20:21:29 -0400 Subject: [PATCH 01/15] ignore netbeans project dir. Fix error when auto converting news. --- .bzrignore | 1 + src/calibre/gui2/device.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.bzrignore b/.bzrignore index 47f754ef3c..0a44159b1e 100644 --- a/.bzrignore +++ b/.bzrignore @@ -13,6 +13,7 @@ src/calibre/manual/cli/ build dist docs +nbproject/ src/calibre/gui2/pictureflow/Makefile.Debug src/calibre/gui2/pictureflow/Makefile.Release src/calibre/gui2/pictureflow/debug/ diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index caed0358cc..9a8595a9c7 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -640,15 +640,15 @@ class DeviceGUI(object): ', '.join(sent_mails), 3000) - def sync_news(self, send_ids=None, do_auto=True): + def sync_news(self, send_ids=None, do_auto_convert=True): if self.device_connected: ids = list(dynamic.get('news_to_be_synced', set([]))) if send_ids is None else send_ids ids = [id for id in ids if self.library_view.model().db.has_id(id)] files, _auto_ids = self.library_view.model().get_preferred_formats_from_ids( ids, self.device_manager.device_class.settings().format_map, - exclude_auto=do_auto) + exclude_auto=do_auto_convert) auto = [] - if _auto_ids: + if do_auto_convert and _auto_ids: for id in _auto_ids: formats = [f.lower() for f in self.library_view.model().db.formats(id, index_is_id=True).split(',')] formats = formats if formats != None else [] From b827dea7f165ab6a6fb3fa8111f1a91e397a4cfe Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 20 May 2009 20:33:34 -0400 Subject: [PATCH 02/15] Prevent trackback when no news is scheduled and clicking downloading all scheduled news button. --- src/calibre/gui2/dialogs/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/dialogs/scheduler.py b/src/calibre/gui2/dialogs/scheduler.py index d757ec75c8..47f5a63db0 100644 --- a/src/calibre/gui2/dialogs/scheduler.py +++ b/src/calibre/gui2/dialogs/scheduler.py @@ -133,7 +133,7 @@ class RecipeModel(QAbstractItemModel, SearchQueryParser): self._map = dict(self.category_map) def scheduled_recipes(self): - for recipe in self.category_map[_('Scheduled')]: + for recipe in self.category_map.get(_('Scheduled'), []): yield recipe def sort_categories(self, x, y): From 24ca1a113493abb26d6dfefd284bbd6d62ed9b07 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 21 May 2009 16:22:24 -0400 Subject: [PATCH 03/15] Refactoring. --- setup.py | 2 +- src/calibre/ebooks/compression/__init__.py | 5 + .../ebooks/{mobi => compression}/palmdoc.c | 0 .../ebooks/{mobi => compression}/palmdoc.py | 0 src/calibre/ebooks/fb2/fb2ml.py | 13 +- src/calibre/ebooks/mobi/reader.py | 142 +++++++++--------- src/calibre/ebooks/mobi/writer.py | 47 +++--- src/calibre/ebooks/pdb/__init__.py | 81 +++++----- src/calibre/ebooks/pdb/ereader/reader.py | 19 ++- src/calibre/ebooks/pdb/ereader/writer.py | 6 +- src/calibre/ebooks/pdb/header.py | 11 +- src/calibre/ebooks/pdb/palmdoc/reader.py | 22 +-- src/calibre/ebooks/pdb/palmdoc/writer.py | 33 ++-- src/calibre/ebooks/pml/pmlml.py | 34 ++++- 14 files changed, 230 insertions(+), 185 deletions(-) create mode 100644 src/calibre/ebooks/compression/__init__.py rename src/calibre/ebooks/{mobi => compression}/palmdoc.c (100%) rename src/calibre/ebooks/{mobi => compression}/palmdoc.py (100%) diff --git a/setup.py b/setup.py index 4f11395fda..407b852a57 100644 --- a/setup.py +++ b/setup.py @@ -89,7 +89,7 @@ if __name__ == '__main__': include_dirs=['src/calibre/utils/msdes']), Extension('calibre.plugins.cPalmdoc', - sources=['src/calibre/ebooks/mobi/palmdoc.c']), + sources=['src/calibre/ebooks/compression/palmdoc.c']), PyQtExtension('calibre.plugins.pictureflow', ['src/calibre/gui2/pictureflow/pictureflow.cpp', diff --git a/src/calibre/ebooks/compression/__init__.py b/src/calibre/ebooks/compression/__init__.py new file mode 100644 index 0000000000..9e2aad729c --- /dev/null +++ b/src/calibre/ebooks/compression/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/mobi/palmdoc.c b/src/calibre/ebooks/compression/palmdoc.c similarity index 100% rename from src/calibre/ebooks/mobi/palmdoc.c rename to src/calibre/ebooks/compression/palmdoc.c diff --git a/src/calibre/ebooks/mobi/palmdoc.py b/src/calibre/ebooks/compression/palmdoc.py similarity index 100% rename from src/calibre/ebooks/mobi/palmdoc.py rename to src/calibre/ebooks/compression/palmdoc.py diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 96c9e43676..b6893e395d 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -25,15 +25,9 @@ TAG_MAP = { 'div' : 'p', } -STYLE_MAP = { - 'bold' : 'strong', - 'bolder' : 'strong', - 'italic' : 'emphasis', -} - STYLES = [ - 'font-weight', - 'font-style', + ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), + ('font-style', {'italic' : 'emphasis'}), ] class FB2MLizer(object): @@ -107,8 +101,9 @@ class FB2MLizer(object): fb2_text += '<%s>' % fb2_tag tag_stack.append(fb2_tag) + # Processes style information for s in STYLES: - style_tag = STYLE_MAP.get(style[s], None) + style_tag = s[1].get(style[s[0]], None) if style_tag: tag_count += 1 fb2_text += '<%s>' % style_tag diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index d6390f2643..edad5fe1f9 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -1,11 +1,17 @@ -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' Read data from .mobi files ''' -import struct, os, cStringIO, re, functools, datetime, textwrap +import datetime +import functools +import os +import re +import struct +import textwrap + +import cStringIO try: from PIL import Image as PILImage @@ -21,8 +27,8 @@ from calibre.ebooks import DRMError from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.huffcdic import HuffReader -from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.mobi.langcodes import main_language, sub_language +from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.ebooks.metadata.toc import TOC @@ -40,8 +46,8 @@ class EXTHHeader(object): while left > 0: left -= 1 - id, size = struct.unpack('>LL', raw[pos:pos+8]) - content = raw[pos+8:pos+size] + id, size = struct.unpack('>LL', raw[pos:pos + 8]) + content = raw[pos + 8:pos + size] pos += size if id >= 100 and id < 200: self.process_metadata(id, content, codec) @@ -87,7 +93,7 @@ class EXTHHeader(object): elif id == 106: try: self.mi.publish_date = datetime.datetime.strptime( - content, '%Y-%m-%d',).date() + content, '%Y-%m-%d', ).date() except: pass elif id == 108: @@ -123,13 +129,13 @@ class BookHeader(object): try: self.codec = { - 1252 : 'cp1252', - 65001 : 'utf-8', - }[self.codepage] + 1252: 'cp1252', + 65001: 'utf-8', + }[self.codepage] except (IndexError, KeyError): self.codec = 'cp1252' if user_encoding is None else user_encoding - log.warn('Unknown codepage %d. Assuming %s'%(self.codepage, - self.codec)) + log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, + self.codec)) if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: self.extra_flags = 0 else: @@ -147,14 +153,14 @@ class BookHeader(object): self.language = main_language.get(langid, 'ENGLISH') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] - self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0] + self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0] self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth = None if not isinstance(self.title, unicode): self.title = self.title.decode(self.codec, 'replace') if self.exth_flag & 0x40: - self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title) + self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title) self.exth.mi.uid = self.unique_id self.exth.mi.language = self.language @@ -182,7 +188,7 @@ class MetadataHeader(BookHeader): return struct.unpack('>H', self.stream.read(2))[0] def section_offset(self, number): - self.stream.seek(78+number*8) + self.stream.seek(78 + number * 8) return struct.unpack('>LBBBB', self.stream.read(8))[0] def header(self): @@ -242,15 +248,15 @@ class MobiReader(object): self.name = self.header[:32].replace('\x00', '') self.num_sections, = struct.unpack('>H', raw[76:78]) - self.ident = self.header[0x3C:0x3C+8].upper() + self.ident = self.header[0x3C:0x3C + 8].upper() if self.ident not in ['BOOKMOBI', 'TEXTREAD']: - raise MobiError('Unknown book type: %s'%self.ident) + raise MobiError('Unknown book type: %s' % self.ident) self.sections = [] self.section_headers = [] for i in range(self.num_sections): - offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8]) - flags, val = a1, a2<<16 | a3<<8 | a4 + offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8]) + flags, val = a1, a2 << 16 | a3 << 8 | a4 self.section_headers.append((offset, flags, val)) def section(section_number): @@ -266,7 +272,7 @@ class MobiReader(object): self.book_header = BookHeader(self.sections[0][0], self.ident, - user_encoding, self.log) + user_encoding, self.log) self.name = self.name.decode(self.book_header.codec, 'replace') def extract_content(self, output_dir, parse_cache): @@ -279,13 +285,13 @@ class MobiReader(object): parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, - 'ignore') + 'ignore') for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) e2u = functools.partial(entity_to_unicode, - exceptions=['lt', 'gt', 'amp', 'apos', 'quot']) + exceptions=['lt', 'gt', 'amp', 'apos', 'quot']) self.processed_html = re.sub(r'&(\S+?);', e2u, - self.processed_html) + self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() @@ -295,7 +301,7 @@ class MobiReader(object): if root.xpath('descendant::p/descendant::p'): from lxml.html import soupparser self.log.warning('Markup contains unclosed

tags, parsing using', - 'BeatifulSoup') + 'BeatifulSoup') root = soupparser.fromstring(self.processed_html) if root.tag != 'html': self.log.warn('File does not have opening tag') @@ -346,45 +352,45 @@ class MobiReader(object): fname = self.name.encode('ascii', 'replace') fname = re.sub(r'[\x08\x15\0]+', '', fname) htmlfile = os.path.join(output_dir, - sanitize_file_name(fname)+'.html') + sanitize_file_name(fname) + '.html') try: for ref in guide.xpath('descendant::reference'): if ref.attrib.has_key('href'): - ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href'] + ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) - self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' + self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(open(self.created_opf_path, 'wb'), ncx, - ncx_manifest_entry=ncx_manifest_entry) + ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') open(ncx_path, 'wb').write(ncx) with open('styles.css', 'wb') as s: - s.write(self.base_css_rules+'\n\n') + s.write(self.base_css_rules + '\n\n') for cls, rule in self.tag_css_rules.items(): if isinstance(rule, unicode): rule = rule.encode('utf-8') - s.write('.%s { %s }\n\n'%(cls, rule)) + s.write('.%s { %s }\n\n' % (cls, rule)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) - opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx, - ncx_manifest_entry ) + opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, + ncx_manifest_entry) ncx = ncx.getvalue() if ncx: - open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + open(os.path.splitext(htmlfile)[0] + '.ncx', 'wb').write(ncx) def read_embedded_metadata(self, root, elem, guide): - raw = ''+html.tostring(elem, encoding='utf-8')+'' + raw = '' + html.tostring(elem, encoding='utf-8') + '' stream = cStringIO.StringIO(raw) opf = OPF(stream) self.embedded_mi = MetaInformation(opf) @@ -394,7 +400,7 @@ class MobiReader(object): href = ref.get('href', '') if href.startswith('#'): href = href[1:] - anchors = root.xpath('//*[@id="%s"]'%href) + anchors = root.xpath('//*[@id="%s"]' % href) if anchors: cpos = anchors[0] reached = False @@ -412,27 +418,27 @@ class MobiReader(object): self.log.debug('Cleaning up HTML...') self.processed_html = re.sub(r'

', '', self.processed_html) if self.book_header.ancient and '')+'' + self.processed_html = '

' + self.processed_html.replace('\n\n', '

') + '' self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('> <', '>\n<') def upshift_markup(self, root): self.log.debug('Converting style information to CSS...') size_map = { - 'xx-small' : '0.5', - 'x-small' : '1', - 'small' : '2', - 'medium' : '3', - 'large' : '4', - 'x-large' : '5', - 'xx-large' : '6', - } + 'xx-small': '0.5', + 'x-small': '1', + 'small': '2', + 'medium': '3', + 'large': '4', + 'x-large': '5', + 'xx-large': '6', + } mobi_version = self.book_header.mobi_version for i, tag in enumerate(root.iter(etree.Element)): tag.attrib.pop('xmlns', '') if tag.tag in ('country-region', 'place', 'placetype', 'placename', - 'state', 'city', 'street', 'address', 'content'): - tag.tag = 'div' if tag.tag == 'content' else 'span' + 'state', 'city', 'street', 'address', 'content'): + tag.tag = 'div' if tag.tag == 'content' else 'span' for key in tag.attrib.keys(): tag.attrib.pop(key) continue @@ -450,7 +456,7 @@ class MobiReader(object): if width: styles.append('text-indent: %s' % width) if width.startswith('-'): - styles.append('margin-left: %s'%(width[1:])) + styles.append('margin-left: %s' % (width[1:])) if attrib.has_key('align'): align = attrib.pop('align').strip() if align: @@ -502,7 +508,7 @@ class MobiReader(object): cls = sel break if cls is None: - ncls = 'calibre_%d'%i + ncls = 'calibre_%d' % i self.tag_css_rules[ncls] = rule cls = attrib.get('class', '') cls = cls + (' ' if cls else '') + ncls @@ -514,17 +520,17 @@ class MobiReader(object): mi = MetaInformation(self.book_header.title, [_('Unknown')]) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): - opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) + opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1) elif mi.cover is not None: opf.cover = mi.cover else: - opf.cover = 'images/%05d.jpg'%1 + opf.cover = 'images/%05d.jpg' % 1 if not os.path.exists(os.path.join(os.path.dirname(htmlfile), - *opf.cover.split('/'))): - opf.cover = None + * opf.cover.split('/'))): + opf.cover = None manifest = [(htmlfile, 'text/x-oeb1-document'), - (os.path.abspath('styles.css'), 'text/css')] + (os.path.abspath('styles.css'), 'text/css')] bp = os.path.dirname(htmlfile) for i in getattr(self, 'image_names', []): manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) @@ -541,7 +547,7 @@ class MobiReader(object): ncx_manifest_entry = None if toc: ncx_manifest_entry = 'toc.ncx' - elems = root.xpath('//*[@id="%s"]'%toc.partition('#')[-1]) + elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1]) tocobj = None ent_pat = re.compile(r'&(\S+?);') if elems: @@ -556,12 +562,12 @@ class MobiReader(object): if href and re.match('\w+://', href) is None: try: text = u' '.join([t.strip() for t in \ - x.xpath('descendant::text()')]) + x.xpath('descendant::text()')]) except: text = '' text = ent_pat.sub(entity_to_unicode, text) tocobj.add_item(toc.partition('#')[0], href[1:], - text) + text) if reached and x.get('class', None) == 'mbp_pagebreak': break if tocobj is not None: @@ -599,17 +605,17 @@ class MobiReader(object): def extract_text(self): self.log.debug('Extracting text...') - text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] - processed_records = list(range(0, self.book_header.records+1)) + text_sections = [self.text_section(i) for i in range(1, self.book_header.records + 1)] + processed_records = list(range(0, self.book_header.records + 1)) self.mobi_html = '' if self.book_header.compression_type == 'DH': huffs = [self.sections[i][0] for i in - range(self.book_header.huff_offset, - self.book_header.huff_offset+self.book_header.huff_number)] + range(self.book_header.huff_offset, + self.book_header.huff_offset + self.book_header.huff_number)] processed_records += list(range(self.book_header.huff_offset, - self.book_header.huff_offset+self.book_header.huff_number)) + self.book_header.huff_offset + self.book_header.huff_number)) huff = HuffReader(huffs) self.mobi_html = huff.decompress(text_sections) @@ -620,7 +626,7 @@ class MobiReader(object): elif self.book_header.compression_type == '\x00\x01': self.mobi_html = ''.join(text_sections) else: - raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) + raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type)) if self.book_header.ancient and ']+filepos=['"]{0,1}(\d+)[^<>]*>''', - re.IGNORECASE) + re.IGNORECASE) for match in link_pattern.finditer(self.mobi_html): positions.add(int(match.group(1))) pos = 0 @@ -652,10 +658,10 @@ class MobiReader(object): if r > -1 and (r < l or l == end or l == -1): p = self.mobi_html.rfind('<', 0, end + 1) if pos < end and p > -1 and \ - not end_tag_re.match(self.mobi_html[p:r]) and \ - not self.mobi_html[p:r+1].endswith('/>'): - anchor = ' filepos-id="filepos%d"' - end = r + not end_tag_re.match(self.mobi_html[p:r]) and \ + not self.mobi_html[p:r + 1].endswith('/>'): + anchor = ' filepos-id="filepos%d"' + end = r else: end = r + 1 self.processed_html += self.mobi_html[pos:end] + (anchor % oend) @@ -673,7 +679,7 @@ class MobiReader(object): start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: # BAEN PRC files have bad headers - start=0 + start = 0 for i in range(start, self.num_sections): if i in processed_records: continue @@ -687,7 +693,7 @@ class MobiReader(object): except IOError: continue - path = os.path.join(output_dir, '%05d.jpg'%image_index) + path = os.path.join(output_dir, '%05d.jpg' % image_index) self.image_names.append(os.path.basename(path)) im.save(open(path, 'wb'), format='JPEG') diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index e16deeccda..1a5a729a6f 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1,27 +1,32 @@ ''' Write content to Mobipocket books. ''' -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' +from collections import defaultdict +from itertools import count +from itertools import izip +import random +import re from struct import pack import time -import random -from cStringIO import StringIO -import re -from itertools import izip, count -from collections import defaultdict from urlparse import urldefrag + from PIL import Image -from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ - OEB_RASTER_IMAGES -from calibre.ebooks.oeb.base import namespace, prefixname -from calibre.ebooks.oeb.base import urlnormalize -from calibre.ebooks.mobi.palmdoc import compress_doc +from cStringIO import StringIO from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.mobiml import MBP_NS +from calibre.ebooks.oeb.base import OEB_DOCS +from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES +from calibre.ebooks.oeb.base import XHTML +from calibre.ebooks.oeb.base import XHTML_NS +from calibre.ebooks.oeb.base import XML_NS +from calibre.ebooks.oeb.base import namespace +from calibre.ebooks.oeb.base import prefixname +from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.compression.palmdoc import compress_doc # TODO: # - Allow override CSS (?) @@ -174,7 +179,7 @@ class Serializer(object): item = hrefs[path] if path else None if item and item.spine_position is None: return False - path = item.href if item else base.href + path = item.href if item else base.href href = '#'.join((path, frag)) if frag else path buffer.write('filepos=') self.href_offsets[href].append(buffer.tell()) @@ -211,8 +216,8 @@ class Serializer(object): def serialize_elem(self, elem, item, nsrmap=NSRMAP): buffer = self.buffer if not isinstance(elem.tag, basestring) \ - or namespace(elem.tag) not in nsrmap: - return + or namespace(elem.tag) not in nsrmap: + return tag = prefixname(elem.tag, nsrmap) # Previous layers take care of @name id = elem.attrib.pop('id', None) @@ -221,9 +226,9 @@ class Serializer(object): offset = self.anchor_offset or buffer.tell() self.id_offsets[href] = offset if self.anchor_offset is not None and \ - tag == 'a' and not elem.attrib and \ - not len(elem) and not elem.text: - return + tag == 'a' and not elem.attrib and \ + not len(elem) and not elem.text: + return self.anchor_offset = buffer.tell() buffer.write('<') buffer.write(tag) @@ -286,8 +291,8 @@ class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') def __init__(self, compression=PALMDOC, imagemax=None, - prefer_author_sort=False): - self._compression = compression or UNCOMPRESSED + prefer_author_sort=False): + self._compression = compression or UNCOMPRESSED self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort @@ -297,7 +302,7 @@ class MobiWriter(object): imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None prefer_author_sort = opts.prefer_author_sort return cls(compression=PALMDOC, imagemax=imagemax, - prefer_author_sort=prefer_author_sort) + prefer_author_sort=prefer_author_sort) def __call__(self, oeb, path): if hasattr(path, 'write'): @@ -305,7 +310,7 @@ class MobiWriter(object): with open(path, 'w+b') as stream: return self._dump_stream(oeb, stream) - def _write(self, *data): + def _write(self, * data): for datum in data: self._stream.write(datum) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index d8850cfb16..54f3826470 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -7,17 +6,17 @@ __docformat__ = 'restructuredtext en' class PDBError(Exception): pass - + from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader -from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader +from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader FORMAT_READERS = { - 'PNPdPPrs' : ereader_reader, - 'PNRdPPrs' : ereader_reader, - 'zTXTGPlm' : ztxt_reader, - 'TEXtREAd' : palmdoc_reader, + 'PNPdPPrs': ereader_reader, + 'PNRdPPrs': ereader_reader, + 'zTXTGPlm': ztxt_reader, + 'TEXtREAd': palmdoc_reader, } from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer @@ -25,41 +24,41 @@ from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer FORMAT_WRITERS = { - 'doc' : palmdoc_writer, - 'ztxt' : ztxt_writer, - 'ereader' : ereader_writer, + 'doc': palmdoc_writer, + 'ztxt': ztxt_writer, + 'ereader': ereader_writer, } IDENTITY_TO_NAME = { - 'PNPdPPrs' : 'eReader', - 'PNRdPPrs' : 'eReader', - 'zTXTGPlm' : 'zTXT', - 'TEXtREAd' : 'PalmDOC', - - '.pdfADBE' : 'Adobe Reader', - 'BVokBDIC' : 'BDicty', - 'DB99DBOS' : 'DB (Database program)', - 'vIMGView' : 'FireViewer (ImageViewer)', - 'PmDBPmDB' : 'HanDBase', - 'InfoINDB' : 'InfoView', - 'ToGoToGo' : 'iSilo', - 'SDocSilX' : 'iSilo 3', - 'JbDbJBas' : 'JFile', - 'JfDbJFil' : 'JFile Pro', - 'DATALSdb' : 'LIST', - 'Mdb1Mdb1' : 'MobileDB', - 'BOOKMOBI' : 'MobiPocket', - 'DataPlkr' : 'Plucker', - 'DataSprd' : 'QuickSheet', - 'SM01SMem' : 'SuperMemo', - 'TEXtTlDc' : 'TealDoc', - 'InfoTlIf' : 'TealInfo', - 'DataTlMl' : 'TealMeal', - 'DataTlPt' : 'TealPaint', - 'dataTDBP' : 'ThinkDB', - 'TdatTide' : 'Tides', - 'ToRaTRPW' : 'TomeRaider', - 'BDOCWrdS' : 'WordSmith', + 'PNPdPPrs': 'eReader', + 'PNRdPPrs': 'eReader', + 'zTXTGPlm': 'zTXT', + 'TEXtREAd': 'PalmDOC', + + '.pdfADBE': 'Adobe Reader', + 'BVokBDIC': 'BDicty', + 'DB99DBOS': 'DB (Database program)', + 'vIMGView': 'FireViewer (ImageViewer)', + 'PmDBPmDB': 'HanDBase', + 'InfoINDB': 'InfoView', + 'ToGoToGo': 'iSilo', + 'SDocSilX': 'iSilo 3', + 'JbDbJBas': 'JFile', + 'JfDbJFil': 'JFile Pro', + 'DATALSdb': 'LIST', + 'Mdb1Mdb1': 'MobileDB', + 'BOOKMOBI': 'MobiPocket', + 'DataPlkr': 'Plucker', + 'DataSprd': 'QuickSheet', + 'SM01SMem': 'SuperMemo', + 'TEXtTlDc': 'TealDoc', + 'InfoTlIf': 'TealInfo', + 'DataTlMl': 'TealMeal', + 'DataTlPt': 'TealPaint', + 'dataTDBP': 'ThinkDB', + 'TdatTide': 'Tides', + 'ToRaTRPW': 'TomeRaider', + 'BDOCWrdS': 'WordSmith', } def get_reader(identity): @@ -67,10 +66,10 @@ def get_reader(identity): Returns None if no reader is found for the identity. ''' return FORMAT_READERS.get(identity, None) - + def get_writer(extension): ''' Returns None if no writer is found for extension. ''' return FORMAT_WRITERS.get(extension, None) - + diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 13429c5a98..7d29ef243c 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,16 +8,19 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, re, struct, zlib +import os +import re +import struct +import zlib from calibre import CurrentDir from calibre.ebooks import DRMError -from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.pdb.ereader import EreaderError -from calibre.ebooks.pml.pmlconverter import pml_to_html, \ - footnote_sidebar_to_html -from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.compression.palmdoc import decompress_doc +from calibre.ebooks.pdb.ereader import EreaderError +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html +from calibre.ebooks.pml.pmlconverter import pml_to_html class HeaderRecord(object): ''' @@ -32,7 +35,7 @@ class HeaderRecord(object): self.non_text_offset, = struct.unpack('>H', raw[12:14]) self.has_metadata, = struct.unpack('>H', raw[24:26]) self.footnote_rec, = struct.unpack('>H', raw[28:30]) - self.sidebar_rec, = struct.unpack('>H', raw[30:32]) + self.sidebar_rec, = struct.unpack('>H', raw[30:32]) self.bookmark_offset, = struct.unpack('>H', raw[32:34]) self.image_data_offset, = struct.unpack('>H', raw[40:42]) self.metadata_offset, = struct.unpack('>H', raw[44:46]) @@ -79,7 +82,7 @@ class Reader(FormatReader): if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: return 'empty', '' data = self.section_data(number) - name = data[4:4+32].strip('\x00') + name = data[4:4 + 32].strip('\x00') img = data[62:] return name, img diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 875aae764a..c8567c93b6 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -8,9 +8,11 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import struct, zlib +import struct +import zlib -import Image, cStringIO +import Image +import cStringIO from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.oeb.base import OEB_IMAGES diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 48c39fc0ad..0d626b98f6 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement ''' Read the header data from a pdb file. ''' @@ -8,7 +7,9 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import re, struct, time +import re +import struct +import time class PdbHeaderReader(object): @@ -35,16 +36,16 @@ class PdbHeaderReader(object): if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) - self.stream.seek(78+number*8) + self.stream.seek(78 + number * 8) offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0] - flags, val = a1, a2<<16 | a3<<8 | a4 + flags, val = a1, a2 << 16 | a3 << 8 | a4 return (offset, flags, val) def section_offset(self, number): if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) - self.stream.seek(78+number*8) + self.stream.seek(78 + number * 8) return struct.unpack('>LBBBB', self.stream.read(8))[0] def section_data(self, number): diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index ba35a2317e..915ed7d739 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -8,11 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, struct, zlib +import os +import struct +from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.mobi.palmdoc import decompress_doc -from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer +from calibre.ebooks.txt.processor import opf_writer +from calibre.ebooks.txt.processor import txt_to_markdown class HeaderRecord(object): ''' @@ -25,15 +27,15 @@ class HeaderRecord(object): def __init__(self, raw): self.compression, = struct.unpack('>H', raw[0:2]) self.num_records, = struct.unpack('>H', raw[8:10]) - - + + class Reader(FormatReader): - + def __init__(self, header, stream, log, encoding=None): self.stream = stream self.log = log self.encoding = encoding - + self.sections = [] for i in range(header.num_sections): self.sections.append(header.section_data(i)) @@ -52,7 +54,7 @@ class Reader(FormatReader): def extract_content(self, output_dir): txt = '' - + self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) @@ -62,12 +64,12 @@ class Reader(FormatReader): html = txt_to_markdown(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) - + from calibre.ebooks.metadata.meta import get_metadata mi = get_metadata(self.stream, 'pdb') manifest = [('index.html', None)] spine = ['index.html'] opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - + return os.path.join(output_dir, 'metadata.opf') diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index e841e69054..6a7d54a586 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -10,10 +10,11 @@ __docformat__ = 'restructuredtext en' import struct +from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.pdb.formatwriter import FormatWriter -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines -from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.txt.writer import TxtNewlines +from calibre.ebooks.txt.writer import TxtWriter MAX_RECORD_SIZE = 4096 @@ -22,48 +23,48 @@ class Writer(FormatWriter): def __init__(self, opts, log): self.opts = opts self.log = log - + def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') txt_records, txt_length = self._generate_text(oeb_book.spine) header_record = self._header_record(txt_length, len(txt_records)) - + section_lengths = [len(header_record)] self.log.info('Compessing data...') for i in range(0, len(txt_records)): self.log.debug('\tCompressing record %i' % i) txt_records[i] = compress_doc(txt_records[i].encode('utf-8')) section_lengths.append(len(txt_records[i])) - + out_stream.seek(0) hb = PdbHeaderBuilder('TEXtREAd', title) hb.build_header(section_lengths, out_stream) - - for record in [header_record]+txt_records: + + for record in [header_record] + txt_records: out_stream.write(record) - + def _generate_text(self, spine): txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) txt = txt_writer.dump(spine) - + txt_length = len(txt) - + txt_records = [] for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): - txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) - + txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) + return txt_records, txt_length - + def _header_record(self, txt_length, record_count): record = '' - + record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression). record += struct.pack('>H', 0) # [2:4], Always 0. record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book. record += struct.pack('>H', record_count) # [8:10], Number of PDB records used for the text of the book. record += struct.pack('>H', MAX_RECORD_SIZE) # [10-12], Maximum size of each record containing text, always 4096. record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text. - + return record - + diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index cdf3bf69e8..c34ada3317 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -8,7 +8,8 @@ __docformat__ = 'restructuredtext en' Transform OEB content into PML markup ''' -import os, re +import os +import re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer @@ -40,6 +41,31 @@ STYLES = [ ('text-align', {'right' : 'r', 'center' : 'c'}), ] +BLOCK_TAGS = [ + 'p', +] + +BLOCK_STYLES = [ + 'block', +] + +LINK_TAGS = [ + 'a', +] + +SEPARATE_TAGS = [ + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'p', + 'div', + 'li', + 'tr', +] + class PMLMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables @@ -104,7 +130,7 @@ class PMLMLizer(object): tag_count = 0 # Are we in a paragraph block? - if tag == 'p' or style['display'] in ('block'): + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if 'block' not in tag_stack: tag_count += 1 tag_stack.append('block') @@ -136,7 +162,7 @@ class PMLMLizer(object): # Special processing of tags that require an argument. # Anchors links - if tag == 'a' and 'q' not in tag_stack: + if tag in LINK_TAGS and 'q' not in tag_stack: href = elem.get('href') if href and '://' not in href: if '#' in href: @@ -168,7 +194,7 @@ class PMLMLizer(object): for i in range(0, tag_count): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'): + if tag in SEPARATE_TAGS: text += os.linesep + os.linesep if 'block' not in tag_stack: From e4ee664bb353190b283296dc9c71d1bf5615da42 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 21 May 2009 17:22:32 -0400 Subject: [PATCH 04/15] eReader input support Makebook produced books (202 byte header documents). --- src/calibre/ebooks/metadata/ereader.py | 38 ++-- src/calibre/ebooks/pdb/ereader/reader.py | 185 ++----------------- src/calibre/ebooks/pdb/ereader/reader132.py | 192 ++++++++++++++++++++ src/calibre/ebooks/pdb/ereader/reader202.py | 155 ++++++++++++++++ 4 files changed, 382 insertions(+), 188 deletions(-) create mode 100644 src/calibre/ebooks/pdb/ereader/reader132.py create mode 100644 src/calibre/ebooks/pdb/ereader/reader202.py diff --git a/src/calibre/ebooks/metadata/ereader.py b/src/calibre/ebooks/metadata/ereader.py index b1edee10b0..6e6624ce2a 100644 --- a/src/calibre/ebooks/metadata/ereader.py +++ b/src/calibre/ebooks/metadata/ereader.py @@ -8,11 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import re +import struct -from calibre.ebooks.metadata import MetaInformation, authors_to_string -from calibre.ebooks.pdb.header import PdbHeaderReader, PdbHeaderBuilder -from calibre.ebooks.pdb.ereader.reader import HeaderRecord +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata import authors_to_string +from calibre.ebooks.pdb.ereader.reader132 import HeaderRecord +from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.pdb.header import PdbHeaderReader def get_metadata(stream, extract_cover=True): """ @@ -20,14 +22,14 @@ def get_metadata(stream, extract_cover=True): """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) - + pheader = PdbHeaderReader(stream) hr = HeaderRecord(pheader.section_data(0)) - + if hr.version in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) - + mdata = mdata.split('\x00') mi.title = mdata[0] mi.authors = [mdata[1]] @@ -35,7 +37,7 @@ def get_metadata(stream, extract_cover=True): mi.isbn = mdata[4] except: pass - + if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') @@ -43,26 +45,31 @@ def get_metadata(stream, extract_cover=True): def set_metadata(stream, mi): pheader = PdbHeaderReader(stream) + + # Only Dropbook produced 132 byte record0 files are supported + if pheader.section_data(0) != 132: + return + sections = [pheader.section_data(x) for x in range(0, pheader.section_count())] hr = HeaderRecord(sections[0]) - + if hr.version not in (2, 10): return - + # Create a metadata record for the file if one does not alreay exist if not hr.has_metadata: sections += ['', 'MeTaInFo\x00'] last_data = len(sections) - 1 - + for i in range(0, 132, 2): - val, = struct.unpack('>H', sections[0][i:i+2]) + val, = struct.unpack('>H', sections[0][i:i + 2]) if val >= hr.last_data_offset: - sections[0][i:i+2] = struct.pack('>H', last_data) - + sections[0][i:i + 2] = struct.pack('>H', last_data) + sections[0][24:26] = struct.pack('>H', 1) # Set has metadata sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated - + # Merge the metadata into the file file_mi = get_metadata(stream, False) file_mi.smart_update(mi) @@ -79,4 +86,3 @@ def set_metadata(stream, mi): # Write the data back to the file for item in sections: stream.write(item) - diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 7d29ef243c..3afb13f035 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,186 +8,27 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os -import re -import struct -import zlib - -from calibre import CurrentDir -from calibre.ebooks import DRMError -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.compression.palmdoc import decompress_doc -from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html -from calibre.ebooks.pml.pmlconverter import pml_to_html - -class HeaderRecord(object): - ''' - The first record in the file is always the header record. It holds - information related to the location of text, images, and so on - in the file. This is used in conjunction with the sections - defined in the file header. - ''' - - def __init__(self, raw): - self.version, = struct.unpack('>H', raw[0:2]) - self.non_text_offset, = struct.unpack('>H', raw[12:14]) - self.has_metadata, = struct.unpack('>H', raw[24:26]) - self.footnote_rec, = struct.unpack('>H', raw[28:30]) - self.sidebar_rec, = struct.unpack('>H', raw[30:32]) - self.bookmark_offset, = struct.unpack('>H', raw[32:34]) - self.image_data_offset, = struct.unpack('>H', raw[40:42]) - self.metadata_offset, = struct.unpack('>H', raw[44:46]) - self.footnote_offset, = struct.unpack('>H', raw[48:50]) - self.sidebar_offset, = struct.unpack('>H', raw[50:52]) - self.last_data_offset, = struct.unpack('>H', raw[52:54]) - - self.num_text_pages = self.non_text_offset - 1 - self.num_image_pages = self.metadata_offset - self.image_data_offset - +from calibre.ebooks.pdb.ereader.reader132 import Reader132 +from calibre.ebooks.pdb.ereader.reader202 import Reader202 class Reader(FormatReader): def __init__(self, header, stream, log, encoding=None): - self.log = log - self.encoding = encoding + record0_size = len(header.section_data(0)) - self.sections = [] - for i in range(header.num_sections): - self.sections.append(header.section_data(i)) - - self.header_record = HeaderRecord(self.section_data(0)) - - if self.header_record.version not in (2, 10): - if self.header_record.version in (260, 272): - raise DRMError('eReader DRM is not supported.') - else: - raise EreaderError('Unknown book version %i.' % self.header_record.version) - - from calibre.ebooks.metadata.pdb import get_metadata - self.mi = get_metadata(stream, False) - - def section_data(self, number): - return self.sections[number] - - def decompress_text(self, number): - if self.header_record.version == 2: - return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) - if self.header_record.version == 10: - return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) - - - def get_image(self, number): - if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: - return 'empty', '' - data = self.section_data(number) - name = data[4:4 + 32].strip('\x00') - img = data[62:] - return name, img - - def get_text_page(self, number): - ''' - Only palmdoc and zlib compressed are supported. The text is - assumed to be encoded as Windows-1252. The encoding is part of - the eReader file spec and should always be this encoding. - ''' - if number not in range(1, self.header_record.num_text_pages + 1): - return '' - - return self.decompress_text(number) + if record0_size == 132: + self.reader = Reader132(header, stream, log, encoding) + elif record0_size == 202: + self.reader = Reader202(header, stream, log, encoding) + else: + raise ValueError('Unknown eReader Header') def extract_content(self, output_dir): - output_dir = os.path.abspath(output_dir) - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - html = u'' - - for i in range(1, self.header_record.num_text_pages + 1): - self.log.debug('Extracting text page %i' % i) - html += pml_to_html(self.get_text_page(i)) - - if self.header_record.footnote_rec > 0: - html += '

%s

' % _('Footnotes') - footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) - for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): - self.log.debug('Extracting footnote page %i' % i) - html += '
' - html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - html += '
' - - if self.header_record.sidebar_rec > 0: - html += '

%s

' % _('Sidebar') - sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) - for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): - self.log.debug('Extracting sidebar page %i' % i) - html += '
' - html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) - html += '
' - - html += '' - - with CurrentDir(output_dir): - with open('index.html', 'wb') as index: - self.log.debug('Writing text to index.html') - index.write(html.encode('utf-8')) - - if not os.path.exists(os.path.join(output_dir, 'images/')): - os.makedirs(os.path.join(output_dir, 'images/')) - images = [] - with CurrentDir(os.path.join(output_dir, 'images/')): - for i in range(0, self.header_record.num_image_pages): - name, img = self.get_image(self.header_record.image_data_offset + i) - images.append(name) - with open(name, 'wb') as imgf: - self.log.debug('Writing image %s to images/' % name) - imgf.write(img) - - opf_path = self.create_opf(output_dir, images) - - return opf_path - - def create_opf(self, output_dir, images): - with CurrentDir(output_dir): - opf = OPFCreator(output_dir, self.mi) - - manifest = [('index.html', None)] - - for i in images: - manifest.append((os.path.join('images/', i), None)) - - opf.create_manifest(manifest) - opf.create_spine(['index.html']) - with open('metadata.opf', 'wb') as opffile: - opf.render(opffile) - - return os.path.join(output_dir, 'metadata.opf') + return self.reader.extract_content(output_dir) def dump_pml(self): - ''' - This is primarily used for debugging and 3rd party tools to - get the plm markup that comprises the text in the file. - ''' - pml = '' - - for i in range(1, self.header_record.num_text_pages + 1): - pml += self.get_text_page(i) - - return pml - - def dump_images(self, output_dir): - ''' - This is primarily used for debugging and 3rd party tools to - get the images in the file. - ''' - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with CurrentDir(output_dir): - for i in range(0, self.header_record.num_image_pages): - name, img = self.get_image(self.header_record.image_data_offset + i) - with open(name, 'wb') as imgf: - imgf.write(img) + return self.reader.dump_pml() + def dump_images(self): + return self.reader.dump_images() diff --git a/src/calibre/ebooks/pdb/ereader/reader132.py b/src/calibre/ebooks/pdb/ereader/reader132.py new file mode 100644 index 0000000000..91edfaf48b --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/reader132.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +''' +Read content from ereader pdb file with a 132 byte header created by Dropbook. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import re +import struct +import zlib + +from calibre import CurrentDir +from calibre.ebooks import DRMError +from calibre.ebooks.compression.palmdoc import decompress_doc +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.pdb.ereader import EreaderError +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html +from calibre.ebooks.pml.pmlconverter import pml_to_html + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.version, = struct.unpack('>H', raw[0:2]) + self.non_text_offset, = struct.unpack('>H', raw[12:14]) + self.has_metadata, = struct.unpack('>H', raw[24:26]) + self.footnote_rec, = struct.unpack('>H', raw[28:30]) + self.sidebar_rec, = struct.unpack('>H', raw[30:32]) + self.bookmark_offset, = struct.unpack('>H', raw[32:34]) + self.image_data_offset, = struct.unpack('>H', raw[40:42]) + self.metadata_offset, = struct.unpack('>H', raw[44:46]) + self.footnote_offset, = struct.unpack('>H', raw[48:50]) + self.sidebar_offset, = struct.unpack('>H', raw[50:52]) + self.last_data_offset, = struct.unpack('>H', raw[52:54]) + + self.num_text_pages = self.non_text_offset - 1 + self.num_image_pages = self.metadata_offset - self.image_data_offset + + +class Reader132(FormatReader): + + def __init__(self, header, stream, log, encoding=None): + self.log = log + self.encoding = encoding + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + if self.header_record.version not in (2, 10): + if self.header_record.version in (260, 272): + raise DRMError('eReader DRM is not supported.') + else: + raise EreaderError('Unknown book version %i.' % self.header_record.version) + + from calibre.ebooks.metadata.pdb import get_metadata + self.mi = get_metadata(stream, False) + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + if self.header_record.version == 2: + return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) + if self.header_record.version == 10: + return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) + + def get_image(self, number): + if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: + return 'empty', '' + data = self.section_data(number) + name = data[4:4 + 32].strip('\x00') + img = data[62:] + return name, img + + def get_text_page(self, number): + ''' + Only palmdoc and zlib compressed are supported. The text is + assumed to be encoded as Windows-1252. The encoding is part of + the eReader file spec and should always be this encoding. + ''' + if number not in range(1, self.header_record.num_text_pages + 1): + return '' + + return self.decompress_text(number) + + def extract_content(self, output_dir): + output_dir = os.path.abspath(output_dir) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + html = u'' + + for i in range(1, self.header_record.num_text_pages + 1): + self.log.debug('Extracting text page %i' % i) + html += pml_to_html(self.get_text_page(i)) + + if self.header_record.footnote_rec > 0: + html += '

%s

' % _('Footnotes') + footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): + self.log.debug('Extracting footnote page %i' % i) + html += '
' + html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) + html += '
' + + if self.header_record.sidebar_rec > 0: + html += '

%s

' % _('Sidebar') + sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): + self.log.debug('Extracting sidebar page %i' % i) + html += '
' + html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) + html += '
' + + html += '' + + with CurrentDir(output_dir): + with open('index.html', 'wb') as index: + self.log.debug('Writing text to index.html') + index.write(html.encode('utf-8')) + + if not os.path.exists(os.path.join(output_dir, 'images/')): + os.makedirs(os.path.join(output_dir, 'images/')) + images = [] + with CurrentDir(os.path.join(output_dir, 'images/')): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + images.append(name) + with open(name, 'wb') as imgf: + self.log.debug('Writing image %s to images/' % name) + imgf.write(img) + + opf_path = self.create_opf(output_dir, images) + + return opf_path + + def create_opf(self, output_dir, images): + with CurrentDir(output_dir): + opf = OPFCreator(output_dir, self.mi) + + manifest = [('index.html', None)] + + for i in images: + manifest.append((os.path.join('images/', i), None)) + + opf.create_manifest(manifest) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(output_dir, 'metadata.opf') + + def dump_pml(self): + ''' + This is primarily used for debugging and 3rd party tools to + get the plm markup that comprises the text in the file. + ''' + pml = '' + + for i in range(1, self.header_record.num_text_pages + 1): + pml += self.get_text_page(i) + + return pml + + def dump_images(self, output_dir): + ''' + This is primarily used for debugging and 3rd party tools to + get the images in the file. + ''' + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with CurrentDir(output_dir): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + with open(name, 'wb') as imgf: + imgf.write(img) + diff --git a/src/calibre/ebooks/pdb/ereader/reader202.py b/src/calibre/ebooks/pdb/ereader/reader202.py new file mode 100644 index 0000000000..120cb5f1d2 --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/reader202.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +''' +Read content from ereader pdb file with a 202 byte header created by Makebook. +''' +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import struct + +from calibre import CurrentDir +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.pml.pmlconverter import pml_to_html +from calibre.ebooks.compression.palmdoc import decompress_doc +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.pdb.ereader import EreaderError + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.version, = struct.unpack('>H', raw[0:2]) + self.non_text_offset, = struct.unpack('>H', raw[8:10]) + + self.num_text_pages = self.non_text_offset - 1 + + +class Reader202(FormatReader): + + def __init__(self, header, stream, log, encoding=None): + self.log = log + self.encoding = encoding + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + if self.header_record.version != 4: + raise EreaderError('Unknown book version %i.' % self.header_record.version) + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding) + + def get_image(self, number): + name = None + img = None + + data = self.section_data(number) + if data.startswith('PNG'): + name = data[4:4 + 32].strip('\x00') + img = data[62:] + + return name, img + + def get_text_page(self, number): + ''' + Only palmdoc compression is supported. The text is xored with 0xA5 and + assumed to be encoded as Windows-1252. The encoding is part of + the eReader file spec and should always be this encoding. + ''' + if number not in range(1, self.header_record.num_text_pages + 1): + return '' + + return self.decompress_text(number) + + def extract_content(self, output_dir): + output_dir = os.path.abspath(output_dir) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + html = u'' + + for i in range(1, self.header_record.num_text_pages + 1): + self.log.debug('Extracting text page %i' % i) + html += pml_to_html(self.get_text_page(i)) + + + html += '' + + with CurrentDir(output_dir): + with open('index.html', 'wb') as index: + self.log.debug('Writing text to index.html') + index.write(html.encode('utf-8')) + + if not os.path.exists(os.path.join(output_dir, 'images/')): + os.makedirs(os.path.join(output_dir, 'images/')) + images = [] + with CurrentDir(os.path.join(output_dir, 'images/')): + for i in range(self.header_record.non_text_offset, len(self.sections)): + name, img = self.get_image(i) + if name: + images.append(name) + with open(name, 'wb') as imgf: + self.log.debug('Writing image %s to images/' % name) + imgf.write(img) + + opf_path = self.create_opf(output_dir, images) + + return opf_path + + def create_opf(self, output_dir, images): + with CurrentDir(output_dir): + opf = OPFCreator(output_dir, MetaInformation(_('Unknown'), _('Unknown'))) + + manifest = [('index.html', None)] + + for i in images: + manifest.append((os.path.join('images/', i), None)) + + opf.create_manifest(manifest) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(output_dir, 'metadata.opf') + + def dump_pml(self): + ''' + This is primarily used for debugging and 3rd party tools to + get the plm markup that comprises the text in the file. + ''' + pml = '' + + for i in range(1, self.header_record.num_text_pages + 1): + pml += self.get_text_page(i) + + return pml + + def dump_images(self, output_dir): + ''' + This is primarily used for debugging and 3rd party tools to + get the images in the file. + ''' + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with CurrentDir(output_dir): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + with open(name, 'wb') as imgf: + imgf.write(img) From 3f92a079181096df80fc9b230bad87f8e25dc147 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 22 May 2009 06:26:57 -0400 Subject: [PATCH 05/15] Fix eReader metadata reading. --- src/calibre/ebooks/metadata/ereader.py | 25 ++++++++++++--------- src/calibre/ebooks/metadata/pdb.py | 1 - src/calibre/ebooks/pdb/ereader/reader202.py | 8 ++++--- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/metadata/ereader.py b/src/calibre/ebooks/metadata/ereader.py index 6e6624ce2a..42f575188c 100644 --- a/src/calibre/ebooks/metadata/ereader.py +++ b/src/calibre/ebooks/metadata/ereader.py @@ -24,19 +24,22 @@ def get_metadata(stream, extract_cover=True): stream.seek(0) pheader = PdbHeaderReader(stream) - hr = HeaderRecord(pheader.section_data(0)) - if hr.version in (2, 10) and hr.has_metadata == 1: - try: - mdata = pheader.section_data(hr.metadata_offset) + # Only Dropbook produced 132 byte record0 files are supported + if len(pheader.section_data(0)) == 132: + hr = HeaderRecord(pheader.section_data(0)) - mdata = mdata.split('\x00') - mi.title = mdata[0] - mi.authors = [mdata[1]] - mi.publisher = mdata[3] - mi.isbn = mdata[4] - except: - pass + if hr.version in (2, 10) and hr.has_metadata == 1: + try: + mdata = pheader.section_data(hr.metadata_offset) + + mdata = mdata.split('\x00') + mi.title = mdata[0] + mi.authors = [mdata[1]] + mi.publisher = mdata[3] + mi.isbn = mdata[4] + except: + pass if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py index f3d2782d16..ccc64d70aa 100644 --- a/src/calibre/ebooks/metadata/pdb.py +++ b/src/calibre/ebooks/metadata/pdb.py @@ -38,7 +38,6 @@ def get_metadata(stream, extract_cover=True): if MetadataReader is None: return MetaInformation(pheader.title, [_('Unknown')]) - return MetadataReader(stream, extract_cover) def set_metadata(stream, mi): diff --git a/src/calibre/ebooks/pdb/ereader/reader202.py b/src/calibre/ebooks/pdb/ereader/reader202.py index 120cb5f1d2..3ef409c9ce 100644 --- a/src/calibre/ebooks/pdb/ereader/reader202.py +++ b/src/calibre/ebooks/pdb/ereader/reader202.py @@ -11,7 +11,6 @@ import os import struct from calibre import CurrentDir -from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.pml.pmlconverter import pml_to_html from calibre.ebooks.compression.palmdoc import decompress_doc @@ -48,6 +47,9 @@ class Reader202(FormatReader): if self.header_record.version != 4: raise EreaderError('Unknown book version %i.' % self.header_record.version) + from calibre.ebooks.metadata.pdb import get_metadata + self.mi = get_metadata(stream, False) + def section_data(self, number): return self.sections[number] @@ -62,7 +64,7 @@ class Reader202(FormatReader): if data.startswith('PNG'): name = data[4:4 + 32].strip('\x00') img = data[62:] - + return name, img def get_text_page(self, number): @@ -114,7 +116,7 @@ class Reader202(FormatReader): def create_opf(self, output_dir, images): with CurrentDir(output_dir): - opf = OPFCreator(output_dir, MetaInformation(_('Unknown'), _('Unknown'))) + opf = OPFCreator(output_dir, self.mi) manifest = [('index.html', None)] From d05c344b6e0d2bc137d49b6cd923dc1320639ced Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 22 May 2009 06:50:10 -0400 Subject: [PATCH 06/15] Fix indent error in Mobi writer --- src/calibre/ebooks/mobi/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 1a5a729a6f..99f50b4439 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -292,7 +292,7 @@ class MobiWriter(object): def __init__(self, compression=PALMDOC, imagemax=None, prefer_author_sort=False): - self._compression = compression or UNCOMPRESSED + self._compression = compression or UNCOMPRESSED self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort From 9b890e279d7e922e62602df61d87ce7814414654 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 22 May 2009 06:56:51 -0400 Subject: [PATCH 07/15] Fix more indent errors in Mobi reader from auto formating --- src/calibre/ebooks/mobi/reader.py | 4 ++-- src/calibre/ebooks/pdf/input.py | 9 ++++----- src/calibre/ebooks/pdf/pdftohtml.py | 6 ++++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index edad5fe1f9..6a399ab145 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -438,7 +438,7 @@ class MobiReader(object): tag.attrib.pop('xmlns', '') if tag.tag in ('country-region', 'place', 'placetype', 'placename', 'state', 'city', 'street', 'address', 'content'): - tag.tag = 'div' if tag.tag == 'content' else 'span' + tag.tag = 'div' if tag.tag == 'content' else 'span' for key in tag.attrib.keys(): tag.attrib.pop(key) continue @@ -527,7 +527,7 @@ class MobiReader(object): opf.cover = 'images/%05d.jpg' % 1 if not os.path.exists(os.path.join(os.path.dirname(htmlfile), * opf.cover.split('/'))): - opf.cover = None + opf.cover = None manifest = [(htmlfile, 'text/x-oeb1-document'), (os.path.abspath('styles.css'), 'text/css')] diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index e8c3889e41..6aa695c912 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' @@ -12,7 +11,7 @@ from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.ebooks.metadata.opf2 import OPFCreator class PDFInput(InputFormatPlugin): - + name = 'PDF Input' author = 'John Schember' description = 'Convert PDF files to HTML' @@ -21,10 +20,10 @@ class PDFInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): html = pdftohtml(stream.name) - + with open('index.html', 'wb') as index: index.write(html) - + from calibre.ebooks.metadata.meta import get_metadata mi = get_metadata(stream, 'pdf') opf = OPFCreator(os.getcwd(), mi) @@ -32,5 +31,5 @@ class PDFInput(InputFormatPlugin): opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: opf.render(opffile) - + return os.path.join(os.getcwd(), 'metadata.opf') diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 2c0daf05ca..c88b50e82e 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2008, Kovid Goyal , ' \ '2009, John Schember ' __docformat__ = 'restructuredtext en' -import errno, os, sys, subprocess +import errno +import os +import sys +import subprocess from functools import partial from calibre.ebooks import ConversionError, DRMError From 503b6976532ad4b3235995dc3890f830675c712c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 23 May 2009 11:55:53 -0400 Subject: [PATCH 08/15] RocketBook (rb) input. --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/rb/__init__.py | 11 +++ src/calibre/ebooks/rb/input.py | 24 ++++++ src/calibre/ebooks/rb/reader.py | 131 ++++++++++++++++++++++++++++++ src/calibre/ebooks/txt/input.py | 1 - 5 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/rb/__init__.py create mode 100644 src/calibre/ebooks/rb/input.py create mode 100644 src/calibre/ebooks/rb/reader.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 8e0f243056..4524f1f04c 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -321,6 +321,7 @@ from calibre.ebooks.lit.input import LITInput from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.fb2.output import FB2Output from calibre.ebooks.odt.input import ODTInput +from calibre.ebooks.rb.input import RBInput from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.comic.input import ComicInput @@ -351,7 +352,7 @@ from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, FB2Output, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, - PMLOutput, MOBIOutput, PDBOutput, LRFOutput, LITOutput] + PMLOutput, MOBIOutput, PDBOutput, LRFOutput, LITOutput, RBInput] plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, JETBOOK, BEBOOK, BEBOOK_MINI] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/rb/__init__.py b/src/calibre/ebooks/rb/__init__.py new file mode 100644 index 0000000000..7c048a95c8 --- /dev/null +++ b/src/calibre/ebooks/rb/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +HEADER = '\xb0\x0c\xb0\x0c\x02\x00NUVO\x00\x00\x00\x00' + +class RocketBookError(Exception): + pass + diff --git a/src/calibre/ebooks/rb/input.py b/src/calibre/ebooks/rb/input.py new file mode 100644 index 0000000000..8b05c1d42e --- /dev/null +++ b/src/calibre/ebooks/rb/input.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.ebooks.rb.reader import Reader +from calibre.customize.conversion import InputFormatPlugin + +class RBInput(InputFormatPlugin): + + name = 'RB Input' + author = 'John Schember' + description = 'Convert RB files to HTML' + file_types = set(['rb']) + + def convert(self, stream, options, file_ext, log, + accelerators): + reader = Reader(stream, log, options.input_encoding) + opf = reader.extract_content(os.getcwd()) + + return opf diff --git a/src/calibre/ebooks/rb/reader.py b/src/calibre/ebooks/rb/reader.py new file mode 100644 index 0000000000..bc70905539 --- /dev/null +++ b/src/calibre/ebooks/rb/reader.py @@ -0,0 +1,131 @@ +import os.path +import zlib +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import struct +from urllib import unquote as urlunquote + +from calibre import CurrentDir +from calibre.ebooks.rb import HEADER +from calibre.ebooks.rb import RocketBookError +from calibre.ebooks.metadata.rb import get_metadata +from calibre.ebooks.metadata.opf2 import OPFCreator + +class RBToc(list): + + class Item(object): + + def __init__(self, name='', size=0, offset=0, flags=0): + self.name = name + self.size = size + self.offset = offset + self.flags = flags + + +class Reader(object): + + def __init__(self, stream, log, encoding=None): + self.stream = stream + self.log = log + self.encoding = encoding + + self.verify_file() + + self.mi = get_metadata(self.stream) + self.toc = self.get_toc() + + def read_i32(self): + return struct.unpack(' Date: Sat, 23 May 2009 13:36:24 -0400 Subject: [PATCH 09/15] RB input: Support uncompressed text. --- src/calibre/ebooks/rb/reader.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/rb/reader.py b/src/calibre/ebooks/rb/reader.py index bc70905539..7e0bb68e4e 100644 --- a/src/calibre/ebooks/rb/reader.py +++ b/src/calibre/ebooks/rb/reader.py @@ -70,21 +70,24 @@ class Reader(object): return toc def get_text(self, toc_item, output_dir): - if toc_item.flags != 8: + if toc_item.flags in (1, 2): return output = u'' - self.stream.seek(toc_item.offset) - count = self.read_i32() - self.read_i32() # Uncompressed size. - chunck_sizes = [] - for i in range(count): - chunck_sizes.append(self.read_i32()) - for size in chunck_sizes: - cm_chunck = self.stream.read(size) - output += zlib.decompress(cm_chunck).decode('cp1252' if self.encoding is None else self.encoding) + if toc_item.flags == 8: + count = self.read_i32() + self.read_i32() # Uncompressed size. + chunck_sizes = [] + for i in range(count): + chunck_sizes.append(self.read_i32()) + + for size in chunck_sizes: + cm_chunck = self.stream.read(size) + output += zlib.decompress(cm_chunck).decode('cp1252' if self.encoding is None else self.encoding) + else: + output += self.stream.read(toc_item.size).decode('cp1252' if self.encoding is None else self.encoding) with open(os.path.join(output_dir, toc_item.name), 'wb') as html: html.write(output.encode('utf-8')) From 1b100b5c5c32e5292784e1a13fce73a983700213 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 23 May 2009 13:37:48 -0400 Subject: [PATCH 10/15] RB input: Fix imports. --- src/calibre/ebooks/rb/reader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/rb/reader.py b/src/calibre/ebooks/rb/reader.py index 7e0bb68e4e..ffc7d6e799 100644 --- a/src/calibre/ebooks/rb/reader.py +++ b/src/calibre/ebooks/rb/reader.py @@ -1,5 +1,3 @@ -import os.path -import zlib # -*- coding: utf-8 -*- __license__ = 'GPL 3' @@ -8,6 +6,7 @@ __docformat__ = 'restructuredtext en' import os import struct +import zlib from urllib import unquote as urlunquote from calibre import CurrentDir From afe9c08304d6da3cde3d2ddbe9de448fcd60d86d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 23 May 2009 14:03:22 -0400 Subject: [PATCH 11/15] Alphabetize imports and classes. --- src/calibre/customize/builtins.py | 492 ++++++++++++++++-------------- 1 file changed, 267 insertions(+), 225 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 4524f1f04c..ab9460d3be 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1,8 +1,9 @@ -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import textwrap, os, glob +import textwrap +import os +import glob from calibre.customize import FileTypePlugin, MetadataReaderPlugin, MetadataWriterPlugin from calibre.constants import __version__ @@ -39,172 +40,6 @@ every time you add an HTML file to the library.\ return of.name -class OPFMetadataReader(MetadataReaderPlugin): - - name = 'Read OPF metadata' - file_types = set(['opf']) - description = _('Read metadata from %s files')%'OPF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.opf2 import OPF - from calibre.ebooks.metadata import MetaInformation - return MetaInformation(OPF(stream, os.getcwd())) - -class RTFMetadataReader(MetadataReaderPlugin): - - name = 'Read RTF metadata' - file_types = set(['rtf']) - description = _('Read metadata from %s files')%'RTF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.rtf import get_metadata - return get_metadata(stream) - -class FB2MetadataReader(MetadataReaderPlugin): - - name = 'Read FB2 metadata' - file_types = set(['fb2']) - description = _('Read metadata from %s files')%'FB2' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.fb2 import get_metadata - return get_metadata(stream) - - -class LRFMetadataReader(MetadataReaderPlugin): - - name = 'Read LRF metadata' - file_types = set(['lrf']) - description = _('Read metadata from %s files')%'LRF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.lrf.meta import get_metadata - return get_metadata(stream) - -class PDFMetadataReader(MetadataReaderPlugin): - - name = 'Read PDF metadata' - file_types = set(['pdf']) - description = _('Read metadata from %s files')%'PDF' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.pdf import get_metadata - return get_metadata(stream) - -class LITMetadataReader(MetadataReaderPlugin): - - name = 'Read LIT metadata' - file_types = set(['lit']) - description = _('Read metadata from %s files')%'LIT' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.lit import get_metadata - return get_metadata(stream) - -class IMPMetadataReader(MetadataReaderPlugin): - - name = 'Read IMP metadata' - file_types = set(['imp']) - description = _('Read metadata from %s files')%'IMP' - author = 'Ashish Kulkarni' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.imp import get_metadata - return get_metadata(stream) - -class RBMetadataReader(MetadataReaderPlugin): - - name = 'Read RB metadata' - file_types = set(['rb']) - description = _('Read metadata from %s files')%'RB' - author = 'Ashish Kulkarni' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.rb import get_metadata - return get_metadata(stream) - -class EPUBMetadataReader(MetadataReaderPlugin): - - name = 'Read EPUB metadata' - file_types = set(['epub']) - description = _('Read metadata from %s files')%'EPUB' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.epub import get_metadata - return get_metadata(stream) - -class HTMLMetadataReader(MetadataReaderPlugin): - - name = 'Read HTML metadata' - file_types = set(['html']) - description = _('Read metadata from %s files')%'HTML' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.html import get_metadata - return get_metadata(stream) - -class MOBIMetadataReader(MetadataReaderPlugin): - - name = 'Read MOBI metadata' - file_types = set(['mobi', 'prc', 'azw']) - description = _('Read metadata from %s files')%'MOBI' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.mobi.reader import get_metadata - return get_metadata(stream) - - -class TOPAZMetadataReader(MetadataReaderPlugin): - - name = 'Read Topaz metadata' - file_types = set(['tpz', 'azw1']) - description = _('Read metadata from %s files')%'MOBI' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.topaz import get_metadata - return get_metadata(stream) - -class ODTMetadataReader(MetadataReaderPlugin): - - name = 'Read ODT metadata' - file_types = set(['odt']) - description = _('Read metadata from %s files')%'ODT' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.odt import get_metadata - return get_metadata(stream) - -class TXTMetadataReader(MetadataReaderPlugin): - - name = 'Read TXT metadata' - file_types = set(['txt']) - description = _('Read metadata from %s files') % 'TXT' - author = 'John Schember' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.txt import get_metadata - return get_metadata(stream) - -class PDBMetadataReader(MetadataReaderPlugin): - - name = 'Read PDB metadata' - file_types = set(['pdb']) - description = _('Read metadata from %s files') % 'PDB' - author = 'John Schember' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.pdb import get_metadata - return get_metadata(stream) - -class LRXMetadataReader(MetadataReaderPlugin): - - name = 'Read LRX metadata' - file_types = set(['lrx']) - description = _('Read metadata from %s files')%'LRX' - - def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.lrx import get_metadata - return get_metadata(stream) class ComicMetadataReader(MetadataReaderPlugin): @@ -227,14 +62,127 @@ class ComicMetadataReader(MetadataReaderPlugin): mi.cover_data = (ext.lower(), data) return mi -class ZipMetadataReader(MetadataReaderPlugin): +class EPUBMetadataReader(MetadataReaderPlugin): - name = 'Read ZIP metadata' - file_types = set(['zip', 'oebzip']) - description = _('Read metadata from ebooks in ZIP archives') + name = 'Read EPUB metadata' + file_types = set(['epub']) + description = _('Read metadata from %s files')%'EPUB' def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.zip import get_metadata + from calibre.ebooks.metadata.epub import get_metadata + return get_metadata(stream) + +class FB2MetadataReader(MetadataReaderPlugin): + + name = 'Read FB2 metadata' + file_types = set(['fb2']) + description = _('Read metadata from %s files')%'FB2' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.fb2 import get_metadata + return get_metadata(stream) + +class HTMLMetadataReader(MetadataReaderPlugin): + + name = 'Read HTML metadata' + file_types = set(['html']) + description = _('Read metadata from %s files')%'HTML' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.html import get_metadata + return get_metadata(stream) + +class IMPMetadataReader(MetadataReaderPlugin): + + name = 'Read IMP metadata' + file_types = set(['imp']) + description = _('Read metadata from %s files')%'IMP' + author = 'Ashish Kulkarni' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.imp import get_metadata + return get_metadata(stream) + +class LITMetadataReader(MetadataReaderPlugin): + + name = 'Read LIT metadata' + file_types = set(['lit']) + description = _('Read metadata from %s files')%'LIT' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.lit import get_metadata + return get_metadata(stream) + +class LRFMetadataReader(MetadataReaderPlugin): + + name = 'Read LRF metadata' + file_types = set(['lrf']) + description = _('Read metadata from %s files')%'LRF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.lrf.meta import get_metadata + return get_metadata(stream) + +class LRXMetadataReader(MetadataReaderPlugin): + + name = 'Read LRX metadata' + file_types = set(['lrx']) + description = _('Read metadata from %s files')%'LRX' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.lrx import get_metadata + return get_metadata(stream) + +class MOBIMetadataReader(MetadataReaderPlugin): + + name = 'Read MOBI metadata' + file_types = set(['mobi', 'prc', 'azw']) + description = _('Read metadata from %s files')%'MOBI' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.mobi.reader import get_metadata + return get_metadata(stream) + +class ODTMetadataReader(MetadataReaderPlugin): + + name = 'Read ODT metadata' + file_types = set(['odt']) + description = _('Read metadata from %s files')%'ODT' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.odt import get_metadata + return get_metadata(stream) + +class OPFMetadataReader(MetadataReaderPlugin): + + name = 'Read OPF metadata' + file_types = set(['opf']) + description = _('Read metadata from %s files')%'OPF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.opf2 import OPF + from calibre.ebooks.metadata import MetaInformation + return MetaInformation(OPF(stream, os.getcwd())) + +class PDBMetadataReader(MetadataReaderPlugin): + + name = 'Read PDB metadata' + file_types = set(['pdb']) + description = _('Read metadata from %s files') % 'PDB' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.pdb import get_metadata + return get_metadata(stream) + +class PDFMetadataReader(MetadataReaderPlugin): + + name = 'Read PDF metadata' + file_types = set(['pdf']) + description = _('Read metadata from %s files')%'PDF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.pdf import get_metadata return get_metadata(stream) class RARMetadataReader(MetadataReaderPlugin): @@ -247,6 +195,58 @@ class RARMetadataReader(MetadataReaderPlugin): from calibre.ebooks.metadata.rar import get_metadata return get_metadata(stream) +class RBMetadataReader(MetadataReaderPlugin): + + name = 'Read RB metadata' + file_types = set(['rb']) + description = _('Read metadata from %s files')%'RB' + author = 'Ashish Kulkarni' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.rb import get_metadata + return get_metadata(stream) + +class RTFMetadataReader(MetadataReaderPlugin): + + name = 'Read RTF metadata' + file_types = set(['rtf']) + description = _('Read metadata from %s files')%'RTF' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.rtf import get_metadata + return get_metadata(stream) + +class TOPAZMetadataReader(MetadataReaderPlugin): + + name = 'Read Topaz metadata' + file_types = set(['tpz', 'azw1']) + description = _('Read metadata from %s files')%'MOBI' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.topaz import get_metadata + return get_metadata(stream) + +class TXTMetadataReader(MetadataReaderPlugin): + + name = 'Read TXT metadata' + file_types = set(['txt']) + description = _('Read metadata from %s files') % 'TXT' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.txt import get_metadata + return get_metadata(stream) + +class ZipMetadataReader(MetadataReaderPlugin): + + name = 'Read ZIP metadata' + file_types = set(['zip', 'oebzip']) + description = _('Read metadata from ebooks in ZIP archives') + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.zip import get_metadata + return get_metadata(stream) + class EPUBMetadataWriter(MetadataWriterPlugin): @@ -268,16 +268,6 @@ class LRFMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.lrf.meta import set_metadata set_metadata(stream, mi) -class RTFMetadataWriter(MetadataWriterPlugin): - - name = 'Set RTF metadata' - file_types = set(['rtf']) - description = _('Set metadata in %s files')%'RTF' - - def set_metadata(self, stream, mi, type): - from calibre.ebooks.metadata.rtf import set_metadata - set_metadata(stream, mi) - class MOBIMetadataWriter(MetadataWriterPlugin): name = 'Set MOBI metadata' @@ -289,17 +279,6 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.metadata.mobi import set_metadata set_metadata(stream, mi) -class PDFMetadataWriter(MetadataWriterPlugin): - - name = 'Set PDF metadata' - file_types = set(['pdf']) - description = _('Set metadata in %s files') % 'PDF' - author = 'Kovid Goyal' - - def set_metadata(self, stream, mi, type): - from calibre.ebooks.metadata.pdf import set_metadata - set_metadata(stream, mi) - class PDBMetadataWriter(MetadataWriterPlugin): name = 'Set PDB metadata' @@ -311,50 +290,113 @@ class PDBMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.metadata.pdb import set_metadata set_metadata(stream, mi) +class PDFMetadataWriter(MetadataWriterPlugin): + name = 'Set PDF metadata' + file_types = set(['pdf']) + description = _('Set metadata in %s files') % 'PDF' + author = 'Kovid Goyal' + + def set_metadata(self, stream, mi, type): + from calibre.ebooks.metadata.pdf import set_metadata + set_metadata(stream, mi) + +class RTFMetadataWriter(MetadataWriterPlugin): + + name = 'Set RTF metadata' + file_types = set(['rtf']) + description = _('Set metadata in %s files')%'RTF' + + def set_metadata(self, stream, mi, type): + from calibre.ebooks.metadata.rtf import set_metadata + set_metadata(stream, mi) + + +from calibre.ebooks.comic.input import ComicInput from calibre.ebooks.epub.input import EPUBInput +from calibre.ebooks.fb2.input import FB2Input +from calibre.ebooks.html.input import HTMLInput +from calibre.ebooks.lit.input import LITInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.odt.input import ODTInput from calibre.ebooks.pdb.input import PDBInput from calibre.ebooks.pdf.input import PDFInput -from calibre.ebooks.txt.input import TXTInput -from calibre.ebooks.lit.input import LITInput -from calibre.ebooks.fb2.input import FB2Input -from calibre.ebooks.fb2.output import FB2Output -from calibre.ebooks.odt.input import ODTInput -from calibre.ebooks.rb.input import RBInput -from calibre.ebooks.rtf.input import RTFInput -from calibre.ebooks.html.input import HTMLInput -from calibre.ebooks.comic.input import ComicInput -from calibre.web.feeds.input import RecipeInput -from calibre.ebooks.oeb.output import OEBOutput -from calibre.ebooks.epub.output import EPUBOutput -from calibre.ebooks.mobi.output import MOBIOutput -from calibre.ebooks.pdb.output import PDBOutput -from calibre.ebooks.lrf.output import LRFOutput -from calibre.ebooks.lit.output import LITOutput -from calibre.ebooks.txt.output import TXTOutput -from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.input import PMLInput +from calibre.ebooks.rb.input import RBInput +from calibre.web.feeds.input import RecipeInput +from calibre.ebooks.rtf.input import RTFInput +from calibre.ebooks.txt.input import TXTInput + +from calibre.ebooks.epub.output import EPUBOutput +from calibre.ebooks.fb2.output import FB2Output +from calibre.ebooks.lit.output import LITOutput +from calibre.ebooks.lrf.output import LRFOutput +from calibre.ebooks.mobi.output import MOBIOutput +from calibre.ebooks.oeb.output import OEBOutput +from calibre.ebooks.pdb.output import PDBOutput +from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.output import PMLOutput +from calibre.ebooks.rb.output import RBOutput +from calibre.ebooks.txt.output import TXTOutput + from calibre.customize.profiles import input_profiles, output_profiles + +from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI +from calibre.devices.blackberry.driver import BLACKBERRY +from calibre.devices.cybookg3.driver import CYBOOKG3 +from calibre.devices.eb600.driver import EB600 +from calibre.devices.jetbook.driver import JETBOOK +from calibre.devices.kindle.driver import KINDLE +from calibre.devices.kindle.driver import KINDLE2 from calibre.devices.prs500.driver import PRS500 from calibre.devices.prs505.driver import PRS505 from calibre.devices.prs700.driver import PRS700 -from calibre.devices.cybookg3.driver import CYBOOKG3 -from calibre.devices.kindle.driver import KINDLE -from calibre.devices.kindle.driver import KINDLE2 -from calibre.devices.blackberry.driver import BLACKBERRY -from calibre.devices.eb600.driver import EB600 -from calibre.devices.jetbook.driver import JETBOOK -from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI -plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, - TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, - FB2Input, FB2Output, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, - PMLOutput, MOBIOutput, PDBOutput, LRFOutput, LITOutput, RBInput] -plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, - EB600, JETBOOK, BEBOOK, BEBOOK_MINI] + +plugins = [] +plugins += [ + ComicInput, + EPUBInput, + FB2Input, + HTMLInput, + LITInput, + MOBIInput, + ODTInput, + PDBInput, + PDFInput, + PMLInput, + RBInput, + RecipeInput, + RTFInput, + TXTInput, +] +plugins += [ + EPUBOutput, + FB2Output, + LITOutput, + LRFOutput, + MOBIOutput, + OEBOutput, + PDBOutput, + PDFOutput, + PMLOutput, + RBOutput, + TXTOutput, +] +plugins += [ + BEBOOK, + BEBOOK_MINI, + BLACKBERRY, + CYBOOKG3, + EB600, + JETBOOK, + KINDLE, + KINDLE2, + PRS500, + PRS505, + PRS700, +] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ From 3659eb1b7a43de50945124e81bdd172efa887043 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 23 May 2009 19:26:21 -0400 Subject: [PATCH 12/15] RB: output. --- src/calibre/ebooks/pdb/ereader/writer.py | 48 +++---- src/calibre/ebooks/rb/__init__.py | 15 ++ src/calibre/ebooks/rb/output.py | 36 +++++ src/calibre/ebooks/rb/rbml.py | 166 +++++++++++++++++++++++ src/calibre/ebooks/rb/writer.py | 143 +++++++++++++++++++ 5 files changed, 384 insertions(+), 24 deletions(-) create mode 100644 src/calibre/ebooks/rb/output.py create mode 100644 src/calibre/ebooks/rb/rbml.py create mode 100644 src/calibre/ebooks/rb/writer.py diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index c8567c93b6..79cb11fdb9 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -27,62 +27,62 @@ IDENTITY = 'PNRdPPrs' MAX_RECORD_SIZE = 3560 class Writer(FormatWriter): - + def __init__(self, opts, log): self.opts = opts self.log = log - + def write_content(self, oeb_book, out_stream, metadata=None): text = self._text(oeb_book) images = self._images(oeb_book.manifest) metadata = [self._metadata(metadata)] - + hr = [self._header_record(len(text), len(images))] - + sections = hr+text+images+metadata+['MeTaInFo\x00'] - + lengths = [len(i) for i in sections] - + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0]) pdbHeaderBuilder.build_header(lengths, out_stream) - + for item in sections: out_stream.write(item) def _text(self, oeb_book): pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables) pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') - + pml_pages = [] for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])) - return pml_pages - + return pml_pages + def _images(self, manifest): images = [] - + for item in manifest: if item.media_type in OEB_IMAGES: image = 'PNG ' image += image_name(item.href) image = image.ljust(62, '\x00') - + im = Image.open(cStringIO.StringIO(item.data)).convert('P') im.thumbnail((300,300), Image.ANTIALIAS) - + data = cStringIO.StringIO() im.save(data, 'PNG') data = data.getvalue() - + image += data - + if len(image) < 65505: images.append(image) - + return images - + def _metadata(self, metadata): ''' Metadata takes the form: @@ -92,14 +92,14 @@ class Writer(FormatWriter): publisher\x00 isbn\x00 ''' - + title = _('Unknown') author = _('Unknown') copyright = '' publisher = '' isbn = '' - - if metadata != None: + + if metadata: if len(metadata.title) >= 1: title = metadata.title[0].value if len(metadata.creator) >= 1: @@ -119,7 +119,7 @@ class Writer(FormatWriter): ''' version = 10 # Zlib compression non_text_offset = text_items + 1 - + if image_items > 0: image_data_offset = text_items + 1 meta_data_offset = image_data_offset + image_items @@ -128,9 +128,9 @@ class Writer(FormatWriter): meta_data_offset = text_items + 1 last_data_offset = meta_data_offset + 1 image_data_offset = last_data_offset - + record = '' - + record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM record += struct.pack('>H', 0) # [2:4] record += struct.pack('>H', 0) # [4:6] @@ -161,6 +161,6 @@ class Writer(FormatWriter): for i in range(54, 132, 2): record += struct.pack('>H', 0) # [54:132] - + return record diff --git a/src/calibre/ebooks/rb/__init__.py b/src/calibre/ebooks/rb/__init__.py index 7c048a95c8..33e9882d9a 100644 --- a/src/calibre/ebooks/rb/__init__.py +++ b/src/calibre/ebooks/rb/__init__.py @@ -4,8 +4,23 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import os + HEADER = '\xb0\x0c\xb0\x0c\x02\x00NUVO\x00\x00\x00\x00' class RocketBookError(Exception): pass + +def unique_name(name, used_names): + name = os.path.basename(name) + if len(name) < 32 and name not in used_names: + return name + else: + ext = os.path.splitext(name)[1][:3] + base_name = name[:22] + for i in range(0, 9999): + name = '%s-%s.%s' % (str(i).rjust('0', 4)[:4], base_name, ext) + if name not in used_names: + break + return name diff --git a/src/calibre/ebooks/rb/output.py b/src/calibre/ebooks/rb/output.py new file mode 100644 index 0000000000..04c7d41790 --- /dev/null +++ b/src/calibre/ebooks/rb/output.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ebooks.rb.writer import RBWriter + +class RBOutput(OutputFormatPlugin): + + name = 'RB Output' + author = 'John Schember' + file_type = 'rb' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + writer = RBWriter(opts, log) + + out_stream.seek(0) + out_stream.truncate() + + writer.write_content(oeb_book, out_stream, oeb_book.metadata) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py new file mode 100644 index 0000000000..3b88f3bc09 --- /dev/null +++ b/src/calibre/ebooks/rb/rbml.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into RB compatible markup. +''' + +import os +import re + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +TAGS = [ + 'b', + 'big', + 'blockquote', + 'br', + 'center', + 'code', + 'div', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'hr', + 'i', + 'li', + 'ol', + 'p', + 'pre', + 'small', + 'sub', + 'sup', + 'ul', +] + +LINK_TAGS = [ + 'a', +] + +STYLES = [ + ('font-weight', {'bold' : 'b', 'bolder' : 'b'}), + ('font-style', {'italic' : 'i'}), + ('text-align', {'center' : 'center'}), +] + +class RBMLizer(object): + + def __init__(self, name_map={}, ignore_tables=False): + self.name_map = name_map + self.ignore_tables = ignore_tables + + def extract_content(self, oeb_book, opts): + oeb_book.logger.info('Converting XHTML to RB markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.mlize_spine() + + + def mlize_spine(self): + output = u'' + for item in self.oeb_book.spine: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.add_page_anchor(item.href) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += u'' + output = self.clean_text(output) + return output + + def add_page_anchor(self, href): + href = os.path.splitext(os.path.basename(href))[0] + return u'' % href + + def clean_text(self, text): + # Remove anchors that do not have links + anchors = set(re.findall(r'(?<=)', text)) + links = set(re.findall(r'(?<=)', text)) + for unused in anchors.difference(links): + text = text.replace('' % unused, '') + + return text + + def dump_text(self, elem, stylizer, tag_stack=[]): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + tag_count = 0 + + # Process tags that need special processing and that do not have inner + # text. Usually these require an argument + if tag == 'img': + src = os.path.basename(elem.get('src')) + name = self.name_map.get(src, src) + text += '' % name + + rb_tag = tag.upper() if tag in TAGS else None + if rb_tag: + tag_count += 1 + text += '<%s>' % rb_tag + tag_stack.append(rb_tag) + + if tag in LINK_TAGS: + href = elem.get('href') + if href: + if '://' not in href: + if '#' in href: + href = href.partition('#')[2] + href = os.path.splitext(os.path.basename(href))[0] + tag_count += 1 + text += '' % href + tag_stack.append('A') + + # Anchor ids + id_name = elem.get('id') + if id_name: + text += '' % os.path.splitext(id_name)[0] + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag: + style_tag = style_tag.upper() + tag_count += 1 + text += '<%s>' % style_tag + tag_stack.append(style_tag) + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += elem.text + + for item in elem: + text += self.dump_text(item, stylizer, tag_stack) + + close_tag_list = [] + for i in range(0, tag_count): + close_tag_list.insert(0, tag_stack.pop()) + + text += self.close_tags(close_tag_list) + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + text += elem.tail + + return text + + def close_tags(self, tags): + text = u'' + for i in range(0, len(tags)): + tag = tags.pop() + text += '' % tag + + return text diff --git a/src/calibre/ebooks/rb/writer.py b/src/calibre/ebooks/rb/writer.py new file mode 100644 index 0000000000..f9057d5c61 --- /dev/null +++ b/src/calibre/ebooks/rb/writer.py @@ -0,0 +1,143 @@ +import os.path +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import struct +import zlib + +import Image +import cStringIO + +from calibre.ebooks.rb.rbml import RBMLizer +from calibre.ebooks.rb import HEADER +from calibre.ebooks.rb import unique_name +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.constants import __appname__, __version__ + +TEXT_RECORD_SIZE = 4096 + +class TocItem(object): + + def __init__(self, name, size, flags): + self.name = name + self.size = size + self.flags = flags + + +class RBWriter(object): + + def __init__(self, opts, log): + self.opts = opts + self.log = log + self.name_map = {} + + def write_content(self, oeb_book, out_stream, metadata=None): + info = [('info.info', self._info_section(metadata))] + images = self._images(oeb_book.manifest) + text_size, chuncks = self._text(oeb_book) + chunck_sizes = [len(x) for x in chuncks] + text = [('index.html', chuncks)] + hidx = [('index.hidx', ' ')] + + toc_items = [] + page_count = 0 + for name, data in info+text+hidx+images: + page_count += 1 + size = len(data) + if (name, data) in text: + flags = 8 + size = 0 + for c in chunck_sizes: + size += c + size += 8 + (len(chunck_sizes) * 4) + elif (name, data) in info: + flags = 2 + else: + flags = 0 + toc_items.append(TocItem(name.ljust(32, '\x00')[:32], size, flags)) + + out_stream.write(HEADER) + out_stream.write(struct.pack('= 1: + text += 'TITLE=%s\n' % metadata.title[0].value + if len(metadata.creator) >= 1: + from calibre.ebooks.metadata import authors_to_string + text += 'AUTHOR=%s\n' % authors_to_string([x.value for x in metadata.creator]) + text += 'GENERATOR=%s - %s\n' % (__appname__, __version__) + text += 'PARSE=1\n' + text += 'OUTPUT=1\n' + text += 'BODY=index.html\n' + + return text + \ No newline at end of file From 4ac041caceeb30226c4409250a1047b6d312e796 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 23 May 2009 20:03:02 -0400 Subject: [PATCH 13/15] RB output: Fix internal links. --- src/calibre/ebooks/rb/rbml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index 3b88f3bc09..f18803e8d0 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -122,7 +122,7 @@ class RBMLizer(object): href = href.partition('#')[2] href = os.path.splitext(os.path.basename(href))[0] tag_count += 1 - text += '' % href + text += '' % href tag_stack.append('A') # Anchor ids From bebf90564812a3fc213c474fed8e2e79ab170ef4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 24 May 2009 11:43:53 -0400 Subject: [PATCH 14/15] use entity_to_unicode properly --- src/calibre/ebooks/fb2/fb2ml.py | 10 +++++++++- src/calibre/ebooks/pml/pmlml.py | 7 ++++--- src/calibre/ebooks/txt/writer.py | 17 ++++++++++++----- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index b6893e395d..f10cf95e87 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -9,8 +9,10 @@ Transform OEB content into FB2 markup ''' import os +import re from base64 import b64encode +from calibre import entity_to_unicode from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import OEB_IMAGES @@ -75,7 +77,13 @@ class FB2MLizer(object): return images def clean_text(self, text): - return text.replace('&', '') + for entity in set(re.findall('&.+?;', text)): + mo = re.search('(%s)' % entity[1:-1], text) + text = text.replace(entity, entity_to_unicode(mo)) + + text = text.replace('&', '') + + return text def dump_text(self, elem, stylizer, tag_stack=[]): if not isinstance(elem.tag, basestring) \ diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index c34ada3317..01f777caae 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -88,7 +88,7 @@ class PMLMLizer(object): def add_page_anchor(self, href): href = os.path.splitext(os.path.basename(href))[0] - return '\\Q="%s"' % href + return u'\\Q="%s"' % href def clean_text(self, text): # Remove excess spaces at beginning and end of lines @@ -108,9 +108,10 @@ class PMLMLizer(object): links = set(re.findall(r'(?<=\\q="#).+?(?=")', text)) for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') - + for entity in set(re.findall('&.+?;', text)): - text = text.replace(entity, entity_to_unicode(entity[1:-1])) + mo = re.search('(%s)' % entity[1:-1], text) + text = text.replace(entity, entity_to_unicode(mo)) return text diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 09a79d322d..313250bcf2 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement -''' -Write content to TXT. -''' __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, re, sys +''' +Write content to TXT. +''' +import os +import re + +from calibre import entity_to_unicode from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from BeautifulSoup import BeautifulSoup @@ -83,6 +85,11 @@ class TxtWriter(object): for symbol in HTML_SYMBOLS: for code in HTML_SYMBOLS[symbol]: content = content.replace(code, symbol) + + for entity in set(re.findall('&.+?;', content)): + mo = re.search('(%s)' % entity[1:-1], content) + content = content.replace(entity, entity_to_unicode(mo)) + return content def cleanup_text(self, text): From 6b9ea1d0aee5de8425e830eb445b1034f908b402 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 25 May 2009 07:34:42 -0400 Subject: [PATCH 15/15] Update inspector with new known values and makebook files. Better error message. --- src/calibre/ebooks/pdb/ereader/inspector.py | 100 +++++++++++++++----- src/calibre/ebooks/pdb/ereader/reader.py | 3 +- 2 files changed, 76 insertions(+), 27 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/inspector.py b/src/calibre/ebooks/pdb/ereader/inspector.py index b5f2341cb5..2ddb5d93f8 100644 --- a/src/calibre/ebooks/pdb/ereader/inspector.py +++ b/src/calibre/ebooks/pdb/ereader/inspector.py @@ -7,10 +7,27 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import struct, sys +import struct +import sys +from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.header import PdbHeaderReader -from calibre.ebooks.pdb.ereader.reader import HeaderRecord + +def ereader_header_info(header): + h0 = header.section_data(0) + + print 'Header Size: %s' % len(h0) + + if len(h0) == 132: + print 'Header Type: Dropbook compatible' + print '' + ereader_header_info132(h0) + elif len(h0) == 202: + print 'Header Type: Makebook compatible' + print '' + ereader_header_info202(h0) + else: + raise EreaderError('Size mismatch. eReader header record size %i KB is not supported.' % len(h0)) def pdb_header_info(header): print 'PDB Header Info:' @@ -20,70 +37,101 @@ def pdb_header_info(header): print 'Title: %s' % header.title print '' -def ereader_header_info(header): - h0 = header.section_data(0) - +def ereader_header_info132(h0): print 'Ereader Record 0 (Header) Info:' print '' print '0-2 Version: %i' % struct.unpack('>H', h0[0:2])[0] print '2-4: %i' % struct.unpack('>H', h0[2:4])[0] print '4-6: %i' % struct.unpack('>H', h0[4:6])[0] - print '6-8: %i' % struct.unpack('>H', h0[6:8])[0] + print '6-8 Codepage: %i' % struct.unpack('>H', h0[6:8])[0] print '8-10: %i' % struct.unpack('>H', h0[8:10])[0] print '10-12: %i' % struct.unpack('>H', h0[10:12])[0] - print '12-14 Non-Text: %i' % struct.unpack('>H', h0[12:14])[0] + print '12-14 Non-Text offset: %i' % struct.unpack('>H', h0[12:14])[0] print '14-16: %i' % struct.unpack('>H', h0[14:16])[0] print '16-18: %i' % struct.unpack('>H', h0[16:18])[0] print '18-20: %i' % struct.unpack('>H', h0[18:20])[0] - print '20-22: %i' % struct.unpack('>H', h0[20:22])[0] + print '20-22 Image Count: %i' % struct.unpack('>H', h0[20:22])[0] print '22-24: %i' % struct.unpack('>H', h0[22:24])[0] - print '24-26: %i' % struct.unpack('>H', h0[24:26])[0] + print '24-26 Has Metadata?: %i' % struct.unpack('>H', h0[24:26])[0] print '26-28: %i' % struct.unpack('>H', h0[26:28])[0] - print '28-30 footnote_rec: %i' % struct.unpack('>H', h0[28:30])[0] - print '30-32 sidebar_rec: %i' % struct.unpack('>H', h0[30:32])[0] - print '32-34 bookmark_offset: %i' % struct.unpack('>H', h0[32:34])[0] - print '34-36: %i' % struct.unpack('>H', h0[34:36])[0] + print '28-30 Footnote Count: %i' % struct.unpack('>H', h0[28:30])[0] + print '30-32 Sidebar Count: %i' % struct.unpack('>H', h0[30:32])[0] + print '32-34 Bookmark Offset: %i' % struct.unpack('>H', h0[32:34])[0] + print '34-36 MAGIC: %i' % struct.unpack('>H', h0[34:36])[0] print '36-38: %i' % struct.unpack('>H', h0[36:38])[0] print '38-40: %i' % struct.unpack('>H', h0[38:40])[0] - print '40-42 image_data_offset: %i' % struct.unpack('>H', h0[40:42])[0] + print '40-42 Image Data Offset: %i' % struct.unpack('>H', h0[40:42])[0] print '42-44: %i' % struct.unpack('>H', h0[42:44])[0] - print '44-46 metadata_offset: %i' % struct.unpack('>H', h0[44:46])[0] + print '44-46 Metadata Offset: %i' % struct.unpack('>H', h0[44:46])[0] print '46-48: %i' % struct.unpack('>H', h0[46:48])[0] - print '48-50 footnote_offset: %i' % struct.unpack('>H', h0[48:50])[0] - print '50-52 sidebar_offset: %i' % struct.unpack('>H', h0[50:52])[0] - print '52-54 last_data_offset: %i' % struct.unpack('>H', h0[52:54])[0] - + print '48-50 Footnote Offset: %i' % struct.unpack('>H', h0[48:50])[0] + print '50-52 Sidebar Offset: %i' % struct.unpack('>H', h0[50:52])[0] + print '52-54 Last Data Offset: %i' % struct.unpack('>H', h0[52:54])[0] + for i in range(54, 131, 2): print '%i-%i: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0]) - + + print '' + +def ereader_header_info202(h0): + print 'Ereader Record 0 (Header) Info:' + print '' + print '0-2 Version: %i' % struct.unpack('>H', h0[0:2])[0] + print '2-4 Garbage: %i' % struct.unpack('>H', h0[2:4])[0] + print '4-6 Garbage: %i' % struct.unpack('>H', h0[4:6])[0] + print '6-8 Garbage: %i' % struct.unpack('>H', h0[6:8])[0] + print '8-10 Non-Text Offset: %i' % struct.unpack('>H', h0[8:10])[0] + print '10-12: %i' % struct.unpack('>H', h0[10:12])[0] + print '12-14: %i' % struct.unpack('>H', h0[12:14])[0] + print '14-16 Garbage: %i' % struct.unpack('>H', h0[14:16])[0] + print '16-18 Garbage: %i' % struct.unpack('>H', h0[16:18])[0] + print '18-20 Garbage: %i' % struct.unpack('>H', h0[18:20])[0] + print '20-22 Garbage: %i' % struct.unpack('>H', h0[20:22])[0] + print '22-24 Garbage: %i' % struct.unpack('>H', h0[22:24])[0] + print '24-26: %i' % struct.unpack('>H', h0[24:26])[0] + print '26-28: %i' % struct.unpack('>H', h0[26:28])[0] + for i in range(28, 98, 2): + print '%i-%i Garbage: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0]) + print '98-100: %i' % struct.unpack('>H', h0[98:100])[0] + for i in range(100, 110, 2): + print '%i-%i Garbage: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0]) + print '110-112: %i' % struct.unpack('>H', h0[110:112])[0] + print '112-114: %i' % struct.unpack('>H', h0[112:114])[0] + print '114-116 Garbage: %i' % struct.unpack('>H', h0[114:116])[0] + for i in range(116, 202, 2): + print '%i-%i: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0]) + + print '' + print '* Garbage: Random values.' print '' + def section_lengths(header): print 'Section Sizes' print '' - + for i in range(0, header.section_count()): size = len(header.section_data(i)) if size > 65505: message = '<--- Over!' else: message = '' - + print 'Section %i: %i %s' % (i, size, message) def main(args=sys.argv): if len(args) < 2: print 'Error: requires input file.' return 1 - + f = open(sys.argv[1], 'rb') - + pheader = PdbHeaderReader(f) - + pdb_header_info(pheader) ereader_header_info(pheader) section_lengths(pheader) - + return 0 if __name__ == '__main__': diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 3afb13f035..7a3298122f 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,6 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader.reader132 import Reader132 from calibre.ebooks.pdb.ereader.reader202 import Reader202 @@ -22,7 +23,7 @@ class Reader(FormatReader): elif record0_size == 202: self.reader = Reader202(header, stream, log, encoding) else: - raise ValueError('Unknown eReader Header') + raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size) def extract_content(self, output_dir): return self.reader.extract_content(output_dir)