diff --git a/setup.py b/setup.py index 4f11395fda..407b852a57 100644 --- a/setup.py +++ b/setup.py @@ -89,7 +89,7 @@ if __name__ == '__main__': include_dirs=['src/calibre/utils/msdes']), Extension('calibre.plugins.cPalmdoc', - sources=['src/calibre/ebooks/mobi/palmdoc.c']), + sources=['src/calibre/ebooks/compression/palmdoc.c']), PyQtExtension('calibre.plugins.pictureflow', ['src/calibre/gui2/pictureflow/pictureflow.cpp', diff --git a/src/calibre/ebooks/compression/__init__.py b/src/calibre/ebooks/compression/__init__.py new file mode 100644 index 0000000000..9e2aad729c --- /dev/null +++ b/src/calibre/ebooks/compression/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/mobi/palmdoc.c b/src/calibre/ebooks/compression/palmdoc.c similarity index 100% rename from src/calibre/ebooks/mobi/palmdoc.c rename to src/calibre/ebooks/compression/palmdoc.c diff --git a/src/calibre/ebooks/mobi/palmdoc.py b/src/calibre/ebooks/compression/palmdoc.py similarity index 100% rename from src/calibre/ebooks/mobi/palmdoc.py rename to src/calibre/ebooks/compression/palmdoc.py diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 96c9e43676..b6893e395d 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -25,15 +25,9 @@ TAG_MAP = { 'div' : 'p', } -STYLE_MAP = { - 'bold' : 'strong', - 'bolder' : 'strong', - 'italic' : 'emphasis', -} - STYLES = [ - 'font-weight', - 'font-style', + ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), + ('font-style', {'italic' : 'emphasis'}), ] class FB2MLizer(object): @@ -107,8 +101,9 @@ class FB2MLizer(object): fb2_text += '<%s>' % fb2_tag tag_stack.append(fb2_tag) + # Processes style information for s in STYLES: - style_tag = STYLE_MAP.get(style[s], None) + style_tag = s[1].get(style[s[0]], None) if style_tag: tag_count += 1 fb2_text += '<%s>' % style_tag diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index d6390f2643..edad5fe1f9 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -1,11 +1,17 @@ -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' Read data from .mobi files ''' -import struct, os, cStringIO, re, functools, datetime, textwrap +import datetime +import functools +import os +import re +import struct +import textwrap + +import cStringIO try: from PIL import Image as PILImage @@ -21,8 +27,8 @@ from calibre.ebooks import DRMError from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.huffcdic import HuffReader -from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.mobi.langcodes import main_language, sub_language +from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.ebooks.metadata.toc import TOC @@ -40,8 +46,8 @@ class EXTHHeader(object): while left > 0: left -= 1 - id, size = struct.unpack('>LL', raw[pos:pos+8]) - content = raw[pos+8:pos+size] + id, size = struct.unpack('>LL', raw[pos:pos + 8]) + content = raw[pos + 8:pos + size] pos += size if id >= 100 and id < 200: self.process_metadata(id, content, codec) @@ -87,7 +93,7 @@ class EXTHHeader(object): elif id == 106: try: self.mi.publish_date = datetime.datetime.strptime( - content, '%Y-%m-%d',).date() + content, '%Y-%m-%d', ).date() except: pass elif id == 108: @@ -123,13 +129,13 @@ class BookHeader(object): try: self.codec = { - 1252 : 'cp1252', - 65001 : 'utf-8', - }[self.codepage] + 1252: 'cp1252', + 65001: 'utf-8', + }[self.codepage] except (IndexError, KeyError): self.codec = 'cp1252' if user_encoding is None else user_encoding - log.warn('Unknown codepage %d. Assuming %s'%(self.codepage, - self.codec)) + log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, + self.codec)) if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: self.extra_flags = 0 else: @@ -147,14 +153,14 @@ class BookHeader(object): self.language = main_language.get(langid, 'ENGLISH') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] - self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0] + self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0] self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth = None if not isinstance(self.title, unicode): self.title = self.title.decode(self.codec, 'replace') if self.exth_flag & 0x40: - self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title) + self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title) self.exth.mi.uid = self.unique_id self.exth.mi.language = self.language @@ -182,7 +188,7 @@ class MetadataHeader(BookHeader): return struct.unpack('>H', self.stream.read(2))[0] def section_offset(self, number): - self.stream.seek(78+number*8) + self.stream.seek(78 + number * 8) return struct.unpack('>LBBBB', self.stream.read(8))[0] def header(self): @@ -242,15 +248,15 @@ class MobiReader(object): self.name = self.header[:32].replace('\x00', '') self.num_sections, = struct.unpack('>H', raw[76:78]) - self.ident = self.header[0x3C:0x3C+8].upper() + self.ident = self.header[0x3C:0x3C + 8].upper() if self.ident not in ['BOOKMOBI', 'TEXTREAD']: - raise MobiError('Unknown book type: %s'%self.ident) + raise MobiError('Unknown book type: %s' % self.ident) self.sections = [] self.section_headers = [] for i in range(self.num_sections): - offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8]) - flags, val = a1, a2<<16 | a3<<8 | a4 + offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8]) + flags, val = a1, a2 << 16 | a3 << 8 | a4 self.section_headers.append((offset, flags, val)) def section(section_number): @@ -266,7 +272,7 @@ class MobiReader(object): self.book_header = BookHeader(self.sections[0][0], self.ident, - user_encoding, self.log) + user_encoding, self.log) self.name = self.name.decode(self.book_header.codec, 'replace') def extract_content(self, output_dir, parse_cache): @@ -279,13 +285,13 @@ class MobiReader(object): parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, - 'ignore') + 'ignore') for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) e2u = functools.partial(entity_to_unicode, - exceptions=['lt', 'gt', 'amp', 'apos', 'quot']) + exceptions=['lt', 'gt', 'amp', 'apos', 'quot']) self.processed_html = re.sub(r'&(\S+?);', e2u, - self.processed_html) + self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() @@ -295,7 +301,7 @@ class MobiReader(object): if root.xpath('descendant::p/descendant::p'): from lxml.html import soupparser self.log.warning('Markup contains unclosed

tags, parsing using', - 'BeatifulSoup') + 'BeatifulSoup') root = soupparser.fromstring(self.processed_html) if root.tag != 'html': self.log.warn('File does not have opening tag') @@ -346,45 +352,45 @@ class MobiReader(object): fname = self.name.encode('ascii', 'replace') fname = re.sub(r'[\x08\x15\0]+', '', fname) htmlfile = os.path.join(output_dir, - sanitize_file_name(fname)+'.html') + sanitize_file_name(fname) + '.html') try: for ref in guide.xpath('descendant::reference'): if ref.attrib.has_key('href'): - ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href'] + ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) - self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' + self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(open(self.created_opf_path, 'wb'), ncx, - ncx_manifest_entry=ncx_manifest_entry) + ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') open(ncx_path, 'wb').write(ncx) with open('styles.css', 'wb') as s: - s.write(self.base_css_rules+'\n\n') + s.write(self.base_css_rules + '\n\n') for cls, rule in self.tag_css_rules.items(): if isinstance(rule, unicode): rule = rule.encode('utf-8') - s.write('.%s { %s }\n\n'%(cls, rule)) + s.write('.%s { %s }\n\n' % (cls, rule)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) - opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx, - ncx_manifest_entry ) + opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, + ncx_manifest_entry) ncx = ncx.getvalue() if ncx: - open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + open(os.path.splitext(htmlfile)[0] + '.ncx', 'wb').write(ncx) def read_embedded_metadata(self, root, elem, guide): - raw = ''+html.tostring(elem, encoding='utf-8')+'' + raw = '' + html.tostring(elem, encoding='utf-8') + '' stream = cStringIO.StringIO(raw) opf = OPF(stream) self.embedded_mi = MetaInformation(opf) @@ -394,7 +400,7 @@ class MobiReader(object): href = ref.get('href', '') if href.startswith('#'): href = href[1:] - anchors = root.xpath('//*[@id="%s"]'%href) + anchors = root.xpath('//*[@id="%s"]' % href) if anchors: cpos = anchors[0] reached = False @@ -412,27 +418,27 @@ class MobiReader(object): self.log.debug('Cleaning up HTML...') self.processed_html = re.sub(r'

', '', self.processed_html) if self.book_header.ancient and '')+'' + self.processed_html = '

' + self.processed_html.replace('\n\n', '

') + '' self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('> <', '>\n<') def upshift_markup(self, root): self.log.debug('Converting style information to CSS...') size_map = { - 'xx-small' : '0.5', - 'x-small' : '1', - 'small' : '2', - 'medium' : '3', - 'large' : '4', - 'x-large' : '5', - 'xx-large' : '6', - } + 'xx-small': '0.5', + 'x-small': '1', + 'small': '2', + 'medium': '3', + 'large': '4', + 'x-large': '5', + 'xx-large': '6', + } mobi_version = self.book_header.mobi_version for i, tag in enumerate(root.iter(etree.Element)): tag.attrib.pop('xmlns', '') if tag.tag in ('country-region', 'place', 'placetype', 'placename', - 'state', 'city', 'street', 'address', 'content'): - tag.tag = 'div' if tag.tag == 'content' else 'span' + 'state', 'city', 'street', 'address', 'content'): + tag.tag = 'div' if tag.tag == 'content' else 'span' for key in tag.attrib.keys(): tag.attrib.pop(key) continue @@ -450,7 +456,7 @@ class MobiReader(object): if width: styles.append('text-indent: %s' % width) if width.startswith('-'): - styles.append('margin-left: %s'%(width[1:])) + styles.append('margin-left: %s' % (width[1:])) if attrib.has_key('align'): align = attrib.pop('align').strip() if align: @@ -502,7 +508,7 @@ class MobiReader(object): cls = sel break if cls is None: - ncls = 'calibre_%d'%i + ncls = 'calibre_%d' % i self.tag_css_rules[ncls] = rule cls = attrib.get('class', '') cls = cls + (' ' if cls else '') + ncls @@ -514,17 +520,17 @@ class MobiReader(object): mi = MetaInformation(self.book_header.title, [_('Unknown')]) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): - opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) + opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1) elif mi.cover is not None: opf.cover = mi.cover else: - opf.cover = 'images/%05d.jpg'%1 + opf.cover = 'images/%05d.jpg' % 1 if not os.path.exists(os.path.join(os.path.dirname(htmlfile), - *opf.cover.split('/'))): - opf.cover = None + * opf.cover.split('/'))): + opf.cover = None manifest = [(htmlfile, 'text/x-oeb1-document'), - (os.path.abspath('styles.css'), 'text/css')] + (os.path.abspath('styles.css'), 'text/css')] bp = os.path.dirname(htmlfile) for i in getattr(self, 'image_names', []): manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) @@ -541,7 +547,7 @@ class MobiReader(object): ncx_manifest_entry = None if toc: ncx_manifest_entry = 'toc.ncx' - elems = root.xpath('//*[@id="%s"]'%toc.partition('#')[-1]) + elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1]) tocobj = None ent_pat = re.compile(r'&(\S+?);') if elems: @@ -556,12 +562,12 @@ class MobiReader(object): if href and re.match('\w+://', href) is None: try: text = u' '.join([t.strip() for t in \ - x.xpath('descendant::text()')]) + x.xpath('descendant::text()')]) except: text = '' text = ent_pat.sub(entity_to_unicode, text) tocobj.add_item(toc.partition('#')[0], href[1:], - text) + text) if reached and x.get('class', None) == 'mbp_pagebreak': break if tocobj is not None: @@ -599,17 +605,17 @@ class MobiReader(object): def extract_text(self): self.log.debug('Extracting text...') - text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] - processed_records = list(range(0, self.book_header.records+1)) + text_sections = [self.text_section(i) for i in range(1, self.book_header.records + 1)] + processed_records = list(range(0, self.book_header.records + 1)) self.mobi_html = '' if self.book_header.compression_type == 'DH': huffs = [self.sections[i][0] for i in - range(self.book_header.huff_offset, - self.book_header.huff_offset+self.book_header.huff_number)] + range(self.book_header.huff_offset, + self.book_header.huff_offset + self.book_header.huff_number)] processed_records += list(range(self.book_header.huff_offset, - self.book_header.huff_offset+self.book_header.huff_number)) + self.book_header.huff_offset + self.book_header.huff_number)) huff = HuffReader(huffs) self.mobi_html = huff.decompress(text_sections) @@ -620,7 +626,7 @@ class MobiReader(object): elif self.book_header.compression_type == '\x00\x01': self.mobi_html = ''.join(text_sections) else: - raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) + raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type)) if self.book_header.ancient and ']+filepos=['"]{0,1}(\d+)[^<>]*>''', - re.IGNORECASE) + re.IGNORECASE) for match in link_pattern.finditer(self.mobi_html): positions.add(int(match.group(1))) pos = 0 @@ -652,10 +658,10 @@ class MobiReader(object): if r > -1 and (r < l or l == end or l == -1): p = self.mobi_html.rfind('<', 0, end + 1) if pos < end and p > -1 and \ - not end_tag_re.match(self.mobi_html[p:r]) and \ - not self.mobi_html[p:r+1].endswith('/>'): - anchor = ' filepos-id="filepos%d"' - end = r + not end_tag_re.match(self.mobi_html[p:r]) and \ + not self.mobi_html[p:r + 1].endswith('/>'): + anchor = ' filepos-id="filepos%d"' + end = r else: end = r + 1 self.processed_html += self.mobi_html[pos:end] + (anchor % oend) @@ -673,7 +679,7 @@ class MobiReader(object): start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: # BAEN PRC files have bad headers - start=0 + start = 0 for i in range(start, self.num_sections): if i in processed_records: continue @@ -687,7 +693,7 @@ class MobiReader(object): except IOError: continue - path = os.path.join(output_dir, '%05d.jpg'%image_index) + path = os.path.join(output_dir, '%05d.jpg' % image_index) self.image_names.append(os.path.basename(path)) im.save(open(path, 'wb'), format='JPEG') diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index e16deeccda..1a5a729a6f 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1,27 +1,32 @@ ''' Write content to Mobipocket books. ''' -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' +from collections import defaultdict +from itertools import count +from itertools import izip +import random +import re from struct import pack import time -import random -from cStringIO import StringIO -import re -from itertools import izip, count -from collections import defaultdict from urlparse import urldefrag + from PIL import Image -from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ - OEB_RASTER_IMAGES -from calibre.ebooks.oeb.base import namespace, prefixname -from calibre.ebooks.oeb.base import urlnormalize -from calibre.ebooks.mobi.palmdoc import compress_doc +from cStringIO import StringIO from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.mobiml import MBP_NS +from calibre.ebooks.oeb.base import OEB_DOCS +from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES +from calibre.ebooks.oeb.base import XHTML +from calibre.ebooks.oeb.base import XHTML_NS +from calibre.ebooks.oeb.base import XML_NS +from calibre.ebooks.oeb.base import namespace +from calibre.ebooks.oeb.base import prefixname +from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.compression.palmdoc import compress_doc # TODO: # - Allow override CSS (?) @@ -174,7 +179,7 @@ class Serializer(object): item = hrefs[path] if path else None if item and item.spine_position is None: return False - path = item.href if item else base.href + path = item.href if item else base.href href = '#'.join((path, frag)) if frag else path buffer.write('filepos=') self.href_offsets[href].append(buffer.tell()) @@ -211,8 +216,8 @@ class Serializer(object): def serialize_elem(self, elem, item, nsrmap=NSRMAP): buffer = self.buffer if not isinstance(elem.tag, basestring) \ - or namespace(elem.tag) not in nsrmap: - return + or namespace(elem.tag) not in nsrmap: + return tag = prefixname(elem.tag, nsrmap) # Previous layers take care of @name id = elem.attrib.pop('id', None) @@ -221,9 +226,9 @@ class Serializer(object): offset = self.anchor_offset or buffer.tell() self.id_offsets[href] = offset if self.anchor_offset is not None and \ - tag == 'a' and not elem.attrib and \ - not len(elem) and not elem.text: - return + tag == 'a' and not elem.attrib and \ + not len(elem) and not elem.text: + return self.anchor_offset = buffer.tell() buffer.write('<') buffer.write(tag) @@ -286,8 +291,8 @@ class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') def __init__(self, compression=PALMDOC, imagemax=None, - prefer_author_sort=False): - self._compression = compression or UNCOMPRESSED + prefer_author_sort=False): + self._compression = compression or UNCOMPRESSED self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort @@ -297,7 +302,7 @@ class MobiWriter(object): imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None prefer_author_sort = opts.prefer_author_sort return cls(compression=PALMDOC, imagemax=imagemax, - prefer_author_sort=prefer_author_sort) + prefer_author_sort=prefer_author_sort) def __call__(self, oeb, path): if hasattr(path, 'write'): @@ -305,7 +310,7 @@ class MobiWriter(object): with open(path, 'w+b') as stream: return self._dump_stream(oeb, stream) - def _write(self, *data): + def _write(self, * data): for datum in data: self._stream.write(datum) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index d8850cfb16..54f3826470 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -7,17 +6,17 @@ __docformat__ = 'restructuredtext en' class PDBError(Exception): pass - + from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader -from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader +from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader FORMAT_READERS = { - 'PNPdPPrs' : ereader_reader, - 'PNRdPPrs' : ereader_reader, - 'zTXTGPlm' : ztxt_reader, - 'TEXtREAd' : palmdoc_reader, + 'PNPdPPrs': ereader_reader, + 'PNRdPPrs': ereader_reader, + 'zTXTGPlm': ztxt_reader, + 'TEXtREAd': palmdoc_reader, } from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer @@ -25,41 +24,41 @@ from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer FORMAT_WRITERS = { - 'doc' : palmdoc_writer, - 'ztxt' : ztxt_writer, - 'ereader' : ereader_writer, + 'doc': palmdoc_writer, + 'ztxt': ztxt_writer, + 'ereader': ereader_writer, } IDENTITY_TO_NAME = { - 'PNPdPPrs' : 'eReader', - 'PNRdPPrs' : 'eReader', - 'zTXTGPlm' : 'zTXT', - 'TEXtREAd' : 'PalmDOC', - - '.pdfADBE' : 'Adobe Reader', - 'BVokBDIC' : 'BDicty', - 'DB99DBOS' : 'DB (Database program)', - 'vIMGView' : 'FireViewer (ImageViewer)', - 'PmDBPmDB' : 'HanDBase', - 'InfoINDB' : 'InfoView', - 'ToGoToGo' : 'iSilo', - 'SDocSilX' : 'iSilo 3', - 'JbDbJBas' : 'JFile', - 'JfDbJFil' : 'JFile Pro', - 'DATALSdb' : 'LIST', - 'Mdb1Mdb1' : 'MobileDB', - 'BOOKMOBI' : 'MobiPocket', - 'DataPlkr' : 'Plucker', - 'DataSprd' : 'QuickSheet', - 'SM01SMem' : 'SuperMemo', - 'TEXtTlDc' : 'TealDoc', - 'InfoTlIf' : 'TealInfo', - 'DataTlMl' : 'TealMeal', - 'DataTlPt' : 'TealPaint', - 'dataTDBP' : 'ThinkDB', - 'TdatTide' : 'Tides', - 'ToRaTRPW' : 'TomeRaider', - 'BDOCWrdS' : 'WordSmith', + 'PNPdPPrs': 'eReader', + 'PNRdPPrs': 'eReader', + 'zTXTGPlm': 'zTXT', + 'TEXtREAd': 'PalmDOC', + + '.pdfADBE': 'Adobe Reader', + 'BVokBDIC': 'BDicty', + 'DB99DBOS': 'DB (Database program)', + 'vIMGView': 'FireViewer (ImageViewer)', + 'PmDBPmDB': 'HanDBase', + 'InfoINDB': 'InfoView', + 'ToGoToGo': 'iSilo', + 'SDocSilX': 'iSilo 3', + 'JbDbJBas': 'JFile', + 'JfDbJFil': 'JFile Pro', + 'DATALSdb': 'LIST', + 'Mdb1Mdb1': 'MobileDB', + 'BOOKMOBI': 'MobiPocket', + 'DataPlkr': 'Plucker', + 'DataSprd': 'QuickSheet', + 'SM01SMem': 'SuperMemo', + 'TEXtTlDc': 'TealDoc', + 'InfoTlIf': 'TealInfo', + 'DataTlMl': 'TealMeal', + 'DataTlPt': 'TealPaint', + 'dataTDBP': 'ThinkDB', + 'TdatTide': 'Tides', + 'ToRaTRPW': 'TomeRaider', + 'BDOCWrdS': 'WordSmith', } def get_reader(identity): @@ -67,10 +66,10 @@ def get_reader(identity): Returns None if no reader is found for the identity. ''' return FORMAT_READERS.get(identity, None) - + def get_writer(extension): ''' Returns None if no writer is found for extension. ''' return FORMAT_WRITERS.get(extension, None) - + diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 13429c5a98..7d29ef243c 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,16 +8,19 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, re, struct, zlib +import os +import re +import struct +import zlib from calibre import CurrentDir from calibre.ebooks import DRMError -from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.pdb.ereader import EreaderError -from calibre.ebooks.pml.pmlconverter import pml_to_html, \ - footnote_sidebar_to_html -from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.compression.palmdoc import decompress_doc +from calibre.ebooks.pdb.ereader import EreaderError +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html +from calibre.ebooks.pml.pmlconverter import pml_to_html class HeaderRecord(object): ''' @@ -32,7 +35,7 @@ class HeaderRecord(object): self.non_text_offset, = struct.unpack('>H', raw[12:14]) self.has_metadata, = struct.unpack('>H', raw[24:26]) self.footnote_rec, = struct.unpack('>H', raw[28:30]) - self.sidebar_rec, = struct.unpack('>H', raw[30:32]) + self.sidebar_rec, = struct.unpack('>H', raw[30:32]) self.bookmark_offset, = struct.unpack('>H', raw[32:34]) self.image_data_offset, = struct.unpack('>H', raw[40:42]) self.metadata_offset, = struct.unpack('>H', raw[44:46]) @@ -79,7 +82,7 @@ class Reader(FormatReader): if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: return 'empty', '' data = self.section_data(number) - name = data[4:4+32].strip('\x00') + name = data[4:4 + 32].strip('\x00') img = data[62:] return name, img diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 875aae764a..c8567c93b6 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -8,9 +8,11 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import struct, zlib +import struct +import zlib -import Image, cStringIO +import Image +import cStringIO from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.oeb.base import OEB_IMAGES diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 48c39fc0ad..0d626b98f6 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement ''' Read the header data from a pdb file. ''' @@ -8,7 +7,9 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import re, struct, time +import re +import struct +import time class PdbHeaderReader(object): @@ -35,16 +36,16 @@ class PdbHeaderReader(object): if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) - self.stream.seek(78+number*8) + self.stream.seek(78 + number * 8) offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0] - flags, val = a1, a2<<16 | a3<<8 | a4 + flags, val = a1, a2 << 16 | a3 << 8 | a4 return (offset, flags, val) def section_offset(self, number): if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) - self.stream.seek(78+number*8) + self.stream.seek(78 + number * 8) return struct.unpack('>LBBBB', self.stream.read(8))[0] def section_data(self, number): diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index ba35a2317e..915ed7d739 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -8,11 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, struct, zlib +import os +import struct +from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.mobi.palmdoc import decompress_doc -from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer +from calibre.ebooks.txt.processor import opf_writer +from calibre.ebooks.txt.processor import txt_to_markdown class HeaderRecord(object): ''' @@ -25,15 +27,15 @@ class HeaderRecord(object): def __init__(self, raw): self.compression, = struct.unpack('>H', raw[0:2]) self.num_records, = struct.unpack('>H', raw[8:10]) - - + + class Reader(FormatReader): - + def __init__(self, header, stream, log, encoding=None): self.stream = stream self.log = log self.encoding = encoding - + self.sections = [] for i in range(header.num_sections): self.sections.append(header.section_data(i)) @@ -52,7 +54,7 @@ class Reader(FormatReader): def extract_content(self, output_dir): txt = '' - + self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) @@ -62,12 +64,12 @@ class Reader(FormatReader): html = txt_to_markdown(txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) - + from calibre.ebooks.metadata.meta import get_metadata mi = get_metadata(self.stream, 'pdb') manifest = [('index.html', None)] spine = ['index.html'] opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - + return os.path.join(output_dir, 'metadata.opf') diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index e841e69054..6a7d54a586 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -10,10 +10,11 @@ __docformat__ = 'restructuredtext en' import struct +from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.pdb.formatwriter import FormatWriter -from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines -from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.txt.writer import TxtNewlines +from calibre.ebooks.txt.writer import TxtWriter MAX_RECORD_SIZE = 4096 @@ -22,48 +23,48 @@ class Writer(FormatWriter): def __init__(self, opts, log): self.opts = opts self.log = log - + def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') txt_records, txt_length = self._generate_text(oeb_book.spine) header_record = self._header_record(txt_length, len(txt_records)) - + section_lengths = [len(header_record)] self.log.info('Compessing data...') for i in range(0, len(txt_records)): self.log.debug('\tCompressing record %i' % i) txt_records[i] = compress_doc(txt_records[i].encode('utf-8')) section_lengths.append(len(txt_records[i])) - + out_stream.seek(0) hb = PdbHeaderBuilder('TEXtREAd', title) hb.build_header(section_lengths, out_stream) - - for record in [header_record]+txt_records: + + for record in [header_record] + txt_records: out_stream.write(record) - + def _generate_text(self, spine): txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) txt = txt_writer.dump(spine) - + txt_length = len(txt) - + txt_records = [] for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): - txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) - + txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) + return txt_records, txt_length - + def _header_record(self, txt_length, record_count): record = '' - + record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression). record += struct.pack('>H', 0) # [2:4], Always 0. record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book. record += struct.pack('>H', record_count) # [8:10], Number of PDB records used for the text of the book. record += struct.pack('>H', MAX_RECORD_SIZE) # [10-12], Maximum size of each record containing text, always 4096. record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text. - + return record - + diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index cdf3bf69e8..c34ada3317 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -8,7 +8,8 @@ __docformat__ = 'restructuredtext en' Transform OEB content into PML markup ''' -import os, re +import os +import re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer @@ -40,6 +41,31 @@ STYLES = [ ('text-align', {'right' : 'r', 'center' : 'c'}), ] +BLOCK_TAGS = [ + 'p', +] + +BLOCK_STYLES = [ + 'block', +] + +LINK_TAGS = [ + 'a', +] + +SEPARATE_TAGS = [ + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'p', + 'div', + 'li', + 'tr', +] + class PMLMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables @@ -104,7 +130,7 @@ class PMLMLizer(object): tag_count = 0 # Are we in a paragraph block? - if tag == 'p' or style['display'] in ('block'): + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if 'block' not in tag_stack: tag_count += 1 tag_stack.append('block') @@ -136,7 +162,7 @@ class PMLMLizer(object): # Special processing of tags that require an argument. # Anchors links - if tag == 'a' and 'q' not in tag_stack: + if tag in LINK_TAGS and 'q' not in tag_stack: href = elem.get('href') if href and '://' not in href: if '#' in href: @@ -168,7 +194,7 @@ class PMLMLizer(object): for i in range(0, tag_count): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'): + if tag in SEPARATE_TAGS: text += os.linesep + os.linesep if 'block' not in tag_stack: