Pull from driver-dev

This commit is contained in:
Kovid Goyal 2009-05-25 07:37:49 -07:00
commit c90c086117
33 changed files with 1544 additions and 662 deletions

View File

@ -13,6 +13,7 @@ src/calibre/manual/cli/
build
dist
docs
nbproject/
src/calibre/gui2/pictureflow/Makefile.Debug
src/calibre/gui2/pictureflow/Makefile.Release
src/calibre/gui2/pictureflow/debug/

View File

@ -89,7 +89,7 @@ if __name__ == '__main__':
include_dirs=['src/calibre/utils/msdes']),
Extension('calibre.plugins.cPalmdoc',
sources=['src/calibre/ebooks/mobi/palmdoc.c']),
sources=['src/calibre/ebooks/compression/palmdoc.c']),
PyQtExtension('calibre.plugins.pictureflow',
['src/calibre/gui2/pictureflow/pictureflow.cpp',

View File

@ -1,8 +1,9 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap, os, glob
import textwrap
import os
import glob
from calibre.customize import FileTypePlugin, MetadataReaderPlugin, MetadataWriterPlugin
from calibre.constants import __version__
@ -39,172 +40,6 @@ every time you add an HTML file to the library.\
return of.name
class OPFMetadataReader(MetadataReaderPlugin):
name = 'Read OPF metadata'
file_types = set(['opf'])
description = _('Read metadata from %s files')%'OPF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata import MetaInformation
return MetaInformation(OPF(stream, os.getcwd()))
class RTFMetadataReader(MetadataReaderPlugin):
name = 'Read RTF metadata'
file_types = set(['rtf'])
description = _('Read metadata from %s files')%'RTF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rtf import get_metadata
return get_metadata(stream)
class FB2MetadataReader(MetadataReaderPlugin):
name = 'Read FB2 metadata'
file_types = set(['fb2'])
description = _('Read metadata from %s files')%'FB2'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.fb2 import get_metadata
return get_metadata(stream)
class LRFMetadataReader(MetadataReaderPlugin):
name = 'Read LRF metadata'
file_types = set(['lrf'])
description = _('Read metadata from %s files')%'LRF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.lrf.meta import get_metadata
return get_metadata(stream)
class PDFMetadataReader(MetadataReaderPlugin):
name = 'Read PDF metadata'
file_types = set(['pdf'])
description = _('Read metadata from %s files')%'PDF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.pdf import get_metadata
return get_metadata(stream)
class LITMetadataReader(MetadataReaderPlugin):
name = 'Read LIT metadata'
file_types = set(['lit'])
description = _('Read metadata from %s files')%'LIT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.lit import get_metadata
return get_metadata(stream)
class IMPMetadataReader(MetadataReaderPlugin):
name = 'Read IMP metadata'
file_types = set(['imp'])
description = _('Read metadata from %s files')%'IMP'
author = 'Ashish Kulkarni'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.imp import get_metadata
return get_metadata(stream)
class RBMetadataReader(MetadataReaderPlugin):
name = 'Read RB metadata'
file_types = set(['rb'])
description = _('Read metadata from %s files')%'RB'
author = 'Ashish Kulkarni'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rb import get_metadata
return get_metadata(stream)
class EPUBMetadataReader(MetadataReaderPlugin):
name = 'Read EPUB metadata'
file_types = set(['epub'])
description = _('Read metadata from %s files')%'EPUB'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.epub import get_metadata
return get_metadata(stream)
class HTMLMetadataReader(MetadataReaderPlugin):
name = 'Read HTML metadata'
file_types = set(['html'])
description = _('Read metadata from %s files')%'HTML'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.html import get_metadata
return get_metadata(stream)
class MOBIMetadataReader(MetadataReaderPlugin):
name = 'Read MOBI metadata'
file_types = set(['mobi', 'prc', 'azw'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.mobi.reader import get_metadata
return get_metadata(stream)
class TOPAZMetadataReader(MetadataReaderPlugin):
name = 'Read Topaz metadata'
file_types = set(['tpz', 'azw1'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
class ODTMetadataReader(MetadataReaderPlugin):
name = 'Read ODT metadata'
file_types = set(['odt'])
description = _('Read metadata from %s files')%'ODT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.odt import get_metadata
return get_metadata(stream)
class TXTMetadataReader(MetadataReaderPlugin):
name = 'Read TXT metadata'
file_types = set(['txt'])
description = _('Read metadata from %s files') % 'TXT'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.txt import get_metadata
return get_metadata(stream)
class PDBMetadataReader(MetadataReaderPlugin):
name = 'Read PDB metadata'
file_types = set(['pdb'])
description = _('Read metadata from %s files') % 'PDB'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.pdb import get_metadata
return get_metadata(stream)
class LRXMetadataReader(MetadataReaderPlugin):
name = 'Read LRX metadata'
file_types = set(['lrx'])
description = _('Read metadata from %s files')%'LRX'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.lrx import get_metadata
return get_metadata(stream)
class ComicMetadataReader(MetadataReaderPlugin):
@ -227,14 +62,127 @@ class ComicMetadataReader(MetadataReaderPlugin):
mi.cover_data = (ext.lower(), data)
return mi
class ZipMetadataReader(MetadataReaderPlugin):
class EPUBMetadataReader(MetadataReaderPlugin):
name = 'Read ZIP metadata'
file_types = set(['zip', 'oebzip'])
description = _('Read metadata from ebooks in ZIP archives')
name = 'Read EPUB metadata'
file_types = set(['epub'])
description = _('Read metadata from %s files')%'EPUB'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.zip import get_metadata
from calibre.ebooks.metadata.epub import get_metadata
return get_metadata(stream)
class FB2MetadataReader(MetadataReaderPlugin):
name = 'Read FB2 metadata'
file_types = set(['fb2'])
description = _('Read metadata from %s files')%'FB2'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.fb2 import get_metadata
return get_metadata(stream)
class HTMLMetadataReader(MetadataReaderPlugin):
name = 'Read HTML metadata'
file_types = set(['html'])
description = _('Read metadata from %s files')%'HTML'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.html import get_metadata
return get_metadata(stream)
class IMPMetadataReader(MetadataReaderPlugin):
name = 'Read IMP metadata'
file_types = set(['imp'])
description = _('Read metadata from %s files')%'IMP'
author = 'Ashish Kulkarni'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.imp import get_metadata
return get_metadata(stream)
class LITMetadataReader(MetadataReaderPlugin):
name = 'Read LIT metadata'
file_types = set(['lit'])
description = _('Read metadata from %s files')%'LIT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.lit import get_metadata
return get_metadata(stream)
class LRFMetadataReader(MetadataReaderPlugin):
name = 'Read LRF metadata'
file_types = set(['lrf'])
description = _('Read metadata from %s files')%'LRF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.lrf.meta import get_metadata
return get_metadata(stream)
class LRXMetadataReader(MetadataReaderPlugin):
name = 'Read LRX metadata'
file_types = set(['lrx'])
description = _('Read metadata from %s files')%'LRX'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.lrx import get_metadata
return get_metadata(stream)
class MOBIMetadataReader(MetadataReaderPlugin):
name = 'Read MOBI metadata'
file_types = set(['mobi', 'prc', 'azw'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.mobi.reader import get_metadata
return get_metadata(stream)
class ODTMetadataReader(MetadataReaderPlugin):
name = 'Read ODT metadata'
file_types = set(['odt'])
description = _('Read metadata from %s files')%'ODT'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.odt import get_metadata
return get_metadata(stream)
class OPFMetadataReader(MetadataReaderPlugin):
name = 'Read OPF metadata'
file_types = set(['opf'])
description = _('Read metadata from %s files')%'OPF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata import MetaInformation
return MetaInformation(OPF(stream, os.getcwd()))
class PDBMetadataReader(MetadataReaderPlugin):
name = 'Read PDB metadata'
file_types = set(['pdb'])
description = _('Read metadata from %s files') % 'PDB'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.pdb import get_metadata
return get_metadata(stream)
class PDFMetadataReader(MetadataReaderPlugin):
name = 'Read PDF metadata'
file_types = set(['pdf'])
description = _('Read metadata from %s files')%'PDF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.pdf import get_metadata
return get_metadata(stream)
class RARMetadataReader(MetadataReaderPlugin):
@ -247,6 +195,58 @@ class RARMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.rar import get_metadata
return get_metadata(stream)
class RBMetadataReader(MetadataReaderPlugin):
name = 'Read RB metadata'
file_types = set(['rb'])
description = _('Read metadata from %s files')%'RB'
author = 'Ashish Kulkarni'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rb import get_metadata
return get_metadata(stream)
class RTFMetadataReader(MetadataReaderPlugin):
name = 'Read RTF metadata'
file_types = set(['rtf'])
description = _('Read metadata from %s files')%'RTF'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.rtf import get_metadata
return get_metadata(stream)
class TOPAZMetadataReader(MetadataReaderPlugin):
name = 'Read Topaz metadata'
file_types = set(['tpz', 'azw1'])
description = _('Read metadata from %s files')%'MOBI'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.topaz import get_metadata
return get_metadata(stream)
class TXTMetadataReader(MetadataReaderPlugin):
name = 'Read TXT metadata'
file_types = set(['txt'])
description = _('Read metadata from %s files') % 'TXT'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.txt import get_metadata
return get_metadata(stream)
class ZipMetadataReader(MetadataReaderPlugin):
name = 'Read ZIP metadata'
file_types = set(['zip', 'oebzip'])
description = _('Read metadata from ebooks in ZIP archives')
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.zip import get_metadata
return get_metadata(stream)
class EPUBMetadataWriter(MetadataWriterPlugin):
@ -268,16 +268,6 @@ class LRFMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.lrf.meta import set_metadata
set_metadata(stream, mi)
class RTFMetadataWriter(MetadataWriterPlugin):
name = 'Set RTF metadata'
file_types = set(['rtf'])
description = _('Set metadata in %s files')%'RTF'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.rtf import set_metadata
set_metadata(stream, mi)
class MOBIMetadataWriter(MetadataWriterPlugin):
name = 'Set MOBI metadata'
@ -289,17 +279,6 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.mobi import set_metadata
set_metadata(stream, mi)
class PDFMetadataWriter(MetadataWriterPlugin):
name = 'Set PDF metadata'
file_types = set(['pdf'])
description = _('Set metadata in %s files') % 'PDF'
author = 'Kovid Goyal'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.pdf import set_metadata
set_metadata(stream, mi)
class PDBMetadataWriter(MetadataWriterPlugin):
name = 'Set PDB metadata'
@ -311,49 +290,113 @@ class PDBMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.pdb import set_metadata
set_metadata(stream, mi)
class PDFMetadataWriter(MetadataWriterPlugin):
name = 'Set PDF metadata'
file_types = set(['pdf'])
description = _('Set metadata in %s files') % 'PDF'
author = 'Kovid Goyal'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.pdf import set_metadata
set_metadata(stream, mi)
class RTFMetadataWriter(MetadataWriterPlugin):
name = 'Set RTF metadata'
file_types = set(['rtf'])
description = _('Set metadata in %s files')%'RTF'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.rtf import set_metadata
set_metadata(stream, mi)
from calibre.ebooks.comic.input import ComicInput
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.odt.input import ODTInput
from calibre.ebooks.pdb.input import PDBInput
from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.fb2.output import FB2Output
from calibre.ebooks.odt.input import ODTInput
from calibre.ebooks.rtf.input import RTFInput
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.comic.input import ComicInput
from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.mobi.output import MOBIOutput
from calibre.ebooks.pdb.output import PDBOutput
from calibre.ebooks.lrf.output import LRFOutput
from calibre.ebooks.lit.output import LITOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.input import PMLInput
from calibre.ebooks.rb.input import RBInput
from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.rtf.input import RTFInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.fb2.output import FB2Output
from calibre.ebooks.lit.output import LITOutput
from calibre.ebooks.lrf.output import LRFOutput
from calibre.ebooks.mobi.output import MOBIOutput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.pdb.output import PDBOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.output import PMLOutput
from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.customize.profiles import input_profiles, output_profiles
from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI
from calibre.devices.blackberry.driver import BLACKBERRY
from calibre.devices.cybookg3.driver import CYBOOKG3
from calibre.devices.eb600.driver import EB600
from calibre.devices.jetbook.driver import JETBOOK
from calibre.devices.kindle.driver import KINDLE
from calibre.devices.kindle.driver import KINDLE2
from calibre.devices.prs500.driver import PRS500
from calibre.devices.prs505.driver import PRS505
from calibre.devices.prs700.driver import PRS700
from calibre.devices.cybookg3.driver import CYBOOKG3
from calibre.devices.kindle.driver import KINDLE
from calibre.devices.kindle.driver import KINDLE2
from calibre.devices.blackberry.driver import BLACKBERRY
from calibre.devices.eb600.driver import EB600
from calibre.devices.jetbook.driver import JETBOOK
from calibre.devices.bebook.driver import BEBOOK, BEBOOK_MINI
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput,
FB2Input, FB2Output, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput,
PMLOutput, MOBIOutput, PDBOutput, LRFOutput, LITOutput]
plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY,
EB600, JETBOOK, BEBOOK, BEBOOK_MINI]
plugins = []
plugins += [
ComicInput,
EPUBInput,
FB2Input,
HTMLInput,
LITInput,
MOBIInput,
ODTInput,
PDBInput,
PDFInput,
PMLInput,
RBInput,
RecipeInput,
RTFInput,
TXTInput,
]
plugins += [
EPUBOutput,
FB2Output,
LITOutput,
LRFOutput,
MOBIOutput,
OEBOutput,
PDBOutput,
PDFOutput,
PMLOutput,
RBOutput,
TXTOutput,
]
plugins += [
BEBOOK,
BEBOOK_MINI,
BLACKBERRY,
CYBOOKG3,
EB600,
JETBOOK,
KINDLE,
KINDLE2,
PRS500,
PRS505,
PRS700,
]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'

View File

@ -9,8 +9,10 @@ Transform OEB content into FB2 markup
'''
import os
import re
from base64 import b64encode
from calibre import entity_to_unicode
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_IMAGES
@ -25,15 +27,9 @@ TAG_MAP = {
'div' : 'p',
}
STYLE_MAP = {
'bold' : 'strong',
'bolder' : 'strong',
'italic' : 'emphasis',
}
STYLES = [
'font-weight',
'font-style',
('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
('font-style', {'italic' : 'emphasis'}),
]
class FB2MLizer(object):
@ -81,7 +77,13 @@ class FB2MLizer(object):
return images
def clean_text(self, text):
return text.replace('&', '')
for entity in set(re.findall('&.+?;', text)):
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
text = text.replace('&', '')
return text
def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \
@ -107,8 +109,9 @@ class FB2MLizer(object):
fb2_text += '<%s>' % fb2_tag
tag_stack.append(fb2_tag)
# Processes style information
for s in STYLES:
style_tag = STYLE_MAP.get(style[s], None)
style_tag = s[1].get(style[s[0]], None)
if style_tag:
tag_count += 1
fb2_text += '<%s>' % style_tag

View File

@ -8,11 +8,13 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
import struct
from calibre.ebooks.metadata import MetaInformation, authors_to_string
from calibre.ebooks.pdb.header import PdbHeaderReader, PdbHeaderBuilder
from calibre.ebooks.pdb.ereader.reader import HeaderRecord
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.pdb.ereader.reader132 import HeaderRecord
from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.pdb.header import PdbHeaderReader
def get_metadata(stream, extract_cover=True):
"""
@ -20,22 +22,25 @@ def get_metadata(stream, extract_cover=True):
"""
mi = MetaInformation(None, [_('Unknown')])
stream.seek(0)
pheader = PdbHeaderReader(stream)
hr = HeaderRecord(pheader.section_data(0))
if hr.version in (2, 10) and hr.has_metadata == 1:
try:
mdata = pheader.section_data(hr.metadata_offset)
mdata = mdata.split('\x00')
mi.title = mdata[0]
mi.authors = [mdata[1]]
mi.publisher = mdata[3]
mi.isbn = mdata[4]
except:
pass
# Only Dropbook produced 132 byte record0 files are supported
if len(pheader.section_data(0)) == 132:
hr = HeaderRecord(pheader.section_data(0))
if hr.version in (2, 10) and hr.has_metadata == 1:
try:
mdata = pheader.section_data(hr.metadata_offset)
mdata = mdata.split('\x00')
mi.title = mdata[0]
mi.authors = [mdata[1]]
mi.publisher = mdata[3]
mi.isbn = mdata[4]
except:
pass
if not mi.title:
mi.title = pheader.title if pheader.title else _('Unknown')
@ -43,26 +48,31 @@ def get_metadata(stream, extract_cover=True):
def set_metadata(stream, mi):
pheader = PdbHeaderReader(stream)
# Only Dropbook produced 132 byte record0 files are supported
if pheader.section_data(0) != 132:
return
sections = [pheader.section_data(x) for x in range(0, pheader.section_count())]
hr = HeaderRecord(sections[0])
if hr.version not in (2, 10):
return
# Create a metadata record for the file if one does not alreay exist
if not hr.has_metadata:
sections += ['', 'MeTaInFo\x00']
last_data = len(sections) - 1
for i in range(0, 132, 2):
val, = struct.unpack('>H', sections[0][i:i+2])
val, = struct.unpack('>H', sections[0][i:i + 2])
if val >= hr.last_data_offset:
sections[0][i:i+2] = struct.pack('>H', last_data)
sections[0][i:i + 2] = struct.pack('>H', last_data)
sections[0][24:26] = struct.pack('>H', 1) # Set has metadata
sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata
sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated
# Merge the metadata into the file
file_mi = get_metadata(stream, False)
file_mi.smart_update(mi)
@ -79,4 +89,3 @@ def set_metadata(stream, mi):
# Write the data back to the file
for item in sections:
stream.write(item)

View File

@ -38,7 +38,6 @@ def get_metadata(stream, extract_cover=True):
if MetadataReader is None:
return MetaInformation(pheader.title, [_('Unknown')])
return MetadataReader(stream, extract_cover)
def set_metadata(stream, mi):

View File

@ -1,11 +1,17 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Read data from .mobi files
'''
import struct, os, cStringIO, re, functools, datetime, textwrap
import datetime
import functools
import os
import re
import struct
import textwrap
import cStringIO
try:
from PIL import Image as PILImage
@ -21,8 +27,8 @@ from calibre.ebooks import DRMError
from calibre.ebooks.chardet import ENCODING_PATS
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.metadata.toc import TOC
@ -40,8 +46,8 @@ class EXTHHeader(object):
while left > 0:
left -= 1
id, size = struct.unpack('>LL', raw[pos:pos+8])
content = raw[pos+8:pos+size]
id, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
@ -87,7 +93,7 @@ class EXTHHeader(object):
elif id == 106:
try:
self.mi.publish_date = datetime.datetime.strptime(
content, '%Y-%m-%d',).date()
content, '%Y-%m-%d', ).date()
except:
pass
elif id == 108:
@ -123,13 +129,13 @@ class BookHeader(object):
try:
self.codec = {
1252 : 'cp1252',
65001 : 'utf-8',
}[self.codepage]
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if user_encoding is None else user_encoding
log.warn('Unknown codepage %d. Assuming %s'%(self.codepage,
self.codec))
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
self.extra_flags = 0
else:
@ -147,14 +153,14 @@ class BookHeader(object):
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id
self.exth.mi.language = self.language
@ -182,7 +188,7 @@ class MetadataHeader(BookHeader):
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78+number*8)
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
@ -242,15 +248,15 @@ class MobiReader(object):
self.name = self.header[:32].replace('\x00', '')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C+8].upper()
self.ident = self.header[0x3C:0x3C + 8].upper()
if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s'%self.ident)
raise MobiError('Unknown book type: %s' % self.ident)
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8])
flags, val = a1, a2<<16 | a3<<8 | a4
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val))
def section(section_number):
@ -266,7 +272,7 @@ class MobiReader(object):
self.book_header = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log)
user_encoding, self.log)
self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir, parse_cache):
@ -279,13 +285,13 @@ class MobiReader(object):
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore')
'ignore')
for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
self.processed_html = re.sub(r'&(\S+?);', e2u,
self.processed_html)
self.processed_html)
self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
@ -295,7 +301,7 @@ class MobiReader(object):
if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser
self.log.warning('Markup contains unclosed <p> tags, parsing using',
'BeatifulSoup')
'BeatifulSoup')
root = soupparser.fromstring(self.processed_html)
if root.tag != 'html':
self.log.warn('File does not have opening <html> tag')
@ -346,45 +352,45 @@ class MobiReader(object):
fname = self.name.encode('ascii', 'replace')
fname = re.sub(r'[\x08\x15\0]+', '', fname)
htmlfile = os.path.join(output_dir,
sanitize_file_name(fname)+'.html')
sanitize_file_name(fname) + '.html')
try:
for ref in guide.xpath('descendant::reference'):
if ref.attrib.has_key('href'):
ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href']
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
except AttributeError:
pass
parse_cache[htmlfile] = root
self.htmlfile = htmlfile
ncx = cStringIO.StringIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf'
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(open(self.created_opf_path, 'wb'), ncx,
ncx_manifest_entry=ncx_manifest_entry)
ncx_manifest_entry=ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
open(ncx_path, 'wb').write(ncx)
with open('styles.css', 'wb') as s:
s.write(self.base_css_rules+'\n\n')
s.write(self.base_css_rules + '\n\n')
for cls, rule in self.tag_css_rules.items():
if isinstance(rule, unicode):
rule = rule.encode('utf-8')
s.write('.%s { %s }\n\n'%(cls, rule))
s.write('.%s { %s }\n\n' % (cls, rule))
if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...')
ncx = cStringIO.StringIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx,
ncx_manifest_entry )
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
open(os.path.splitext(htmlfile)[0] + '.ncx', 'wb').write(ncx)
def read_embedded_metadata(self, root, elem, guide):
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
raw = '<package>' + html.tostring(elem, encoding='utf-8') + '</package>'
stream = cStringIO.StringIO(raw)
opf = OPF(stream)
self.embedded_mi = MetaInformation(opf)
@ -394,7 +400,7 @@ class MobiReader(object):
href = ref.get('href', '')
if href.startswith('#'):
href = href[1:]
anchors = root.xpath('//*[@id="%s"]'%href)
anchors = root.xpath('//*[@id="%s"]' % href)
if anchors:
cpos = anchors[0]
reached = False
@ -412,26 +418,26 @@ class MobiReader(object):
self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
def upshift_markup(self, root):
self.log.debug('Converting style information to CSS...')
size_map = {
'xx-small' : '0.5',
'x-small' : '1',
'small' : '2',
'medium' : '3',
'large' : '4',
'x-large' : '5',
'xx-large' : '6',
}
'xx-small': '0.5',
'x-small': '1',
'small': '2',
'medium': '3',
'large': '4',
'x-large': '5',
'xx-large': '6',
}
mobi_version = self.book_header.mobi_version
for i, tag in enumerate(root.iter(etree.Element)):
tag.attrib.pop('xmlns', '')
if tag.tag in ('country-region', 'place', 'placetype', 'placename',
'state', 'city', 'street', 'address', 'content'):
'state', 'city', 'street', 'address', 'content'):
tag.tag = 'div' if tag.tag == 'content' else 'span'
for key in tag.attrib.keys():
tag.attrib.pop(key)
@ -450,7 +456,7 @@ class MobiReader(object):
if width:
styles.append('text-indent: %s' % width)
if width.startswith('-'):
styles.append('margin-left: %s'%(width[1:]))
styles.append('margin-left: %s' % (width[1:]))
if attrib.has_key('align'):
align = attrib.pop('align').strip()
if align:
@ -502,7 +508,7 @@ class MobiReader(object):
cls = sel
break
if cls is None:
ncls = 'calibre_%d'%i
ncls = 'calibre_%d' % i
self.tag_css_rules[ncls] = rule
cls = attrib.get('class', '')
cls = cls + (' ' if cls else '') + ncls
@ -514,17 +520,17 @@ class MobiReader(object):
mi = MetaInformation(self.book_header.title, [_('Unknown')])
opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
elif mi.cover is not None:
opf.cover = mi.cover
else:
opf.cover = 'images/%05d.jpg'%1
opf.cover = 'images/%05d.jpg' % 1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
*opf.cover.split('/'))):
* opf.cover.split('/'))):
opf.cover = None
manifest = [(htmlfile, 'text/x-oeb1-document'),
(os.path.abspath('styles.css'), 'text/css')]
(os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile)
for i in getattr(self, 'image_names', []):
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
@ -541,7 +547,7 @@ class MobiReader(object):
ncx_manifest_entry = None
if toc:
ncx_manifest_entry = 'toc.ncx'
elems = root.xpath('//*[@id="%s"]'%toc.partition('#')[-1])
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if elems:
@ -556,12 +562,12 @@ class MobiReader(object):
if href and re.match('\w+://', href) is None:
try:
text = u' '.join([t.strip() for t in \
x.xpath('descendant::text()')])
x.xpath('descendant::text()')])
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], href[1:],
text)
text)
if reached and x.get('class', None) == 'mbp_pagebreak':
break
if tocobj is not None:
@ -599,17 +605,17 @@ class MobiReader(object):
def extract_text(self):
self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1))
text_sections = [self.text_section(i) for i in range(1, self.book_header.records + 1)]
processed_records = list(range(0, self.book_header.records + 1))
self.mobi_html = ''
if self.book_header.compression_type == 'DH':
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)]
range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number))
self.book_header.huff_offset + self.book_header.huff_number))
huff = HuffReader(huffs)
self.mobi_html = huff.decompress(text_sections)
@ -620,7 +626,7 @@ class MobiReader(object):
elif self.book_header.compression_type == '\x00\x01':
self.mobi_html = ''.join(text_sections)
else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type))
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
self.mobi_html = self.mobi_html.replace('\0', '')
@ -636,7 +642,7 @@ class MobiReader(object):
self.log.debug('Adding anchors...')
positions = set([])
link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE)
re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1)))
pos = 0
@ -652,10 +658,10 @@ class MobiReader(object):
if r > -1 and (r < l or l == end or l == -1):
p = self.mobi_html.rfind('<', 0, end + 1)
if pos < end and p > -1 and \
not end_tag_re.match(self.mobi_html[p:r]) and \
not self.mobi_html[p:r+1].endswith('/>'):
anchor = ' filepos-id="filepos%d"'
end = r
not end_tag_re.match(self.mobi_html[p:r]) and \
not self.mobi_html[p:r + 1].endswith('/>'):
anchor = ' filepos-id="filepos%d"'
end = r
else:
end = r + 1
self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
@ -673,7 +679,7 @@ class MobiReader(object):
start = getattr(self.book_header, 'first_image_index', -1)
if start > self.num_sections or start < 0:
# BAEN PRC files have bad headers
start=0
start = 0
for i in range(start, self.num_sections):
if i in processed_records:
continue
@ -687,7 +693,7 @@ class MobiReader(object):
except IOError:
continue
path = os.path.join(output_dir, '%05d.jpg'%image_index)
path = os.path.join(output_dir, '%05d.jpg' % image_index)
self.image_names.append(os.path.basename(path))
im.save(open(path, 'wb'), format='JPEG')

View File

@ -1,27 +1,32 @@
'''
Write content to Mobipocket books.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
from collections import defaultdict
from itertools import count
from itertools import izip
import random
import re
from struct import pack
import time
import random
from cStringIO import StringIO
import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.mobi.palmdoc import compress_doc
from cStringIO import StringIO
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS
from calibre.ebooks.oeb.base import OEB_DOCS
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.base import XHTML_NS
from calibre.ebooks.oeb.base import XML_NS
from calibre.ebooks.oeb.base import namespace
from calibre.ebooks.oeb.base import prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.compression.palmdoc import compress_doc
# TODO:
# - Allow override CSS (?)
@ -174,7 +179,7 @@ class Serializer(object):
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
path = item.href if item else base.href
path = item.href if item else base.href
href = '#'.join((path, frag)) if frag else path
buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell())
@ -211,8 +216,8 @@ class Serializer(object):
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buffer = self.buffer
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) not in nsrmap:
return
or namespace(elem.tag) not in nsrmap:
return
tag = prefixname(elem.tag, nsrmap)
# Previous layers take care of @name
id = elem.attrib.pop('id', None)
@ -221,9 +226,9 @@ class Serializer(object):
offset = self.anchor_offset or buffer.tell()
self.id_offsets[href] = offset
if self.anchor_offset is not None and \
tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text:
return
tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text:
return
self.anchor_offset = buffer.tell()
buffer.write('<')
buffer.write(tag)
@ -286,7 +291,7 @@ class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, compression=PALMDOC, imagemax=None,
prefer_author_sort=False):
prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort
@ -297,7 +302,7 @@ class MobiWriter(object):
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
prefer_author_sort = opts.prefer_author_sort
return cls(compression=PALMDOC, imagemax=imagemax,
prefer_author_sort=prefer_author_sort)
prefer_author_sort=prefer_author_sort)
def __call__(self, oeb, path):
if hasattr(path, 'write'):
@ -305,7 +310,7 @@ class MobiWriter(object):
with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream)
def _write(self, *data):
def _write(self, * data):
for datum in data:
self._stream.write(datum)

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -7,17 +6,17 @@ __docformat__ = 'restructuredtext en'
class PDBError(Exception):
pass
from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
FORMAT_READERS = {
'PNPdPPrs' : ereader_reader,
'PNRdPPrs' : ereader_reader,
'zTXTGPlm' : ztxt_reader,
'TEXtREAd' : palmdoc_reader,
'PNPdPPrs': ereader_reader,
'PNRdPPrs': ereader_reader,
'zTXTGPlm': ztxt_reader,
'TEXtREAd': palmdoc_reader,
}
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
@ -25,41 +24,41 @@ from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
FORMAT_WRITERS = {
'doc' : palmdoc_writer,
'ztxt' : ztxt_writer,
'ereader' : ereader_writer,
'doc': palmdoc_writer,
'ztxt': ztxt_writer,
'ereader': ereader_writer,
}
IDENTITY_TO_NAME = {
'PNPdPPrs' : 'eReader',
'PNRdPPrs' : 'eReader',
'zTXTGPlm' : 'zTXT',
'TEXtREAd' : 'PalmDOC',
'.pdfADBE' : 'Adobe Reader',
'BVokBDIC' : 'BDicty',
'DB99DBOS' : 'DB (Database program)',
'vIMGView' : 'FireViewer (ImageViewer)',
'PmDBPmDB' : 'HanDBase',
'InfoINDB' : 'InfoView',
'ToGoToGo' : 'iSilo',
'SDocSilX' : 'iSilo 3',
'JbDbJBas' : 'JFile',
'JfDbJFil' : 'JFile Pro',
'DATALSdb' : 'LIST',
'Mdb1Mdb1' : 'MobileDB',
'BOOKMOBI' : 'MobiPocket',
'DataPlkr' : 'Plucker',
'DataSprd' : 'QuickSheet',
'SM01SMem' : 'SuperMemo',
'TEXtTlDc' : 'TealDoc',
'InfoTlIf' : 'TealInfo',
'DataTlMl' : 'TealMeal',
'DataTlPt' : 'TealPaint',
'dataTDBP' : 'ThinkDB',
'TdatTide' : 'Tides',
'ToRaTRPW' : 'TomeRaider',
'BDOCWrdS' : 'WordSmith',
'PNPdPPrs': 'eReader',
'PNRdPPrs': 'eReader',
'zTXTGPlm': 'zTXT',
'TEXtREAd': 'PalmDOC',
'.pdfADBE': 'Adobe Reader',
'BVokBDIC': 'BDicty',
'DB99DBOS': 'DB (Database program)',
'vIMGView': 'FireViewer (ImageViewer)',
'PmDBPmDB': 'HanDBase',
'InfoINDB': 'InfoView',
'ToGoToGo': 'iSilo',
'SDocSilX': 'iSilo 3',
'JbDbJBas': 'JFile',
'JfDbJFil': 'JFile Pro',
'DATALSdb': 'LIST',
'Mdb1Mdb1': 'MobileDB',
'BOOKMOBI': 'MobiPocket',
'DataPlkr': 'Plucker',
'DataSprd': 'QuickSheet',
'SM01SMem': 'SuperMemo',
'TEXtTlDc': 'TealDoc',
'InfoTlIf': 'TealInfo',
'DataTlMl': 'TealMeal',
'DataTlPt': 'TealPaint',
'dataTDBP': 'ThinkDB',
'TdatTide': 'Tides',
'ToRaTRPW': 'TomeRaider',
'BDOCWrdS': 'WordSmith',
}
def get_reader(identity):
@ -67,10 +66,10 @@ def get_reader(identity):
Returns None if no reader is found for the identity.
'''
return FORMAT_READERS.get(identity, None)
def get_writer(extension):
'''
Returns None if no writer is found for extension.
'''
return FORMAT_WRITERS.get(extension, None)

View File

@ -7,10 +7,27 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import struct, sys
import struct
import sys
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb.ereader.reader import HeaderRecord
def ereader_header_info(header):
h0 = header.section_data(0)
print 'Header Size: %s' % len(h0)
if len(h0) == 132:
print 'Header Type: Dropbook compatible'
print ''
ereader_header_info132(h0)
elif len(h0) == 202:
print 'Header Type: Makebook compatible'
print ''
ereader_header_info202(h0)
else:
raise EreaderError('Size mismatch. eReader header record size %i KB is not supported.' % len(h0))
def pdb_header_info(header):
print 'PDB Header Info:'
@ -20,70 +37,101 @@ def pdb_header_info(header):
print 'Title: %s' % header.title
print ''
def ereader_header_info(header):
h0 = header.section_data(0)
def ereader_header_info132(h0):
print 'Ereader Record 0 (Header) Info:'
print ''
print '0-2 Version: %i' % struct.unpack('>H', h0[0:2])[0]
print '2-4: %i' % struct.unpack('>H', h0[2:4])[0]
print '4-6: %i' % struct.unpack('>H', h0[4:6])[0]
print '6-8: %i' % struct.unpack('>H', h0[6:8])[0]
print '6-8 Codepage: %i' % struct.unpack('>H', h0[6:8])[0]
print '8-10: %i' % struct.unpack('>H', h0[8:10])[0]
print '10-12: %i' % struct.unpack('>H', h0[10:12])[0]
print '12-14 Non-Text: %i' % struct.unpack('>H', h0[12:14])[0]
print '12-14 Non-Text offset: %i' % struct.unpack('>H', h0[12:14])[0]
print '14-16: %i' % struct.unpack('>H', h0[14:16])[0]
print '16-18: %i' % struct.unpack('>H', h0[16:18])[0]
print '18-20: %i' % struct.unpack('>H', h0[18:20])[0]
print '20-22: %i' % struct.unpack('>H', h0[20:22])[0]
print '20-22 Image Count: %i' % struct.unpack('>H', h0[20:22])[0]
print '22-24: %i' % struct.unpack('>H', h0[22:24])[0]
print '24-26: %i' % struct.unpack('>H', h0[24:26])[0]
print '24-26 Has Metadata?: %i' % struct.unpack('>H', h0[24:26])[0]
print '26-28: %i' % struct.unpack('>H', h0[26:28])[0]
print '28-30 footnote_rec: %i' % struct.unpack('>H', h0[28:30])[0]
print '30-32 sidebar_rec: %i' % struct.unpack('>H', h0[30:32])[0]
print '32-34 bookmark_offset: %i' % struct.unpack('>H', h0[32:34])[0]
print '34-36: %i' % struct.unpack('>H', h0[34:36])[0]
print '28-30 Footnote Count: %i' % struct.unpack('>H', h0[28:30])[0]
print '30-32 Sidebar Count: %i' % struct.unpack('>H', h0[30:32])[0]
print '32-34 Bookmark Offset: %i' % struct.unpack('>H', h0[32:34])[0]
print '34-36 MAGIC: %i' % struct.unpack('>H', h0[34:36])[0]
print '36-38: %i' % struct.unpack('>H', h0[36:38])[0]
print '38-40: %i' % struct.unpack('>H', h0[38:40])[0]
print '40-42 image_data_offset: %i' % struct.unpack('>H', h0[40:42])[0]
print '40-42 Image Data Offset: %i' % struct.unpack('>H', h0[40:42])[0]
print '42-44: %i' % struct.unpack('>H', h0[42:44])[0]
print '44-46 metadata_offset: %i' % struct.unpack('>H', h0[44:46])[0]
print '44-46 Metadata Offset: %i' % struct.unpack('>H', h0[44:46])[0]
print '46-48: %i' % struct.unpack('>H', h0[46:48])[0]
print '48-50 footnote_offset: %i' % struct.unpack('>H', h0[48:50])[0]
print '50-52 sidebar_offset: %i' % struct.unpack('>H', h0[50:52])[0]
print '52-54 last_data_offset: %i' % struct.unpack('>H', h0[52:54])[0]
print '48-50 Footnote Offset: %i' % struct.unpack('>H', h0[48:50])[0]
print '50-52 Sidebar Offset: %i' % struct.unpack('>H', h0[50:52])[0]
print '52-54 Last Data Offset: %i' % struct.unpack('>H', h0[52:54])[0]
for i in range(54, 131, 2):
print '%i-%i: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0])
print ''
def ereader_header_info202(h0):
print 'Ereader Record 0 (Header) Info:'
print ''
print '0-2 Version: %i' % struct.unpack('>H', h0[0:2])[0]
print '2-4 Garbage: %i' % struct.unpack('>H', h0[2:4])[0]
print '4-6 Garbage: %i' % struct.unpack('>H', h0[4:6])[0]
print '6-8 Garbage: %i' % struct.unpack('>H', h0[6:8])[0]
print '8-10 Non-Text Offset: %i' % struct.unpack('>H', h0[8:10])[0]
print '10-12: %i' % struct.unpack('>H', h0[10:12])[0]
print '12-14: %i' % struct.unpack('>H', h0[12:14])[0]
print '14-16 Garbage: %i' % struct.unpack('>H', h0[14:16])[0]
print '16-18 Garbage: %i' % struct.unpack('>H', h0[16:18])[0]
print '18-20 Garbage: %i' % struct.unpack('>H', h0[18:20])[0]
print '20-22 Garbage: %i' % struct.unpack('>H', h0[20:22])[0]
print '22-24 Garbage: %i' % struct.unpack('>H', h0[22:24])[0]
print '24-26: %i' % struct.unpack('>H', h0[24:26])[0]
print '26-28: %i' % struct.unpack('>H', h0[26:28])[0]
for i in range(28, 98, 2):
print '%i-%i Garbage: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0])
print '98-100: %i' % struct.unpack('>H', h0[98:100])[0]
for i in range(100, 110, 2):
print '%i-%i Garbage: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0])
print '110-112: %i' % struct.unpack('>H', h0[110:112])[0]
print '112-114: %i' % struct.unpack('>H', h0[112:114])[0]
print '114-116 Garbage: %i' % struct.unpack('>H', h0[114:116])[0]
for i in range(116, 202, 2):
print '%i-%i: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0])
print ''
print '* Garbage: Random values.'
print ''
def section_lengths(header):
print 'Section Sizes'
print ''
for i in range(0, header.section_count()):
size = len(header.section_data(i))
if size > 65505:
message = '<--- Over!'
else:
message = ''
print 'Section %i: %i %s' % (i, size, message)
def main(args=sys.argv):
if len(args) < 2:
print 'Error: requires input file.'
return 1
f = open(sys.argv[1], 'rb')
pheader = PdbHeaderReader(f)
pdb_header_info(pheader)
ereader_header_info(pheader)
section_lengths(pheader)
return 0
if __name__ == '__main__':

View File

@ -8,183 +8,28 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, re, struct, zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pml.pmlconverter import pml_to_html, \
footnote_sidebar_to_html
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.has_metadata, = struct.unpack('>H', raw[24:26])
self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46])
self.footnote_offset, = struct.unpack('>H', raw[48:50])
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset - 1
self.num_image_pages = self.metadata_offset - self.image_data_offset
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader.reader132 import Reader132
from calibre.ebooks.pdb.ereader.reader202 import Reader202
class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.log = log
self.encoding = encoding
record0_size = len(header.section_data(0))
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
if self.header_record.version not in (2, 10):
if self.header_record.version in (260, 272):
raise DRMError('eReader DRM is not supported.')
else:
raise EreaderError('Unknown book version %i.' % self.header_record.version)
from calibre.ebooks.metadata.pdb import get_metadata
self.mi = get_metadata(stream, False)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.version == 2:
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
if self.header_record.version == 10:
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', ''
data = self.section_data(number)
name = data[4:4+32].strip('\x00')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc and zlib compressed are supported. The text is
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
if number not in range(1, self.header_record.num_text_pages + 1):
return ''
return self.decompress_text(number)
if record0_size == 132:
self.reader = Reader132(header, stream, log, encoding)
elif record0_size == 202:
self.reader = Reader202(header, stream, log, encoding)
else:
raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size)
def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
html = u'<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i))
if self.header_record.footnote_rec > 0:
html += '<br /><h1>%s</h1>' % _('Footnotes')
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
self.log.debug('Extracting footnote page %i' % i)
html += '<dl>'
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
html += '</dl>'
if self.header_record.sidebar_rec > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar')
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
self.log.debug('Extracting sidebar page %i' % i)
html += '<dl>'
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
html += '</dl>'
html += '</body></html>'
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
images = []
with CurrentDir(os.path.join(output_dir, 'images/')):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
images.append(name)
with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name)
imgf.write(img)
opf_path = self.create_opf(output_dir, images)
return opf_path
def create_opf(self, output_dir, images):
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, self.mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf')
return self.reader.extract_content(output_dir)
def dump_pml(self):
'''
This is primarily used for debugging and 3rd party tools to
get the plm markup that comprises the text in the file.
'''
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml
def dump_images(self, output_dir):
'''
This is primarily used for debugging and 3rd party tools to
get the images in the file.
'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with CurrentDir(output_dir):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
with open(name, 'wb') as imgf:
imgf.write(img)
return self.reader.dump_pml()
def dump_images(self):
return self.reader.dump_images()

View File

@ -0,0 +1,192 @@
# -*- coding: utf-8 -*-
'''
Read content from ereader pdb file with a 132 byte header created by Dropbook.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import re
import struct
import zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
from calibre.ebooks.pml.pmlconverter import pml_to_html
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.has_metadata, = struct.unpack('>H', raw[24:26])
self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46])
self.footnote_offset, = struct.unpack('>H', raw[48:50])
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset - 1
self.num_image_pages = self.metadata_offset - self.image_data_offset
class Reader132(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.log = log
self.encoding = encoding
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
if self.header_record.version not in (2, 10):
if self.header_record.version in (260, 272):
raise DRMError('eReader DRM is not supported.')
else:
raise EreaderError('Unknown book version %i.' % self.header_record.version)
from calibre.ebooks.metadata.pdb import get_metadata
self.mi = get_metadata(stream, False)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.version == 2:
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
if self.header_record.version == 10:
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', ''
data = self.section_data(number)
name = data[4:4 + 32].strip('\x00')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc and zlib compressed are supported. The text is
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
if number not in range(1, self.header_record.num_text_pages + 1):
return ''
return self.decompress_text(number)
def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
html = u'<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i))
if self.header_record.footnote_rec > 0:
html += '<br /><h1>%s</h1>' % _('Footnotes')
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
self.log.debug('Extracting footnote page %i' % i)
html += '<dl>'
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
html += '</dl>'
if self.header_record.sidebar_rec > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar')
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
self.log.debug('Extracting sidebar page %i' % i)
html += '<dl>'
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
html += '</dl>'
html += '</body></html>'
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
images = []
with CurrentDir(os.path.join(output_dir, 'images/')):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
images.append(name)
with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name)
imgf.write(img)
opf_path = self.create_opf(output_dir, images)
return opf_path
def create_opf(self, output_dir, images):
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, self.mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf')
def dump_pml(self):
'''
This is primarily used for debugging and 3rd party tools to
get the plm markup that comprises the text in the file.
'''
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml
def dump_images(self, output_dir):
'''
This is primarily used for debugging and 3rd party tools to
get the images in the file.
'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with CurrentDir(output_dir):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
with open(name, 'wb') as imgf:
imgf.write(img)

View File

@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
'''
Read content from ereader pdb file with a 202 byte header created by Makebook.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import struct
from calibre import CurrentDir
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pml.pmlconverter import pml_to_html
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[8:10])
self.num_text_pages = self.non_text_offset - 1
class Reader202(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.log = log
self.encoding = encoding
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
if self.header_record.version != 4:
raise EreaderError('Unknown book version %i.' % self.header_record.version)
from calibre.ebooks.metadata.pdb import get_metadata
self.mi = get_metadata(stream, False)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding)
def get_image(self, number):
name = None
img = None
data = self.section_data(number)
if data.startswith('PNG'):
name = data[4:4 + 32].strip('\x00')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc compression is supported. The text is xored with 0xA5 and
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
if number not in range(1, self.header_record.num_text_pages + 1):
return ''
return self.decompress_text(number)
def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
html = u'<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i))
html += '</body></html>'
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
images = []
with CurrentDir(os.path.join(output_dir, 'images/')):
for i in range(self.header_record.non_text_offset, len(self.sections)):
name, img = self.get_image(i)
if name:
images.append(name)
with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name)
imgf.write(img)
opf_path = self.create_opf(output_dir, images)
return opf_path
def create_opf(self, output_dir, images):
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, self.mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf')
def dump_pml(self):
'''
This is primarily used for debugging and 3rd party tools to
get the plm markup that comprises the text in the file.
'''
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml
def dump_images(self, output_dir):
'''
This is primarily used for debugging and 3rd party tools to
get the images in the file.
'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with CurrentDir(output_dir):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
with open(name, 'wb') as imgf:
imgf.write(img)

View File

@ -8,9 +8,11 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import struct, zlib
import struct
import zlib
import Image, cStringIO
import Image
import cStringIO
from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.oeb.base import OEB_IMAGES
@ -25,62 +27,62 @@ IDENTITY = 'PNRdPPrs'
MAX_RECORD_SIZE = 3560
class Writer(FormatWriter):
def __init__(self, opts, log):
self.opts = opts
self.log = log
def write_content(self, oeb_book, out_stream, metadata=None):
text = self._text(oeb_book)
images = self._images(oeb_book.manifest)
metadata = [self._metadata(metadata)]
hr = [self._header_record(len(text), len(images))]
sections = hr+text+images+metadata+['MeTaInFo\x00']
lengths = [len(i) for i in sections]
pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0])
pdbHeaderBuilder.build_header(lengths, out_stream)
for item in sections:
out_stream.write(item)
def _text(self, oeb_book):
pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
pml_pages = []
for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
return pml_pages
return pml_pages
def _images(self, manifest):
images = []
for item in manifest:
if item.media_type in OEB_IMAGES:
image = 'PNG '
image += image_name(item.href)
image = image.ljust(62, '\x00')
im = Image.open(cStringIO.StringIO(item.data)).convert('P')
im.thumbnail((300,300), Image.ANTIALIAS)
data = cStringIO.StringIO()
im.save(data, 'PNG')
data = data.getvalue()
image += data
if len(image) < 65505:
images.append(image)
return images
def _metadata(self, metadata):
'''
Metadata takes the form:
@ -90,14 +92,14 @@ class Writer(FormatWriter):
publisher\x00
isbn\x00
'''
title = _('Unknown')
author = _('Unknown')
copyright = ''
publisher = ''
isbn = ''
if metadata != None:
if metadata:
if len(metadata.title) >= 1:
title = metadata.title[0].value
if len(metadata.creator) >= 1:
@ -117,7 +119,7 @@ class Writer(FormatWriter):
'''
version = 10 # Zlib compression
non_text_offset = text_items + 1
if image_items > 0:
image_data_offset = text_items + 1
meta_data_offset = image_data_offset + image_items
@ -126,9 +128,9 @@ class Writer(FormatWriter):
meta_data_offset = text_items + 1
last_data_offset = meta_data_offset + 1
image_data_offset = last_data_offset
record = ''
record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
record += struct.pack('>H', 0) # [2:4]
record += struct.pack('>H', 0) # [4:6]
@ -159,6 +161,6 @@ class Writer(FormatWriter):
for i in range(54, 132, 2):
record += struct.pack('>H', 0) # [54:132]
return record

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Read the header data from a pdb file.
'''
@ -8,7 +7,9 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re, struct, time
import re
import struct
import time
class PdbHeaderReader(object):
@ -35,16 +36,16 @@ class PdbHeaderReader(object):
if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
self.stream.seek(78 + number * 8)
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
flags, val = a1, a2<<16 | a3<<8 | a4
flags, val = a1, a2 << 16 | a3 << 8 | a4
return (offset, flags, val)
def section_offset(self, number):
if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def section_data(self, number):

View File

@ -8,11 +8,13 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, struct, zlib
import os
import struct
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer
from calibre.ebooks.txt.processor import opf_writer
from calibre.ebooks.txt.processor import txt_to_markdown
class HeaderRecord(object):
'''
@ -25,15 +27,15 @@ class HeaderRecord(object):
def __init__(self, raw):
self.compression, = struct.unpack('>H', raw[0:2])
self.num_records, = struct.unpack('>H', raw[8:10])
class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.stream = stream
self.log = log
self.encoding = encoding
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
@ -52,7 +54,7 @@ class Reader(FormatReader):
def extract_content(self, output_dir):
txt = ''
self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i)
@ -62,12 +64,12 @@ class Reader(FormatReader):
html = txt_to_markdown(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')

View File

@ -10,10 +10,11 @@ __docformat__ = 'restructuredtext en'
import struct
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.txt.writer import TxtNewlines
from calibre.ebooks.txt.writer import TxtWriter
MAX_RECORD_SIZE = 4096
@ -22,48 +23,48 @@ class Writer(FormatWriter):
def __init__(self, opts, log):
self.opts = opts
self.log = log
def write_content(self, oeb_book, out_stream, metadata=None):
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
txt_records, txt_length = self._generate_text(oeb_book.spine)
header_record = self._header_record(txt_length, len(txt_records))
section_lengths = [len(header_record)]
self.log.info('Compessing data...')
for i in range(0, len(txt_records)):
self.log.debug('\tCompressing record %i' % i)
txt_records[i] = compress_doc(txt_records[i].encode('utf-8'))
section_lengths.append(len(txt_records[i]))
out_stream.seek(0)
hb = PdbHeaderBuilder('TEXtREAd', title)
hb.build_header(section_lengths, out_stream)
for record in [header_record]+txt_records:
for record in [header_record] + txt_records:
out_stream.write(record)
def _generate_text(self, spine):
txt_writer = TxtWriter(TxtNewlines('system').newline, self.log)
txt = txt_writer.dump(spine)
txt_length = len(txt)
txt_records = []
for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1):
txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
return txt_records, txt_length
def _header_record(self, txt_length, record_count):
record = ''
record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression).
record += struct.pack('>H', 0) # [2:4], Always 0.
record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book.
record += struct.pack('>H', record_count) # [8:10], Number of PDB records used for the text of the book.
record += struct.pack('>H', MAX_RECORD_SIZE) # [10-12], Maximum size of each record containing text, always 4096.
record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text.
return record

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -23,11 +22,13 @@ class PDFInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
html = pdftohtml(stream.name)
if self._preprocess_html_for_viewer:
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
prepro = HTMLPreProcessor(lambda x:x, False)
html = prepro(html.decode('utf-8')).encode('utf-8')
with open('index.html', 'wb') as index:
index.write(html)

View File

@ -1,12 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
'2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import errno, os, sys, subprocess
import errno
import os
import sys
import subprocess
from functools import partial
from calibre.ebooks import ConversionError, DRMError

View File

@ -8,7 +8,8 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into PML markup
'''
import os, re
import os
import re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
@ -40,6 +41,31 @@ STYLES = [
('text-align', {'right' : 'r', 'center' : 'c'}),
]
BLOCK_TAGS = [
'p',
]
BLOCK_STYLES = [
'block',
]
LINK_TAGS = [
'a',
]
SEPARATE_TAGS = [
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'p',
'div',
'li',
'tr',
]
class PMLMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
@ -62,7 +88,7 @@ class PMLMLizer(object):
def add_page_anchor(self, href):
href = os.path.splitext(os.path.basename(href))[0]
return '\\Q="%s"' % href
return u'\\Q="%s"' % href
def clean_text(self, text):
# Remove excess spaces at beginning and end of lines
@ -82,9 +108,10 @@ class PMLMLizer(object):
links = set(re.findall(r'(?<=\\q="#).+?(?=")', text))
for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '')
for entity in set(re.findall('&.+?;', text)):
text = text.replace(entity, entity_to_unicode(entity[1:-1]))
mo = re.search('(%s)' % entity[1:-1], text)
text = text.replace(entity, entity_to_unicode(mo))
return text
@ -104,7 +131,7 @@ class PMLMLizer(object):
tag_count = 0
# Are we in a paragraph block?
if tag == 'p' or style['display'] in ('block'):
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
if 'block' not in tag_stack:
tag_count += 1
tag_stack.append('block')
@ -136,7 +163,7 @@ class PMLMLizer(object):
# Special processing of tags that require an argument.
# Anchors links
if tag == 'a' and 'q' not in tag_stack:
if tag in LINK_TAGS and 'q' not in tag_stack:
href = elem.get('href')
if href and '://' not in href:
if '#' in href:
@ -168,7 +195,7 @@ class PMLMLizer(object):
for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'):
if tag in SEPARATE_TAGS:
text += os.linesep + os.linesep
if 'block' not in tag_stack:

View File

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
HEADER = '\xb0\x0c\xb0\x0c\x02\x00NUVO\x00\x00\x00\x00'
class RocketBookError(Exception):
pass
def unique_name(name, used_names):
name = os.path.basename(name)
if len(name) < 32 and name not in used_names:
return name
else:
ext = os.path.splitext(name)[1][:3]
base_name = name[:22]
for i in range(0, 9999):
name = '%s-%s.%s' % (str(i).rjust('0', 4)[:4], base_name, ext)
if name not in used_names:
break
return name

View File

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.ebooks.rb.reader import Reader
from calibre.customize.conversion import InputFormatPlugin
class RBInput(InputFormatPlugin):
name = 'RB Input'
author = 'John Schember'
description = 'Convert RB files to HTML'
file_types = set(['rb'])
def convert(self, stream, options, file_ext, log,
accelerators):
reader = Reader(stream, log, options.input_encoding)
opf = reader.extract_content(os.getcwd())
return opf

View File

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.ebooks.rb.writer import RBWriter
class RBOutput(OutputFormatPlugin):
name = 'RB Output'
author = 'John Schember'
file_type = 'rb'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
writer = RBWriter(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@ -0,0 +1,166 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into RB compatible markup.
'''
import os
import re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
TAGS = [
'b',
'big',
'blockquote',
'br',
'center',
'code',
'div',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'i',
'li',
'ol',
'p',
'pre',
'small',
'sub',
'sup',
'ul',
]
LINK_TAGS = [
'a',
]
STYLES = [
('font-weight', {'bold' : 'b', 'bolder' : 'b'}),
('font-style', {'italic' : 'i'}),
('text-align', {'center' : 'center'}),
]
class RBMLizer(object):
def __init__(self, name_map={}, ignore_tables=False):
self.name_map = name_map
self.ignore_tables = ignore_tables
def extract_content(self, oeb_book, opts):
oeb_book.logger.info('Converting XHTML to RB markup...')
self.oeb_book = oeb_book
self.opts = opts
return self.mlize_spine()
def mlize_spine(self):
output = u'<HTML><HEAD><TITLE></TITLE></HEAD><BODY>'
for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.add_page_anchor(item.href)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += u'</BODY></HTML>'
output = self.clean_text(output)
return output
def add_page_anchor(self, href):
href = os.path.splitext(os.path.basename(href))[0]
return u'<A NAME="%s"></A>' % href
def clean_text(self, text):
# Remove anchors that do not have links
anchors = set(re.findall(r'(?<=<A NAME=").+?(?="></A>)', text))
links = set(re.findall(r'(?<=<A HREF="#).+?(?=">)', text))
for unused in anchors.difference(links):
text = text.replace('<A NAME="%s"></A>' % unused, '')
return text
def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
return u''
text = u''
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return u''
tag = barename(elem.tag)
tag_count = 0
# Process tags that need special processing and that do not have inner
# text. Usually these require an argument
if tag == 'img':
src = os.path.basename(elem.get('src'))
name = self.name_map.get(src, src)
text += '<IMG SRC="%s">' % name
rb_tag = tag.upper() if tag in TAGS else None
if rb_tag:
tag_count += 1
text += '<%s>' % rb_tag
tag_stack.append(rb_tag)
if tag in LINK_TAGS:
href = elem.get('href')
if href:
if '://' not in href:
if '#' in href:
href = href.partition('#')[2]
href = os.path.splitext(os.path.basename(href))[0]
tag_count += 1
text += '<A HREF="#%s">' % href
tag_stack.append('A')
# Anchor ids
id_name = elem.get('id')
if id_name:
text += '<A NAME="%s"></A>' % os.path.splitext(id_name)[0]
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag:
style_tag = style_tag.upper()
tag_count += 1
text += '<%s>' % style_tag
tag_stack.append(style_tag)
# Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += elem.text
for item in elem:
text += self.dump_text(item, stylizer, tag_stack)
close_tag_list = []
for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list)
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
text += elem.tail
return text
def close_tags(self, tags):
text = u''
for i in range(0, len(tags)):
tag = tags.pop()
text += '</%s>' % tag
return text

View File

@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import struct
import zlib
from urllib import unquote as urlunquote
from calibre import CurrentDir
from calibre.ebooks.rb import HEADER
from calibre.ebooks.rb import RocketBookError
from calibre.ebooks.metadata.rb import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator
class RBToc(list):
class Item(object):
def __init__(self, name='', size=0, offset=0, flags=0):
self.name = name
self.size = size
self.offset = offset
self.flags = flags
class Reader(object):
def __init__(self, stream, log, encoding=None):
self.stream = stream
self.log = log
self.encoding = encoding
self.verify_file()
self.mi = get_metadata(self.stream)
self.toc = self.get_toc()
def read_i32(self):
return struct.unpack('<I', self.stream.read(4))[0]
def verify_file(self):
self.stream.seek(0)
if self.stream.read(14) != HEADER:
raise RocketBookError('Could not read file: %s. Does not contain a valid RocketBook Header.' % self.stream.name)
self.stream.seek(28)
size = self.read_i32()
self.stream.seek(0, os.SEEK_END)
real_size = self.stream.tell()
if size != real_size:
raise RocketBookError('File is corrupt. The file size recorded in the header does not match the actual file size.')
def get_toc(self):
self.stream.seek(24)
toc_offset = self.read_i32()
self.stream.seek(toc_offset)
pages = self.read_i32()
toc = RBToc()
for i in range(pages):
name = urlunquote(self.stream.read(32).strip('\x00'))
size, offset, flags = self.read_i32(), self.read_i32(), self.read_i32()
toc.append(RBToc.Item(name=name, size=size, offset=offset, flags=flags))
return toc
def get_text(self, toc_item, output_dir):
if toc_item.flags in (1, 2):
return
output = u''
self.stream.seek(toc_item.offset)
if toc_item.flags == 8:
count = self.read_i32()
self.read_i32() # Uncompressed size.
chunck_sizes = []
for i in range(count):
chunck_sizes.append(self.read_i32())
for size in chunck_sizes:
cm_chunck = self.stream.read(size)
output += zlib.decompress(cm_chunck).decode('cp1252' if self.encoding is None else self.encoding)
else:
output += self.stream.read(toc_item.size).decode('cp1252' if self.encoding is None else self.encoding)
with open(os.path.join(output_dir, toc_item.name), 'wb') as html:
html.write(output.encode('utf-8'))
def get_image(self, toc_item, output_dir):
if toc_item.flags != 0:
return
self.stream.seek(toc_item.offset)
data = self.stream.read(toc_item.size)
with open(os.path.join(output_dir, toc_item.name), 'wb') as img:
img.write(data)
def extract_content(self, output_dir):
html = []
images = []
for item in self.toc:
if item.name.lower().endswith('html'):
html.append(item.name)
self.get_text(item, output_dir)
if item.name.lower().endswith('png'):
images.append(item.name)
self.get_image(item, output_dir)
opf_path = self.create_opf(output_dir, html, images)
return opf_path
def create_opf(self, output_dir, pages, images):
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, self.mi)
manifest = []
for page in pages+images:
manifest.append((page, None))
opf.create_manifest(manifest)
opf.create_spine(pages)
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf')

View File

@ -0,0 +1,143 @@
import os.path
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import struct
import zlib
import Image
import cStringIO
from calibre.ebooks.rb.rbml import RBMLizer
from calibre.ebooks.rb import HEADER
from calibre.ebooks.rb import unique_name
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.constants import __appname__, __version__
TEXT_RECORD_SIZE = 4096
class TocItem(object):
def __init__(self, name, size, flags):
self.name = name
self.size = size
self.flags = flags
class RBWriter(object):
def __init__(self, opts, log):
self.opts = opts
self.log = log
self.name_map = {}
def write_content(self, oeb_book, out_stream, metadata=None):
info = [('info.info', self._info_section(metadata))]
images = self._images(oeb_book.manifest)
text_size, chuncks = self._text(oeb_book)
chunck_sizes = [len(x) for x in chuncks]
text = [('index.html', chuncks)]
hidx = [('index.hidx', ' ')]
toc_items = []
page_count = 0
for name, data in info+text+hidx+images:
page_count += 1
size = len(data)
if (name, data) in text:
flags = 8
size = 0
for c in chunck_sizes:
size += c
size += 8 + (len(chunck_sizes) * 4)
elif (name, data) in info:
flags = 2
else:
flags = 0
toc_items.append(TocItem(name.ljust(32, '\x00')[:32], size, flags))
out_stream.write(HEADER)
out_stream.write(struct.pack('<I', 0))
out_stream.write(struct.pack('<IH', 0, 0))
out_stream.write(struct.pack('<I', 0x128))
out_stream.write(struct.pack('<I', 0))
for i in range(0x20, 0x128, 4):
out_stream.write(struct.pack('<I', 0))
out_stream.write(struct.pack('<I', page_count))
offset = out_stream.tell() + (len(toc_items) * 44)
for item in toc_items:
out_stream.write(item.name)
out_stream.write(struct.pack('<I', item.size))
out_stream.write(struct.pack('<I', offset))
out_stream.write(struct.pack('<I', item.flags))
offset += item.size
out_stream.write(info[0][1])
# Compressed text with proper heading
out_stream.write(struct.pack('<I', len(text[0][1])))
out_stream.write(struct.pack('<I', text_size))
for size in chunck_sizes:
out_stream.write(struct.pack('<I', size))
for chunck in text[0][1]:
out_stream.write(chunck)
for item in hidx+images:
out_stream.write(item[1])
total_size = out_stream.tell()
out_stream.seek(0x1c)
out_stream.write(struct.pack('<I', total_size))
def _text(self, oeb_book):
rbmlizer = RBMLizer(name_map=self.name_map, ignore_tables=self.opts.linearize_tables)
text = rbmlizer.extract_content(oeb_book, self.opts).encode('cp1252', 'xmlcharrefreplace')
size = len(text)
pages = []
for i in range(0, (len(text) / TEXT_RECORD_SIZE) + 1):
pages.append(zlib.compress(text[i * TEXT_RECORD_SIZE : (i * TEXT_RECORD_SIZE) + TEXT_RECORD_SIZE], 9))
return (size, pages)
def _images(self, manifest):
images = []
used_names = []
for item in manifest:
if item.media_type in OEB_IMAGES:
data = ''
im = Image.open(cStringIO.StringIO(item.data)).convert('L')
data = cStringIO.StringIO()
im.save(data, 'PNG')
data = data.getvalue()
name = '%s.png' % os.path.splitext(os.path.basename(item.href))[0]
name = unique_name(name, used_names)
used_names.append(name)
self.name_map[os.path.basename(item.href)] = name
images.append((name, data))
return images
def _info_section(self, metadata):
text = 'TYPE=2\n'
if metadata:
if len(metadata.title) >= 1:
text += 'TITLE=%s\n' % metadata.title[0].value
if len(metadata.creator) >= 1:
from calibre.ebooks.metadata import authors_to_string
text += 'AUTHOR=%s\n' % authors_to_string([x.value for x in metadata.creator])
text += 'GENERATOR=%s - %s\n' % (__appname__, __version__)
text += 'PARSE=1\n'
text += 'OUTPUT=1\n'
text += 'BODY=index.html\n'
return text

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'

View File

@ -1,15 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Write content to TXT.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, re, sys
'''
Write content to TXT.
'''
import os
import re
from calibre import entity_to_unicode
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
from BeautifulSoup import BeautifulSoup
@ -83,6 +85,11 @@ class TxtWriter(object):
for symbol in HTML_SYMBOLS:
for code in HTML_SYMBOLS[symbol]:
content = content.replace(code, symbol)
for entity in set(re.findall('&.+?;', content)):
mo = re.search('(%s)' % entity[1:-1], content)
content = content.replace(entity, entity_to_unicode(mo))
return content
def cleanup_text(self, text):

View File

@ -640,15 +640,15 @@ class DeviceGUI(object):
', '.join(sent_mails), 3000)
def sync_news(self, send_ids=None, do_auto=True):
def sync_news(self, send_ids=None, do_auto_convert=True):
if self.device_connected:
ids = list(dynamic.get('news_to_be_synced', set([]))) if send_ids is None else send_ids
ids = [id for id in ids if self.library_view.model().db.has_id(id)]
files, _auto_ids = self.library_view.model().get_preferred_formats_from_ids(
ids, self.device_manager.device_class.settings().format_map,
exclude_auto=do_auto)
exclude_auto=do_auto_convert)
auto = []
if _auto_ids:
if do_auto_convert and _auto_ids:
for id in _auto_ids:
formats = [f.lower() for f in self.library_view.model().db.formats(id, index_is_id=True).split(',')]
formats = formats if formats != None else []

View File

@ -133,7 +133,7 @@ class RecipeModel(QAbstractItemModel, SearchQueryParser):
self._map = dict(self.category_map)
def scheduled_recipes(self):
for recipe in self.category_map[_('Scheduled')]:
for recipe in self.category_map.get(_('Scheduled'), []):
yield recipe
def sort_categories(self, x, y):