eReader format support from driver-dev

This commit is contained in:
Kovid Goyal 2009-04-22 14:38:40 -07:00
commit b93029a4fe
8 changed files with 467 additions and 1 deletions

View File

@ -278,6 +278,7 @@ class PDFMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.pdb.input import PDBInput
from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lit.input import LITInput
@ -290,7 +291,7 @@ from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput,
FB2Input, ODTInput, RTFInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.pdb.ereader.reader import Reader as eReader
FORMATS = {
'PNPdPPrs' : eReader,
'PNRdPPrs' : eReader,
}
IDENTITY_TO_NAME = {
'PNPdPPrs' : 'eReader',
'PNRdPPrs' : 'eReader',
}
class PDBError(Exception):
pass
def get_reader(identity):
'''
Returns None if no reader is found for the identity.
'''
if identity in FORMATS.keys():
return FORMATS[identity]
else:
return None

View File

@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
class EreaderError(Exception):
pass

View File

@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Convert pml markup to and from html
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
PML_HTML_RULES = [
(re.compile('\\\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
(re.compile('\\\\x(?P<text>.+?)\\\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text')),
(re.compile('\\\\X(?P<val>[0-4])(?P<text>.+?)\\\\X[0-4]', re.DOTALL), lambda match: '<h%i style="page-break-before: always;">%i</h%i>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('\\\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
(re.compile('\\\\c(?P<text>.+?)\\\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text')),
(re.compile('\\\\r(?P<text>.+?)\\\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text')),
(re.compile('\\\\i(?P<text>.+?)\\\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text')),
(re.compile('\\\\u(?P<text>.+?)\\\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text')),
(re.compile('\\\\o(?P<text>.+?)\\\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
(re.compile('\\\\v(?P<text>.+?)\\\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
(re.compile('\\\\t(?P<text>.+?)\\\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%">%s</div>' % match.group('text')),
(re.compile('\\\\T="(?P<val>\d+%*)"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%">%s</div>' % (match.group('val'), match.group('text'))),
(re.compile('\\\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
(re.compile('\\\\n'), lambda match: ''),
(re.compile('\\\\s'), lambda match: ''),
(re.compile('\\\\b(?P<text>.+?)\\\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
(re.compile('\\\\l(?P<text>.+?)\\\\l', re.DOTALL), lambda match: '<big>%s</big>' % match.group('text')),
(re.compile('\\\\B(?P<text>.+?)\\\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')),
(re.compile('\\\\Sp(?P<text>.+?)\\\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text')),
(re.compile('\\\\Sb(?P<text>.+?)\\\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text')),
(re.compile('\\\\k(?P<text>.+?)\\\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
(re.compile('\\\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
(re.compile('\\\\U(?P<num>\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))),
(re.compile('\\\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')),
(re.compile('\\\\q="(?P<target>#.+?)"(?P<text>)\\\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile('\\\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
(re.compile('\\\\-'), lambda match: ''),
(re.compile('\\\\Fn="(?P<target>.+?)"(?P<text>.+?)\\\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile('\\\\Sd="(?P<target>.+?)"(?P<text>.+?)\\\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile('\\\\I'), lambda match: ''),
# eReader files are one paragraph per line.
# This forces the lines to wrap properly.
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
# Remove unmatched plm codes.
(re.compile('(?<=[^\\\\])\\\\[pxcriouvtblBk]'), lambda match: ''),
(re.compile('(?<=[^\\\\])\\\\X[0-4]'), lambda match: ''),
(re.compile('(?<=[^\\\\])\\\\Sp'), lambda match: ''),
(re.compile('(?<=[^\\\\])\\\\Sb'), lambda match: ''),
# Replace \\ with \.
(re.compile('\\\\\\\\'), lambda match: '\\'),
]
FOOTNOTE_HTML_RULES = [
(re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>')
]
SIDEBAR_HTML_RULES = [
(re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>')
]
def pml_to_html(pml):
html = pml
for rule in PML_HTML_RULES:
html = rule[0].sub(rule[1], html)
for symbol in HTML_SYMBOLS.keys():
if ord(symbol) > 128:
html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1])
return html
def footnote_to_html(footnotes):
html = footnotes
for rule in FOOTNOTE_HTML_RULES:
html = rule[0].sub(rule[1], html)
html = pml_to_html(html)
return html
def sidebar_to_html(sidebars):
html = sidebars
for rule in FOOTNOTE_HTML_RULES:
html = rule[0].sub(rule[1], html)
html = pml_to_html(html)
return html

View File

@ -0,0 +1,216 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Read content from ereader pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, sys, struct, zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
footnote_to_html, sidebar_to_html
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46])
self.footnote_offset, = struct.unpack('>H', raw[48:50])
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset -1
self.num_image_pages = self.metadata_offset - self.image_data_offset
# Can't tell which is sidebar and footnote if they have same offset.
# They don't exist if offset is larget than last_record.
# Todo: Determine if the subtraction is necessary and find out
# what _rec means.
end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset
self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0
self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.log = log
self.encoding = encoding
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
if self.header_record.version not in (2, 10):
if self.header_record.version in (260, 272):
raise DRMError('eReader DRM is not supported.')
else:
raise EreaderError('Unknown book version %i.' % self.header_record.version)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.version == 2:
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
if self.header_record.version == 10:
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding)
def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', ''
data = self.section_data(number)
name = data[4:4+32].strip('\0')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc and zlib compressed are supported. The text is
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
if number not in range(1, self.header_record.num_text_pages):
return ''
return self.decompress_text(number)
def get_footnote_page(self, number):
if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages):
return ''
return self.decompress_text(number)
def get_sidebar_page(self, number):
if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1):
return ''
return self.decompress_text(number)
def has_footnotes(self):
if self.header_record.num_footnote_pages > 1:
try:
content = self.decompress_text(self.header_record.footnote_offset)
if content.contains('</footnote>'):
return True
except:
pass
return False
def has_sidebar(self):
if self.header_record.num_sidebar_pages > 1:
try:
content = self.decompress_text(self.header_record.sidebar_offset)
if content.contains('</sidebar>'):
return True
except:
pass
return False
def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
html = '<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i)
html += pml_to_html(self.get_text_page(i))
# Untested: The num_.._pages variable may not be correct!
# Possibly use .._rec instead?
'''
if has_footnotes():
html += '<br /><h1>%s</h1>' % _('Footnotes')
for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages):
self.log.debug('Extracting footnote page %i' % i)
html += footnote_to_html(self.get_footnote_page(i))
if has_sidebar():
html += '<br /><h1>%s</h1>' % _('Sidebar')
for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages):
self.log.debug('Extracting sidebar page %i' % i)
html += sidebar_to_html(self.get_sidebar_page(i))
'''
html += '</body></html>'
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
images = []
with CurrentDir(os.path.join(output_dir, 'images/')):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
images.append(name)
with open(name, 'wb') as imgf:
self.log.debug('Writing image %s to images/' % name)
imgf.write(img)
opf_path = self.create_opf(output_dir, images)
return opf_path
def create_opf(self, output_dir, images):
mi = MetaInformation(None, None)
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
return os.path.join(output_dir, 'metadata.opf')
def dump_pml(self):
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml
class EreaderMetadata(object):
def __init__(self, record):
pass

View File

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Interface defining the necessary public functions for a pdb format reader.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
class FormatReader(object):
def __init__(self, header, stream, log, encoding=None):
raise NotImplementedError()
def extract_content(self, output_dir):
raise NotImplementedError()

View File

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Read the header data from a pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, struct
class PdbHeader(object):
def __init__(self, stream):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
self.title = self.name()
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def name(self):
self.stream.seek(0)
return self.stream.read(32).replace('\x00', '')
def full_section_info(self, number):
if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
flags, val = a1, a2<<16 | a3<<8 | a4
return (offset, flags, val)
def section_offset(self, number):
if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def section_data(self, number):
if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number)
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
return self.stream.read(end - start)

View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
class PDBInput(InputFormatPlugin):
name = 'PDB Input'
author = 'John Schember'
description = 'Convert PDB to HTML'
file_types = set(['pdb'])
def convert(self, stream, options, file_ext, log,
accelerators):
header = PdbHeader(stream)
Reader = get_reader(header.ident)
if Reader is None:
raise PDBError('Unknown format in pdb file. Identity is %s' % header.identity)
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
reader = Reader(header, stream, log, options.input_encoding)
opf = reader.extract_content(os.getcwd())
return opf