initial ereader input

This commit is contained in:
John Schember 2009-04-21 19:09:03 -04:00
parent ac9f766a8d
commit 68e7e1b112
7 changed files with 429 additions and 1 deletions

View File

@ -278,6 +278,7 @@ class PDFMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.pdb.input import PDBInput
from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lit.input import LITInput
@ -287,7 +288,7 @@ from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]

View File

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.pdb.ereader.reader import Reader as eReader
FORMATS = {
'PNPdPPrs' : eReader,
'PNRdPPrs' : eReader,
}
class PDBError(Exception):
pass
def get_reader(identity):
'''
Returns None if no reader is found for the identity.
'''
if identity in FORMATS.keys():
return FORMATS[identity]
else:
return None

View File

@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Write content to TXT.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
class EreaderError(Exception):
pass

View File

@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Convert pml markup to html
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
PML_HTML_RULES = [
(re.compile('\\\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
(re.compile('\\\\x(?P<text>.+?)\\\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text')),
(re.compile('\\\\X(?P<val>[0-4])(?P<text>.+?)\\\\X[0-4]', re.DOTALL), lambda match: '<h%i style="page-break-before: always;">%i</h%i>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('\\\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry
(re.compile('\\\\c(?P<text>.+?)\\\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text')),
(re.compile('\\\\r(?P<text>.+?)\\\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text')),
(re.compile('\\\\i(?P<text>.+?)\\\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text')),
(re.compile('\\\\u(?P<text>.+?)\\\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text')),
(re.compile('\\\\o(?P<text>.+?)\\\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')),
(re.compile('\\\\v(?P<text>.+?)\\\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')),
(re.compile('\\\\t(?P<text>.+?)\\\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%">%s</div>' % match.group('text')),
(re.compile('\\\\T="(?P<val>\d+%*)"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%">%s</div>' % (match.group('val'), match.group('text'))),
(re.compile('\\\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
(re.compile('\\\\n'), lambda match: ''),
(re.compile('\\\\s'), lambda match: ''),
(re.compile('\\\\b(?P<text>.+?)\\\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead.
(re.compile('\\\\l(?P<text>.+?)\\\\l', re.DOTALL), lambda match: '<big>%s</big>' % match.group('text')),
(re.compile('\\\\B(?P<text>.+?)\\\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')),
(re.compile('\\\\Sp(?P<text>.+?)\\\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text')),
(re.compile('\\\\Sb(?P<text>.+?)\\\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text')),
(re.compile('\\\\k(?P<text>.+?)\\\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
(re.compile('\\\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
(re.compile('\\\\U(?P<num>\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))),
(re.compile('\\\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')),
(re.compile('\\\\q="(?P<target>#.+?)"(?P<text>)\\\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile('\\\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
(re.compile('\\\\-'), lambda match: ''),
# Todo: Footnotes need link.
(re.compile('\\\\Fn="(?P<target>.+?)"(?P<text>.+?)\\\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile('\\\\Sd="(?P<target>.+?)"(?P<text>.+?)\\\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile('\\\\I'), lambda match: ''),
# eReader files are one paragraph per line.
# This forces the lines to wrap properly.
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
# Remove unmatched plm codes.
(re.compile('(?<=[^\\\\])\\\\[pxcriouvtblBk]'), lambda match: ''),
(re.compile('(?<=[^\\\\])\\\\X[0-4]'), lambda match: ''),
(re.compile('(?<=[^\\\\])\\\\Sp'), lambda match: ''),
(re.compile('(?<=[^\\\\])\\\\Sb'), lambda match: ''),
# Replace \\ with \.
(re.compile('\\\\\\\\'), lambda match: '\\'),
]
FOOTNOTE_HTML_RULES = [
(re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>')
]
SIDEBAR_HTML_RULES = [
(re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>')
]
def pml_to_html(pml):
html = pml
for rule in PML_HTML_RULES:
html = rule[0].sub(rule[1], html)
for symbol in HTML_SYMBOLS.keys():
if ord(symbol) > 128:
html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1])
return html
def footnote_to_html(footnotes):
html = footnotes
for rule in FOOTNOTE_HTML_RULES:
html = rule[0].sub(rule[1], html)
html = pml_to_html(html)
return html
def sidebar_to_html(sidebars):
html = sidebars
for rule in FOOTNOTE_HTML_RULES:
html = rule[0].sub(rule[1], html)
html = pml_to_html(html)
return html

View File

@ -0,0 +1,199 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Read content from ereader pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, sys, struct, zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \
footnote_to_html, sidebar_to_html
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator
class HeaderRecord(object):
'''
The first record in the file is always the header record. It holds
information related to the location of text, images, and so on
in the file. This is used in conjunction with the sections
defined in the file header.
'''
def __init__(self, raw):
self.version, = struct.unpack('>H', raw[0:2])
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46])
self.footnote_offset, = struct.unpack('>H', raw[48:50])
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
self.last_data_offset, = struct.unpack('>H', raw[52:54])
self.num_text_pages = self.non_text_offset -1
self.num_image_pages = self.metadata_offset - self.image_data_offset
# Can't tell which is sidebar and footnote if they have same offset.
# They don't exist if offset is larget than last_record.
self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0
self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0
class Reader(object):
def __init__(self, header, stream):
raw = stream.read()
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
self.header_record = HeaderRecord(self.section_data(0))
if self.header_record.version not in (2, 10):
if self.header_record.version in (260, 272):
raise DRMError('eReader DRM is not supported.')
else:
raise EreaderError('Unknown book version %i.' % self.header_record.version)
def section_data(self, number):
return self.sections[number]
def decompress_text(self, number):
if self.header_record.version == 2:
return decompress_doc(self.section_data(number)).decode('cp1252')
if self.header_record.version == 10:
return zlib.decompress(self.section_data(number)).decode('cp1252')
def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', ''
data = self.section_data(number)
name = data[4:4+32].strip('\0')
img = data[62:]
return name, img
def get_text_page(self, number):
'''
Only palmdoc and zlib compressed are supported. The text is
assumed to be encoded as Windows-1252. The encoding is part of
the eReader file spec and should always be this encoding.
'''
if number < 1 or number > self.header_record.num_text_pages:
return ''
return self.decompress_text(number)
def get_footnote_page(self, number):
if number < self.header_record.footnote_offset or number > self.header_record.footnote_offset + self.header_record.num_footnote_pages - 1:
return ''
return self.decompress_text(number)
def get_sidebar_page(self, number):
if number < self.header_record.sidebar_offset or number > self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1:
return ''
return self.decompress_text(number)
def has_footnotes(self):
if self.header_record.num_footnote_pages > 1:
try:
content = self.decompress_text(self.header_record.footnote_offset)
if content.contains('</footnote>'):
return True
except:
pass
return False
def has_sidebar(self):
if self.header_record.num_sidebar_pages > 1:
try:
content = self.decompress_text(self.header_record.sidebar_offset)
if content.contains('</sidebar>'):
return True
except:
pass
return False
def extract_content(self, output_dir):
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
html = '<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1):
html += pml_to_html(self.get_text_page(i))
# Untested: The num_.._pages variable may not be correct!
# Possibly use .._rec instead?
'''
if has_footnotes():
html += '<br /><h1>%s</h1>' % _('Footnotes')
for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages):
html += footnote_to_html(self.get_footnote_page(i))
if has_sidebar():
html += '<br /><h1>%s</h1>' % _('Sidebar')
for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages):
html += sidebar_to_html(self.get_sidebar_page(i))
'''
html += '</body></html>'
with CurrentDir(output_dir):
with open('index.html', 'wb') as index:
index.write(html.encode('utf-8'))
if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/'))
images = []
with CurrentDir(os.path.join(output_dir, 'images/')):
for i in range(0, self.header_record.num_image_pages):
name, img = self.get_image(self.header_record.image_data_offset + i)
images.append(name)
with open(name, 'wb') as imgf:
imgf.write(img)
self.create_opf(output_dir, images)
return os.path.join(output_dir, 'metadata.opf')
def create_opf(self, output_dir, images):
mi = MetaInformation(None, None)
with CurrentDir(output_dir):
opf = OPFCreator(output_dir, mi)
manifest = [('index.html', None)]
for i in images:
manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:
opf.render(opffile)
def dump_pml(self):
pml = ''
for i in range(1, self.header_record.num_text_pages + 1):
pml += self.get_text_page(i)
return pml

View File

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Read the header data from a pdb file.
'''
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, struct
class PdbHeader(object):
def __init__(self, stream):
self.stream = stream
self.ident = self.identity()
self.num_sections = self.section_count()
self.title = self.name()
def identity(self):
self.stream.seek(60)
ident = self.stream.read(8)
return ident
def section_count(self):
self.stream.seek(76)
return struct.unpack('>H', self.stream.read(2))[0]
def name(self):
self.stream.seek(0)
return self.stream.read(32).replace('\x00', '')
def full_section_info(self, number):
if number > self.num_sections:
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
flags, val = a1, a2<<16 | a3<<8 | a4
return (offset, flags, val)
def section_offset(self, number):
if number > self.num_sections:
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def section_data(self, number):
if number > self.num_sections:
raise ValueError('Not a valid section number %i' % number)
start = self.section_offset(number)
if number == self.num_sections -1:
end = os.stat(self.stream.name).st_size
else:
end = self.section_offset(number + 1)
self.stream.seek(start)
return self.stream.read(end - start)

View File

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeader
from calibre.ebooks.pdb import PDBError, get_reader
class PDBInput(InputFormatPlugin):
name = 'PDB Input'
author = 'John Schember'
description = 'Convert PDB to HTML'
file_types = set(['pdb'])
def convert(self, stream, options, file_ext, log,
accelerators):
header = PdbHeader(stream)
Reader = get_reader(header.ident)
if Reader is None:
raise PDBError('Unknown format identity is %s' % header.identity)
reader = Reader(header, stream)
opf = reader.extract_content(os.getcwd())
return opf