Refactoring.

This commit is contained in:
John Schember 2009-05-21 16:22:24 -04:00
parent 4be2cbb770
commit 24ca1a1134
14 changed files with 230 additions and 185 deletions

View File

@ -89,7 +89,7 @@ if __name__ == '__main__':
include_dirs=['src/calibre/utils/msdes']), include_dirs=['src/calibre/utils/msdes']),
Extension('calibre.plugins.cPalmdoc', Extension('calibre.plugins.cPalmdoc',
sources=['src/calibre/ebooks/mobi/palmdoc.c']), sources=['src/calibre/ebooks/compression/palmdoc.c']),
PyQtExtension('calibre.plugins.pictureflow', PyQtExtension('calibre.plugins.pictureflow',
['src/calibre/gui2/pictureflow/pictureflow.cpp', ['src/calibre/gui2/pictureflow/pictureflow.cpp',

View File

@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'

View File

@ -25,15 +25,9 @@ TAG_MAP = {
'div' : 'p', 'div' : 'p',
} }
STYLE_MAP = {
'bold' : 'strong',
'bolder' : 'strong',
'italic' : 'emphasis',
}
STYLES = [ STYLES = [
'font-weight', ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
'font-style', ('font-style', {'italic' : 'emphasis'}),
] ]
class FB2MLizer(object): class FB2MLizer(object):
@ -107,8 +101,9 @@ class FB2MLizer(object):
fb2_text += '<%s>' % fb2_tag fb2_text += '<%s>' % fb2_tag
tag_stack.append(fb2_tag) tag_stack.append(fb2_tag)
# Processes style information
for s in STYLES: for s in STYLES:
style_tag = STYLE_MAP.get(style[s], None) style_tag = s[1].get(style[s[0]], None)
if style_tag: if style_tag:
tag_count += 1 tag_count += 1
fb2_text += '<%s>' % style_tag fb2_text += '<%s>' % style_tag

View File

@ -1,11 +1,17 @@
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
Read data from .mobi files Read data from .mobi files
''' '''
import struct, os, cStringIO, re, functools, datetime, textwrap import datetime
import functools
import os
import re
import struct
import textwrap
import cStringIO
try: try:
from PIL import Image as PILImage from PIL import Image as PILImage
@ -21,8 +27,8 @@ from calibre.ebooks import DRMError
from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.chardet import ENCODING_PATS
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
@ -40,8 +46,8 @@ class EXTHHeader(object):
while left > 0: while left > 0:
left -= 1 left -= 1
id, size = struct.unpack('>LL', raw[pos:pos+8]) id, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos+8:pos+size] content = raw[pos + 8:pos + size]
pos += size pos += size
if id >= 100 and id < 200: if id >= 100 and id < 200:
self.process_metadata(id, content, codec) self.process_metadata(id, content, codec)
@ -87,7 +93,7 @@ class EXTHHeader(object):
elif id == 106: elif id == 106:
try: try:
self.mi.publish_date = datetime.datetime.strptime( self.mi.publish_date = datetime.datetime.strptime(
content, '%Y-%m-%d',).date() content, '%Y-%m-%d', ).date()
except: except:
pass pass
elif id == 108: elif id == 108:
@ -123,13 +129,13 @@ class BookHeader(object):
try: try:
self.codec = { self.codec = {
1252 : 'cp1252', 1252: 'cp1252',
65001 : 'utf-8', 65001: 'utf-8',
}[self.codepage] }[self.codepage]
except (IndexError, KeyError): except (IndexError, KeyError):
self.codec = 'cp1252' if user_encoding is None else user_encoding self.codec = 'cp1252' if user_encoding is None else user_encoding
log.warn('Unknown codepage %d. Assuming %s'%(self.codepage, log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec)) self.codec))
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
self.extra_flags = 0 self.extra_flags = 0
else: else:
@ -147,14 +153,14 @@ class BookHeader(object):
self.language = main_language.get(langid, 'ENGLISH') self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0] self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None self.exth = None
if not isinstance(self.title, unicode): if not isinstance(self.title, unicode):
self.title = self.title.decode(self.codec, 'replace') self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40: if self.exth_flag & 0x40:
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title) self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id self.exth.mi.uid = self.unique_id
self.exth.mi.language = self.language self.exth.mi.language = self.language
@ -182,7 +188,7 @@ class MetadataHeader(BookHeader):
return struct.unpack('>H', self.stream.read(2))[0] return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number): def section_offset(self, number):
self.stream.seek(78+number*8) self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0] return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self): def header(self):
@ -242,15 +248,15 @@ class MobiReader(object):
self.name = self.header[:32].replace('\x00', '') self.name = self.header[:32].replace('\x00', '')
self.num_sections, = struct.unpack('>H', raw[76:78]) self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C+8].upper() self.ident = self.header[0x3C:0x3C + 8].upper()
if self.ident not in ['BOOKMOBI', 'TEXTREAD']: if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s'%self.ident) raise MobiError('Unknown book type: %s' % self.ident)
self.sections = [] self.sections = []
self.section_headers = [] self.section_headers = []
for i in range(self.num_sections): for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8]) offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
flags, val = a1, a2<<16 | a3<<8 | a4 flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val)) self.section_headers.append((offset, flags, val))
def section(section_number): def section(section_number):
@ -266,7 +272,7 @@ class MobiReader(object):
self.book_header = BookHeader(self.sections[0][0], self.ident, self.book_header = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log) user_encoding, self.log)
self.name = self.name.decode(self.book_header.codec, 'replace') self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir, parse_cache): def extract_content(self, output_dir, parse_cache):
@ -279,13 +285,13 @@ class MobiReader(object):
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors() self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec, self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore') 'ignore')
for pat in ENCODING_PATS: for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html) self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode, e2u = functools.partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp', 'apos', 'quot']) exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
self.processed_html = re.sub(r'&(\S+?);', e2u, self.processed_html = re.sub(r'&(\S+?);', e2u,
self.processed_html) self.processed_html)
self.extract_images(processed_records, output_dir) self.extract_images(processed_records, output_dir)
self.replace_page_breaks() self.replace_page_breaks()
self.cleanup_html() self.cleanup_html()
@ -295,7 +301,7 @@ class MobiReader(object):
if root.xpath('descendant::p/descendant::p'): if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser from lxml.html import soupparser
self.log.warning('Markup contains unclosed <p> tags, parsing using', self.log.warning('Markup contains unclosed <p> tags, parsing using',
'BeatifulSoup') 'BeatifulSoup')
root = soupparser.fromstring(self.processed_html) root = soupparser.fromstring(self.processed_html)
if root.tag != 'html': if root.tag != 'html':
self.log.warn('File does not have opening <html> tag') self.log.warn('File does not have opening <html> tag')
@ -346,45 +352,45 @@ class MobiReader(object):
fname = self.name.encode('ascii', 'replace') fname = self.name.encode('ascii', 'replace')
fname = re.sub(r'[\x08\x15\0]+', '', fname) fname = re.sub(r'[\x08\x15\0]+', '', fname)
htmlfile = os.path.join(output_dir, htmlfile = os.path.join(output_dir,
sanitize_file_name(fname)+'.html') sanitize_file_name(fname) + '.html')
try: try:
for ref in guide.xpath('descendant::reference'): for ref in guide.xpath('descendant::reference'):
if ref.attrib.has_key('href'): if ref.attrib.has_key('href'):
ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href'] ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
except AttributeError: except AttributeError:
pass pass
parse_cache[htmlfile] = root parse_cache[htmlfile] = root
self.htmlfile = htmlfile self.htmlfile = htmlfile
ncx = cStringIO.StringIO() ncx = cStringIO.StringIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(open(self.created_opf_path, 'wb'), ncx, opf.render(open(self.created_opf_path, 'wb'), ncx,
ncx_manifest_entry=ncx_manifest_entry) ncx_manifest_entry=ncx_manifest_entry)
ncx = ncx.getvalue() ncx = ncx.getvalue()
if ncx: if ncx:
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
open(ncx_path, 'wb').write(ncx) open(ncx_path, 'wb').write(ncx)
with open('styles.css', 'wb') as s: with open('styles.css', 'wb') as s:
s.write(self.base_css_rules+'\n\n') s.write(self.base_css_rules + '\n\n')
for cls, rule in self.tag_css_rules.items(): for cls, rule in self.tag_css_rules.items():
if isinstance(rule, unicode): if isinstance(rule, unicode):
rule = rule.encode('utf-8') rule = rule.encode('utf-8')
s.write('.%s { %s }\n\n'%(cls, rule)) s.write('.%s { %s }\n\n' % (cls, rule))
if self.book_header.exth is not None or self.embedded_mi is not None: if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...') self.log.debug('Creating OPF...')
ncx = cStringIO.StringIO() ncx = cStringIO.StringIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx, opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry ) ncx_manifest_entry)
ncx = ncx.getvalue() ncx = ncx.getvalue()
if ncx: if ncx:
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) open(os.path.splitext(htmlfile)[0] + '.ncx', 'wb').write(ncx)
def read_embedded_metadata(self, root, elem, guide): def read_embedded_metadata(self, root, elem, guide):
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>' raw = '<package>' + html.tostring(elem, encoding='utf-8') + '</package>'
stream = cStringIO.StringIO(raw) stream = cStringIO.StringIO(raw)
opf = OPF(stream) opf = OPF(stream)
self.embedded_mi = MetaInformation(opf) self.embedded_mi = MetaInformation(opf)
@ -394,7 +400,7 @@ class MobiReader(object):
href = ref.get('href', '') href = ref.get('href', '')
if href.startswith('#'): if href.startswith('#'):
href = href[1:] href = href[1:]
anchors = root.xpath('//*[@id="%s"]'%href) anchors = root.xpath('//*[@id="%s"]' % href)
if anchors: if anchors:
cpos = anchors[0] cpos = anchors[0]
reached = False reached = False
@ -412,27 +418,27 @@ class MobiReader(object):
self.log.debug('Cleaning up HTML...') self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html) self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower(): if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>' self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<') self.processed_html = self.processed_html.replace('> <', '>\n<')
def upshift_markup(self, root): def upshift_markup(self, root):
self.log.debug('Converting style information to CSS...') self.log.debug('Converting style information to CSS...')
size_map = { size_map = {
'xx-small' : '0.5', 'xx-small': '0.5',
'x-small' : '1', 'x-small': '1',
'small' : '2', 'small': '2',
'medium' : '3', 'medium': '3',
'large' : '4', 'large': '4',
'x-large' : '5', 'x-large': '5',
'xx-large' : '6', 'xx-large': '6',
} }
mobi_version = self.book_header.mobi_version mobi_version = self.book_header.mobi_version
for i, tag in enumerate(root.iter(etree.Element)): for i, tag in enumerate(root.iter(etree.Element)):
tag.attrib.pop('xmlns', '') tag.attrib.pop('xmlns', '')
if tag.tag in ('country-region', 'place', 'placetype', 'placename', if tag.tag in ('country-region', 'place', 'placetype', 'placename',
'state', 'city', 'street', 'address', 'content'): 'state', 'city', 'street', 'address', 'content'):
tag.tag = 'div' if tag.tag == 'content' else 'span' tag.tag = 'div' if tag.tag == 'content' else 'span'
for key in tag.attrib.keys(): for key in tag.attrib.keys():
tag.attrib.pop(key) tag.attrib.pop(key)
continue continue
@ -450,7 +456,7 @@ class MobiReader(object):
if width: if width:
styles.append('text-indent: %s' % width) styles.append('text-indent: %s' % width)
if width.startswith('-'): if width.startswith('-'):
styles.append('margin-left: %s'%(width[1:])) styles.append('margin-left: %s' % (width[1:]))
if attrib.has_key('align'): if attrib.has_key('align'):
align = attrib.pop('align').strip() align = attrib.pop('align').strip()
if align: if align:
@ -502,7 +508,7 @@ class MobiReader(object):
cls = sel cls = sel
break break
if cls is None: if cls is None:
ncls = 'calibre_%d'%i ncls = 'calibre_%d' % i
self.tag_css_rules[ncls] = rule self.tag_css_rules[ncls] = rule
cls = attrib.get('class', '') cls = attrib.get('class', '')
cls = cls + (' ' if cls else '') + ncls cls = cls + (' ' if cls else '') + ncls
@ -514,17 +520,17 @@ class MobiReader(object):
mi = MetaInformation(self.book_header.title, [_('Unknown')]) mi = MetaInformation(self.book_header.title, [_('Unknown')])
opf = OPFCreator(os.path.dirname(htmlfile), mi) opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'): if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
elif mi.cover is not None: elif mi.cover is not None:
opf.cover = mi.cover opf.cover = mi.cover
else: else:
opf.cover = 'images/%05d.jpg'%1 opf.cover = 'images/%05d.jpg' % 1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile), if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
*opf.cover.split('/'))): * opf.cover.split('/'))):
opf.cover = None opf.cover = None
manifest = [(htmlfile, 'text/x-oeb1-document'), manifest = [(htmlfile, 'text/x-oeb1-document'),
(os.path.abspath('styles.css'), 'text/css')] (os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile) bp = os.path.dirname(htmlfile)
for i in getattr(self, 'image_names', []): for i in getattr(self, 'image_names', []):
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
@ -541,7 +547,7 @@ class MobiReader(object):
ncx_manifest_entry = None ncx_manifest_entry = None
if toc: if toc:
ncx_manifest_entry = 'toc.ncx' ncx_manifest_entry = 'toc.ncx'
elems = root.xpath('//*[@id="%s"]'%toc.partition('#')[-1]) elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
tocobj = None tocobj = None
ent_pat = re.compile(r'&(\S+?);') ent_pat = re.compile(r'&(\S+?);')
if elems: if elems:
@ -556,12 +562,12 @@ class MobiReader(object):
if href and re.match('\w+://', href) is None: if href and re.match('\w+://', href) is None:
try: try:
text = u' '.join([t.strip() for t in \ text = u' '.join([t.strip() for t in \
x.xpath('descendant::text()')]) x.xpath('descendant::text()')])
except: except:
text = '' text = ''
text = ent_pat.sub(entity_to_unicode, text) text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], href[1:], tocobj.add_item(toc.partition('#')[0], href[1:],
text) text)
if reached and x.get('class', None) == 'mbp_pagebreak': if reached and x.get('class', None) == 'mbp_pagebreak':
break break
if tocobj is not None: if tocobj is not None:
@ -599,17 +605,17 @@ class MobiReader(object):
def extract_text(self): def extract_text(self):
self.log.debug('Extracting text...') self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] text_sections = [self.text_section(i) for i in range(1, self.book_header.records + 1)]
processed_records = list(range(0, self.book_header.records+1)) processed_records = list(range(0, self.book_header.records + 1))
self.mobi_html = '' self.mobi_html = ''
if self.book_header.compression_type == 'DH': if self.book_header.compression_type == 'DH':
huffs = [self.sections[i][0] for i in huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset, range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)] self.book_header.huff_offset + self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset, processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)) self.book_header.huff_offset + self.book_header.huff_number))
huff = HuffReader(huffs) huff = HuffReader(huffs)
self.mobi_html = huff.decompress(text_sections) self.mobi_html = huff.decompress(text_sections)
@ -620,7 +626,7 @@ class MobiReader(object):
elif self.book_header.compression_type == '\x00\x01': elif self.book_header.compression_type == '\x00\x01':
self.mobi_html = ''.join(text_sections) self.mobi_html = ''.join(text_sections)
else: else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type))
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower(): if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ') self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
self.mobi_html = self.mobi_html.replace('\0', '') self.mobi_html = self.mobi_html.replace('\0', '')
@ -636,7 +642,7 @@ class MobiReader(object):
self.log.debug('Adding anchors...') self.log.debug('Adding anchors...')
positions = set([]) positions = set([])
link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE) re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html): for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1))) positions.add(int(match.group(1)))
pos = 0 pos = 0
@ -652,10 +658,10 @@ class MobiReader(object):
if r > -1 and (r < l or l == end or l == -1): if r > -1 and (r < l or l == end or l == -1):
p = self.mobi_html.rfind('<', 0, end + 1) p = self.mobi_html.rfind('<', 0, end + 1)
if pos < end and p > -1 and \ if pos < end and p > -1 and \
not end_tag_re.match(self.mobi_html[p:r]) and \ not end_tag_re.match(self.mobi_html[p:r]) and \
not self.mobi_html[p:r+1].endswith('/>'): not self.mobi_html[p:r + 1].endswith('/>'):
anchor = ' filepos-id="filepos%d"' anchor = ' filepos-id="filepos%d"'
end = r end = r
else: else:
end = r + 1 end = r + 1
self.processed_html += self.mobi_html[pos:end] + (anchor % oend) self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
@ -673,7 +679,7 @@ class MobiReader(object):
start = getattr(self.book_header, 'first_image_index', -1) start = getattr(self.book_header, 'first_image_index', -1)
if start > self.num_sections or start < 0: if start > self.num_sections or start < 0:
# BAEN PRC files have bad headers # BAEN PRC files have bad headers
start=0 start = 0
for i in range(start, self.num_sections): for i in range(start, self.num_sections):
if i in processed_records: if i in processed_records:
continue continue
@ -687,7 +693,7 @@ class MobiReader(object):
except IOError: except IOError:
continue continue
path = os.path.join(output_dir, '%05d.jpg'%image_index) path = os.path.join(output_dir, '%05d.jpg' % image_index)
self.image_names.append(os.path.basename(path)) self.image_names.append(os.path.basename(path))
im.save(open(path, 'wb'), format='JPEG') im.save(open(path, 'wb'), format='JPEG')

View File

@ -1,27 +1,32 @@
''' '''
Write content to Mobipocket books. Write content to Mobipocket books.
''' '''
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
from collections import defaultdict
from itertools import count
from itertools import izip
import random
import re
from struct import pack from struct import pack
import time import time
import random
from cStringIO import StringIO
import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag from urlparse import urldefrag
from PIL import Image from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ from cStringIO import StringIO
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS from calibre.ebooks.mobi.mobiml import MBP_NS
from calibre.ebooks.oeb.base import OEB_DOCS
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.base import XHTML_NS
from calibre.ebooks.oeb.base import XML_NS
from calibre.ebooks.oeb.base import namespace
from calibre.ebooks.oeb.base import prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.compression.palmdoc import compress_doc
# TODO: # TODO:
# - Allow override CSS (?) # - Allow override CSS (?)
@ -174,7 +179,7 @@ class Serializer(object):
item = hrefs[path] if path else None item = hrefs[path] if path else None
if item and item.spine_position is None: if item and item.spine_position is None:
return False return False
path = item.href if item else base.href path = item.href if item else base.href
href = '#'.join((path, frag)) if frag else path href = '#'.join((path, frag)) if frag else path
buffer.write('filepos=') buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell()) self.href_offsets[href].append(buffer.tell())
@ -211,8 +216,8 @@ class Serializer(object):
def serialize_elem(self, elem, item, nsrmap=NSRMAP): def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buffer = self.buffer buffer = self.buffer
if not isinstance(elem.tag, basestring) \ if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) not in nsrmap: or namespace(elem.tag) not in nsrmap:
return return
tag = prefixname(elem.tag, nsrmap) tag = prefixname(elem.tag, nsrmap)
# Previous layers take care of @name # Previous layers take care of @name
id = elem.attrib.pop('id', None) id = elem.attrib.pop('id', None)
@ -221,9 +226,9 @@ class Serializer(object):
offset = self.anchor_offset or buffer.tell() offset = self.anchor_offset or buffer.tell()
self.id_offsets[href] = offset self.id_offsets[href] = offset
if self.anchor_offset is not None and \ if self.anchor_offset is not None and \
tag == 'a' and not elem.attrib and \ tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text: not len(elem) and not elem.text:
return return
self.anchor_offset = buffer.tell() self.anchor_offset = buffer.tell()
buffer.write('<') buffer.write('<')
buffer.write(tag) buffer.write(tag)
@ -286,8 +291,8 @@ class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, compression=PALMDOC, imagemax=None, def __init__(self, compression=PALMDOC, imagemax=None,
prefer_author_sort=False): prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort self._prefer_author_sort = prefer_author_sort
@ -297,7 +302,7 @@ class MobiWriter(object):
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
prefer_author_sort = opts.prefer_author_sort prefer_author_sort = opts.prefer_author_sort
return cls(compression=PALMDOC, imagemax=imagemax, return cls(compression=PALMDOC, imagemax=imagemax,
prefer_author_sort=prefer_author_sort) prefer_author_sort=prefer_author_sort)
def __call__(self, oeb, path): def __call__(self, oeb, path):
if hasattr(path, 'write'): if hasattr(path, 'write'):
@ -305,7 +310,7 @@ class MobiWriter(object):
with open(path, 'w+b') as stream: with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream) return self._dump_stream(oeb, stream)
def _write(self, *data): def _write(self, * data):
for datum in data: for datum in data:
self._stream.write(datum) self._stream.write(datum)

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -7,17 +6,17 @@ __docformat__ = 'restructuredtext en'
class PDBError(Exception): class PDBError(Exception):
pass pass
from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
FORMAT_READERS = { FORMAT_READERS = {
'PNPdPPrs' : ereader_reader, 'PNPdPPrs': ereader_reader,
'PNRdPPrs' : ereader_reader, 'PNRdPPrs': ereader_reader,
'zTXTGPlm' : ztxt_reader, 'zTXTGPlm': ztxt_reader,
'TEXtREAd' : palmdoc_reader, 'TEXtREAd': palmdoc_reader,
} }
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
@ -25,41 +24,41 @@ from calibre.ebooks.pdb.ztxt.writer import Writer as ztxt_writer
from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
FORMAT_WRITERS = { FORMAT_WRITERS = {
'doc' : palmdoc_writer, 'doc': palmdoc_writer,
'ztxt' : ztxt_writer, 'ztxt': ztxt_writer,
'ereader' : ereader_writer, 'ereader': ereader_writer,
} }
IDENTITY_TO_NAME = { IDENTITY_TO_NAME = {
'PNPdPPrs' : 'eReader', 'PNPdPPrs': 'eReader',
'PNRdPPrs' : 'eReader', 'PNRdPPrs': 'eReader',
'zTXTGPlm' : 'zTXT', 'zTXTGPlm': 'zTXT',
'TEXtREAd' : 'PalmDOC', 'TEXtREAd': 'PalmDOC',
'.pdfADBE' : 'Adobe Reader', '.pdfADBE': 'Adobe Reader',
'BVokBDIC' : 'BDicty', 'BVokBDIC': 'BDicty',
'DB99DBOS' : 'DB (Database program)', 'DB99DBOS': 'DB (Database program)',
'vIMGView' : 'FireViewer (ImageViewer)', 'vIMGView': 'FireViewer (ImageViewer)',
'PmDBPmDB' : 'HanDBase', 'PmDBPmDB': 'HanDBase',
'InfoINDB' : 'InfoView', 'InfoINDB': 'InfoView',
'ToGoToGo' : 'iSilo', 'ToGoToGo': 'iSilo',
'SDocSilX' : 'iSilo 3', 'SDocSilX': 'iSilo 3',
'JbDbJBas' : 'JFile', 'JbDbJBas': 'JFile',
'JfDbJFil' : 'JFile Pro', 'JfDbJFil': 'JFile Pro',
'DATALSdb' : 'LIST', 'DATALSdb': 'LIST',
'Mdb1Mdb1' : 'MobileDB', 'Mdb1Mdb1': 'MobileDB',
'BOOKMOBI' : 'MobiPocket', 'BOOKMOBI': 'MobiPocket',
'DataPlkr' : 'Plucker', 'DataPlkr': 'Plucker',
'DataSprd' : 'QuickSheet', 'DataSprd': 'QuickSheet',
'SM01SMem' : 'SuperMemo', 'SM01SMem': 'SuperMemo',
'TEXtTlDc' : 'TealDoc', 'TEXtTlDc': 'TealDoc',
'InfoTlIf' : 'TealInfo', 'InfoTlIf': 'TealInfo',
'DataTlMl' : 'TealMeal', 'DataTlMl': 'TealMeal',
'DataTlPt' : 'TealPaint', 'DataTlPt': 'TealPaint',
'dataTDBP' : 'ThinkDB', 'dataTDBP': 'ThinkDB',
'TdatTide' : 'Tides', 'TdatTide': 'Tides',
'ToRaTRPW' : 'TomeRaider', 'ToRaTRPW': 'TomeRaider',
'BDOCWrdS' : 'WordSmith', 'BDOCWrdS': 'WordSmith',
} }
def get_reader(identity): def get_reader(identity):
@ -67,10 +66,10 @@ def get_reader(identity):
Returns None if no reader is found for the identity. Returns None if no reader is found for the identity.
''' '''
return FORMAT_READERS.get(identity, None) return FORMAT_READERS.get(identity, None)
def get_writer(extension): def get_writer(extension):
''' '''
Returns None if no writer is found for extension. Returns None if no writer is found for extension.
''' '''
return FORMAT_WRITERS.get(extension, None) return FORMAT_WRITERS.get(extension, None)

View File

@ -8,16 +8,19 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, struct, zlib import os
import re
import struct
import zlib
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pml.pmlconverter import pml_to_html, \
footnote_sidebar_to_html
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
from calibre.ebooks.pml.pmlconverter import pml_to_html
class HeaderRecord(object): class HeaderRecord(object):
''' '''
@ -32,7 +35,7 @@ class HeaderRecord(object):
self.non_text_offset, = struct.unpack('>H', raw[12:14]) self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.has_metadata, = struct.unpack('>H', raw[24:26]) self.has_metadata, = struct.unpack('>H', raw[24:26])
self.footnote_rec, = struct.unpack('>H', raw[28:30]) self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32]) self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34]) self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42]) self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46]) self.metadata_offset, = struct.unpack('>H', raw[44:46])
@ -79,7 +82,7 @@ class Reader(FormatReader):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', '' return 'empty', ''
data = self.section_data(number) data = self.section_data(number)
name = data[4:4+32].strip('\x00') name = data[4:4 + 32].strip('\x00')
img = data[62:] img = data[62:]
return name, img return name, img

View File

@ -8,9 +8,11 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct, zlib import struct
import zlib
import Image, cStringIO import Image
import cStringIO
from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.oeb.base import OEB_IMAGES

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import with_statement
''' '''
Read the header data from a pdb file. Read the header data from a pdb file.
''' '''
@ -8,7 +7,9 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, struct, time import re
import struct
import time
class PdbHeaderReader(object): class PdbHeaderReader(object):
@ -35,16 +36,16 @@ class PdbHeaderReader(object):
if number not in range(0, self.num_sections): if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number) raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8) self.stream.seek(78 + number * 8)
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0] offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
flags, val = a1, a2<<16 | a3<<8 | a4 flags, val = a1, a2 << 16 | a3 << 8 | a4
return (offset, flags, val) return (offset, flags, val)
def section_offset(self, number): def section_offset(self, number):
if number not in range(0, self.num_sections): if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number) raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8) self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0] return struct.unpack('>LBBBB', self.stream.read(8))[0]
def section_data(self, number): def section_data(self, number):

View File

@ -8,11 +8,13 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, struct, zlib import os
import struct
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.txt.processor import opf_writer
from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer from calibre.ebooks.txt.processor import txt_to_markdown
class HeaderRecord(object): class HeaderRecord(object):
''' '''
@ -25,15 +27,15 @@ class HeaderRecord(object):
def __init__(self, raw): def __init__(self, raw):
self.compression, = struct.unpack('>H', raw[0:2]) self.compression, = struct.unpack('>H', raw[0:2])
self.num_records, = struct.unpack('>H', raw[8:10]) self.num_records, = struct.unpack('>H', raw[8:10])
class Reader(FormatReader): class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, encoding=None):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = encoding self.encoding = encoding
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
self.sections.append(header.section_data(i)) self.sections.append(header.section_data(i))
@ -52,7 +54,7 @@ class Reader(FormatReader):
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
@ -62,12 +64,12 @@ class Reader(FormatReader):
html = txt_to_markdown(txt) html = txt_to_markdown(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index: with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8')) index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(self.stream, 'pdb') mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)] manifest = [('index.html', None)]
spine = ['index.html'] spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf') return os.path.join(output_dir, 'metadata.opf')

View File

@ -10,10 +10,11 @@ __docformat__ = 'restructuredtext en'
import struct import struct
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.txt.writer import TxtNewlines
from calibre.ebooks.txt.writer import TxtWriter
MAX_RECORD_SIZE = 4096 MAX_RECORD_SIZE = 4096
@ -22,48 +23,48 @@ class Writer(FormatWriter):
def __init__(self, opts, log): def __init__(self, opts, log):
self.opts = opts self.opts = opts
self.log = log self.log = log
def write_content(self, oeb_book, out_stream, metadata=None): def write_content(self, oeb_book, out_stream, metadata=None):
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
txt_records, txt_length = self._generate_text(oeb_book.spine) txt_records, txt_length = self._generate_text(oeb_book.spine)
header_record = self._header_record(txt_length, len(txt_records)) header_record = self._header_record(txt_length, len(txt_records))
section_lengths = [len(header_record)] section_lengths = [len(header_record)]
self.log.info('Compessing data...') self.log.info('Compessing data...')
for i in range(0, len(txt_records)): for i in range(0, len(txt_records)):
self.log.debug('\tCompressing record %i' % i) self.log.debug('\tCompressing record %i' % i)
txt_records[i] = compress_doc(txt_records[i].encode('utf-8')) txt_records[i] = compress_doc(txt_records[i].encode('utf-8'))
section_lengths.append(len(txt_records[i])) section_lengths.append(len(txt_records[i]))
out_stream.seek(0) out_stream.seek(0)
hb = PdbHeaderBuilder('TEXtREAd', title) hb = PdbHeaderBuilder('TEXtREAd', title)
hb.build_header(section_lengths, out_stream) hb.build_header(section_lengths, out_stream)
for record in [header_record]+txt_records: for record in [header_record] + txt_records:
out_stream.write(record) out_stream.write(record)
def _generate_text(self, spine): def _generate_text(self, spine):
txt_writer = TxtWriter(TxtNewlines('system').newline, self.log) txt_writer = TxtWriter(TxtNewlines('system').newline, self.log)
txt = txt_writer.dump(spine) txt = txt_writer.dump(spine)
txt_length = len(txt) txt_length = len(txt)
txt_records = [] txt_records = []
for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1):
txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
return txt_records, txt_length return txt_records, txt_length
def _header_record(self, txt_length, record_count): def _header_record(self, txt_length, record_count):
record = '' record = ''
record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression). record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression).
record += struct.pack('>H', 0) # [2:4], Always 0. record += struct.pack('>H', 0) # [2:4], Always 0.
record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book. record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book.
record += struct.pack('>H', record_count) # [8:10], Number of PDB records used for the text of the book. record += struct.pack('>H', record_count) # [8:10], Number of PDB records used for the text of the book.
record += struct.pack('>H', MAX_RECORD_SIZE) # [10-12], Maximum size of each record containing text, always 4096. record += struct.pack('>H', MAX_RECORD_SIZE) # [10-12], Maximum size of each record containing text, always 4096.
record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text. record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text.
return record return record

View File

@ -8,7 +8,8 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into PML markup Transform OEB content into PML markup
''' '''
import os, re import os
import re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
@ -40,6 +41,31 @@ STYLES = [
('text-align', {'right' : 'r', 'center' : 'c'}), ('text-align', {'right' : 'r', 'center' : 'c'}),
] ]
BLOCK_TAGS = [
'p',
]
BLOCK_STYLES = [
'block',
]
LINK_TAGS = [
'a',
]
SEPARATE_TAGS = [
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'p',
'div',
'li',
'tr',
]
class PMLMLizer(object): class PMLMLizer(object):
def __init__(self, ignore_tables=False): def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables self.ignore_tables = ignore_tables
@ -104,7 +130,7 @@ class PMLMLizer(object):
tag_count = 0 tag_count = 0
# Are we in a paragraph block? # Are we in a paragraph block?
if tag == 'p' or style['display'] in ('block'): if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
if 'block' not in tag_stack: if 'block' not in tag_stack:
tag_count += 1 tag_count += 1
tag_stack.append('block') tag_stack.append('block')
@ -136,7 +162,7 @@ class PMLMLizer(object):
# Special processing of tags that require an argument. # Special processing of tags that require an argument.
# Anchors links # Anchors links
if tag == 'a' and 'q' not in tag_stack: if tag in LINK_TAGS and 'q' not in tag_stack:
href = elem.get('href') href = elem.get('href')
if href and '://' not in href: if href and '://' not in href:
if '#' in href: if '#' in href:
@ -168,7 +194,7 @@ class PMLMLizer(object):
for i in range(0, tag_count): for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop()) close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list) text += self.close_tags(close_tag_list)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'): if tag in SEPARATE_TAGS:
text += os.linesep + os.linesep text += os.linesep + os.linesep
if 'block' not in tag_stack: if 'block' not in tag_stack: