RB: output.

This commit is contained in:
John Schember 2009-05-23 19:26:21 -04:00
parent afe9c08304
commit 3659eb1b7a
5 changed files with 384 additions and 24 deletions

View File

@ -99,7 +99,7 @@ class Writer(FormatWriter):
publisher = '' publisher = ''
isbn = '' isbn = ''
if metadata != None: if metadata:
if len(metadata.title) >= 1: if len(metadata.title) >= 1:
title = metadata.title[0].value title = metadata.title[0].value
if len(metadata.creator) >= 1: if len(metadata.creator) >= 1:

View File

@ -4,8 +4,23 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os
HEADER = '\xb0\x0c\xb0\x0c\x02\x00NUVO\x00\x00\x00\x00' HEADER = '\xb0\x0c\xb0\x0c\x02\x00NUVO\x00\x00\x00\x00'
class RocketBookError(Exception): class RocketBookError(Exception):
pass pass
def unique_name(name, used_names):
name = os.path.basename(name)
if len(name) < 32 and name not in used_names:
return name
else:
ext = os.path.splitext(name)[1][:3]
base_name = name[:22]
for i in range(0, 9999):
name = '%s-%s.%s' % (str(i).rjust('0', 4)[:4], base_name, ext)
if name not in used_names:
break
return name

View File

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.ebooks.rb.writer import RBWriter
class RBOutput(OutputFormatPlugin):
name = 'RB Output'
author = 'John Schember'
file_type = 'rb'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
writer = RBWriter(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@ -0,0 +1,166 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into RB compatible markup.
'''
import os
import re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
TAGS = [
'b',
'big',
'blockquote',
'br',
'center',
'code',
'div',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'i',
'li',
'ol',
'p',
'pre',
'small',
'sub',
'sup',
'ul',
]
LINK_TAGS = [
'a',
]
STYLES = [
('font-weight', {'bold' : 'b', 'bolder' : 'b'}),
('font-style', {'italic' : 'i'}),
('text-align', {'center' : 'center'}),
]
class RBMLizer(object):
def __init__(self, name_map={}, ignore_tables=False):
self.name_map = name_map
self.ignore_tables = ignore_tables
def extract_content(self, oeb_book, opts):
oeb_book.logger.info('Converting XHTML to RB markup...')
self.oeb_book = oeb_book
self.opts = opts
return self.mlize_spine()
def mlize_spine(self):
output = u'<HTML><HEAD><TITLE></TITLE></HEAD><BODY>'
for item in self.oeb_book.spine:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
output += self.add_page_anchor(item.href)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += u'</BODY></HTML>'
output = self.clean_text(output)
return output
def add_page_anchor(self, href):
href = os.path.splitext(os.path.basename(href))[0]
return u'<A NAME="%s"></A>' % href
def clean_text(self, text):
# Remove anchors that do not have links
anchors = set(re.findall(r'(?<=<A NAME=").+?(?="></A>)', text))
links = set(re.findall(r'(?<=<A HREF="#).+?(?=">)', text))
for unused in anchors.difference(links):
text = text.replace('<A NAME="%s"></A>' % unused, '')
return text
def dump_text(self, elem, stylizer, tag_stack=[]):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
return u''
text = u''
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return u''
tag = barename(elem.tag)
tag_count = 0
# Process tags that need special processing and that do not have inner
# text. Usually these require an argument
if tag == 'img':
src = os.path.basename(elem.get('src'))
name = self.name_map.get(src, src)
text += '<IMG SRC="%s">' % name
rb_tag = tag.upper() if tag in TAGS else None
if rb_tag:
tag_count += 1
text += '<%s>' % rb_tag
tag_stack.append(rb_tag)
if tag in LINK_TAGS:
href = elem.get('href')
if href:
if '://' not in href:
if '#' in href:
href = href.partition('#')[2]
href = os.path.splitext(os.path.basename(href))[0]
tag_count += 1
text += '<A HREF="%s">' % href
tag_stack.append('A')
# Anchor ids
id_name = elem.get('id')
if id_name:
text += '<A NAME="%s"></A>' % os.path.splitext(id_name)[0]
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag:
style_tag = style_tag.upper()
tag_count += 1
text += '<%s>' % style_tag
tag_stack.append(style_tag)
# Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += elem.text
for item in elem:
text += self.dump_text(item, stylizer, tag_stack)
close_tag_list = []
for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list)
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
text += elem.tail
return text
def close_tags(self, tags):
text = u''
for i in range(0, len(tags)):
tag = tags.pop()
text += '</%s>' % tag
return text

View File

@ -0,0 +1,143 @@
import os.path
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import struct
import zlib
import Image
import cStringIO
from calibre.ebooks.rb.rbml import RBMLizer
from calibre.ebooks.rb import HEADER
from calibre.ebooks.rb import unique_name
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.constants import __appname__, __version__
TEXT_RECORD_SIZE = 4096
class TocItem(object):
def __init__(self, name, size, flags):
self.name = name
self.size = size
self.flags = flags
class RBWriter(object):
def __init__(self, opts, log):
self.opts = opts
self.log = log
self.name_map = {}
def write_content(self, oeb_book, out_stream, metadata=None):
info = [('info.info', self._info_section(metadata))]
images = self._images(oeb_book.manifest)
text_size, chuncks = self._text(oeb_book)
chunck_sizes = [len(x) for x in chuncks]
text = [('index.html', chuncks)]
hidx = [('index.hidx', ' ')]
toc_items = []
page_count = 0
for name, data in info+text+hidx+images:
page_count += 1
size = len(data)
if (name, data) in text:
flags = 8
size = 0
for c in chunck_sizes:
size += c
size += 8 + (len(chunck_sizes) * 4)
elif (name, data) in info:
flags = 2
else:
flags = 0
toc_items.append(TocItem(name.ljust(32, '\x00')[:32], size, flags))
out_stream.write(HEADER)
out_stream.write(struct.pack('<I', 0))
out_stream.write(struct.pack('<IH', 0, 0))
out_stream.write(struct.pack('<I', 0x128))
out_stream.write(struct.pack('<I', 0))
for i in range(0x20, 0x128, 4):
out_stream.write(struct.pack('<I', 0))
out_stream.write(struct.pack('<I', page_count))
offset = out_stream.tell() + (len(toc_items) * 44)
for item in toc_items:
out_stream.write(item.name)
out_stream.write(struct.pack('<I', item.size))
out_stream.write(struct.pack('<I', offset))
out_stream.write(struct.pack('<I', item.flags))
offset += item.size
out_stream.write(info[0][1])
# Compressed text with proper heading
out_stream.write(struct.pack('<I', len(text[0][1])))
out_stream.write(struct.pack('<I', text_size))
for size in chunck_sizes:
out_stream.write(struct.pack('<I', size))
for chunck in text[0][1]:
out_stream.write(chunck)
for item in hidx+images:
out_stream.write(item[1])
total_size = out_stream.tell()
out_stream.seek(0x1c)
out_stream.write(struct.pack('<I', total_size))
def _text(self, oeb_book):
rbmlizer = RBMLizer(name_map=self.name_map, ignore_tables=self.opts.linearize_tables)
text = rbmlizer.extract_content(oeb_book, self.opts).encode('cp1252', 'xmlcharrefreplace')
size = len(text)
pages = []
for i in range(0, (len(text) / TEXT_RECORD_SIZE) + 1):
pages.append(zlib.compress(text[i * TEXT_RECORD_SIZE : (i * TEXT_RECORD_SIZE) + TEXT_RECORD_SIZE], 9))
return (size, pages)
def _images(self, manifest):
images = []
used_names = []
for item in manifest:
if item.media_type in OEB_IMAGES:
data = ''
im = Image.open(cStringIO.StringIO(item.data)).convert('L')
data = cStringIO.StringIO()
im.save(data, 'PNG')
data = data.getvalue()
name = '%s.png' % os.path.splitext(os.path.basename(item.href))[0]
name = unique_name(name, used_names)
used_names.append(name)
self.name_map[os.path.basename(item.href)] = name
images.append((name, data))
return images
def _info_section(self, metadata):
text = 'TYPE=2\n'
if metadata:
if len(metadata.title) >= 1:
text += 'TITLE=%s\n' % metadata.title[0].value
if len(metadata.creator) >= 1:
from calibre.ebooks.metadata import authors_to_string
text += 'AUTHOR=%s\n' % authors_to_string([x.value for x in metadata.creator])
text += 'GENERATOR=%s - %s\n' % (__appname__, __version__)
text += 'PARSE=1\n'
text += 'OUTPUT=1\n'
text += 'BODY=index.html\n'
return text