From 3659eb1b7a43de50945124e81bdd172efa887043 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 23 May 2009 19:26:21 -0400 Subject: [PATCH] RB: output. --- src/calibre/ebooks/pdb/ereader/writer.py | 48 +++---- src/calibre/ebooks/rb/__init__.py | 15 ++ src/calibre/ebooks/rb/output.py | 36 +++++ src/calibre/ebooks/rb/rbml.py | 166 +++++++++++++++++++++++ src/calibre/ebooks/rb/writer.py | 143 +++++++++++++++++++ 5 files changed, 384 insertions(+), 24 deletions(-) create mode 100644 src/calibre/ebooks/rb/output.py create mode 100644 src/calibre/ebooks/rb/rbml.py create mode 100644 src/calibre/ebooks/rb/writer.py diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index c8567c93b6..79cb11fdb9 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -27,62 +27,62 @@ IDENTITY = 'PNRdPPrs' MAX_RECORD_SIZE = 3560 class Writer(FormatWriter): - + def __init__(self, opts, log): self.opts = opts self.log = log - + def write_content(self, oeb_book, out_stream, metadata=None): text = self._text(oeb_book) images = self._images(oeb_book.manifest) metadata = [self._metadata(metadata)] - + hr = [self._header_record(len(text), len(images))] - + sections = hr+text+images+metadata+['MeTaInFo\x00'] - + lengths = [len(i) for i in sections] - + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0]) pdbHeaderBuilder.build_header(lengths, out_stream) - + for item in sections: out_stream.write(item) def _text(self, oeb_book): pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables) pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') - + pml_pages = [] for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])) - return pml_pages - + return pml_pages + def _images(self, manifest): images = [] - + for item in manifest: if item.media_type in OEB_IMAGES: image = 'PNG ' image += image_name(item.href) image = image.ljust(62, '\x00') - + im = Image.open(cStringIO.StringIO(item.data)).convert('P') im.thumbnail((300,300), Image.ANTIALIAS) - + data = cStringIO.StringIO() im.save(data, 'PNG') data = data.getvalue() - + image += data - + if len(image) < 65505: images.append(image) - + return images - + def _metadata(self, metadata): ''' Metadata takes the form: @@ -92,14 +92,14 @@ class Writer(FormatWriter): publisher\x00 isbn\x00 ''' - + title = _('Unknown') author = _('Unknown') copyright = '' publisher = '' isbn = '' - - if metadata != None: + + if metadata: if len(metadata.title) >= 1: title = metadata.title[0].value if len(metadata.creator) >= 1: @@ -119,7 +119,7 @@ class Writer(FormatWriter): ''' version = 10 # Zlib compression non_text_offset = text_items + 1 - + if image_items > 0: image_data_offset = text_items + 1 meta_data_offset = image_data_offset + image_items @@ -128,9 +128,9 @@ class Writer(FormatWriter): meta_data_offset = text_items + 1 last_data_offset = meta_data_offset + 1 image_data_offset = last_data_offset - + record = '' - + record += struct.pack('>H', version) # [0:2] # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM record += struct.pack('>H', 0) # [2:4] record += struct.pack('>H', 0) # [4:6] @@ -161,6 +161,6 @@ class Writer(FormatWriter): for i in range(54, 132, 2): record += struct.pack('>H', 0) # [54:132] - + return record diff --git a/src/calibre/ebooks/rb/__init__.py b/src/calibre/ebooks/rb/__init__.py index 7c048a95c8..33e9882d9a 100644 --- a/src/calibre/ebooks/rb/__init__.py +++ b/src/calibre/ebooks/rb/__init__.py @@ -4,8 +4,23 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import os + HEADER = '\xb0\x0c\xb0\x0c\x02\x00NUVO\x00\x00\x00\x00' class RocketBookError(Exception): pass + +def unique_name(name, used_names): + name = os.path.basename(name) + if len(name) < 32 and name not in used_names: + return name + else: + ext = os.path.splitext(name)[1][:3] + base_name = name[:22] + for i in range(0, 9999): + name = '%s-%s.%s' % (str(i).rjust('0', 4)[:4], base_name, ext) + if name not in used_names: + break + return name diff --git a/src/calibre/ebooks/rb/output.py b/src/calibre/ebooks/rb/output.py new file mode 100644 index 0000000000..04c7d41790 --- /dev/null +++ b/src/calibre/ebooks/rb/output.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ebooks.rb.writer import RBWriter + +class RBOutput(OutputFormatPlugin): + + name = 'RB Output' + author = 'John Schember' + file_type = 'rb' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + writer = RBWriter(opts, log) + + out_stream.seek(0) + out_stream.truncate() + + writer.write_content(oeb_book, out_stream, oeb_book.metadata) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py new file mode 100644 index 0000000000..3b88f3bc09 --- /dev/null +++ b/src/calibre/ebooks/rb/rbml.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into RB compatible markup. +''' + +import os +import re + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +TAGS = [ + 'b', + 'big', + 'blockquote', + 'br', + 'center', + 'code', + 'div', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'hr', + 'i', + 'li', + 'ol', + 'p', + 'pre', + 'small', + 'sub', + 'sup', + 'ul', +] + +LINK_TAGS = [ + 'a', +] + +STYLES = [ + ('font-weight', {'bold' : 'b', 'bolder' : 'b'}), + ('font-style', {'italic' : 'i'}), + ('text-align', {'center' : 'center'}), +] + +class RBMLizer(object): + + def __init__(self, name_map={}, ignore_tables=False): + self.name_map = name_map + self.ignore_tables = ignore_tables + + def extract_content(self, oeb_book, opts): + oeb_book.logger.info('Converting XHTML to RB markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.mlize_spine() + + + def mlize_spine(self): + output = u'' + for item in self.oeb_book.spine: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.add_page_anchor(item.href) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += u'' + output = self.clean_text(output) + return output + + def add_page_anchor(self, href): + href = os.path.splitext(os.path.basename(href))[0] + return u'' % href + + def clean_text(self, text): + # Remove anchors that do not have links + anchors = set(re.findall(r'(?<=)', text)) + links = set(re.findall(r'(?<=)', text)) + for unused in anchors.difference(links): + text = text.replace('' % unused, '') + + return text + + def dump_text(self, elem, stylizer, tag_stack=[]): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + tag_count = 0 + + # Process tags that need special processing and that do not have inner + # text. Usually these require an argument + if tag == 'img': + src = os.path.basename(elem.get('src')) + name = self.name_map.get(src, src) + text += '' % name + + rb_tag = tag.upper() if tag in TAGS else None + if rb_tag: + tag_count += 1 + text += '<%s>' % rb_tag + tag_stack.append(rb_tag) + + if tag in LINK_TAGS: + href = elem.get('href') + if href: + if '://' not in href: + if '#' in href: + href = href.partition('#')[2] + href = os.path.splitext(os.path.basename(href))[0] + tag_count += 1 + text += '' % href + tag_stack.append('A') + + # Anchor ids + id_name = elem.get('id') + if id_name: + text += '' % os.path.splitext(id_name)[0] + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag: + style_tag = style_tag.upper() + tag_count += 1 + text += '<%s>' % style_tag + tag_stack.append(style_tag) + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += elem.text + + for item in elem: + text += self.dump_text(item, stylizer, tag_stack) + + close_tag_list = [] + for i in range(0, tag_count): + close_tag_list.insert(0, tag_stack.pop()) + + text += self.close_tags(close_tag_list) + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + text += elem.tail + + return text + + def close_tags(self, tags): + text = u'' + for i in range(0, len(tags)): + tag = tags.pop() + text += '' % tag + + return text diff --git a/src/calibre/ebooks/rb/writer.py b/src/calibre/ebooks/rb/writer.py new file mode 100644 index 0000000000..f9057d5c61 --- /dev/null +++ b/src/calibre/ebooks/rb/writer.py @@ -0,0 +1,143 @@ +import os.path +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os +import struct +import zlib + +import Image +import cStringIO + +from calibre.ebooks.rb.rbml import RBMLizer +from calibre.ebooks.rb import HEADER +from calibre.ebooks.rb import unique_name +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.constants import __appname__, __version__ + +TEXT_RECORD_SIZE = 4096 + +class TocItem(object): + + def __init__(self, name, size, flags): + self.name = name + self.size = size + self.flags = flags + + +class RBWriter(object): + + def __init__(self, opts, log): + self.opts = opts + self.log = log + self.name_map = {} + + def write_content(self, oeb_book, out_stream, metadata=None): + info = [('info.info', self._info_section(metadata))] + images = self._images(oeb_book.manifest) + text_size, chuncks = self._text(oeb_book) + chunck_sizes = [len(x) for x in chuncks] + text = [('index.html', chuncks)] + hidx = [('index.hidx', ' ')] + + toc_items = [] + page_count = 0 + for name, data in info+text+hidx+images: + page_count += 1 + size = len(data) + if (name, data) in text: + flags = 8 + size = 0 + for c in chunck_sizes: + size += c + size += 8 + (len(chunck_sizes) * 4) + elif (name, data) in info: + flags = 2 + else: + flags = 0 + toc_items.append(TocItem(name.ljust(32, '\x00')[:32], size, flags)) + + out_stream.write(HEADER) + out_stream.write(struct.pack('= 1: + text += 'TITLE=%s\n' % metadata.title[0].value + if len(metadata.creator) >= 1: + from calibre.ebooks.metadata import authors_to_string + text += 'AUTHOR=%s\n' % authors_to_string([x.value for x in metadata.creator]) + text += 'GENERATOR=%s - %s\n' % (__appname__, __version__) + text += 'PARSE=1\n' + text += 'OUTPUT=1\n' + text += 'BODY=index.html\n' + + return text + \ No newline at end of file