From 3659eb1b7a43de50945124e81bdd172efa887043 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 23 May 2009 19:26:21 -0400
Subject: [PATCH] RB: output.

---
 src/calibre/ebooks/pdb/ereader/writer.py |  48 +++----
 src/calibre/ebooks/rb/__init__.py        |  15 ++
 src/calibre/ebooks/rb/output.py          |  36 +++++
 src/calibre/ebooks/rb/rbml.py            | 166 +++++++++++++++++++++++
 src/calibre/ebooks/rb/writer.py          | 143 +++++++++++++++++++
 5 files changed, 384 insertions(+), 24 deletions(-)
 create mode 100644 src/calibre/ebooks/rb/output.py
 create mode 100644 src/calibre/ebooks/rb/rbml.py
 create mode 100644 src/calibre/ebooks/rb/writer.py

diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py
index c8567c93b6..79cb11fdb9 100644
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@@ -27,62 +27,62 @@ IDENTITY = 'PNRdPPrs'
 MAX_RECORD_SIZE = 3560
 
 class Writer(FormatWriter):
-    
+
     def __init__(self, opts, log):
         self.opts = opts
         self.log = log
-        
+
     def write_content(self, oeb_book, out_stream, metadata=None):
         text = self._text(oeb_book)
         images = self._images(oeb_book.manifest)
         metadata = [self._metadata(metadata)]
-        
+
         hr = [self._header_record(len(text), len(images))]
-        
+
         sections = hr+text+images+metadata+['MeTaInFo\x00']
-        
+
         lengths = [len(i) for i in sections]
-        
+
         pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0])
         pdbHeaderBuilder.build_header(lengths, out_stream)
-        
+
         for item in sections:
             out_stream.write(item)
 
     def _text(self, oeb_book):
         pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables)
         pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
-    
+
         pml_pages = []
         for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1):
             pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]))
 
-        return pml_pages            
-        
+        return pml_pages
+
     def _images(self, manifest):
         images = []
-        
+
         for item in manifest:
             if item.media_type in OEB_IMAGES:
                 image = 'PNG '
 
                 image += image_name(item.href)
                 image = image.ljust(62, '\x00')
-                
+
                 im = Image.open(cStringIO.StringIO(item.data)).convert('P')
                 im.thumbnail((300,300), Image.ANTIALIAS)
-                
+
                 data = cStringIO.StringIO()
                 im.save(data, 'PNG')
                 data = data.getvalue()
-                
+
                 image += data
-                
+
                 if len(image) < 65505:
                     images.append(image)
-                
+
         return images
-        
+
     def _metadata(self, metadata):
         '''
         Metadata takes the form:
@@ -92,14 +92,14 @@ class Writer(FormatWriter):
         publisher\x00
         isbn\x00
         '''
-        
+
         title = _('Unknown')
         author = _('Unknown')
         copyright = ''
         publisher = ''
         isbn = ''
-        
-        if metadata != None:
+
+        if metadata:
             if len(metadata.title) >= 1:
                 title = metadata.title[0].value
             if len(metadata.creator) >= 1:
@@ -119,7 +119,7 @@ class Writer(FormatWriter):
         '''
         version = 10 # Zlib compression
         non_text_offset = text_items + 1
-        
+
         if image_items > 0:
             image_data_offset = text_items + 1
             meta_data_offset = image_data_offset + image_items
@@ -128,9 +128,9 @@ class Writer(FormatWriter):
             meta_data_offset = text_items + 1
             last_data_offset = meta_data_offset + 1
             image_data_offset = last_data_offset
-    
+
         record = ''
-        
+
         record += struct.pack('>H', version)                # [0:2]    # Version. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
         record += struct.pack('>H', 0)                      # [2:4]
         record += struct.pack('>H', 0)                      # [4:6]
@@ -161,6 +161,6 @@ class Writer(FormatWriter):
 
         for i in range(54, 132, 2):
             record += struct.pack('>H', 0)                  # [54:132]
-        
+
         return record
 
diff --git a/src/calibre/ebooks/rb/__init__.py b/src/calibre/ebooks/rb/__init__.py
index 7c048a95c8..33e9882d9a 100644
--- a/src/calibre/ebooks/rb/__init__.py
+++ b/src/calibre/ebooks/rb/__init__.py
@@ -4,8 +4,23 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
+import os
+
 HEADER = '\xb0\x0c\xb0\x0c\x02\x00NUVO\x00\x00\x00\x00'
 
 class RocketBookError(Exception):
     pass
 
+
+def unique_name(name, used_names):
+    name = os.path.basename(name)
+    if len(name) < 32 and name not in used_names:
+        return name
+    else:
+        ext = os.path.splitext(name)[1][:3]
+        base_name = name[:22]
+        for i in range(0, 9999):
+            name = '%s-%s.%s' % (str(i).rjust('0', 4)[:4], base_name, ext)
+            if name not in used_names:
+                break
+        return name
diff --git a/src/calibre/ebooks/rb/output.py b/src/calibre/ebooks/rb/output.py
new file mode 100644
index 0000000000..04c7d41790
--- /dev/null
+++ b/src/calibre/ebooks/rb/output.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+from calibre.customize.conversion import OutputFormatPlugin
+from calibre.ebooks.rb.writer import RBWriter
+
+class RBOutput(OutputFormatPlugin):
+
+    name = 'RB Output'
+    author = 'John Schember'
+    file_type = 'rb'
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        close = False
+        if not hasattr(output_path, 'write'):
+            close = True
+            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
+                os.makedirs(os.path.dirname(output_path))
+            out_stream = open(output_path, 'wb')
+        else:
+            out_stream = output_path
+
+        writer = RBWriter(opts, log)
+
+        out_stream.seek(0)
+        out_stream.truncate()
+
+        writer.write_content(oeb_book, out_stream, oeb_book.metadata)
+
+        if close:
+            out_stream.close()
diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py
new file mode 100644
index 0000000000..3b88f3bc09
--- /dev/null
+++ b/src/calibre/ebooks/rb/rbml.py
@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into RB compatible markup.
+'''
+
+import os
+import re
+
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
+from calibre.ebooks.oeb.stylizer import Stylizer
+
+TAGS = [
+    'b',
+    'big',
+    'blockquote',
+    'br',
+    'center',
+    'code',
+    'div',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'hr',
+    'i',
+    'li',
+    'ol',
+    'p',
+    'pre',
+    'small',
+    'sub',
+    'sup',
+    'ul',
+]
+
+LINK_TAGS = [
+    'a',
+]
+
+STYLES = [
+    ('font-weight', {'bold'   : 'b', 'bolder' : 'b'}),
+    ('font-style', {'italic' : 'i'}),
+    ('text-align', {'center' : 'center'}),
+]
+
+class RBMLizer(object):
+
+    def __init__(self, name_map={}, ignore_tables=False):
+        self.name_map = name_map
+        self.ignore_tables = ignore_tables
+
+    def extract_content(self, oeb_book, opts):
+        oeb_book.logger.info('Converting XHTML to RB markup...')
+        self.oeb_book = oeb_book
+        self.opts = opts
+        return self.mlize_spine()
+
+
+    def mlize_spine(self):
+        output = u'<HTML><HEAD><TITLE></TITLE></HEAD><BODY>'
+        for item in self.oeb_book.spine:
+            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
+            output += self.add_page_anchor(item.href)
+            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
+        output += u'</BODY></HTML>'
+        output = self.clean_text(output)
+        return output
+
+    def add_page_anchor(self, href):
+        href = os.path.splitext(os.path.basename(href))[0]
+        return u'<A NAME="%s"></A>' % href
+
+    def clean_text(self, text):        
+        # Remove anchors that do not have links
+        anchors = set(re.findall(r'(?<=<A NAME=").+?(?="></A>)', text))
+        links = set(re.findall(r'(?<=<A HREF="#).+?(?=">)', text))
+        for unused in anchors.difference(links):
+            text = text.replace('<A NAME="%s"></A>' % unused, '')
+
+        return text
+
+    def dump_text(self, elem, stylizer, tag_stack=[]):
+        if not isinstance(elem.tag, basestring) \
+           or namespace(elem.tag) != XHTML_NS:
+            return u''
+
+        text = u''
+        style = stylizer.style(elem)
+
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            return u''
+
+        tag = barename(elem.tag)
+        tag_count = 0
+        
+        # Process tags that need special processing and that do not have inner
+        # text. Usually these require an argument
+        if tag == 'img':
+            src = os.path.basename(elem.get('src'))
+            name = self.name_map.get(src, src)
+            text += '<IMG SRC="%s">' % name
+
+        rb_tag = tag.upper() if tag in TAGS else None
+        if rb_tag:
+            tag_count += 1
+            text += '<%s>' % rb_tag
+            tag_stack.append(rb_tag)
+
+        if tag in LINK_TAGS:
+            href = elem.get('href')
+            if href:
+                if '://' not in href:
+                    if '#' in href:
+                        href = href.partition('#')[2]
+                    href = os.path.splitext(os.path.basename(href))[0]
+                tag_count += 1
+                text += '<A HREF="%s">' % href
+                tag_stack.append('A')
+
+        # Anchor ids
+        id_name = elem.get('id')
+        if id_name:
+            text += '<A NAME="%s"></A>' % os.path.splitext(id_name)[0]
+
+        # Processes style information
+        for s in STYLES:
+            style_tag = s[1].get(style[s[0]], None)
+            if style_tag:
+                style_tag = style_tag.upper()
+                tag_count += 1
+                text += '<%s>' % style_tag
+                tag_stack.append(style_tag)
+
+        # Proccess tags that contain text.
+        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
+            text += elem.text
+
+        for item in elem:
+            text += self.dump_text(item, stylizer, tag_stack)
+
+        close_tag_list = []
+        for i in range(0, tag_count):
+            close_tag_list.insert(0, tag_stack.pop())
+
+        text += self.close_tags(close_tag_list)
+
+        if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
+                text += elem.tail
+
+        return text
+
+    def close_tags(self, tags):
+        text = u''
+        for i in range(0, len(tags)):
+            tag = tags.pop()
+            text += '</%s>' % tag
+
+        return text
diff --git a/src/calibre/ebooks/rb/writer.py b/src/calibre/ebooks/rb/writer.py
new file mode 100644
index 0000000000..f9057d5c61
--- /dev/null
+++ b/src/calibre/ebooks/rb/writer.py
@@ -0,0 +1,143 @@
+import os.path
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+import struct
+import zlib
+
+import Image
+import cStringIO
+
+from calibre.ebooks.rb.rbml import RBMLizer
+from calibre.ebooks.rb import HEADER
+from calibre.ebooks.rb import unique_name
+from calibre.ebooks.oeb.base import OEB_IMAGES
+from calibre.constants import __appname__, __version__
+
+TEXT_RECORD_SIZE = 4096
+
+class TocItem(object):
+
+    def __init__(self, name, size, flags):
+        self.name = name
+        self.size = size
+        self.flags = flags
+
+
+class RBWriter(object):
+
+    def __init__(self, opts, log):
+        self.opts = opts
+        self.log = log
+        self.name_map = {}
+
+    def write_content(self, oeb_book, out_stream, metadata=None):
+        info = [('info.info', self._info_section(metadata))]
+        images = self._images(oeb_book.manifest)
+        text_size, chuncks = self._text(oeb_book)
+        chunck_sizes = [len(x) for x in chuncks]
+        text = [('index.html', chuncks)]
+        hidx = [('index.hidx', ' ')]
+
+        toc_items = []
+        page_count = 0
+        for name, data in info+text+hidx+images:
+            page_count += 1
+            size = len(data)
+            if (name, data) in text:
+                flags = 8
+                size = 0
+                for c in chunck_sizes:
+                    size += c
+                size += 8 + (len(chunck_sizes) * 4)
+            elif (name, data) in info:
+                flags = 2
+            else:
+                flags = 0
+            toc_items.append(TocItem(name.ljust(32, '\x00')[:32], size, flags))
+
+        out_stream.write(HEADER)
+        out_stream.write(struct.pack('<I', 0))
+        out_stream.write(struct.pack('<IH', 0, 0))
+        out_stream.write(struct.pack('<I', 0x128))
+        out_stream.write(struct.pack('<I', 0))
+        for i in range(0x20, 0x128, 4):
+            out_stream.write(struct.pack('<I', 0))
+        out_stream.write(struct.pack('<I', page_count))
+        offset = out_stream.tell() + (len(toc_items) * 44)
+        for item in toc_items:
+            out_stream.write(item.name)
+            out_stream.write(struct.pack('<I', item.size))
+            out_stream.write(struct.pack('<I', offset))
+            out_stream.write(struct.pack('<I', item.flags))
+            offset += item.size
+
+        out_stream.write(info[0][1])
+
+        # Compressed text with proper heading
+        out_stream.write(struct.pack('<I', len(text[0][1])))
+        out_stream.write(struct.pack('<I', text_size))
+        for size in chunck_sizes:
+            out_stream.write(struct.pack('<I', size))
+        for chunck in text[0][1]:
+            out_stream.write(chunck)
+
+        for item in hidx+images:
+            out_stream.write(item[1])
+
+        total_size = out_stream.tell()
+        out_stream.seek(0x1c)
+        out_stream.write(struct.pack('<I', total_size))
+
+    def _text(self, oeb_book):
+        rbmlizer = RBMLizer(name_map=self.name_map, ignore_tables=self.opts.linearize_tables)
+        text = rbmlizer.extract_content(oeb_book, self.opts).encode('cp1252', 'xmlcharrefreplace')
+        size = len(text)
+
+        pages = []
+        for i in range(0, (len(text) / TEXT_RECORD_SIZE) + 1):
+            pages.append(zlib.compress(text[i * TEXT_RECORD_SIZE : (i * TEXT_RECORD_SIZE) + TEXT_RECORD_SIZE], 9))
+
+        return (size, pages)
+
+    def _images(self, manifest):
+        images = []
+        used_names = []
+
+        for item in manifest:
+            if item.media_type in OEB_IMAGES:
+                data = ''
+
+                im = Image.open(cStringIO.StringIO(item.data)).convert('L')
+                data = cStringIO.StringIO()
+                im.save(data, 'PNG')
+                data = data.getvalue()
+
+                name = '%s.png' % os.path.splitext(os.path.basename(item.href))[0]
+                name = unique_name(name, used_names)
+                used_names.append(name)
+                self.name_map[os.path.basename(item.href)] = name
+                
+                images.append((name, data))
+
+        return images
+
+    def _info_section(self, metadata):
+        text = 'TYPE=2\n'
+        if metadata:
+            if len(metadata.title) >= 1:
+                text += 'TITLE=%s\n' % metadata.title[0].value
+            if len(metadata.creator) >= 1:
+                from calibre.ebooks.metadata import authors_to_string
+                text += 'AUTHOR=%s\n' % authors_to_string([x.value for x in metadata.creator])
+        text += 'GENERATOR=%s - %s\n' % (__appname__, __version__)
+        text += 'PARSE=1\n'
+        text += 'OUTPUT=1\n'
+        text += 'BODY=index.html\n'
+
+        return text
+    
\ No newline at end of file