FB2 Output: Generate output 100% compliant with the FB2 spec

2025-07-08 10:44:09 -04:00 · 2010-12-04 18:10:58 -07:00 · 2010-12-04 18:10:58 -07:00 · bad82f3daa
commit bad82f3daa
parent 23e24a9b2a bc669a1f98
1 changed files with 154 additions and 144 deletions
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -8,16 +8,10 @@ __docformat__ = 'restructuredtext en'
 Transform OEB content into FB2 markup
 '''
 import cStringIO
 from base64 import b64encode
 from datetime import datetime
 import re
 try:
    from PIL import Image
    Image
 except ImportError:
    import Image
 from lxml import etree
 from calibre import prepare_string_for_xml
@ -25,32 +19,7 @@ from calibre.constants import __appname__, __version__
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
-
+from calibre.utils.magick.draw import save_cover_data_to
 TAG_MAP = {
    'b' : 'strong',
    'i' : 'emphasis',
    'p' : 'p',
    'li' : 'p',
    'div': 'p',
 }
 TAG_SPACE = []
 TAG_IMAGES = [
    'img',
 ]
 TAG_LINKS = [
 ]
 BLOCK = [
    'p',
 ]
 STYLES = [
    ('font-weight', {'bold'   : 'strong', 'bolder' : 'strong'}),
    ('font-style', {'italic' : 'emphasis'}),
 ]
 class FB2MLizer(object):
    '''
@ -63,24 +32,32 @@ class FB2MLizer(object):
    def __init__(self, log):
        self.log = log
        self.image_hrefs = {}
        self.reset_state()
    def reset_state(self):
        # Used to ensure text and tags are always within <p> and </p>
        self.in_p = False
        # Mapping of image names. OEB allows for images to have the same name but be stored
        # in different directories. FB2 images are all in a flat layout so we rename all images
        # into a sequential numbering system to ensure there are no collisions between image names.
        self.image_hrefs = {}
    def extract_content(self, oeb_book, opts):
        self.log.info('Converting XHTML to FB2 markup...')
        self.oeb_book = oeb_book
        self.opts = opts
        return self.fb2mlize_spine()
    def fb2mlize_spine(self):
-        self.image_hrefs = {}
+        self.reset_state()
-        self.link_hrefs = {}
+
        output = [self.fb2_header()]
        output.append(self.get_text())
        output.append(self.fb2_body_footer())
        output.append(self.fb2mlize_images())
        output.append(self.fb2_footer())
        output = self.clean_text(u''.join(output))
        if self.opts.pretty_print:
            return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
        else:
@ -97,65 +74,75 @@ class FB2MLizer(object):
        return text
    def fb2_header(self):
-        author_first = u''
+        metadata = {}
-        author_middle = u''
+        metadata['author_first'] = u''
-        author_last = u''
+        metadata['author_middle'] = u''
-        author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
+        metadata['author_last'] = u''
        metadata['title'] = self.oeb_book.metadata.title[0].value
        metadata['appname'] = __appname__
        metadata['version'] = __version__
        metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
        metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
        author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
        if len(author_parts) == 1:
-            author_last = author_parts[0]
+            metadata['author_last'] = author_parts[0]
        elif len(author_parts) == 2:
-            author_first = author_parts[0]
+            metadata['author_first'] = author_parts[0]
-            author_last = author_parts[1]
+            metadata['author_last'] = author_parts[1]
        else:
-            author_first = author_parts[0]
+            metadata['author_first'] = author_parts[0]
-            author_middle = ' '.join(author_parts[1:-2])
+            metadata['author_middle'] = ' '.join(author_parts[1:-2])
-            author_last = author_parts[-1]
+            metadata['author_last'] = author_parts[-1]
        for key, value in metadata.items():
            metadata[key] = prepare_string_for_xml(value)
        return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
                '<description>' \
                    '<title-info>' \
-                        '<genre></genre>' \
+                        '<genre>antique</genre>' \
                        '<author>' \
-                            '<first-name>%s</first-name>' \
+                            '<first-name>%(author_first)s</first-name>' \
-                            '<middle-name>%s</middle-name>' \
+                            '<middle-name>%(author_middle)s</middle-name>' \
-                            '<last-name>%s</last-name>' \
+                            '<last-name>%(author_last)s</last-name>' \
                        '</author>' \
-                        '<book-title>%s</book-title>' \
+                        '<book-title>%(title)s</book-title>' \
-                        '<annotation><p/></annotation>' \
+                        '<lang>%(lang)s</lang>' \
                    '</title-info>' \
                    '<document-info>' \
-                        '<program-used>%s %s</program-used>' \
+                        '<author>' \
                            '<first-name></first-name>' \
                            '<middle-name></middle-name>' \
                            '<last-name></last-name>' \
                        '</author>' \
                        '<program-used>%(appname)s %(version)s</program-used>' \
                        '<date>%(date)s</date>' \
                        '<id>1</id>' \
                        '<version>1.0</version>' \
                    '</document-info>' \
-                '</description><body>' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last,
+                '</description>' % metadata
-                        self.oeb_book.metadata.title[0].value, __appname__, __version__)))
+
    def fb2_footer(self):
        return u'</FictionBook>'
    def get_text(self):
-        text = []
+        text = ['<body>']
-        for item in self.oeb_book.spine:            
+        for item in self.oeb_book.spine:
            self.log.debug('Converting %s to FictionBook2 XML' % item.href)
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
            text.append('<section>')
            text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
            text.append('</section>')
-        return ''.join(text)
+        return ''.join(text) + '</body>'
    def fb2_body_footer(self):
        return u'</body>'
    def fb2_footer(self):
        return u'</FictionBook>'
    def fb2mlize_images(self):
        images = []
        for item in self.oeb_book.manifest:
            if item.media_type in OEB_RASTER_IMAGES:
                try:
-                    im = Image.open(cStringIO.StringIO(item.data)).convert('RGB')
+                    data = save_cover_data_to(item.data, None,
-                    data = cStringIO.StringIO()
+                            return_data=True)
                    im.save(data, 'JPEG')
                    data = data.getvalue()
                    raw_data = b64encode(data)
                    # Don't put the encoded image on a single line.
                    data = ''
@ -178,29 +165,11 @@ class FB2MLizer(object):
        else:
            self.in_p = True
            return ['<p>'], ['p']
    def insert_empty_line(self, tags):
        if self.in_p:
            text = ['']
            closed_tags = []
            tags.reverse()
            for t in tags:
                text.append('</%s>' % t)
                closed_tags.append(t)
                if t == 'p':
                    break
            text.append('<empty-line />')
            closed_tags.reverse()
            for t in closed_tags:
                text.append('<%s>' % t)
            return text
        else:
            return ['<empty-line />']
    def close_open_p(self, tags):
        text = ['']
        added_p = False
-        
+
        if self.in_p:
            # Close all up to p. Close p. Reopen all closed tags including p.
            closed_tags = []
@ -217,86 +186,127 @@ class FB2MLizer(object):
            text.append('<p>')
            added_p = True
            self.in_p = True
-        
+
        return text, added_p
-    def dump_text(self, elem, stylizer, page, tag_stack=[]):
+    def handle_simple_tag(self, tag, tags):
-        if not isinstance(elem.tag, basestring) \
+        s_out = []
-           or namespace(elem.tag) != XHTML_NS:
+        s_tags = []
        if tag not in tags:
            p_out, p_tags = self.ensure_p()
            s_out += p_out
            s_tags += p_tags
            s_out.append('<%s>' % tag)
            s_tags.append(tag)
        return s_out, s_tags
    def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
        '''
        This function is intended to be used in a recursive manner. dump_text will
        run though all elements in the elem_tree and call itself on each element.
        self.image_hrefs will be populated by calling this function.
        @param elem_tree: etree representation of XHTML content to be transformed.
        @param stylizer: Used to track the style of elements within the tree.
        @param page: OEB page used to determine absolute urls.
        @param tag_stack: List of open FB2 tags to take into account.
        @return: List of string representing the XHTML converted to FB2 markup.
        '''
        # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
        if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
            return []
-        style = stylizer.style(elem)
+        style = stylizer.style(elem_tree)
-        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
           or style['visibility'] == 'hidden':
            return []
-        fb2_text = []
+        # FB2 generated output.
        fb2_out = []
        # FB2 tags in the order they are opened. This will be used to close the tags.
        tags = []
        # First tag in tree
        tag = barename(elem_tree.tag)
-        tag = barename(elem.tag)
+        # Process the XHTML tag if it needs to be converted to an FB2 tag.
        if tag in TAG_IMAGES:
            if elem.attrib.get('src', None):
                if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
                    self.image_hrefs[page.abshref(elem.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
                p_txt, p_tag = self.ensure_p()
                fb2_text += p_txt
                tags += p_tag
                fb2_text.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem.attrib['src'])])
        if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
-            fb2_text.append('<title>')
+            fb2_out.append('<title>')
            tags.append('title')
-        if tag == 'br':
+        if tag == 'img':
-            fb2_text += self.insert_empty_line(tag_stack+tags)
+            # TODO: Check that the image is in the manifest and only write the tag if it is.
-
+            if elem_tree.attrib.get('src', None):
-        fb2_tag = TAG_MAP.get(tag, None)
+                if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys():
-        if fb2_tag == 'p':
+                    self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
        elif tag == 'br':
            if self.in_p:
                closed_tags = []
                open_tags = tag_stack+tags
                open_tags.reverse()
                for t in open_tags:
                    fb2_out.append('</%s>' % t)
                    closed_tags.append(t)
                    if t == 'p':
                        break
                fb2_out.append('<empty-line />')
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
                fb2_out.append('<empty-line />')
        elif tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack+tags)
-            fb2_text += p_text
+            fb2_out += p_text
            if added_p:
                tags.append('p')
-        elif fb2_tag and fb2_tag not in tag_stack+tags:
+        elif tag == 'b':
-            p_text, p_tags = self.ensure_p()
+            s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
-            fb2_text += p_text
+            fb2_out += s_out
-            tags += p_tags
+            tags += s_tags
-            fb2_text.append('<%s>' % fb2_tag)
+        elif tag == 'i':
-            tags.append(fb2_tag)
+            s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
-        # Processes style information
+        # Processes style information.
-        for s in STYLES:
+        if style['font-style'] == 'italic':
-            style_tag = s[1].get(style[s[0]], None)
+            s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
-            if style_tag and style_tag not in tag_stack+tags:
+            fb2_out += s_out
-                p_text, p_tags = self.ensure_p()
+            tags += s_tags
-                fb2_text += p_text
+        elif style['font-weight'] in ('bold', 'bolder'):
-                tags += p_tags
+            s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
-                fb2_text.append('<%s>' % style_tag)
+            fb2_out += s_out
-                tags.append(style_tag)
+            tags += s_tags
-        if tag in TAG_SPACE:
+        # Process element text.
-            fb2_text.append(' ')
+        if hasattr(elem_tree, 'text') and elem_tree.text:
        if hasattr(elem, 'text') and elem.text:
            if not self.in_p:
-                fb2_text.append('<p>')
+                fb2_out.append('<p>')
-            fb2_text.append(prepare_string_for_xml(elem.text))
+            fb2_out.append(prepare_string_for_xml(elem_tree.text))
            if not self.in_p:
-                fb2_text.append('</p>')
+                fb2_out.append('</p>')
-        for item in elem:
+        # Process sub-elements.
-            fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags)
+        for item in elem_tree:
            fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)
        # Close open FB2 tags.
        tags.reverse()
-        fb2_text += self.close_tags(tags)
+        fb2_out += self.close_tags(tags)
-        if hasattr(elem, 'tail') and elem.tail:
+        # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
        if hasattr(elem_tree, 'tail') and elem_tree.tail:
            if not self.in_p:
-                fb2_text.append('<p>')
+                fb2_out.append('<p>')
-            fb2_text.append(prepare_string_for_xml(elem.tail))
+            fb2_out.append(prepare_string_for_xml(elem_tree.tail))
            if not self.in_p:
-                fb2_text.append('</p>')
+                fb2_out.append('</p>')
-        return fb2_text
+        return fb2_out
    def close_tags(self, tags):
        text = []