FB2 Output: Generate output 100% compliant with the FB2 spec

This commit is contained in:
Kovid Goyal 2010-12-04 18:10:58 -07:00
commit bad82f3daa

View File

@ -8,16 +8,10 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into FB2 markup
'''
import cStringIO
from base64 import b64encode
from datetime import datetime
import re
try:
from PIL import Image
Image
except ImportError:
import Image
from lxml import etree
from calibre import prepare_string_for_xml
@ -25,32 +19,7 @@ from calibre.constants import __appname__, __version__
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
TAG_MAP = {
'b' : 'strong',
'i' : 'emphasis',
'p' : 'p',
'li' : 'p',
'div': 'p',
}
TAG_SPACE = []
TAG_IMAGES = [
'img',
]
TAG_LINKS = [
]
BLOCK = [
'p',
]
STYLES = [
('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
('font-style', {'italic' : 'emphasis'}),
]
from calibre.utils.magick.draw import save_cover_data_to
class FB2MLizer(object):
'''
@ -63,24 +32,32 @@ class FB2MLizer(object):
def __init__(self, log):
self.log = log
self.image_hrefs = {}
self.reset_state()
def reset_state(self):
# Used to ensure text and tags are always within <p> and </p>
self.in_p = False
# Mapping of image names. OEB allows for images to have the same name but be stored
# in different directories. FB2 images are all in a flat layout so we rename all images
# into a sequential numbering system to ensure there are no collisions between image names.
self.image_hrefs = {}
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to FB2 markup...')
self.oeb_book = oeb_book
self.opts = opts
return self.fb2mlize_spine()
def fb2mlize_spine(self):
self.image_hrefs = {}
self.link_hrefs = {}
self.reset_state()
output = [self.fb2_header()]
output.append(self.get_text())
output.append(self.fb2_body_footer())
output.append(self.fb2mlize_images())
output.append(self.fb2_footer())
output = self.clean_text(u''.join(output))
if self.opts.pretty_print:
return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
else:
@ -97,65 +74,75 @@ class FB2MLizer(object):
return text
def fb2_header(self):
author_first = u''
author_middle = u''
author_last = u''
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
metadata = {}
metadata['author_first'] = u''
metadata['author_middle'] = u''
metadata['author_last'] = u''
metadata['title'] = self.oeb_book.metadata.title[0].value
metadata['appname'] = __appname__
metadata['version'] = __version__
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
if len(author_parts) == 1:
author_last = author_parts[0]
metadata['author_last'] = author_parts[0]
elif len(author_parts) == 2:
author_first = author_parts[0]
author_last = author_parts[1]
metadata['author_first'] = author_parts[0]
metadata['author_last'] = author_parts[1]
else:
author_first = author_parts[0]
author_middle = ' '.join(author_parts[1:-2])
author_last = author_parts[-1]
metadata['author_first'] = author_parts[0]
metadata['author_middle'] = ' '.join(author_parts[1:-2])
metadata['author_last'] = author_parts[-1]
for key, value in metadata.items():
metadata[key] = prepare_string_for_xml(value)
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
'<description>' \
'<title-info>' \
'<genre></genre>' \
'<genre>antique</genre>' \
'<author>' \
'<first-name>%s</first-name>' \
'<middle-name>%s</middle-name>' \
'<last-name>%s</last-name>' \
'<first-name>%(author_first)s</first-name>' \
'<middle-name>%(author_middle)s</middle-name>' \
'<last-name>%(author_last)s</last-name>' \
'</author>' \
'<book-title>%s</book-title>' \
'<annotation><p/></annotation>' \
'<book-title>%(title)s</book-title>' \
'<lang>%(lang)s</lang>' \
'</title-info>' \
'<document-info>' \
'<program-used>%s %s</program-used>' \
'<author>' \
'<first-name></first-name>' \
'<middle-name></middle-name>' \
'<last-name></last-name>' \
'</author>' \
'<program-used>%(appname)s %(version)s</program-used>' \
'<date>%(date)s</date>' \
'<id>1</id>' \
'<version>1.0</version>' \
'</document-info>' \
'</description><body>' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last,
self.oeb_book.metadata.title[0].value, __appname__, __version__)))
'</description>' % metadata
def fb2_footer(self):
return u'</FictionBook>'
def get_text(self):
text = []
text = ['<body>']
for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append('<section>')
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
text.append('</section>')
return ''.join(text)
def fb2_body_footer(self):
return u'</body>'
def fb2_footer(self):
return u'</FictionBook>'
return ''.join(text) + '</body>'
def fb2mlize_images(self):
images = []
for item in self.oeb_book.manifest:
if item.media_type in OEB_RASTER_IMAGES:
try:
im = Image.open(cStringIO.StringIO(item.data)).convert('RGB')
data = cStringIO.StringIO()
im.save(data, 'JPEG')
data = data.getvalue()
data = save_cover_data_to(item.data, None,
return_data=True)
raw_data = b64encode(data)
# Don't put the encoded image on a single line.
data = ''
@ -179,24 +166,6 @@ class FB2MLizer(object):
self.in_p = True
return ['<p>'], ['p']
def insert_empty_line(self, tags):
if self.in_p:
text = ['']
closed_tags = []
tags.reverse()
for t in tags:
text.append('</%s>' % t)
closed_tags.append(t)
if t == 'p':
break
text.append('<empty-line />')
closed_tags.reverse()
for t in closed_tags:
text.append('<%s>' % t)
return text
else:
return ['<empty-line />']
def close_open_p(self, tags):
text = ['']
added_p = False
@ -220,83 +189,124 @@ class FB2MLizer(object):
return text, added_p
def dump_text(self, elem, stylizer, page, tag_stack=[]):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
def handle_simple_tag(self, tag, tags):
s_out = []
s_tags = []
if tag not in tags:
p_out, p_tags = self.ensure_p()
s_out += p_out
s_tags += p_tags
s_out.append('<%s>' % tag)
s_tags.append(tag)
return s_out, s_tags
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
'''
This function is intended to be used in a recursive manner. dump_text will
run though all elements in the elem_tree and call itself on each element.
self.image_hrefs will be populated by calling this function.
@param elem_tree: etree representation of XHTML content to be transformed.
@param stylizer: Used to track the style of elements within the tree.
@param page: OEB page used to determine absolute urls.
@param tag_stack: List of open FB2 tags to take into account.
@return: List of string representing the XHTML converted to FB2 markup.
'''
# Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
return []
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
style = stylizer.style(elem_tree)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
return []
fb2_text = []
# FB2 generated output.
fb2_out = []
# FB2 tags in the order they are opened. This will be used to close the tags.
tags = []
# First tag in tree
tag = barename(elem_tree.tag)
tag = barename(elem.tag)
if tag in TAG_IMAGES:
if elem.attrib.get('src', None):
if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
self.image_hrefs[page.abshref(elem.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
p_txt, p_tag = self.ensure_p()
fb2_text += p_txt
tags += p_tag
fb2_text.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem.attrib['src'])])
# Process the XHTML tag if it needs to be converted to an FB2 tag.
if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
fb2_text.append('<title>')
fb2_out.append('<title>')
tags.append('title')
if tag == 'br':
fb2_text += self.insert_empty_line(tag_stack+tags)
fb2_tag = TAG_MAP.get(tag, None)
if fb2_tag == 'p':
if tag == 'img':
# TODO: Check that the image is in the manifest and only write the tag if it is.
if elem_tree.attrib.get('src', None):
if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys():
self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
p_txt, p_tag = self.ensure_p()
fb2_out += p_txt
tags += p_tag
fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
elif tag == 'br':
if self.in_p:
closed_tags = []
open_tags = tag_stack+tags
open_tags.reverse()
for t in open_tags:
fb2_out.append('</%s>' % t)
closed_tags.append(t)
if t == 'p':
break
fb2_out.append('<empty-line />')
closed_tags.reverse()
for t in closed_tags:
fb2_out.append('<%s>' % t)
else:
fb2_out.append('<empty-line />')
elif tag in ('div', 'li', 'p'):
p_text, added_p = self.close_open_p(tag_stack+tags)
fb2_text += p_text
fb2_out += p_text
if added_p:
tags.append('p')
elif fb2_tag and fb2_tag not in tag_stack+tags:
p_text, p_tags = self.ensure_p()
fb2_text += p_text
tags += p_tags
fb2_text.append('<%s>' % fb2_tag)
tags.append(fb2_tag)
elif tag == 'b':
s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_out += s_out
tags += s_tags
elif tag == 'i':
s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
fb2_out += s_out
tags += s_tags
# Processes style information
for s in STYLES:
style_tag = s[1].get(style[s[0]], None)
if style_tag and style_tag not in tag_stack+tags:
p_text, p_tags = self.ensure_p()
fb2_text += p_text
tags += p_tags
fb2_text.append('<%s>' % style_tag)
tags.append(style_tag)
# Processes style information.
if style['font-style'] == 'italic':
s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
fb2_out += s_out
tags += s_tags
elif style['font-weight'] in ('bold', 'bolder'):
s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_out += s_out
tags += s_tags
if tag in TAG_SPACE:
fb2_text.append(' ')
if hasattr(elem, 'text') and elem.text:
# Process element text.
if hasattr(elem_tree, 'text') and elem_tree.text:
if not self.in_p:
fb2_text.append('<p>')
fb2_text.append(prepare_string_for_xml(elem.text))
fb2_out.append('<p>')
fb2_out.append(prepare_string_for_xml(elem_tree.text))
if not self.in_p:
fb2_text.append('</p>')
fb2_out.append('</p>')
for item in elem:
fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags)
# Process sub-elements.
for item in elem_tree:
fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)
# Close open FB2 tags.
tags.reverse()
fb2_text += self.close_tags(tags)
fb2_out += self.close_tags(tags)
if hasattr(elem, 'tail') and elem.tail:
# Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
if hasattr(elem_tree, 'tail') and elem_tree.tail:
if not self.in_p:
fb2_text.append('<p>')
fb2_text.append(prepare_string_for_xml(elem.tail))
fb2_out.append('<p>')
fb2_out.append(prepare_string_for_xml(elem_tree.tail))
if not self.in_p:
fb2_text.append('</p>')
fb2_out.append('</p>')
return fb2_text
return fb2_out
def close_tags(self, tags):
text = []