FB2 Output: Generate output 100% compliant with the FB2 spec

This commit is contained in:
Kovid Goyal 2010-12-04 18:10:58 -07:00
commit bad82f3daa

View File

@ -8,16 +8,10 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into FB2 markup Transform OEB content into FB2 markup
''' '''
import cStringIO
from base64 import b64encode from base64 import b64encode
from datetime import datetime
import re import re
try:
from PIL import Image
Image
except ImportError:
import Image
from lxml import etree from lxml import etree
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
@ -25,32 +19,7 @@ from calibre.constants import __appname__, __version__
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.utils.magick.draw import save_cover_data_to
TAG_MAP = {
'b' : 'strong',
'i' : 'emphasis',
'p' : 'p',
'li' : 'p',
'div': 'p',
}
TAG_SPACE = []
TAG_IMAGES = [
'img',
]
TAG_LINKS = [
]
BLOCK = [
'p',
]
STYLES = [
('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
('font-style', {'italic' : 'emphasis'}),
]
class FB2MLizer(object): class FB2MLizer(object):
''' '''
@ -63,24 +32,32 @@ class FB2MLizer(object):
def __init__(self, log): def __init__(self, log):
self.log = log self.log = log
self.image_hrefs = {} self.image_hrefs = {}
self.reset_state()
def reset_state(self):
# Used to ensure text and tags are always within <p> and </p> # Used to ensure text and tags are always within <p> and </p>
self.in_p = False self.in_p = False
# Mapping of image names. OEB allows for images to have the same name but be stored
# in different directories. FB2 images are all in a flat layout so we rename all images
# into a sequential numbering system to ensure there are no collisions between image names.
self.image_hrefs = {}
def extract_content(self, oeb_book, opts): def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to FB2 markup...') self.log.info('Converting XHTML to FB2 markup...')
self.oeb_book = oeb_book self.oeb_book = oeb_book
self.opts = opts self.opts = opts
return self.fb2mlize_spine() return self.fb2mlize_spine()
def fb2mlize_spine(self): def fb2mlize_spine(self):
self.image_hrefs = {} self.reset_state()
self.link_hrefs = {}
output = [self.fb2_header()] output = [self.fb2_header()]
output.append(self.get_text()) output.append(self.get_text())
output.append(self.fb2_body_footer())
output.append(self.fb2mlize_images()) output.append(self.fb2mlize_images())
output.append(self.fb2_footer()) output.append(self.fb2_footer())
output = self.clean_text(u''.join(output)) output = self.clean_text(u''.join(output))
if self.opts.pretty_print: if self.opts.pretty_print:
return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
else: else:
@ -97,65 +74,75 @@ class FB2MLizer(object):
return text return text
def fb2_header(self): def fb2_header(self):
author_first = u'' metadata = {}
author_middle = u'' metadata['author_first'] = u''
author_last = u'' metadata['author_middle'] = u''
author_parts = self.oeb_book.metadata.creator[0].value.split(' ') metadata['author_last'] = u''
metadata['title'] = self.oeb_book.metadata.title[0].value
metadata['appname'] = __appname__
metadata['version'] = __version__
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
if len(author_parts) == 1: if len(author_parts) == 1:
author_last = author_parts[0] metadata['author_last'] = author_parts[0]
elif len(author_parts) == 2: elif len(author_parts) == 2:
author_first = author_parts[0] metadata['author_first'] = author_parts[0]
author_last = author_parts[1] metadata['author_last'] = author_parts[1]
else: else:
author_first = author_parts[0] metadata['author_first'] = author_parts[0]
author_middle = ' '.join(author_parts[1:-2]) metadata['author_middle'] = ' '.join(author_parts[1:-2])
author_last = author_parts[-1] metadata['author_last'] = author_parts[-1]
for key, value in metadata.items():
metadata[key] = prepare_string_for_xml(value)
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \ return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
'<description>' \ '<description>' \
'<title-info>' \ '<title-info>' \
'<genre></genre>' \ '<genre>antique</genre>' \
'<author>' \ '<author>' \
'<first-name>%s</first-name>' \ '<first-name>%(author_first)s</first-name>' \
'<middle-name>%s</middle-name>' \ '<middle-name>%(author_middle)s</middle-name>' \
'<last-name>%s</last-name>' \ '<last-name>%(author_last)s</last-name>' \
'</author>' \ '</author>' \
'<book-title>%s</book-title>' \ '<book-title>%(title)s</book-title>' \
'<annotation><p/></annotation>' \ '<lang>%(lang)s</lang>' \
'</title-info>' \ '</title-info>' \
'<document-info>' \ '<document-info>' \
'<program-used>%s %s</program-used>' \ '<author>' \
'<first-name></first-name>' \
'<middle-name></middle-name>' \
'<last-name></last-name>' \
'</author>' \
'<program-used>%(appname)s %(version)s</program-used>' \
'<date>%(date)s</date>' \
'<id>1</id>' \
'<version>1.0</version>' \
'</document-info>' \ '</document-info>' \
'</description><body>' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last, '</description>' % metadata
self.oeb_book.metadata.title[0].value, __appname__, __version__)))
def fb2_footer(self):
return u'</FictionBook>'
def get_text(self): def get_text(self):
text = [] text = ['<body>']
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href) self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append('<section>') text.append('<section>')
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
text.append('</section>') text.append('</section>')
return ''.join(text) return ''.join(text) + '</body>'
def fb2_body_footer(self):
return u'</body>'
def fb2_footer(self):
return u'</FictionBook>'
def fb2mlize_images(self): def fb2mlize_images(self):
images = [] images = []
for item in self.oeb_book.manifest: for item in self.oeb_book.manifest:
if item.media_type in OEB_RASTER_IMAGES: if item.media_type in OEB_RASTER_IMAGES:
try: try:
im = Image.open(cStringIO.StringIO(item.data)).convert('RGB') data = save_cover_data_to(item.data, None,
data = cStringIO.StringIO() return_data=True)
im.save(data, 'JPEG')
data = data.getvalue()
raw_data = b64encode(data) raw_data = b64encode(data)
# Don't put the encoded image on a single line. # Don't put the encoded image on a single line.
data = '' data = ''
@ -178,29 +165,11 @@ class FB2MLizer(object):
else: else:
self.in_p = True self.in_p = True
return ['<p>'], ['p'] return ['<p>'], ['p']
def insert_empty_line(self, tags):
if self.in_p:
text = ['']
closed_tags = []
tags.reverse()
for t in tags:
text.append('</%s>' % t)
closed_tags.append(t)
if t == 'p':
break
text.append('<empty-line />')
closed_tags.reverse()
for t in closed_tags:
text.append('<%s>' % t)
return text
else:
return ['<empty-line />']
def close_open_p(self, tags): def close_open_p(self, tags):
text = [''] text = ['']
added_p = False added_p = False
if self.in_p: if self.in_p:
# Close all up to p. Close p. Reopen all closed tags including p. # Close all up to p. Close p. Reopen all closed tags including p.
closed_tags = [] closed_tags = []
@ -217,86 +186,127 @@ class FB2MLizer(object):
text.append('<p>') text.append('<p>')
added_p = True added_p = True
self.in_p = True self.in_p = True
return text, added_p return text, added_p
def dump_text(self, elem, stylizer, page, tag_stack=[]): def handle_simple_tag(self, tag, tags):
if not isinstance(elem.tag, basestring) \ s_out = []
or namespace(elem.tag) != XHTML_NS: s_tags = []
if tag not in tags:
p_out, p_tags = self.ensure_p()
s_out += p_out
s_tags += p_tags
s_out.append('<%s>' % tag)
s_tags.append(tag)
return s_out, s_tags
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
'''
This function is intended to be used in a recursive manner. dump_text will
run though all elements in the elem_tree and call itself on each element.
self.image_hrefs will be populated by calling this function.
@param elem_tree: etree representation of XHTML content to be transformed.
@param stylizer: Used to track the style of elements within the tree.
@param page: OEB page used to determine absolute urls.
@param tag_stack: List of open FB2 tags to take into account.
@return: List of string representing the XHTML converted to FB2 markup.
'''
# Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
return [] return []
style = stylizer.style(elem) style = stylizer.style(elem_tree)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
or style['visibility'] == 'hidden':
return [] return []
fb2_text = [] # FB2 generated output.
fb2_out = []
# FB2 tags in the order they are opened. This will be used to close the tags.
tags = [] tags = []
# First tag in tree
tag = barename(elem_tree.tag)
tag = barename(elem.tag) # Process the XHTML tag if it needs to be converted to an FB2 tag.
if tag in TAG_IMAGES:
if elem.attrib.get('src', None):
if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
self.image_hrefs[page.abshref(elem.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
p_txt, p_tag = self.ensure_p()
fb2_text += p_txt
tags += p_tag
fb2_text.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem.attrib['src'])])
if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title: if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
fb2_text.append('<title>') fb2_out.append('<title>')
tags.append('title') tags.append('title')
if tag == 'br': if tag == 'img':
fb2_text += self.insert_empty_line(tag_stack+tags) # TODO: Check that the image is in the manifest and only write the tag if it is.
if elem_tree.attrib.get('src', None):
fb2_tag = TAG_MAP.get(tag, None) if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys():
if fb2_tag == 'p': self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
p_txt, p_tag = self.ensure_p()
fb2_out += p_txt
tags += p_tag
fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
elif tag == 'br':
if self.in_p:
closed_tags = []
open_tags = tag_stack+tags
open_tags.reverse()
for t in open_tags:
fb2_out.append('</%s>' % t)
closed_tags.append(t)
if t == 'p':
break
fb2_out.append('<empty-line />')
closed_tags.reverse()
for t in closed_tags:
fb2_out.append('<%s>' % t)
else:
fb2_out.append('<empty-line />')
elif tag in ('div', 'li', 'p'):
p_text, added_p = self.close_open_p(tag_stack+tags) p_text, added_p = self.close_open_p(tag_stack+tags)
fb2_text += p_text fb2_out += p_text
if added_p: if added_p:
tags.append('p') tags.append('p')
elif fb2_tag and fb2_tag not in tag_stack+tags: elif tag == 'b':
p_text, p_tags = self.ensure_p() s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_text += p_text fb2_out += s_out
tags += p_tags tags += s_tags
fb2_text.append('<%s>' % fb2_tag) elif tag == 'i':
tags.append(fb2_tag) s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
fb2_out += s_out
tags += s_tags
# Processes style information # Processes style information.
for s in STYLES: if style['font-style'] == 'italic':
style_tag = s[1].get(style[s[0]], None) s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
if style_tag and style_tag not in tag_stack+tags: fb2_out += s_out
p_text, p_tags = self.ensure_p() tags += s_tags
fb2_text += p_text elif style['font-weight'] in ('bold', 'bolder'):
tags += p_tags s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_text.append('<%s>' % style_tag) fb2_out += s_out
tags.append(style_tag) tags += s_tags
if tag in TAG_SPACE: # Process element text.
fb2_text.append(' ') if hasattr(elem_tree, 'text') and elem_tree.text:
if hasattr(elem, 'text') and elem.text:
if not self.in_p: if not self.in_p:
fb2_text.append('<p>') fb2_out.append('<p>')
fb2_text.append(prepare_string_for_xml(elem.text)) fb2_out.append(prepare_string_for_xml(elem_tree.text))
if not self.in_p: if not self.in_p:
fb2_text.append('</p>') fb2_out.append('</p>')
for item in elem: # Process sub-elements.
fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags) for item in elem_tree:
fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)
# Close open FB2 tags.
tags.reverse() tags.reverse()
fb2_text += self.close_tags(tags) fb2_out += self.close_tags(tags)
if hasattr(elem, 'tail') and elem.tail: # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
if hasattr(elem_tree, 'tail') and elem_tree.tail:
if not self.in_p: if not self.in_p:
fb2_text.append('<p>') fb2_out.append('<p>')
fb2_text.append(prepare_string_for_xml(elem.tail)) fb2_out.append(prepare_string_for_xml(elem_tree.tail))
if not self.in_p: if not self.in_p:
fb2_text.append('</p>') fb2_out.append('</p>')
return fb2_text return fb2_out
def close_tags(self, tags): def close_tags(self, tags):
text = [] text = []