diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index 252453d25e..f27729ae8c 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -26,32 +26,6 @@ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
-TAG_MAP = {
- 'b' : 'strong',
- 'i' : 'emphasis',
- 'p' : 'p',
- 'li' : 'p',
- 'div': 'p',
-}
-
-TAG_SPACE = []
-
-TAG_IMAGES = [
- 'img',
-]
-
-TAG_LINKS = [
-]
-
-BLOCK = [
- 'p',
-]
-
-STYLES = [
- ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
- ('font-style', {'italic' : 'emphasis'}),
-]
-
class FB2MLizer(object):
'''
Todo: * Include more FB2 specific tags in the conversion.
@@ -63,24 +37,32 @@ class FB2MLizer(object):
def __init__(self, log):
self.log = log
self.image_hrefs = {}
+ self.reset_state()
+
+ def reset_state(self):
# Used to ensure text and tags are always within
and
self.in_p = False
+ # Mapping of image names. OEB allows for images to have the same name but be stored
+ # in different directories. FB2 images are all in a flat layout so we rename all images
+ # into a sequential numbering system to ensure there are no collisions between image names.
+ self.image_hrefs = {}
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to FB2 markup...')
self.oeb_book = oeb_book
self.opts = opts
+
return self.fb2mlize_spine()
def fb2mlize_spine(self):
- self.image_hrefs = {}
- self.link_hrefs = {}
+ self.reset_state()
+
output = [self.fb2_header()]
output.append(self.get_text())
- output.append(self.fb2_body_footer())
output.append(self.fb2mlize_images())
output.append(self.fb2_footer())
output = self.clean_text(u''.join(output))
+
if self.opts.pretty_print:
return u'\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
else:
@@ -127,24 +109,21 @@ class FB2MLizer(object):
'' \
'%s %s' \
'' \
- '' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last,
+ '' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last,
self.oeb_book.metadata.title[0].value, __appname__, __version__)))
+ def fb2_footer(self):
+ return u''
+
def get_text(self):
- text = []
+ text = ['']
for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append('')
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
text.append('')
- return ''.join(text)
-
- def fb2_body_footer(self):
- return u''
-
- def fb2_footer(self):
- return u''
+ return ''.join(text) + ''
def fb2mlize_images(self):
images = []
@@ -178,24 +157,6 @@ class FB2MLizer(object):
else:
self.in_p = True
return [''], ['p']
-
- def insert_empty_line(self, tags):
- if self.in_p:
- text = ['']
- closed_tags = []
- tags.reverse()
- for t in tags:
- text.append('%s>' % t)
- closed_tags.append(t)
- if t == 'p':
- break
- text.append('')
- closed_tags.reverse()
- for t in closed_tags:
- text.append('<%s>' % t)
- return text
- else:
- return ['']
def close_open_p(self, tags):
text = ['']
@@ -219,84 +180,125 @@ class FB2MLizer(object):
self.in_p = True
return text, added_p
+
+ def handle_simple_tag(self, tag, tags):
+ s_out = []
+ s_tags = []
+ if tag not in tags:
+ p_out, p_tags = self.ensure_p()
+ s_out += p_out
+ s_tags += p_tags
+ s_out.append('<%s>' % tag)
+ s_tags.append(tag)
+ return s_out, s_tags
- def dump_text(self, elem, stylizer, page, tag_stack=[]):
- if not isinstance(elem.tag, basestring) \
- or namespace(elem.tag) != XHTML_NS:
+ def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
+ '''
+ This function is intended to be used in a recursive manner. dump_text will
+ run though all elements in the elem_tree and call itself on each element.
+
+ self.image_hrefs will be populated by calling this function.
+
+ @param elem_tree: etree representation of XHTML content to be transformed.
+ @param stylizer: Used to track the style of elements within the tree.
+ @param page: OEB page used to determine absolute urls.
+ @param tag_stack: List of open FB2 tags to take into account.
+
+ @return: List of string representing the XHTML converted to FB2 markup.
+ '''
+ # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
+ if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
return []
- style = stylizer.style(elem)
- if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
- or style['visibility'] == 'hidden':
+ style = stylizer.style(elem_tree)
+ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
return []
- fb2_text = []
+ # FB2 generated output.
+ fb2_out = []
+ # FB2 tags in the order they are opened. This will be used to close the tags.
tags = []
+ # First tag in tree
+ tag = barename(elem_tree.tag)
- tag = barename(elem.tag)
-
- if tag in TAG_IMAGES:
- if elem.attrib.get('src', None):
- if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
- self.image_hrefs[page.abshref(elem.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
- p_txt, p_tag = self.ensure_p()
- fb2_text += p_txt
- tags += p_tag
- fb2_text.append('' % self.image_hrefs[page.abshref(elem.attrib['src'])])
-
+ # Process the XHTML tag if it needs to be converted to an FB2 tag.
if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
- fb2_text.append('
')
+ fb2_out.append('')
tags.append('title')
- if tag == 'br':
- fb2_text += self.insert_empty_line(tag_stack+tags)
-
- fb2_tag = TAG_MAP.get(tag, None)
- if fb2_tag == 'p':
+ if tag == 'img':
+ # TODO: Check that the image is in the manifest and only write the tag if it is.
+ if elem_tree.attrib.get('src', None):
+ if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys():
+ self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
+ p_txt, p_tag = self.ensure_p()
+ fb2_out += p_txt
+ tags += p_tag
+ fb2_out.append('' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
+ elif tag == 'br':
+ if self.in_p:
+ closed_tags = []
+ open_tags = tag_stack+tags
+ open_tags.reverse()
+ for t in open_tags:
+ fb2_out.append('%s>' % t)
+ closed_tags.append(t)
+ if t == 'p':
+ break
+ fb2_out.append('')
+ closed_tags.reverse()
+ for t in closed_tags:
+ fb2_out.append('<%s>' % t)
+ else:
+ fb2_out.append('')
+ elif tag in ('div', 'li', 'p'):
p_text, added_p = self.close_open_p(tag_stack+tags)
- fb2_text += p_text
+ fb2_out += p_text
if added_p:
tags.append('p')
- elif fb2_tag and fb2_tag not in tag_stack+tags:
- p_text, p_tags = self.ensure_p()
- fb2_text += p_text
- tags += p_tags
- fb2_text.append('<%s>' % fb2_tag)
- tags.append(fb2_tag)
+ elif tag == 'b':
+ s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+ elif tag == 'i':
+ s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
- # Processes style information
- for s in STYLES:
- style_tag = s[1].get(style[s[0]], None)
- if style_tag and style_tag not in tag_stack+tags:
- p_text, p_tags = self.ensure_p()
- fb2_text += p_text
- tags += p_tags
- fb2_text.append('<%s>' % style_tag)
- tags.append(style_tag)
-
- if tag in TAG_SPACE:
- fb2_text.append(' ')
-
- if hasattr(elem, 'text') and elem.text:
+ # Processes style information.
+ if style['font-style'] == 'italic':
+ s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+ elif style['font-weight'] in ('bold', 'bolder'):
+ s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
+ fb2_out += s_out
+ tags += s_tags
+
+ # Process element text.
+ if hasattr(elem_tree, 'text') and elem_tree.text:
if not self.in_p:
- fb2_text.append('')
- fb2_text.append(prepare_string_for_xml(elem.text))
+ fb2_out.append('
')
+ fb2_out.append(prepare_string_for_xml(elem_tree.text))
if not self.in_p:
- fb2_text.append('
')
+ fb2_out.append('')
- for item in elem:
- fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags)
+ # Process sub-elements.
+ for item in elem_tree:
+ fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)
+ # Close open FB2 tags.
tags.reverse()
- fb2_text += self.close_tags(tags)
+ fb2_out += self.close_tags(tags)
- if hasattr(elem, 'tail') and elem.tail:
+ # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
+ if hasattr(elem_tree, 'tail') and elem_tree.tail:
if not self.in_p:
- fb2_text.append('')
- fb2_text.append(prepare_string_for_xml(elem.tail))
+ fb2_out.append('
')
+ fb2_out.append(prepare_string_for_xml(elem_tree.tail))
if not self.in_p:
- fb2_text.append('
')
+ fb2_out.append('')
- return fb2_text
+ return fb2_out
def close_tags(self, tags):
text = []