diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 252453d25e..77332b826e 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -8,16 +8,10 @@ __docformat__ = 'restructuredtext en' Transform OEB content into FB2 markup ''' -import cStringIO from base64 import b64encode +from datetime import datetime import re -try: - from PIL import Image - Image -except ImportError: - import Image - from lxml import etree from calibre import prepare_string_for_xml @@ -25,32 +19,7 @@ from calibre.constants import __appname__, __version__ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES - -TAG_MAP = { - 'b' : 'strong', - 'i' : 'emphasis', - 'p' : 'p', - 'li' : 'p', - 'div': 'p', -} - -TAG_SPACE = [] - -TAG_IMAGES = [ - 'img', -] - -TAG_LINKS = [ -] - -BLOCK = [ - 'p', -] - -STYLES = [ - ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), - ('font-style', {'italic' : 'emphasis'}), -] +from calibre.utils.magick.draw import save_cover_data_to class FB2MLizer(object): ''' @@ -63,24 +32,32 @@ class FB2MLizer(object): def __init__(self, log): self.log = log self.image_hrefs = {} + self.reset_state() + + def reset_state(self): # Used to ensure text and tags are always within
and
self.in_p = False + # Mapping of image names. OEB allows for images to have the same name but be stored + # in different directories. FB2 images are all in a flat layout so we rename all images + # into a sequential numbering system to ensure there are no collisions between image names. + self.image_hrefs = {} def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to FB2 markup...') self.oeb_book = oeb_book self.opts = opts + return self.fb2mlize_spine() def fb2mlize_spine(self): - self.image_hrefs = {} - self.link_hrefs = {} + self.reset_state() + output = [self.fb2_header()] output.append(self.get_text()) - output.append(self.fb2_body_footer()) output.append(self.fb2mlize_images()) output.append(self.fb2_footer()) output = self.clean_text(u''.join(output)) + if self.opts.pretty_print: return u'\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) else: @@ -97,65 +74,75 @@ class FB2MLizer(object): return text def fb2_header(self): - author_first = u'' - author_middle = u'' - author_last = u'' - author_parts = self.oeb_book.metadata.creator[0].value.split(' ') + metadata = {} + metadata['author_first'] = u'' + metadata['author_middle'] = u'' + metadata['author_last'] = u'' + metadata['title'] = self.oeb_book.metadata.title[0].value + metadata['appname'] = __appname__ + metadata['version'] = __version__ + metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) + metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en' + author_parts = self.oeb_book.metadata.creator[0].value.split(' ') if len(author_parts) == 1: - author_last = author_parts[0] + metadata['author_last'] = author_parts[0] elif len(author_parts) == 2: - author_first = author_parts[0] - author_last = author_parts[1] + metadata['author_first'] = author_parts[0] + metadata['author_last'] = author_parts[1] else: - author_first = author_parts[0] - author_middle = ' '.join(author_parts[1:-2]) - author_last = author_parts[-1] + metadata['author_first'] = author_parts[0] + metadata['author_middle'] = ' '.join(author_parts[1:-2]) + metadata['author_last'] = author_parts[-1] + + for key, value in metadata.items(): + metadata[key] = prepare_string_for_xml(value) return u''], ['p']
-
- def insert_empty_line(self, tags):
- if self.in_p:
- text = ['']
- closed_tags = []
- tags.reverse()
- for t in tags:
- text.append('%s>' % t)
- closed_tags.append(t)
- if t == 'p':
- break
- text.append('
')
added_p = True
self.in_p = True
-
+
return text, added_p
- def dump_text(self, elem, stylizer, page, tag_stack=[]):
- if not isinstance(elem.tag, basestring) \
- or namespace(elem.tag) != XHTML_NS:
+ def handle_simple_tag(self, tag, tags):
+ s_out = []
+ s_tags = []
+ if tag not in tags:
+ p_out, p_tags = self.ensure_p()
+ s_out += p_out
+ s_tags += p_tags
+ s_out.append('<%s>' % tag)
+ s_tags.append(tag)
+ return s_out, s_tags
+
+ def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
+ '''
+ This function is intended to be used in a recursive manner. dump_text will
+ run though all elements in the elem_tree and call itself on each element.
+
+ self.image_hrefs will be populated by calling this function.
+
+ @param elem_tree: etree representation of XHTML content to be transformed.
+ @param stylizer: Used to track the style of elements within the tree.
+ @param page: OEB page used to determine absolute urls.
+ @param tag_stack: List of open FB2 tags to take into account.
+
+ @return: List of string representing the XHTML converted to FB2 markup.
+ '''
+ # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
+ if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
return []
- style = stylizer.style(elem)
- if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
- or style['visibility'] == 'hidden':
+ style = stylizer.style(elem_tree)
+ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
return []
- fb2_text = []
+ # FB2 generated output.
+ fb2_out = []
+ # FB2 tags in the order they are opened. This will be used to close the tags.
tags = []
+ # First tag in tree
+ tag = barename(elem_tree.tag)
- tag = barename(elem.tag)
-
- if tag in TAG_IMAGES:
- if elem.attrib.get('src', None):
- if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
- self.image_hrefs[page.abshref(elem.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
- p_txt, p_tag = self.ensure_p()
- fb2_text += p_txt
- tags += p_tag
- fb2_text.append('
') - fb2_text.append(prepare_string_for_xml(elem.text)) + fb2_out.append('
') + fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: - fb2_text.append('
') + fb2_out.append('') - for item in elem: - fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags) + # Process sub-elements. + for item in elem_tree: + fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags) + # Close open FB2 tags. tags.reverse() - fb2_text += self.close_tags(tags) + fb2_out += self.close_tags(tags) - if hasattr(elem, 'tail') and elem.tail: + # Process element text that comes after the close of the XHTML tag but before the next XHTML tag. + if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: - fb2_text.append('') - fb2_text.append(prepare_string_for_xml(elem.tail)) + fb2_out.append('
') + fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: - fb2_text.append('
') + fb2_out.append('') - return fb2_text + return fb2_out def close_tags(self, tags): text = []