From fc39c1ad11979b721830a6845a5f625229fc7f8f Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 4 Dec 2010 16:57:13 -0500 Subject: [PATCH 01/10] FB2 Output: Rewrite and restructure. --- src/calibre/ebooks/fb2/fb2ml.py | 226 ++++++++++++++++---------------- 1 file changed, 114 insertions(+), 112 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 252453d25e..f27729ae8c 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -26,32 +26,6 @@ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES -TAG_MAP = { - 'b' : 'strong', - 'i' : 'emphasis', - 'p' : 'p', - 'li' : 'p', - 'div': 'p', -} - -TAG_SPACE = [] - -TAG_IMAGES = [ - 'img', -] - -TAG_LINKS = [ -] - -BLOCK = [ - 'p', -] - -STYLES = [ - ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), - ('font-style', {'italic' : 'emphasis'}), -] - class FB2MLizer(object): ''' Todo: * Include more FB2 specific tags in the conversion. @@ -63,24 +37,32 @@ class FB2MLizer(object): def __init__(self, log): self.log = log self.image_hrefs = {} + self.reset_state() + + def reset_state(self): # Used to ensure text and tags are always within

and

self.in_p = False + # Mapping of image names. OEB allows for images to have the same name but be stored + # in different directories. FB2 images are all in a flat layout so we rename all images + # into a sequential numbering system to ensure there are no collisions between image names. + self.image_hrefs = {} def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to FB2 markup...') self.oeb_book = oeb_book self.opts = opts + return self.fb2mlize_spine() def fb2mlize_spine(self): - self.image_hrefs = {} - self.link_hrefs = {} + self.reset_state() + output = [self.fb2_header()] output.append(self.get_text()) - output.append(self.fb2_body_footer()) output.append(self.fb2mlize_images()) output.append(self.fb2_footer()) output = self.clean_text(u''.join(output)) + if self.opts.pretty_print: return u'\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) else: @@ -127,24 +109,21 @@ class FB2MLizer(object): '' \ '%s %s' \ '' \ - '' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last, + '' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last, self.oeb_book.metadata.title[0].value, __appname__, __version__))) + def fb2_footer(self): + return u'' + def get_text(self): - text = [] + text = [''] for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) text.append('
') text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) text.append('
') - return ''.join(text) - - def fb2_body_footer(self): - return u'' - - def fb2_footer(self): - return u'' + return ''.join(text) + '' def fb2mlize_images(self): images = [] @@ -178,24 +157,6 @@ class FB2MLizer(object): else: self.in_p = True return ['

'], ['p'] - - def insert_empty_line(self, tags): - if self.in_p: - text = [''] - closed_tags = [] - tags.reverse() - for t in tags: - text.append('' % t) - closed_tags.append(t) - if t == 'p': - break - text.append('') - closed_tags.reverse() - for t in closed_tags: - text.append('<%s>' % t) - return text - else: - return [''] def close_open_p(self, tags): text = [''] @@ -219,84 +180,125 @@ class FB2MLizer(object): self.in_p = True return text, added_p + + def handle_simple_tag(self, tag, tags): + s_out = [] + s_tags = [] + if tag not in tags: + p_out, p_tags = self.ensure_p() + s_out += p_out + s_tags += p_tags + s_out.append('<%s>' % tag) + s_tags.append(tag) + return s_out, s_tags - def dump_text(self, elem, stylizer, page, tag_stack=[]): - if not isinstance(elem.tag, basestring) \ - or namespace(elem.tag) != XHTML_NS: + def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): + ''' + This function is intended to be used in a recursive manner. dump_text will + run though all elements in the elem_tree and call itself on each element. + + self.image_hrefs will be populated by calling this function. + + @param elem_tree: etree representation of XHTML content to be transformed. + @param stylizer: Used to track the style of elements within the tree. + @param page: OEB page used to determine absolute urls. + @param tag_stack: List of open FB2 tags to take into account. + + @return: List of string representing the XHTML converted to FB2 markup. + ''' + # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace. + if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS: return [] - style = stylizer.style(elem) - if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ - or style['visibility'] == 'hidden': + style = stylizer.style(elem_tree) + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden': return [] - fb2_text = [] + # FB2 generated output. + fb2_out = [] + # FB2 tags in the order they are opened. This will be used to close the tags. tags = [] + # First tag in tree + tag = barename(elem_tree.tag) - tag = barename(elem.tag) - - if tag in TAG_IMAGES: - if elem.attrib.get('src', None): - if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): - self.image_hrefs[page.abshref(elem.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys()) - p_txt, p_tag = self.ensure_p() - fb2_text += p_txt - tags += p_tag - fb2_text.append('' % self.image_hrefs[page.abshref(elem.attrib['src'])]) - + # Process the XHTML tag if it needs to be converted to an FB2 tag. if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title: - fb2_text.append('') + fb2_out.append('<title>') tags.append('title') - if tag == 'br': - fb2_text += self.insert_empty_line(tag_stack+tags) - - fb2_tag = TAG_MAP.get(tag, None) - if fb2_tag == 'p': + if tag == 'img': + # TODO: Check that the image is in the manifest and only write the tag if it is. + if elem_tree.attrib.get('src', None): + if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys(): + self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys()) + p_txt, p_tag = self.ensure_p() + fb2_out += p_txt + tags += p_tag + fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])]) + elif tag == 'br': + if self.in_p: + closed_tags = [] + open_tags = tag_stack+tags + open_tags.reverse() + for t in open_tags: + fb2_out.append('</%s>' % t) + closed_tags.append(t) + if t == 'p': + break + fb2_out.append('<empty-line />') + closed_tags.reverse() + for t in closed_tags: + fb2_out.append('<%s>' % t) + else: + fb2_out.append('<empty-line />') + elif tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack+tags) - fb2_text += p_text + fb2_out += p_text if added_p: tags.append('p') - elif fb2_tag and fb2_tag not in tag_stack+tags: - p_text, p_tags = self.ensure_p() - fb2_text += p_text - tags += p_tags - fb2_text.append('<%s>' % fb2_tag) - tags.append(fb2_tag) + elif tag == 'b': + s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) + fb2_out += s_out + tags += s_tags + elif tag == 'i': + s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) + fb2_out += s_out + tags += s_tags - # Processes style information - for s in STYLES: - style_tag = s[1].get(style[s[0]], None) - if style_tag and style_tag not in tag_stack+tags: - p_text, p_tags = self.ensure_p() - fb2_text += p_text - tags += p_tags - fb2_text.append('<%s>' % style_tag) - tags.append(style_tag) - - if tag in TAG_SPACE: - fb2_text.append(' ') - - if hasattr(elem, 'text') and elem.text: + # Processes style information. + if style['font-style'] == 'italic': + s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) + fb2_out += s_out + tags += s_tags + elif style['font-weight'] in ('bold', 'bolder'): + s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) + fb2_out += s_out + tags += s_tags + + # Process element text. + if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: - fb2_text.append('<p>') - fb2_text.append(prepare_string_for_xml(elem.text)) + fb2_out.append('<p>') + fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: - fb2_text.append('</p>') + fb2_out.append('</p>') - for item in elem: - fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags) + # Process sub-elements. + for item in elem_tree: + fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags) + # Close open FB2 tags. tags.reverse() - fb2_text += self.close_tags(tags) + fb2_out += self.close_tags(tags) - if hasattr(elem, 'tail') and elem.tail: + # Process element text that comes after the close of the XHTML tag but before the next XHTML tag. + if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: - fb2_text.append('<p>') - fb2_text.append(prepare_string_for_xml(elem.tail)) + fb2_out.append('<p>') + fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: - fb2_text.append('</p>') + fb2_out.append('</p>') - return fb2_text + return fb2_out def close_tags(self, tags): text = [] From bc669a1f987d80523df4a4ac61b0c35039104ad7 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Dec 2010 19:47:16 -0500 Subject: [PATCH 02/10] FB2 Output: Produce correct metadata header. --- src/calibre/ebooks/fb2/fb2ml.py | 55 +++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index f27729ae8c..d83dc45a0f 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -10,6 +10,7 @@ Transform OEB content into FB2 markup import cStringIO from base64 import b64encode +from datetime import datetime import re try: @@ -79,38 +80,54 @@ class FB2MLizer(object): return text def fb2_header(self): - author_first = u'' - author_middle = u'' - author_last = u'' + metadata = {} + metadata['author_first'] = u'' + metadata['author_middle'] = u'' + metadata['author_last'] = u'' + metadata['title'] = self.oeb_book.metadata.title[0].value + metadata['appname'] = __appname__ + metadata['version'] = __version__ + metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) + metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en' + author_parts = self.oeb_book.metadata.creator[0].value.split(' ') - if len(author_parts) == 1: - author_last = author_parts[0] + metadata['author_last'] = author_parts[0] elif len(author_parts) == 2: - author_first = author_parts[0] - author_last = author_parts[1] + metadata['author_first'] = author_parts[0] + metadata['author_last'] = author_parts[1] else: - author_first = author_parts[0] - author_middle = ' '.join(author_parts[1:-2]) - author_last = author_parts[-1] + metadata['author_first'] = author_parts[0] + metadata['author_middle'] = ' '.join(author_parts[1:-2]) + metadata['author_last'] = author_parts[-1] + + for key, value in metadata.items(): + metadata[key] = prepare_string_for_xml(value) return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \ '<description>' \ '<title-info>' \ - '<genre></genre>' \ + '<genre>antique</genre>' \ '<author>' \ - '<first-name>%s</first-name>' \ - '<middle-name>%s</middle-name>' \ - '<last-name>%s</last-name>' \ + '<first-name>%(author_first)s</first-name>' \ + '<middle-name>%(author_middle)s</middle-name>' \ + '<last-name>%(author_last)s</last-name>' \ '</author>' \ - '<book-title>%s</book-title>' \ - '<annotation><p/></annotation>' \ + '<book-title>%(title)s</book-title>' \ + '<lang>%(lang)s</lang>' \ '</title-info>' \ '<document-info>' \ - '<program-used>%s %s</program-used>' \ + '<author>' \ + '<first-name></first-name>' \ + '<middle-name></middle-name>' \ + '<last-name></last-name>' \ + '</author>' \ + '<program-used>%(appname)s %(version)s</program-used>' \ + '<date>%(date)s</date>' \ + '<id>1</id>' \ + '<version>1.0</version>' \ '</document-info>' \ - '</description>' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last, - self.oeb_book.metadata.title[0].value, __appname__, __version__))) + '</description>' % metadata def fb2_footer(self): return u'</FictionBook>' From 1f7c291aeb3bae171e5bff44d0ea47ba3fdd5524 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Dec 2010 20:37:16 -0500 Subject: [PATCH 03/10] FB2 Output: Use random uuid for book id. --- src/calibre/ebooks/fb2/fb2ml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index d83dc45a0f..d89570a44e 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -12,6 +12,7 @@ import cStringIO from base64 import b64encode from datetime import datetime import re +import uuid try: from PIL import Image @@ -88,7 +89,8 @@ class FB2MLizer(object): metadata['appname'] = __appname__ metadata['version'] = __version__ metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) - metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en' + metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en' + metadata['id'] = '%s' % uuid.uuid4() author_parts = self.oeb_book.metadata.creator[0].value.split(' ') if len(author_parts) == 1: @@ -124,7 +126,7 @@ class FB2MLizer(object): '</author>' \ '<program-used>%(appname)s %(version)s</program-used>' \ '<date>%(date)s</date>' \ - '<id>1</id>' \ + '<id>%(id)s</id>' \ '<version>1.0</version>' \ '</document-info>' \ '</description>' % metadata From 63727bc608ddc2683ceb4ceca84a0e0776fb0ceb Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sat, 4 Dec 2010 19:06:47 -0700 Subject: [PATCH 04/10] ... --- src/calibre/ebooks/oeb/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 2e480a9941..a077fb0225 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -775,6 +775,7 @@ class Manifest(object): return u'Item(id=%r, href=%r, media_type=%r)' \ % (self.id, self.href, self.media_type) + # Parsing {{{ def _parse_xml(self, data): data = xml_to_unicode(data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] @@ -1035,6 +1036,8 @@ class Manifest(object): data = item.data.cssText return ('utf-8', data) + # }}} + @dynamic_property def data(self): doc = """Provides MIME type sensitive access to the manifest From 0ea35abaf189ded4ba2e39a6201c545c21e8290e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Dec 2010 22:04:37 -0500 Subject: [PATCH 05/10] FB2 Output: Check image is in document and manifest before referencing and writing. --- src/calibre/ebooks/fb2/fb2ml.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index d89570a44e..3020c002a5 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -145,8 +145,14 @@ class FB2MLizer(object): return ''.join(text) + '</body>' def fb2mlize_images(self): + ''' + This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. + ''' images = [] for item in self.oeb_book.manifest: + # Don't write the image if it's not referenced in the document's text. + if item.href not in self.image_hrefs: + continue if item.media_type in OEB_RASTER_IMAGES: try: im = Image.open(cStringIO.StringIO(item.data)).convert('RGB') @@ -164,7 +170,7 @@ class FB2MLizer(object): col = 1 col += 1 data += char - images.append('<binary id="%s" content-type="%s">%s\n</binary>' % (self.image_hrefs.get(item.href, '_0000.JPEG'), item.media_type, data)) + images.append('<binary id="%s" content-type="%s">%s\n</binary>' % (self.image_hrefs[item.href], item.media_type, data)) except Exception as e: self.log.error('Error: Could not include file %s because ' \ '%s.' % (item.href, e)) @@ -245,14 +251,15 @@ class FB2MLizer(object): fb2_out.append('<title>') tags.append('title') if tag == 'img': - # TODO: Check that the image is in the manifest and only write the tag if it is. if elem_tree.attrib.get('src', None): - if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys(): - self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys()) - p_txt, p_tag = self.ensure_p() - fb2_out += p_txt - tags += p_tag - fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])]) + # Only write the image tag if it is in the manifest. + if page.abshref(elem_tree.attrib['src']) in self.oeb_book.manifest.hrefs.keys(): + if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys(): + self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys()) + p_txt, p_tag = self.ensure_p() + fb2_out += p_txt + tags += p_tag + fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])]) elif tag == 'br': if self.in_p: closed_tags = [] From 596c8b905bd7280093cc84eb57bef674326a63a0 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Dec 2010 22:14:30 -0500 Subject: [PATCH 06/10] FB2 Output: SVG rasterization. --- src/calibre/ebooks/fb2/output.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/ebooks/fb2/output.py b/src/calibre/ebooks/fb2/output.py index 88508b83e0..33714c6e6e 100644 --- a/src/calibre/ebooks/fb2/output.py +++ b/src/calibre/ebooks/fb2/output.py @@ -29,6 +29,14 @@ class FB2Output(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.ebooks.oeb.transforms.jacket import linearize_jacket + from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable + + try: + rasterizer = SVGRasterizer() + rasterizer(oeb_book, opts) + except Unavailable: + self.log.warn('SVG rasterizer unavailable, SVG will not be converted') + linearize_jacket(oeb_book) fb2mlizer = FB2MLizer(log) From cafe81f2e4528b596517108d501cd7a934864d15 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sat, 4 Dec 2010 20:20:20 -0700 Subject: [PATCH 07/10] EPUB Output: Mangle filenames to ensure they are unique, to support broken EPUB Readers like Aldiko and Stanza. Currently disabled pending further testing. --- src/calibre/ebooks/epub/output.py | 3 + .../ebooks/oeb/transforms/filenames.py | 130 ++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 src/calibre/ebooks/oeb/transforms/filenames.py diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 17f6eb9f46..c5d11edc2b 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -142,6 +142,9 @@ class EPUBOutput(OutputFormatPlugin): def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb + #from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames + #UniqueFilenames()(oeb, opts) + self.workaround_ade_quirks() self.workaround_webkit_quirks() self.upshift_markup() diff --git a/src/calibre/ebooks/oeb/transforms/filenames.py b/src/calibre/ebooks/oeb/transforms/filenames.py new file mode 100644 index 0000000000..2b22474d30 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/filenames.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +import posixpath +from urlparse import urldefrag + +from lxml import etree +import cssutils + +from calibre.ebooks.oeb.base import rewrite_links, urlnormalize + +class RenameFiles(object): + + ''' + Rename files and adjust all links pointing to them. Note that the spine + and manifest are not touched by this transform. + ''' + + def __init__(self, rename_map): + self.rename_map = rename_map + + def __call__(self, oeb, opts): + self.log = oeb.logger + self.opts = opts + self.oeb = oeb + + for item in oeb.manifest.items: + self.current_item = item + if etree.iselement(item.data): + rewrite_links(self.current_item.data, self.url_replacer) + elif hasattr(item.data, 'cssText'): + cssutils.replaceUrls(item.data, self.url_replacer) + + if self.oeb.guide: + for ref in self.oeb.guide.values(): + href = urlnormalize(ref.href) + href, frag = urldefrag(href) + replacement = self.rename_map.get(href, None) + if replacement is not None: + nhref = replacement + if frag: + nhref += '#' + frag + ref.href = nhref + + if self.oeb.toc: + self.fix_toc_entry(self.oeb.toc) + + + def fix_toc_entry(self, toc): + if toc.href: + href = urlnormalize(toc.href) + href, frag = urldefrag(href) + replacement = self.rename_map.get(href, None) + + if replacement is not None: + nhref = replacement + if frag: + nhref = '#'.join((nhref, frag)) + toc.href = nhref + + for x in toc: + self.fix_toc_entry(x) + + def url_replacer(self, orig_url): + url = urlnormalize(orig_url) + path, frag = urldefrag(url) + href = self.current_item.abshref(path) + replacement = self.rename_map.get(href, None) + if replacement is None: + return orig_url + replacement = self.current_item.relhref(replacement) + if frag: + replacement += '#' + frag + return replacement + +class UniqueFilenames(object): + + 'Ensure that every item in the manifest has a unique filename' + + def __call__(self, oeb, opts): + self.log = oeb.logger + self.opts = opts + self.oeb = oeb + + self.seen_filenames = set([]) + self.rename_map = {} + + for item in list(oeb.manifest.items): + fname = posixpath.basename(item.href) + if fname in self.seen_filenames: + suffix = self.unique_suffix(fname) + data = item.data + base, ext = posixpath.splitext(item.href) + nhref = base + suffix + ext + nhref = oeb.manifest.generate(href=nhref)[1] + nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data, + fallback=item.fallback) + self.seen_filenames.add(posixpath.basename(nhref)) + self.rename_map[item.href] = nhref + if item.spine_position is not None: + oeb.spine.insert(item.spine_position, nitem, item.linear) + oeb.spine.remove(item) + oeb.manifest.remove(item) + else: + self.seen_filenames.add(fname) + + if self.rename_map: + self.log('Found non-unique filenames, renaming to support broken' + ' EPUB readers like FBReader, Aldiko and Stanza...') + from pprint import pformat + self.log.debug(pformat(self.rename_map)) + + renamer = RenameFiles(self.rename_map) + renamer(oeb, opts) + + + def unique_suffix(self, fname): + base, ext = posixpath.splitext(fname) + c = 0 + while True: + c += 1 + suffix = '_u%d'%c + candidate = base + suffix + ext + if candidate not in self.seen_filenames: + return suffix + From f7d9571c4c51f8640841f3dfd45af7a71d3fff12 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Dec 2010 22:37:34 -0500 Subject: [PATCH 08/10] FB2 Output: Replace PIL with ImageMagick. Don't convert JPG images to JPG because it's unnecessary. --- src/calibre/ebooks/fb2/fb2ml.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 3020c002a5..b04cb50d46 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -8,18 +8,12 @@ __docformat__ = 'restructuredtext en' Transform OEB content into FB2 markup ''' -import cStringIO from base64 import b64encode from datetime import datetime +from mimetypes import types_map import re import uuid -try: - from PIL import Image - Image -except ImportError: - import Image - from lxml import etree from calibre import prepare_string_for_xml @@ -27,6 +21,7 @@ from calibre.constants import __appname__, __version__ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES +from calibre.utils.magick import Image class FB2MLizer(object): ''' @@ -155,11 +150,11 @@ class FB2MLizer(object): continue if item.media_type in OEB_RASTER_IMAGES: try: - im = Image.open(cStringIO.StringIO(item.data)).convert('RGB') - data = cStringIO.StringIO() - im.save(data, 'JPEG') - data = data.getvalue() - + if not item.media_type == types_map['.jpeg'] or not item.media_type == types_map['.jpg']: + im = Image() + im.load(item.data) + im.set_compression_quality(70) + data = im.export('jpg') raw_data = b64encode(data) # Don't put the encoded image on a single line. data = '' From 4d3e99af6733a7ed7bba89297971b8847d5483d9 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Dec 2010 22:59:37 -0500 Subject: [PATCH 09/10] FB2 Output: Fix writing incorrect mimetype. --- src/calibre/ebooks/fb2/fb2ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 479cd4d789..0748970c60 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -165,7 +165,7 @@ class FB2MLizer(object): col = 1 col += 1 data += char - images.append('<binary id="%s" content-type="%s">%s\n</binary>' % (self.image_hrefs[item.href], item.media_type, data)) + images.append('<binary id="%s">%s\n</binary>' % (self.image_hrefs[item.href], data)) except Exception as e: self.log.error('Error: Could not include file %s because ' \ '%s.' % (item.href, e)) From fcd87f216c6172443f3245b1ad117435aca516c9 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Dec 2010 23:02:55 -0500 Subject: [PATCH 10/10] FB2 Output: Add image mimetype back and set to jpg because that is what is written. --- src/calibre/ebooks/fb2/fb2ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 0748970c60..46861357e6 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -165,7 +165,7 @@ class FB2MLizer(object): col = 1 col += 1 data += char - images.append('<binary id="%s">%s\n</binary>' % (self.image_hrefs[item.href], data)) + images.append('<binary id="%s" content-type="image/jpeg">%s\n</binary>' % (self.image_hrefs[item.href], data)) except Exception as e: self.log.error('Error: Could not include file %s because ' \ '%s.' % (item.href, e))