Merge from trunk

This commit is contained in:
Charles Haley 2010-12-05 10:55:25 +00:00
commit fbdef6b460
6 changed files with 348 additions and 181 deletions

View File

@ -142,6 +142,9 @@ class EPUBOutput(OutputFormatPlugin):
def convert(self, oeb, output_path, input_plugin, opts, log): def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb self.log, self.opts, self.oeb = log, opts, oeb
#from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
#UniqueFilenames()(oeb, opts)
self.workaround_ade_quirks() self.workaround_ade_quirks()
self.workaround_webkit_quirks() self.workaround_webkit_quirks()
self.upshift_markup() self.upshift_markup()

View File

@ -8,15 +8,11 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into FB2 markup Transform OEB content into FB2 markup
''' '''
import cStringIO
from base64 import b64encode from base64 import b64encode
from datetime import datetime
from mimetypes import types_map
import re import re
import uuid
try:
from PIL import Image
Image
except ImportError:
import Image
from lxml import etree from lxml import etree
@ -25,32 +21,7 @@ from calibre.constants import __appname__, __version__
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.utils.magick import Image
TAG_MAP = {
'b' : 'strong',
'i' : 'emphasis',
'p' : 'p',
'li' : 'p',
'div': 'p',
}
TAG_SPACE = []
TAG_IMAGES = [
'img',
]
TAG_LINKS = [
]
BLOCK = [
'p',
]
STYLES = [
('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
('font-style', {'italic' : 'emphasis'}),
]
class FB2MLizer(object): class FB2MLizer(object):
''' '''
@ -63,24 +34,32 @@ class FB2MLizer(object):
def __init__(self, log): def __init__(self, log):
self.log = log self.log = log
self.image_hrefs = {} self.image_hrefs = {}
self.reset_state()
def reset_state(self):
# Used to ensure text and tags are always within <p> and </p> # Used to ensure text and tags are always within <p> and </p>
self.in_p = False self.in_p = False
# Mapping of image names. OEB allows for images to have the same name but be stored
# in different directories. FB2 images are all in a flat layout so we rename all images
# into a sequential numbering system to ensure there are no collisions between image names.
self.image_hrefs = {}
def extract_content(self, oeb_book, opts): def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to FB2 markup...') self.log.info('Converting XHTML to FB2 markup...')
self.oeb_book = oeb_book self.oeb_book = oeb_book
self.opts = opts self.opts = opts
return self.fb2mlize_spine() return self.fb2mlize_spine()
def fb2mlize_spine(self): def fb2mlize_spine(self):
self.image_hrefs = {} self.reset_state()
self.link_hrefs = {}
output = [self.fb2_header()] output = [self.fb2_header()]
output.append(self.get_text()) output.append(self.get_text())
output.append(self.fb2_body_footer())
output.append(self.fb2mlize_images()) output.append(self.fb2mlize_images())
output.append(self.fb2_footer()) output.append(self.fb2_footer())
output = self.clean_text(u''.join(output)) output = self.clean_text(u''.join(output))
if self.opts.pretty_print: if self.opts.pretty_print:
return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True) return u'<?xml version="1.0" encoding="UTF-8"?>\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
else: else:
@ -97,65 +76,85 @@ class FB2MLizer(object):
return text return text
def fb2_header(self): def fb2_header(self):
author_first = u'' metadata = {}
author_middle = u'' metadata['author_first'] = u''
author_last = u'' metadata['author_middle'] = u''
author_parts = self.oeb_book.metadata.creator[0].value.split(' ') metadata['author_last'] = u''
metadata['title'] = self.oeb_book.metadata.title[0].value
metadata['appname'] = __appname__
metadata['version'] = __version__
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
metadata['id'] = '%s' % uuid.uuid4()
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
if len(author_parts) == 1: if len(author_parts) == 1:
author_last = author_parts[0] metadata['author_last'] = author_parts[0]
elif len(author_parts) == 2: elif len(author_parts) == 2:
author_first = author_parts[0] metadata['author_first'] = author_parts[0]
author_last = author_parts[1] metadata['author_last'] = author_parts[1]
else: else:
author_first = author_parts[0] metadata['author_first'] = author_parts[0]
author_middle = ' '.join(author_parts[1:-2]) metadata['author_middle'] = ' '.join(author_parts[1:-2])
author_last = author_parts[-1] metadata['author_last'] = author_parts[-1]
for key, value in metadata.items():
metadata[key] = prepare_string_for_xml(value)
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \ return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
'<description>' \ '<description>' \
'<title-info>' \ '<title-info>' \
'<genre></genre>' \ '<genre>antique</genre>' \
'<author>' \ '<author>' \
'<first-name>%s</first-name>' \ '<first-name>%(author_first)s</first-name>' \
'<middle-name>%s</middle-name>' \ '<middle-name>%(author_middle)s</middle-name>' \
'<last-name>%s</last-name>' \ '<last-name>%(author_last)s</last-name>' \
'</author>' \ '</author>' \
'<book-title>%s</book-title>' \ '<book-title>%(title)s</book-title>' \
'<annotation><p/></annotation>' \ '<lang>%(lang)s</lang>' \
'</title-info>' \ '</title-info>' \
'<document-info>' \ '<document-info>' \
'<program-used>%s %s</program-used>' \ '<author>' \
'<first-name></first-name>' \
'<middle-name></middle-name>' \
'<last-name></last-name>' \
'</author>' \
'<program-used>%(appname)s %(version)s</program-used>' \
'<date>%(date)s</date>' \
'<id>%(id)s</id>' \
'<version>1.0</version>' \
'</document-info>' \ '</document-info>' \
'</description><body>' % tuple(map(prepare_string_for_xml, (author_first, author_middle, author_last, '</description>' % metadata
self.oeb_book.metadata.title[0].value, __appname__, __version__)))
def fb2_footer(self):
return u'</FictionBook>'
def get_text(self): def get_text(self):
text = [] text = ['<body>']
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href) self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append('<section>') text.append('<section>')
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
text.append('</section>') text.append('</section>')
return ''.join(text) return ''.join(text) + '</body>'
def fb2_body_footer(self):
return u'</body>'
def fb2_footer(self):
return u'</FictionBook>'
def fb2mlize_images(self): def fb2mlize_images(self):
'''
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
'''
images = [] images = []
for item in self.oeb_book.manifest: for item in self.oeb_book.manifest:
# Don't write the image if it's not referenced in the document's text.
if item.href not in self.image_hrefs:
continue
if item.media_type in OEB_RASTER_IMAGES: if item.media_type in OEB_RASTER_IMAGES:
try: try:
im = Image.open(cStringIO.StringIO(item.data)).convert('RGB') if not item.media_type == types_map['.jpeg'] or not item.media_type == types_map['.jpg']:
data = cStringIO.StringIO() im = Image()
im.save(data, 'JPEG') im.load(item.data)
data = data.getvalue() im.set_compression_quality(70)
data = im.export('jpg')
raw_data = b64encode(data) raw_data = b64encode(data)
# Don't put the encoded image on a single line. # Don't put the encoded image on a single line.
data = '' data = ''
@ -166,7 +165,7 @@ class FB2MLizer(object):
col = 1 col = 1
col += 1 col += 1
data += char data += char
images.append('<binary id="%s" content-type="%s">%s\n</binary>' % (self.image_hrefs.get(item.href, '_0000.JPEG'), item.media_type, data)) images.append('<binary id="%s" content-type="image/jpeg">%s\n</binary>' % (self.image_hrefs[item.href], data))
except Exception as e: except Exception as e:
self.log.error('Error: Could not include file %s because ' \ self.log.error('Error: Could not include file %s because ' \
'%s.' % (item.href, e)) '%s.' % (item.href, e))
@ -179,24 +178,6 @@ class FB2MLizer(object):
self.in_p = True self.in_p = True
return ['<p>'], ['p'] return ['<p>'], ['p']
def insert_empty_line(self, tags):
if self.in_p:
text = ['']
closed_tags = []
tags.reverse()
for t in tags:
text.append('</%s>' % t)
closed_tags.append(t)
if t == 'p':
break
text.append('<empty-line />')
closed_tags.reverse()
for t in closed_tags:
text.append('<%s>' % t)
return text
else:
return ['<empty-line />']
def close_open_p(self, tags): def close_open_p(self, tags):
text = [''] text = ['']
added_p = False added_p = False
@ -220,83 +201,125 @@ class FB2MLizer(object):
return text, added_p return text, added_p
def dump_text(self, elem, stylizer, page, tag_stack=[]): def handle_simple_tag(self, tag, tags):
if not isinstance(elem.tag, basestring) \ s_out = []
or namespace(elem.tag) != XHTML_NS: s_tags = []
if tag not in tags:
p_out, p_tags = self.ensure_p()
s_out += p_out
s_tags += p_tags
s_out.append('<%s>' % tag)
s_tags.append(tag)
return s_out, s_tags
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
'''
This function is intended to be used in a recursive manner. dump_text will
run though all elements in the elem_tree and call itself on each element.
self.image_hrefs will be populated by calling this function.
@param elem_tree: etree representation of XHTML content to be transformed.
@param stylizer: Used to track the style of elements within the tree.
@param page: OEB page used to determine absolute urls.
@param tag_stack: List of open FB2 tags to take into account.
@return: List of string representing the XHTML converted to FB2 markup.
'''
# Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
return [] return []
style = stylizer.style(elem) style = stylizer.style(elem_tree)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden':
or style['visibility'] == 'hidden':
return [] return []
fb2_text = [] # FB2 generated output.
fb2_out = []
# FB2 tags in the order they are opened. This will be used to close the tags.
tags = [] tags = []
# First tag in tree
tag = barename(elem_tree.tag)
tag = barename(elem.tag) # Process the XHTML tag if it needs to be converted to an FB2 tag.
if tag in TAG_IMAGES:
if elem.attrib.get('src', None):
if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
self.image_hrefs[page.abshref(elem.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
p_txt, p_tag = self.ensure_p()
fb2_text += p_txt
tags += p_tag
fb2_text.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem.attrib['src'])])
if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title: if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
fb2_text.append('<title>') fb2_out.append('<title>')
tags.append('title') tags.append('title')
if tag == 'br': if tag == 'img':
fb2_text += self.insert_empty_line(tag_stack+tags) if elem_tree.attrib.get('src', None):
# Only write the image tag if it is in the manifest.
fb2_tag = TAG_MAP.get(tag, None) if page.abshref(elem_tree.attrib['src']) in self.oeb_book.manifest.hrefs.keys():
if fb2_tag == 'p': if page.abshref(elem_tree.attrib['src']) not in self.image_hrefs.keys():
self.image_hrefs[page.abshref(elem_tree.attrib['src'])] = '_%s.jpg' % len(self.image_hrefs.keys())
p_txt, p_tag = self.ensure_p()
fb2_out += p_txt
tags += p_tag
fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[page.abshref(elem_tree.attrib['src'])])
elif tag == 'br':
if self.in_p:
closed_tags = []
open_tags = tag_stack+tags
open_tags.reverse()
for t in open_tags:
fb2_out.append('</%s>' % t)
closed_tags.append(t)
if t == 'p':
break
fb2_out.append('<empty-line />')
closed_tags.reverse()
for t in closed_tags:
fb2_out.append('<%s>' % t)
else:
fb2_out.append('<empty-line />')
elif tag in ('div', 'li', 'p'):
p_text, added_p = self.close_open_p(tag_stack+tags) p_text, added_p = self.close_open_p(tag_stack+tags)
fb2_text += p_text fb2_out += p_text
if added_p: if added_p:
tags.append('p') tags.append('p')
elif fb2_tag and fb2_tag not in tag_stack+tags: elif tag == 'b':
p_text, p_tags = self.ensure_p() s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_text += p_text fb2_out += s_out
tags += p_tags tags += s_tags
fb2_text.append('<%s>' % fb2_tag) elif tag == 'i':
tags.append(fb2_tag) s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
fb2_out += s_out
tags += s_tags
# Processes style information # Processes style information.
for s in STYLES: if style['font-style'] == 'italic':
style_tag = s[1].get(style[s[0]], None) s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
if style_tag and style_tag not in tag_stack+tags: fb2_out += s_out
p_text, p_tags = self.ensure_p() tags += s_tags
fb2_text += p_text elif style['font-weight'] in ('bold', 'bolder'):
tags += p_tags s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
fb2_text.append('<%s>' % style_tag) fb2_out += s_out
tags.append(style_tag) tags += s_tags
if tag in TAG_SPACE: # Process element text.
fb2_text.append(' ') if hasattr(elem_tree, 'text') and elem_tree.text:
if hasattr(elem, 'text') and elem.text:
if not self.in_p: if not self.in_p:
fb2_text.append('<p>') fb2_out.append('<p>')
fb2_text.append(prepare_string_for_xml(elem.text)) fb2_out.append(prepare_string_for_xml(elem_tree.text))
if not self.in_p: if not self.in_p:
fb2_text.append('</p>') fb2_out.append('</p>')
for item in elem: # Process sub-elements.
fb2_text += self.dump_text(item, stylizer, page, tag_stack+tags) for item in elem_tree:
fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)
# Close open FB2 tags.
tags.reverse() tags.reverse()
fb2_text += self.close_tags(tags) fb2_out += self.close_tags(tags)
if hasattr(elem, 'tail') and elem.tail: # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
if hasattr(elem_tree, 'tail') and elem_tree.tail:
if not self.in_p: if not self.in_p:
fb2_text.append('<p>') fb2_out.append('<p>')
fb2_text.append(prepare_string_for_xml(elem.tail)) fb2_out.append(prepare_string_for_xml(elem_tree.tail))
if not self.in_p: if not self.in_p:
fb2_text.append('</p>') fb2_out.append('</p>')
return fb2_text return fb2_out
def close_tags(self, tags): def close_tags(self, tags):
text = [] text = []

View File

@ -29,6 +29,14 @@ class FB2Output(OutputFormatPlugin):
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
try:
rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts)
except Unavailable:
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
linearize_jacket(oeb_book) linearize_jacket(oeb_book)
fb2mlizer = FB2MLizer(log) fb2mlizer = FB2MLizer(log)

View File

@ -775,6 +775,7 @@ class Manifest(object):
return u'Item(id=%r, href=%r, media_type=%r)' \ return u'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
# Parsing {{{
def _parse_xml(self, data): def _parse_xml(self, data):
data = xml_to_unicode(data, strip_encoding_pats=True, data = xml_to_unicode(data, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0] assume_utf8=True, resolve_entities=True)[0]
@ -1035,6 +1036,8 @@ class Manifest(object):
data = item.data.cssText data = item.data.cssText
return ('utf-8', data) return ('utf-8', data)
# }}}
@dynamic_property @dynamic_property
def data(self): def data(self):
doc = """Provides MIME type sensitive access to the manifest doc = """Provides MIME type sensitive access to the manifest

View File

@ -0,0 +1,130 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import posixpath
from urlparse import urldefrag
from lxml import etree
import cssutils
from calibre.ebooks.oeb.base import rewrite_links, urlnormalize
class RenameFiles(object):
'''
Rename files and adjust all links pointing to them. Note that the spine
and manifest are not touched by this transform.
'''
def __init__(self, rename_map):
self.rename_map = rename_map
def __call__(self, oeb, opts):
self.log = oeb.logger
self.opts = opts
self.oeb = oeb
for item in oeb.manifest.items:
self.current_item = item
if etree.iselement(item.data):
rewrite_links(self.current_item.data, self.url_replacer)
elif hasattr(item.data, 'cssText'):
cssutils.replaceUrls(item.data, self.url_replacer)
if self.oeb.guide:
for ref in self.oeb.guide.values():
href = urlnormalize(ref.href)
href, frag = urldefrag(href)
replacement = self.rename_map.get(href, None)
if replacement is not None:
nhref = replacement
if frag:
nhref += '#' + frag
ref.href = nhref
if self.oeb.toc:
self.fix_toc_entry(self.oeb.toc)
def fix_toc_entry(self, toc):
if toc.href:
href = urlnormalize(toc.href)
href, frag = urldefrag(href)
replacement = self.rename_map.get(href, None)
if replacement is not None:
nhref = replacement
if frag:
nhref = '#'.join((nhref, frag))
toc.href = nhref
for x in toc:
self.fix_toc_entry(x)
def url_replacer(self, orig_url):
url = urlnormalize(orig_url)
path, frag = urldefrag(url)
href = self.current_item.abshref(path)
replacement = self.rename_map.get(href, None)
if replacement is None:
return orig_url
replacement = self.current_item.relhref(replacement)
if frag:
replacement += '#' + frag
return replacement
class UniqueFilenames(object):
'Ensure that every item in the manifest has a unique filename'
def __call__(self, oeb, opts):
self.log = oeb.logger
self.opts = opts
self.oeb = oeb
self.seen_filenames = set([])
self.rename_map = {}
for item in list(oeb.manifest.items):
fname = posixpath.basename(item.href)
if fname in self.seen_filenames:
suffix = self.unique_suffix(fname)
data = item.data
base, ext = posixpath.splitext(item.href)
nhref = base + suffix + ext
nhref = oeb.manifest.generate(href=nhref)[1]
nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data,
fallback=item.fallback)
self.seen_filenames.add(posixpath.basename(nhref))
self.rename_map[item.href] = nhref
if item.spine_position is not None:
oeb.spine.insert(item.spine_position, nitem, item.linear)
oeb.spine.remove(item)
oeb.manifest.remove(item)
else:
self.seen_filenames.add(fname)
if self.rename_map:
self.log('Found non-unique filenames, renaming to support broken'
' EPUB readers like FBReader, Aldiko and Stanza...')
from pprint import pformat
self.log.debug(pformat(self.rename_map))
renamer = RenameFiles(self.rename_map)
renamer(oeb, opts)
def unique_suffix(self, fname):
base, ext = posixpath.splitext(fname)
c = 0
while True:
c += 1
suffix = '_u%d'%c
candidate = base + suffix + ext
if candidate not in self.seen_filenames:
return suffix

View File

@ -114,27 +114,27 @@ Montag
Dienstag Dienstag
Januar Januar
Februar Februar
März März
Fuße Fuße
Fluße Fluße
Flusse Flusse
flusse flusse
fluße fluße
flüße flüße
flüsse flüsse
''' '''
german_good = ''' german_good = '''
Dienstag Dienstag
Februar Februar
flusse flusse
Flusse Flusse
fluße fluße
Fluße Fluße
flüsse flüsse
flüße flüße
Fuße Fuße
Januar Januar
März März
Montag Montag
Sonntag''' Sonntag'''
french = ''' french = '''
@ -142,49 +142,49 @@ dimanche
lundi lundi
mardi mardi
janvier janvier
février février
mars mars
dé  déjà
Meme Meme
deja deja
même même
dejà dejà
bpef bpef
bÅg bœg
Boef Boef
Mé© Mémé
bÅf bœf
boef boef
bnef bnef
pêche pêche
pèché pèché
pêché pêché
pêche pêche
pêché''' pêché'''
french_good = ''' french_good = '''
bnef bnef
boef boef
Boef Boef
bÅf bœf
bÅg bœg
bpef bpef
deja deja
dejà dejà
dé  déjà
dimanche dimanche
février février
janvier janvier
lundi lundi
mardi mardi
mars mars
Meme Meme
Mé© Mémé
même même
pèché pèché
pêche pêche
pêche pêche
pêché pêché
pêché''' pêché'''
# }}} # }}}
def create(l): def create(l):