diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 7d21f8255e..1e40a8e5ff 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -231,6 +231,17 @@ class HTMLMetadataReader(MetadataReaderPlugin): from calibre.ebooks.metadata.html import get_metadata return get_metadata(stream) +class HTMLZMetadataReader(MetadataReaderPlugin): + + name = 'Read HTMLZ metadata' + file_types = set(['htmlz']) + description = _('Read metadata from %s files') % 'HTMLZ' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.extz import get_metadata + return get_metadata(stream) + class IMPMetadataReader(MetadataReaderPlugin): name = 'Read IMP metadata' @@ -407,7 +418,7 @@ class TXTZMetadataReader(MetadataReaderPlugin): author = 'John Schember' def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.txtz import get_metadata + from calibre.ebooks.metadata.extz import get_metadata return get_metadata(stream) class ZipMetadataReader(MetadataReaderPlugin): @@ -433,6 +444,17 @@ class EPUBMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.metadata.epub import set_metadata set_metadata(stream, mi, apply_null=self.apply_null) +class HTMLZMetadataWriter(MetadataWriterPlugin): + + name = 'Set HTMLZ metadata' + file_types = set(['htmlz']) + description = _('Set metadata from %s files') % 'HTMLZ' + author = 'John Schember' + + def set_metadata(self, stream, mi, type): + from calibre.ebooks.metadata.extz import set_metadata + set_metadata(stream, mi) + class LRFMetadataWriter(MetadataWriterPlugin): name = 'Set LRF metadata' @@ -505,7 +527,7 @@ class TXTZMetadataWriter(MetadataWriterPlugin): author = 'John Schember' def set_metadata(self, stream, mi, type): - from calibre.ebooks.metadata.txtz import set_metadata + from calibre.ebooks.metadata.extz import set_metadata set_metadata(stream, mi) # }}} @@ -514,6 +536,7 @@ from calibre.ebooks.comic.input import ComicInput from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.html.input import HTMLInput +from calibre.ebooks.htmlz.input import HTMLZInput from calibre.ebooks.lit.input import LITInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.odt.input import ODTInput @@ -544,6 +567,7 @@ from calibre.ebooks.tcr.output import TCROutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTZOutput from calibre.ebooks.html.output import HTMLOutput +from calibre.ebooks.htmlz.output import HTMLZOutput from calibre.ebooks.snb.output import SNBOutput from calibre.customize.profiles import input_profiles, output_profiles @@ -599,6 +623,7 @@ plugins += [ EPUBInput, FB2Input, HTMLInput, + HTMLZInput, LITInput, MOBIInput, ODTInput, @@ -630,6 +655,7 @@ plugins += [ TXTOutput, TXTZOutput, HTMLOutput, + HTMLZOutput, SNBOutput, ] # Order here matters. The first matched device is the one used. diff --git a/src/calibre/ebooks/htmlz/__init__.py b/src/calibre/ebooks/htmlz/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/ebooks/htmlz/input.py b/src/calibre/ebooks/htmlz/input.py new file mode 100644 index 0000000000..6822f91b68 --- /dev/null +++ b/src/calibre/ebooks/htmlz/input.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre import walk +from calibre.customize.conversion import InputFormatPlugin +from calibre.utils.zipfile import ZipFile + +class HTMLZInput(InputFormatPlugin): + + name = 'HTLZ Input' + author = 'John Schember' + description = 'Convert HTML files to HTML' + file_types = set(['htmlz']) + + def convert(self, stream, options, file_ext, log, + accelerators): + self.log = log + html = u'' + + # Extract content from zip archive. + zf = ZipFile(stream) + zf.extractall('.') + + for x in walk('.'): + if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'): + with open(x, 'rb') as tf: + html = tf.read() + break + + # Run the HTML through the html processing plugin. + from calibre.customize.ui import plugin_for_input_format + html_input = plugin_for_input_format('html') + for opt in html_input.options: + setattr(options, opt.option.name, opt.recommended_value) + options.input_encoding = 'utf-8' + base = os.getcwdu() + fname = os.path.join(base, 'index.html') + c = 0 + while os.path.exists(fname): + c += 1 + fname = 'index%d.html'%c + htmlfile = open(fname, 'wb') + with htmlfile: + htmlfile.write(html.encode('utf-8')) + odi = options.debug_pipeline + options.debug_pipeline = None + # Generate oeb from html conversion. + oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, + {}) + options.debug_pipeline = odi + os.remove(htmlfile.name) + + # Set metadata from file. + from calibre.customize.ui import get_file_type_metadata + from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata + mi = get_file_type_metadata(stream, file_ext) + meta_info_to_oeb_metadata(mi, oeb.metadata, log) + + return oeb diff --git a/src/calibre/ebooks/htmlz/oeb2html.py b/src/calibre/ebooks/htmlz/oeb2html.py new file mode 100644 index 0000000000..d6f4b46d15 --- /dev/null +++ b/src/calibre/ebooks/htmlz/oeb2html.py @@ -0,0 +1,372 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into a single (more or less) HTML file. +''' + +import os + +from urlparse import urlparse + +from calibre import prepare_string_for_xml +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.utils.logging import default_log + +class OEB2HTML(object): + ''' + Base class. All subclasses should implement dump_text to actually transform + content. Also, callers should use oeb2html to get the transformed html. + links and images can be retrieved after calling oeb2html to get the mapping + of OEB links and images to the new names used in the html returned by oeb2html. + Images will always be referenced as if they are in an images directory. + + Use get_css to get the CSS classes for the OEB document as a string. + ''' + + def __init__(self, log=None): + self.log = default_log if log is None else log + self.links = {} + self.images = {} + + def oeb2html(self, oeb_book, opts): + self.log.info('Converting OEB book to HTML...') + self.opts = opts + self.links = {} + self.images = {} + + return self.mlize_spine(oeb_book) + + def mlize_spine(self, oeb_book): + output = [u''] + for item in oeb_book.spine: + self.log.debug('Converting %s to HTML...' % item.href) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output.append('\n\n') + output.append('') + return ''.join(output) + + def dump_text(self, elem, stylizer, page): + raise NotImplementedError + + def get_link_id(self, href, aid): + aid = '%s#%s' % (href, aid) + if aid not in self.links: + self.links[aid] = 'calibre_link-%s' % len(self.links.keys()) + return self.links[aid] + + def rewrite_links(self, tag, attribs, page): + # Rewrite ids. + if 'id' in attribs: + attribs['id'] = self.get_link_id(page.href, attribs['id']) + # Rewrite links. + if tag == 'a': + href = attribs['href'] + href = page.abshref(href) + if self.url_is_relative(href): + if '#' not in href: + href += '#' + if href not in self.links: + self.links[href] = 'calibre_link-%s' % len(self.links.keys()) + href = '#%s' % self.links[href] + attribs['href'] = href + return attribs + + def rewrite_images(self, tag, attribs, page): + if tag == 'img': + src = attribs.get('src', None) + if src: + src = page.abshref(src) + if src not in self.images: + ext = os.path.splitext(src)[1] + fname = '%s%s' % (len(self.images), ext) + fname = fname.zfill(10) + self.images[src] = fname + attribs['src'] = 'images/%s' % self.images[src] + return attribs + + def url_is_relative(self, url): + o = urlparse(url) + return False if o.scheme else True + + def get_css(self, oeb_book): + css = u'' + for item in oeb_book.manifest: + if item.media_type == 'text/css': + css = item.data.cssText + break + return css + + +class OEB2HTMLNoCSSizer(OEB2HTML): + ''' + This will remap a small number of CSS styles to equivalent HTML tags. + ''' + + def dump_text(self, elem, stylizer, page): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + if tag == 'body': + tag = 'div' + attribs['id'] = self.get_link_id(page.href, '') + tags.append(tag) + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + # Remove attributes we won't want. + if 'class' in attribs: + del attribs['class'] + if 'style' in attribs: + del attribs['style'] + + attribs = self.rewrite_links(tag, attribs, page) + attribs = self.rewrite_images(tag, attribs, page) + + # Turn the rest of the attributes into a string we can write with the tag. + at = '' + for k, v in attribs.items(): + at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + + # Write the tag. + text.append('<%s%s>' % (tag, at)) + + # Turn styles into tags. + if style['font-weight'] in ('bold', 'bolder'): + text.append('') + tags.append('b') + if style['font-style'] == 'italic': + text.append('') + tags.append('i') + if style['text-decoration'] == 'underline': + text.append('') + tags.append('u') + if style['text-decoration'] == 'line-through': + text.append('') + tags.append('s') + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + text.append(elem.text) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page) + + # Close all open tags. + tags.reverse() + for t in tags: + text.append('' % t) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + text.append(elem.tail) + + return text + + +class OEB2HTMLInlineCSSizer(OEB2HTML): + ''' + Turns external CSS classes into inline style attributes. + ''' + + def dump_text(self, elem, stylizer, page): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + style_a = '%s' % style + if tag == 'body': + tag = 'div' + attribs['id'] = self.get_link_id(page.href, '') + if not style['page-break-before'] == 'always': + style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a + tags.append(tag) + + # Remove attributes we won't want. + if 'class' in attribs: + del attribs['class'] + if 'style' in attribs: + del attribs['style'] + + attribs = self.rewrite_links(tag, attribs, page) + attribs = self.rewrite_images(tag, attribs, page) + + # Turn the rest of the attributes into a string we can write with the tag. + at = '' + for k, v in attribs.items(): + at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + + # Turn style into strings for putting in the tag. + style_t = '' + if style_a: + style_t = ' style="%s"' % style_a + + # Write the tag. + text.append('<%s%s%s>' % (tag, at, style_t)) + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + text.append(elem.text) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page) + + # Close all open tags. + tags.reverse() + for t in tags: + text.append('' % t) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + text.append(elem.tail) + + return text + + +class OEB2HTMLClassCSSizer(OEB2HTML): + ''' + Use CSS classes. css_style option can specify whether to use + inline classes (style tag in the head) or reference an external + CSS file called style.css. + ''' + + def mlize_spine(self, oeb_book): + output = [] + for item in oeb_book.spine: + self.log.debug('Converting %s to HTML...' % item.href) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output.append('\n\n') + if self.opts.class_style == 'external': + css = u'' + else: + css = u'' + output = [u''] + [css] + [u''] + output + [u''] + return ''.join(output) + + def dump_text(self, elem, stylizer, page): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + if tag == 'body': + tag = 'div' + attribs['id'] = self.get_link_id(page.href, '') + tags.append(tag) + + # Remove attributes we won't want. + if 'style' in attribs: + del attribs['style'] + + attribs = self.rewrite_links(tag, attribs, page) + attribs = self.rewrite_images(tag, attribs, page) + + # Turn the rest of the attributes into a string we can write with the tag. + at = '' + for k, v in attribs.items(): + at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + + # Write the tag. + text.append('<%s%s>' % (tag, at)) + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + text.append(elem.text) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page) + + # Close all open tags. + tags.reverse() + for t in tags: + text.append('' % t) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + text.append(elem.tail) + + return text + + +def oeb2html_no_css(oeb_book, log, opts): + izer = OEB2HTMLNoCSSizer(log) + html = izer.oeb2html(oeb_book, opts) + images = izer.images + return (html, images) + +def oeb2html_inline_css(oeb_book, log, opts): + izer = OEB2HTMLInlineCSSizer(log) + html = izer.oeb2html(oeb_book, opts) + images = izer.images + return (html, images) + +def oeb2html_class_css(oeb_book, log, opts): + izer = OEB2HTMLClassCSSizer(log) + setattr(opts, 'class_style', 'inline') + html = izer.oeb2html(oeb_book, opts) + images = izer.images + return (html, images) diff --git a/src/calibre/ebooks/htmlz/output.py b/src/calibre/ebooks/htmlz/output.py new file mode 100644 index 0000000000..37802fdb11 --- /dev/null +++ b/src/calibre/ebooks/htmlz/output.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from lxml import etree + +from calibre.customize.conversion import OutputFormatPlugin, \ + OptionRecommendation +from calibre.ebooks.html import tostring +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.zipfile import ZipFile + +class HTMLZOutput(OutputFormatPlugin): + + name = 'HTMLZ Output' + author = 'John Schember' + file_type = 'htmlz' + + options = set([ + OptionRecommendation(name='css_type', recommended_value='class', + level=OptionRecommendation.LOW, + choices=['class', 'inline', 'tag'], + help=_('Specify the handling of CSS. Default is class.\n' + 'class: Use CSS classes and have elements reference them.\n' + 'inline: Write the CSS as an inline style attribute.\n' + 'tag: Turn as many CSS styles into HTML tags.' + )), + OptionRecommendation(name='class_style', recommended_value='external', + level=OptionRecommendation.LOW, + choices=['external', 'inline'], + help=_('How to handle the CSS when using css-type = \'class\'.\n' + 'Default is external.\n' + 'external: Use an external CSS file that is linked in the document.\n' + 'inline: Place the CSS in the head section of the document.' + )), + ]) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + with TemporaryDirectory('_txtz_output') as tdir: + # HTML + if opts.css_type == 'inline': + from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer as OEB2HTMLizer + elif opts.css_type == 'tag': + from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer as OEB2HTMLizer + else: + from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer + + htmlizer = OEB2HTMLizer(log) + html = htmlizer.oeb2html(oeb_book, opts) + + html = etree.fromstring(html) + html = tostring(html, pretty_print=True) + + with open(os.path.join(tdir, 'index.html'), 'wb') as tf: + tf.write(html) + + # CSS + if opts.css_type == 'class' and opts.class_style == 'external': + with open(os.path.join(tdir, 'style.css'), 'wb') as tf: + tf.write(htmlizer.get_css(oeb_book)) + + # Images + images = htmlizer.images + if images: + if not os.path.exists(os.path.join(tdir, 'images')): + os.makedirs(os.path.join(tdir, 'images')) + for item in oeb_book.manifest: + if item.media_type in OEB_IMAGES and item.href in images: + fname = os.path.join(tdir, 'images', images[item.href]) + with open(fname, 'wb') as img: + img.write(item.data) + + # Metadata + with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf: + mdataf.write(etree.tostring(oeb_book.metadata.to_opf1())) + + txtz = ZipFile(output_path, 'w') + txtz.add_dir(tdir) diff --git a/src/calibre/ebooks/metadata/txtz.py b/src/calibre/ebooks/metadata/extz.py similarity index 94% rename from src/calibre/ebooks/metadata/txtz.py rename to src/calibre/ebooks/metadata/extz.py index ae6efb4838..0ecdbe9ea6 100644 --- a/src/calibre/ebooks/metadata/txtz.py +++ b/src/calibre/ebooks/metadata/extz.py @@ -4,7 +4,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, John Schember ' ''' -Read meta information from TXT files +Read meta information from extZ (TXTZ, HTMLZ...) files. ''' import os diff --git a/src/calibre/gui2/convert/htmlz_output.py b/src/calibre/gui2/convert/htmlz_output.py new file mode 100644 index 0000000000..f1197175f9 --- /dev/null +++ b/src/calibre/gui2/convert/htmlz_output.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +from calibre.gui2.convert.htmlz_output_ui import Ui_Form +from calibre.gui2.convert import Widget + +format_model = None + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('HTMLZ Output') + HELP = _('Options specific to')+' HTMLZ '+_('output') + COMMIT_NAME = 'htmlz_output' + ICON = I('mimetypes/html.png') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, ['css_type', 'class_style']) + self.db, self.book_id = db, book_id + for x in get_option('css_type').option.choices: + self.opt_css_type.addItem(x) + for x in get_option('class_style').option.choices: + self.opt_class_style.addItem(x) + self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/htmlz_output.ui b/src/calibre/gui2/convert/htmlz_output.ui new file mode 100644 index 0000000000..2cdd7594ab --- /dev/null +++ b/src/calibre/gui2/convert/htmlz_output.ui @@ -0,0 +1,61 @@ + + + Form + + + + 0 + 0 + 438 + 300 + + + + Form + + + + + + Qt::Vertical + + + + 20 + 246 + + + + + + + + How to handle CSS + + + opt_css_type + + + + + + + 20 + + + + + + + How to handle class based CSS + + + + + + + + + + +