HTMLZ input and output plugins to produce single file html content.

This commit is contained in:
John Schember 2011-04-02 12:37:19 -04:00
parent 5e2e6a9d30
commit 16d2ce68e8
8 changed files with 639 additions and 3 deletions

View File

@ -231,6 +231,17 @@ class HTMLMetadataReader(MetadataReaderPlugin):
from calibre.ebooks.metadata.html import get_metadata
return get_metadata(stream)
class HTMLZMetadataReader(MetadataReaderPlugin):
name = 'Read HTMLZ metadata'
file_types = set(['htmlz'])
description = _('Read metadata from %s files') % 'HTMLZ'
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.extz import get_metadata
return get_metadata(stream)
class IMPMetadataReader(MetadataReaderPlugin):
name = 'Read IMP metadata'
@ -407,7 +418,7 @@ class TXTZMetadataReader(MetadataReaderPlugin):
author = 'John Schember'
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.txtz import get_metadata
from calibre.ebooks.metadata.extz import get_metadata
return get_metadata(stream)
class ZipMetadataReader(MetadataReaderPlugin):
@ -433,6 +444,17 @@ class EPUBMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.epub import set_metadata
set_metadata(stream, mi, apply_null=self.apply_null)
class HTMLZMetadataWriter(MetadataWriterPlugin):
name = 'Set HTMLZ metadata'
file_types = set(['htmlz'])
description = _('Set metadata from %s files') % 'HTMLZ'
author = 'John Schember'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.extz import set_metadata
set_metadata(stream, mi)
class LRFMetadataWriter(MetadataWriterPlugin):
name = 'Set LRF metadata'
@ -505,7 +527,7 @@ class TXTZMetadataWriter(MetadataWriterPlugin):
author = 'John Schember'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.txtz import set_metadata
from calibre.ebooks.metadata.extz import set_metadata
set_metadata(stream, mi)
# }}}
@ -514,6 +536,7 @@ from calibre.ebooks.comic.input import ComicInput
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.htmlz.input import HTMLZInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.odt.input import ODTInput
@ -544,6 +567,7 @@ from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.txt.output import TXTZOutput
from calibre.ebooks.html.output import HTMLOutput
from calibre.ebooks.htmlz.output import HTMLZOutput
from calibre.ebooks.snb.output import SNBOutput
from calibre.customize.profiles import input_profiles, output_profiles
@ -599,6 +623,7 @@ plugins += [
EPUBInput,
FB2Input,
HTMLInput,
HTMLZInput,
LITInput,
MOBIInput,
ODTInput,
@ -630,6 +655,7 @@ plugins += [
TXTOutput,
TXTZOutput,
HTMLOutput,
HTMLZOutput,
SNBOutput,
]
# Order here matters. The first matched device is the one used.

View File

View File

@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre import walk
from calibre.customize.conversion import InputFormatPlugin
from calibre.utils.zipfile import ZipFile
class HTMLZInput(InputFormatPlugin):
name = 'HTLZ Input'
author = 'John Schember'
description = 'Convert HTML files to HTML'
file_types = set(['htmlz'])
def convert(self, stream, options, file_ext, log,
accelerators):
self.log = log
html = u''
# Extract content from zip archive.
zf = ZipFile(stream)
zf.extractall('.')
for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
with open(x, 'rb') as tf:
html = tf.read()
break
# Run the HTML through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = os.getcwdu()
fname = os.path.join(base, 'index.html')
c = 0
while os.path.exists(fname):
c += 1
fname = 'index%d.html'%c
htmlfile = open(fname, 'wb')
with htmlfile:
htmlfile.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile.name)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb

View File

@ -0,0 +1,372 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into a single (more or less) HTML file.
'''
import os
from urlparse import urlparse
from calibre import prepare_string_for_xml
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.logging import default_log
class OEB2HTML(object):
'''
Base class. All subclasses should implement dump_text to actually transform
content. Also, callers should use oeb2html to get the transformed html.
links and images can be retrieved after calling oeb2html to get the mapping
of OEB links and images to the new names used in the html returned by oeb2html.
Images will always be referenced as if they are in an images directory.
Use get_css to get the CSS classes for the OEB document as a string.
'''
def __init__(self, log=None):
self.log = default_log if log is None else log
self.links = {}
self.images = {}
def oeb2html(self, oeb_book, opts):
self.log.info('Converting OEB book to HTML...')
self.opts = opts
self.links = {}
self.images = {}
return self.mlize_spine(oeb_book)
def mlize_spine(self, oeb_book):
output = [u'<html><body><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /></head>']
for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href)
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
output.append('\n\n')
output.append('</body></html>')
return ''.join(output)
def dump_text(self, elem, stylizer, page):
raise NotImplementedError
def get_link_id(self, href, aid):
aid = '%s#%s' % (href, aid)
if aid not in self.links:
self.links[aid] = 'calibre_link-%s' % len(self.links.keys())
return self.links[aid]
def rewrite_links(self, tag, attribs, page):
# Rewrite ids.
if 'id' in attribs:
attribs['id'] = self.get_link_id(page.href, attribs['id'])
# Rewrite links.
if tag == 'a':
href = attribs['href']
href = page.abshref(href)
if self.url_is_relative(href):
if '#' not in href:
href += '#'
if href not in self.links:
self.links[href] = 'calibre_link-%s' % len(self.links.keys())
href = '#%s' % self.links[href]
attribs['href'] = href
return attribs
def rewrite_images(self, tag, attribs, page):
if tag == 'img':
src = attribs.get('src', None)
if src:
src = page.abshref(src)
if src not in self.images:
ext = os.path.splitext(src)[1]
fname = '%s%s' % (len(self.images), ext)
fname = fname.zfill(10)
self.images[src] = fname
attribs['src'] = 'images/%s' % self.images[src]
return attribs
def url_is_relative(self, url):
o = urlparse(url)
return False if o.scheme else True
def get_css(self, oeb_book):
css = u''
for item in oeb_book.manifest:
if item.media_type == 'text/css':
css = item.data.cssText
break
return css
class OEB2HTMLNoCSSizer(OEB2HTML):
'''
This will remap a small number of CSS styles to equivalent HTML tags.
'''
def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
if tag == 'body':
tag = 'div'
attribs['id'] = self.get_link_id(page.href, '')
tags.append(tag)
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return ['']
# Remove attributes we won't want.
if 'class' in attribs:
del attribs['class']
if 'style' in attribs:
del attribs['style']
attribs = self.rewrite_links(tag, attribs, page)
attribs = self.rewrite_images(tag, attribs, page)
# Turn the rest of the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
# Write the tag.
text.append('<%s%s>' % (tag, at))
# Turn styles into tags.
if style['font-weight'] in ('bold', 'bolder'):
text.append('<b>')
tags.append('b')
if style['font-style'] == 'italic':
text.append('<i>')
tags.append('i')
if style['text-decoration'] == 'underline':
text.append('<u>')
tags.append('u')
if style['text-decoration'] == 'line-through':
text.append('<s>')
tags.append('s')
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(elem.text)
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page)
# Close all open tags.
tags.reverse()
for t in tags:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(elem.tail)
return text
class OEB2HTMLInlineCSSizer(OEB2HTML):
'''
Turns external CSS classes into inline style attributes.
'''
def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
style_a = '%s' % style
if tag == 'body':
tag = 'div'
attribs['id'] = self.get_link_id(page.href, '')
if not style['page-break-before'] == 'always':
style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a
tags.append(tag)
# Remove attributes we won't want.
if 'class' in attribs:
del attribs['class']
if 'style' in attribs:
del attribs['style']
attribs = self.rewrite_links(tag, attribs, page)
attribs = self.rewrite_images(tag, attribs, page)
# Turn the rest of the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
# Turn style into strings for putting in the tag.
style_t = ''
if style_a:
style_t = ' style="%s"' % style_a
# Write the tag.
text.append('<%s%s%s>' % (tag, at, style_t))
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(elem.text)
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page)
# Close all open tags.
tags.reverse()
for t in tags:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(elem.tail)
return text
class OEB2HTMLClassCSSizer(OEB2HTML):
'''
Use CSS classes. css_style option can specify whether to use
inline classes (style tag in the head) or reference an external
CSS file called style.css.
'''
def mlize_spine(self, oeb_book):
output = []
for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href)
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
output.append('\n\n')
if self.opts.class_style == 'external':
css = u'<link href="style.css" rel="stylesheet" type="text/css" />'
else:
css = u'<style type="text/css">' + self.get_css(oeb_book) + u'</style>'
output = [u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + [css] + [u'</head><body>'] + output + [u'</body></html>']
return ''.join(output)
def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
if tag == 'body':
tag = 'div'
attribs['id'] = self.get_link_id(page.href, '')
tags.append(tag)
# Remove attributes we won't want.
if 'style' in attribs:
del attribs['style']
attribs = self.rewrite_links(tag, attribs, page)
attribs = self.rewrite_images(tag, attribs, page)
# Turn the rest of the attributes into a string we can write with the tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
# Write the tag.
text.append('<%s%s>' % (tag, at))
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(elem.text)
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page)
# Close all open tags.
tags.reverse()
for t in tags:
text.append('</%s>' % t)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
text.append(elem.tail)
return text
def oeb2html_no_css(oeb_book, log, opts):
izer = OEB2HTMLNoCSSizer(log)
html = izer.oeb2html(oeb_book, opts)
images = izer.images
return (html, images)
def oeb2html_inline_css(oeb_book, log, opts):
izer = OEB2HTMLInlineCSSizer(log)
html = izer.oeb2html(oeb_book, opts)
images = izer.images
return (html, images)
def oeb2html_class_css(oeb_book, log, opts):
izer = OEB2HTMLClassCSSizer(log)
setattr(opts, 'class_style', 'inline')
html = izer.oeb2html(oeb_book, opts)
images = izer.images
return (html, images)

View File

@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.html import tostring
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
class HTMLZOutput(OutputFormatPlugin):
name = 'HTMLZ Output'
author = 'John Schember'
file_type = 'htmlz'
options = set([
OptionRecommendation(name='css_type', recommended_value='class',
level=OptionRecommendation.LOW,
choices=['class', 'inline', 'tag'],
help=_('Specify the handling of CSS. Default is class.\n'
'class: Use CSS classes and have elements reference them.\n'
'inline: Write the CSS as an inline style attribute.\n'
'tag: Turn as many CSS styles into HTML tags.'
)),
OptionRecommendation(name='class_style', recommended_value='external',
level=OptionRecommendation.LOW,
choices=['external', 'inline'],
help=_('How to handle the CSS when using css-type = \'class\'.\n'
'Default is external.\n'
'external: Use an external CSS file that is linked in the document.\n'
'inline: Place the CSS in the head section of the document.'
)),
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
with TemporaryDirectory('_txtz_output') as tdir:
# HTML
if opts.css_type == 'inline':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer as OEB2HTMLizer
elif opts.css_type == 'tag':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer as OEB2HTMLizer
else:
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
htmlizer = OEB2HTMLizer(log)
html = htmlizer.oeb2html(oeb_book, opts)
html = etree.fromstring(html)
html = tostring(html, pretty_print=True)
with open(os.path.join(tdir, 'index.html'), 'wb') as tf:
tf.write(html)
# CSS
if opts.css_type == 'class' and opts.class_style == 'external':
with open(os.path.join(tdir, 'style.css'), 'wb') as tf:
tf.write(htmlizer.get_css(oeb_book))
# Images
images = htmlizer.images
if images:
if not os.path.exists(os.path.join(tdir, 'images')):
os.makedirs(os.path.join(tdir, 'images'))
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES and item.href in images:
fname = os.path.join(tdir, 'images', images[item.href])
with open(fname, 'wb') as img:
img.write(item.data)
# Metadata
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
txtz = ZipFile(output_path, 'w')
txtz.add_dir(tdir)

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
'''
Read meta information from TXT files
Read meta information from extZ (TXTZ, HTMLZ...) files.
'''
import os

View File

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.convert.htmlz_output_ui import Ui_Form
from calibre.gui2.convert import Widget
format_model = None
class PluginWidget(Widget, Ui_Form):
TITLE = _('HTMLZ Output')
HELP = _('Options specific to')+' HTMLZ '+_('output')
COMMIT_NAME = 'htmlz_output'
ICON = I('mimetypes/html.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, ['css_type', 'class_style'])
self.db, self.book_id = db, book_id
for x in get_option('css_type').option.choices:
self.opt_css_type.addItem(x)
for x in get_option('class_style').option.choices:
self.opt_class_style.addItem(x)
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -0,0 +1,61 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Form</class>
<widget class="QWidget" name="Form">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>438</width>
<height>300</height>
</rect>
</property>
<property name="windowTitle">
<string>Form</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="2" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>246</height>
</size>
</property>
</spacer>
</item>
<item row="0" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>How to handle CSS</string>
</property>
<property name="buddy">
<cstring>opt_css_type</cstring>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QComboBox" name="opt_css_type">
<property name="minimumContentsLength">
<number>20</number>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>How to handle class based CSS</string>
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QComboBox" name="opt_class_style"/>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>