mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-14 02:26:58 -05:00
1185 lines
48 KiB
Python
1185 lines
48 KiB
Python
from __future__ import with_statement
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
'''
|
|
Code to recursively parse HTML files and create an open ebook in a specified
|
|
directory or zip file. All the action starts in :function:`create_dir`.
|
|
'''
|
|
|
|
import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools
|
|
from urlparse import urlparse, urlunparse
|
|
from urllib import unquote
|
|
|
|
from lxml import etree
|
|
from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \
|
|
fromstring as _fromstring, tostring as _tostring, \
|
|
soupparser, HtmlElement
|
|
from lxml.etree import XPath
|
|
get_text = XPath("//text()")
|
|
|
|
from calibre import unicode_path, entity_to_unicode
|
|
from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
|
|
from calibre.utils.config import Config, StringConfig
|
|
from calibre.ebooks.metadata import MetaInformation
|
|
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
|
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
|
from calibre.utils.zipfile import ZipFile
|
|
from cssutils import CSSParser
|
|
|
|
class HTMLElement(HtmlElement):
|
|
|
|
@dynamic_property
|
|
def specified_font_size(self):
|
|
|
|
def fget(self):
|
|
ans = self.get('specified_font_size', '')
|
|
if not ans:
|
|
return lambda x: x
|
|
if ans.startswith('f'):
|
|
return functools.partial(operator.mul, float(ans[1:]))
|
|
return float(ans)
|
|
|
|
def fset(self, val):
|
|
self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))
|
|
|
|
return property(fget=fget, fset=fset)
|
|
|
|
@dynamic_property
|
|
def computed_font_size(self):
|
|
def fget(self):
|
|
ans = self.get('computed_font_size', '')
|
|
if ans == '':
|
|
return None
|
|
return float(ans)
|
|
|
|
def fset(self, val):
|
|
self.set('computed_font_size', repr(val))
|
|
|
|
return property(fget=fget, fset=fset)
|
|
|
|
def remove_font_size_information(self):
|
|
for elem in self.iter():
|
|
for p in ('computed', 'specified'):
|
|
elem.attrib.pop(p+'_font_size', None)
|
|
|
|
def getpath(self):
|
|
return self.getroottree().getpath(self)
|
|
|
|
class Lookup(HtmlElementClassLookup):
|
|
|
|
def lookup(self, node_type, document, namespace, name):
|
|
if node_type == 'element':
|
|
return HTMLElement
|
|
return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)
|
|
|
|
class HTMLParser(_HTMLParser):
|
|
|
|
def __init__(self, **kwargs):
|
|
super(HTMLParser, self).__init__(**kwargs)
|
|
self.set_element_class_lookup(Lookup())
|
|
|
|
parser = HTMLParser()
|
|
|
|
def fromstring(raw, **kw):
|
|
return _fromstring(raw, parser=parser, **kw)
|
|
|
|
def tostring(root, pretty_print=False):
|
|
return _tostring(root, encoding='utf-8', method='xml',
|
|
include_meta_content_type=True,
|
|
pretty_print=pretty_print)
|
|
|
|
class Link(object):
|
|
'''
|
|
Represents a link in a HTML file.
|
|
'''
|
|
|
|
@classmethod
|
|
def url_to_local_path(cls, url, base):
|
|
path = urlunparse(('', '', url.path, url.params, url.query, ''))
|
|
path = unquote(path)
|
|
if os.path.isabs(path):
|
|
return path
|
|
return os.path.abspath(os.path.join(base, path))
|
|
|
|
def __init__(self, url, base):
|
|
'''
|
|
:param url: The url this link points to. Must be an unquoted unicode string.
|
|
:param base: The base directory that relative URLs are with respect to.
|
|
Must be a unicode string.
|
|
'''
|
|
assert isinstance(url, unicode) and isinstance(base, unicode)
|
|
self.url = url
|
|
self.parsed_url = urlparse(self.url)
|
|
self.is_local = self.parsed_url.scheme in ('', 'file')
|
|
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
|
self.path = None
|
|
self.fragment = unquote(self.parsed_url.fragment)
|
|
if self.is_local and not self.is_internal:
|
|
self.path = self.url_to_local_path(self.parsed_url, base)
|
|
|
|
def __hash__(self):
|
|
if self.path is None:
|
|
return hash(self.url)
|
|
return hash(self.path)
|
|
|
|
def __eq__(self, other):
|
|
return self.path == getattr(other, 'path', other)
|
|
|
|
def __str__(self):
|
|
return u'Link: %s --> %s'%(self.url, self.path)
|
|
|
|
|
|
class IgnoreFile(Exception):
|
|
|
|
def __init__(self, msg, errno):
|
|
Exception.__init__(self, msg)
|
|
self.doesnt_exist = errno == 2
|
|
self.errno = errno
|
|
|
|
class HTMLFile(object):
|
|
'''
|
|
Contains basic information about an HTML file. This
|
|
includes a list of links to other files as well as
|
|
the encoding of each file. Also tries to detect if the file is not a HTML
|
|
file in which case :member:`is_binary` is set to True.
|
|
|
|
The encoding of the file is available as :member:`encoding`.
|
|
'''
|
|
|
|
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
|
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
|
LINK_PAT = re.compile(
|
|
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
|
re.DOTALL|re.IGNORECASE)
|
|
|
|
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
|
'''
|
|
:param level: The level of this file. Should be 0 for the root file.
|
|
:param encoding: Use `encoding` to decode HTML.
|
|
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
|
'''
|
|
self.path = unicode_path(path_to_html_file, abs=True)
|
|
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
|
self.base = os.path.dirname(self.path)
|
|
self.level = level
|
|
self.referrer = referrer
|
|
self.links = []
|
|
|
|
try:
|
|
with open(self.path, 'rb') as f:
|
|
src = f.read()
|
|
except IOError, err:
|
|
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
|
if level == 0:
|
|
raise IOError(msg)
|
|
raise IgnoreFile(msg, err.errno)
|
|
|
|
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
|
if not self.is_binary:
|
|
if encoding is None:
|
|
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
|
self.encoding = encoding
|
|
else:
|
|
self.encoding = encoding
|
|
|
|
src = src.decode(encoding, 'replace')
|
|
match = self.TITLE_PAT.search(src)
|
|
self.title = match.group(1) if match is not None else self.title
|
|
self.find_links(src)
|
|
|
|
|
|
|
|
def __eq__(self, other):
|
|
return self.path == getattr(other, 'path', other)
|
|
|
|
def __str__(self):
|
|
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
|
|
|
def __repr__(self):
|
|
return str(self)
|
|
|
|
|
|
def find_links(self, src):
|
|
for match in self.LINK_PAT.finditer(src):
|
|
url = None
|
|
for i in ('url1', 'url2', 'url3'):
|
|
url = match.group(i)
|
|
if url:
|
|
break
|
|
link = self.resolve(url)
|
|
if link not in self.links:
|
|
self.links.append(link)
|
|
|
|
def resolve(self, url):
|
|
return Link(url, self.base)
|
|
|
|
|
|
def depth_first(root, flat, visited=set([])):
|
|
yield root
|
|
visited.add(root)
|
|
for link in root.links:
|
|
if link.path is not None and link not in visited:
|
|
try:
|
|
index = flat.index(link)
|
|
except ValueError: # Can happen if max_levels is used
|
|
continue
|
|
hf = flat[index]
|
|
if hf not in visited:
|
|
yield hf
|
|
visited.add(hf)
|
|
for hf in depth_first(hf, flat, visited):
|
|
if hf not in visited:
|
|
yield hf
|
|
visited.add(hf)
|
|
|
|
|
|
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
|
'''
|
|
Recursively traverse all links in the HTML file.
|
|
|
|
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
|
implies that no links in the root HTML file are followed.
|
|
:param encoding: Specify character encoding of HTML files. If `None` it is
|
|
auto-detected.
|
|
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
|
:class:`HTMLFile` objects.
|
|
'''
|
|
assert max_levels >= 0
|
|
level = 0
|
|
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
|
next_level = list(flat)
|
|
while level < max_levels and len(next_level) > 0:
|
|
level += 1
|
|
nl = []
|
|
for hf in next_level:
|
|
rejects = []
|
|
for link in hf.links:
|
|
if link.path is None or link.path in flat:
|
|
continue
|
|
try:
|
|
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
|
if nf.is_binary:
|
|
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
|
nl.append(nf)
|
|
flat.append(nf)
|
|
except IgnoreFile, err:
|
|
rejects.append(link)
|
|
if not err.doesnt_exist or verbose > 1:
|
|
print repr(err)
|
|
for link in rejects:
|
|
hf.links.remove(link)
|
|
|
|
next_level = list(nl)
|
|
orec = sys.getrecursionlimit()
|
|
sys.setrecursionlimit(500000)
|
|
try:
|
|
return flat, list(depth_first(flat[0], flat))
|
|
finally:
|
|
sys.setrecursionlimit(orec)
|
|
|
|
|
|
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
|
'''
|
|
Return a list of :class:`HTMLFile` objects in the order specified by the
|
|
`<spine>` element of the OPF.
|
|
|
|
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
|
|
:param encoding: Specify character encoding of HTML files. If `None` it is
|
|
auto-detected.
|
|
'''
|
|
if not opf_reader.spine:
|
|
raise ValueError('OPF does not have a spine')
|
|
flat = []
|
|
for path in opf_reader.spine.items():
|
|
path = os.path.abspath(path)
|
|
if path not in flat:
|
|
flat.append(os.path.abspath(path))
|
|
for item in opf_reader.manifest:
|
|
if 'html' in item.mime_type:
|
|
path = os.path.abspath(item.path)
|
|
if path not in flat:
|
|
flat.append(path)
|
|
for i, path in enumerate(flat):
|
|
if not os.path.exists(path):
|
|
path = path.replace('&', '%26')
|
|
if os.path.exists(path):
|
|
flat[i] = path
|
|
for item in opf_reader.itermanifest():
|
|
item.set('href', item.get('href').replace('&', '%26'))
|
|
ans = []
|
|
for path in flat:
|
|
if os.path.exists(path):
|
|
ans.append(HTMLFile(path, 0, encoding, verbose))
|
|
else:
|
|
print 'WARNING: OPF spine item %s does not exist'%path
|
|
ans = [f for f in ans if not f.is_binary]
|
|
return ans
|
|
|
|
|
|
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
|
|
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
|
|
|
def sanitize_head(match):
|
|
x = match.group(1)
|
|
x = _span_pat.sub('', x)
|
|
return '<head>\n'+x+'\n</head>'
|
|
|
|
class PreProcessor(object):
|
|
PREPROCESS = [
|
|
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
|
# Put all sorts of crap into <head>. This messes up lxml
|
|
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
|
|
sanitize_head),
|
|
# Convert all entities, since lxml doesn't handle them well
|
|
(re.compile(r'&(\S+?);'), convert_entities),
|
|
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
|
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
|
lambda match: ''),
|
|
]
|
|
|
|
# Fix pdftohtml markup
|
|
PDFTOHTML = [
|
|
# Remove <hr> tags
|
|
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
|
# Remove page numbers
|
|
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
|
|
# Remove <br> and replace <br><br> with <p>
|
|
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
|
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
|
|
lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
|
|
else match.group(1)),
|
|
# Remove hyphenation
|
|
(re.compile(r'-\n\r?'), lambda match: ''),
|
|
|
|
# Remove gray background
|
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>')
|
|
|
|
]
|
|
|
|
# Fix Book Designer markup
|
|
BOOK_DESIGNER = [
|
|
# HR
|
|
(re.compile('<hr>', re.IGNORECASE),
|
|
lambda match : '<span style="page-break-after:always"> </span>'),
|
|
# Create header tags
|
|
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
|
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
|
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
|
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
|
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
|
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
|
]
|
|
|
|
def is_baen(self, src):
|
|
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
|
re.IGNORECASE).search(src) is not None
|
|
|
|
def is_book_designer(self, raw):
|
|
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
|
|
|
def is_pdftohtml(self, src):
|
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
|
|
|
def preprocess(self, html):
|
|
opts = getattr(self, 'opts', False)
|
|
if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False):
|
|
html = opts.profile.remove_special_chars.sub('', html)
|
|
if self.is_baen(html):
|
|
rules = []
|
|
elif self.is_book_designer(html):
|
|
rules = self.BOOK_DESIGNER
|
|
elif self.is_pdftohtml(html):
|
|
rules = self.PDFTOHTML
|
|
else:
|
|
rules = []
|
|
for rule in self.PREPROCESS + rules:
|
|
html = rule[0].sub(rule[1], html)
|
|
return html
|
|
|
|
class Parser(PreProcessor):
|
|
# SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont'
|
|
# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in
|
|
# [
|
|
# (r'<(?P<tag>%s)(?P<attrs>(\s+[^<>]*){0,1})(?<!/)>',
|
|
# '<\g<tag>\g<attrs> />'),
|
|
# (),
|
|
# ]
|
|
# ]
|
|
|
|
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
|
|
self.setup_cli_handler(opts.verbose)
|
|
self.htmlfile = htmlfile
|
|
self.opts = opts
|
|
self.tdir = tdir
|
|
self.resource_map = resource_map
|
|
self.htmlfiles = htmlfiles
|
|
self.resource_dir = os.path.join(tdir, 'resources')
|
|
save_counter = 1
|
|
self.htmlfile_map = {}
|
|
self.level = self.htmlfile.level
|
|
for f in self.htmlfiles:
|
|
name = os.path.basename(f.path)
|
|
name = os.path.splitext(name)[0] + '.xhtml'
|
|
if name in self.htmlfile_map.values():
|
|
name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
|
|
save_counter += 1
|
|
self.htmlfile_map[f.path] = name
|
|
|
|
self.parse_html()
|
|
# Handle <image> tags inside embedded <svg>
|
|
# At least one source of EPUB files (Penguin) uses xlink:href
|
|
# without declaring the xlink namespace
|
|
for image in self.root.xpath('//image'):
|
|
for attr in image.attrib.keys():
|
|
if attr.endswith(':href'):
|
|
nhref = self.rewrite_links(image.get(attr))
|
|
image.set(attr, nhref)
|
|
|
|
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
|
|
for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
|
|
if self.root.get(bad, None) is not None:
|
|
self.root.attrib.pop(bad)
|
|
|
|
|
|
|
|
def save_path(self):
|
|
return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
|
|
|
|
def save(self, strip_comments=False):
|
|
'''
|
|
Save processed HTML into the content directory.
|
|
Should be called after all HTML processing is finished.
|
|
'''
|
|
self.root.set('xmlns', 'http://www.w3.org/1999/xhtml')
|
|
self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink')
|
|
for svg in self.root.xpath('//svg'):
|
|
svg.set('xmlns', 'http://www.w3.org/2000/svg')
|
|
|
|
ans = tostring(self.root, pretty_print=self.opts.pretty_print)
|
|
ans = re.compile(r'<head>', re.IGNORECASE).sub(
|
|
'<head>\n\t<meta http-equiv="Content-Type" '
|
|
'content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
|
|
if strip_comments:
|
|
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
|
|
with open(self.save_path(), 'wb') as f:
|
|
f.write(ans)
|
|
return f.name
|
|
|
|
|
|
def parse_html(self):
|
|
''' Create lxml ElementTree from HTML '''
|
|
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
|
if self.htmlfile.is_binary:
|
|
raise ValueError('Not a valid HTML file: '+self.htmlfile.path)
|
|
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
|
|
src = src.replace('\x00', '').replace('\r', ' ')
|
|
src = self.preprocess(src)
|
|
# lxml chokes on unicode input when it contains encoding declarations
|
|
for pat in ENCODING_PATS:
|
|
src = pat.sub('', src)
|
|
src = src[src.find('<'):]
|
|
# Remove unclosed <style> tag as that messes up lxml's parsing
|
|
src = re.sub(r'<style>\s*</head>', '', src)
|
|
try:
|
|
self.root = fromstring(src)
|
|
except:
|
|
if self.opts.verbose:
|
|
self.log_exception('lxml based parsing failed')
|
|
self.root = soupparser.fromstring(src, makeelement=parser.makeelement)
|
|
head = self.root.xpath('./head')
|
|
if head:
|
|
head = head[0]
|
|
else:
|
|
head = etree.SubElement(self.root, 'head')
|
|
self.root.remove(head)
|
|
self.root.insert(0, head)
|
|
|
|
self.head = head
|
|
try:
|
|
self.body = self.root.body
|
|
except:
|
|
import traceback
|
|
err = traceback.format_exc()
|
|
self.root = fromstring(u'<html><head/><body><p>This page was too '
|
|
'severely malformed for calibre to handle. '
|
|
'It has been replaced by this error message.'
|
|
'</p><pre>%s</pre></body></html>'%err)
|
|
self.head = self.root.xpath('./head')[0]
|
|
self.body = self.root.body
|
|
invalid_counter = 0
|
|
for a in self.root.xpath('//a[@name]'):
|
|
try:
|
|
a.set('id', a.get('name'))
|
|
except:
|
|
invalid_counter += 1
|
|
for x in ('id', 'name'):
|
|
a.set(x, 'calibre_invalid_id_%d'%invalid_counter)
|
|
if not self.head.xpath('./title'):
|
|
title = etree.SubElement(self.head, 'title')
|
|
title.text = _('Unknown')
|
|
|
|
def debug_tree(self, name):
|
|
'''
|
|
Dump source tree for later debugging.
|
|
'''
|
|
tdir = tempfile.gettempdir()
|
|
if not os.path.exists(tdir):
|
|
os.makedirs(tdir)
|
|
with open(os.path.join(tdir, '%s-%s.html'%\
|
|
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
|
|
f.write(tostring(self.root))
|
|
self.log_debug(_('Written processed HTML to ')+f.name)
|
|
|
|
|
|
def rewrite_links(self, olink):
|
|
'''
|
|
Make all links in document relative so that they work in the EPUB container.
|
|
Also copies any resources (like images, stylesheets, scripts, etc.) into
|
|
the local tree.
|
|
'''
|
|
if not isinstance(olink, unicode):
|
|
olink = olink.decode(self.htmlfile.encoding)
|
|
link = self.htmlfile.resolve(olink)
|
|
frag = (('#'+link.fragment) if link.fragment else '')
|
|
if link.path == self.htmlfile.path:
|
|
return frag if frag else '#'
|
|
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
|
|
return olink
|
|
if link.path in self.htmlfiles:
|
|
return self.htmlfile_map[link.path] + frag
|
|
if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
|
|
return olink # This happens when --max-levels is used
|
|
if link.path in self.resource_map.keys():
|
|
return self.resource_map[link.path] + frag
|
|
name = os.path.basename(link.path)
|
|
name, ext = os.path.splitext(name)
|
|
name += ('_%d'%len(self.resource_map)) + ext
|
|
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
|
|
name = 'resources/' + name
|
|
self.resource_map[link.path] = name
|
|
return name + frag
|
|
|
|
|
|
|
|
class Processor(Parser):
|
|
'''
|
|
This class builds on :class:`Parser` to provide additional methods
|
|
to perform various processing/modification tasks on HTML files.
|
|
'''
|
|
|
|
LINKS_PATH = XPath('//a[@href]')
|
|
PIXEL_PAT = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px')
|
|
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
Parser.__init__(self, *args, **kwargs)
|
|
temp = LoggingInterface(logging.getLogger('cssutils'))
|
|
temp.setup_cli_handler(self.opts.verbose)
|
|
self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR)
|
|
self.stylesheet = self.font_css = self.override_css = None
|
|
|
|
def detect_chapters(self):
|
|
self.detected_chapters = self.opts.chapter(self.root)
|
|
chapter_mark = self.opts.chapter_mark
|
|
page_break_before = 'display: block; page-break-before: always'
|
|
page_break_after = 'display: block; page-break-after: always'
|
|
for elem in self.detected_chapters:
|
|
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
|
|
self.log_info('\tDetected chapter: %s', text[:50])
|
|
if chapter_mark == 'none':
|
|
continue
|
|
elif chapter_mark == 'rule':
|
|
mark = etree.Element('hr')
|
|
elif chapter_mark == 'pagebreak':
|
|
mark = etree.Element('div', style=page_break_after)
|
|
else: # chapter_mark == 'both':
|
|
mark = etree.Element('hr', style=page_break_before)
|
|
elem.addprevious(mark)
|
|
|
|
def save(self, strip_comments=False):
|
|
style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
|
|
for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
|
|
if sheet is not None:
|
|
style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet',
|
|
'href':'resources/%s_%d.css'%(style_path, i),
|
|
'charset':'UTF-8'})
|
|
style.tail = '\n'
|
|
path = os.path.join(os.path.dirname(self.save_path()), *(style.get('href').split('/')))
|
|
self.resource_map[path] = style.get('href')
|
|
raw = getattr(sheet, 'cssText', sheet)
|
|
if isinstance(raw, unicode):
|
|
raw = raw.encode('utf-8')
|
|
open(path, 'wb').write(raw)
|
|
return Parser.save(self, strip_comments=strip_comments)
|
|
|
|
def populate_toc(self, toc):
|
|
'''
|
|
Populate the Table of Contents from detected chapters and links.
|
|
'''
|
|
class Adder(object):
|
|
|
|
def __init__(self, toc):
|
|
self.next_play_order = max([x.play_order for x in toc.flat()])
|
|
|
|
def __call__(self, href, fragment, text, target, type='link'):
|
|
for entry in toc.flat():
|
|
if entry.href == href and entry.fragment == fragment:
|
|
return entry
|
|
if len(text) > 50:
|
|
text = text[:50] + u'\u2026'
|
|
self.next_play_order += 1
|
|
return target.add_item(href, fragment, text, type=type,
|
|
play_order=self.next_play_order)
|
|
add_item = Adder(toc)
|
|
name = self.htmlfile_map[self.htmlfile.path]
|
|
href = 'content/'+name
|
|
|
|
# Add level* TOC items
|
|
counter = 0
|
|
|
|
def elem_to_link(elem, href, counter):
|
|
text = (u''.join(elem.xpath('string()'))).strip()
|
|
if not text:
|
|
return None, None, None
|
|
t = elem.xpath('descendant-or-self::a[@href]')
|
|
if t:
|
|
_href = 'content/' + t[0].get('href', '')
|
|
parts = _href.split('#')
|
|
_href = parts[0]
|
|
frag = None if len(parts) == 1 else parts[-1]
|
|
else:
|
|
_href = href
|
|
id = elem.get('id', 'calibre_chapter_%d'%counter)
|
|
elem.set('id', id)
|
|
frag = id
|
|
return text, _href, frag
|
|
|
|
|
|
if self.opts.level1_toc is not None:
|
|
level1 = self.opts.level1_toc(self.root)
|
|
level1_order = []
|
|
if level1:
|
|
added = {}
|
|
for elem in level1:
|
|
text, _href, frag = elem_to_link(elem, href, counter)
|
|
counter += 1
|
|
if text:
|
|
level1_order.append(add_item(_href, frag, text, toc, type='chapter'))
|
|
added[elem] = level1_order[-1]
|
|
add_item(_href, frag, 'Top', added[elem], type='chapter')
|
|
if self.opts.level2_toc is not None:
|
|
added2 = {}
|
|
level2 = list(self.opts.level2_toc(self.root))
|
|
for elem in level2:
|
|
level1 = None
|
|
for item in self.root.iterdescendants():
|
|
if item in added.keys():
|
|
level1 = added[item]
|
|
elif item == elem and level1 is not None:
|
|
text, _href, frag = elem_to_link(elem, href, counter)
|
|
counter += 1
|
|
if text:
|
|
added2[elem] = \
|
|
add_item(_href, frag, text, level1, type='chapter')
|
|
if self.opts.level3_toc is not None:
|
|
level3 = list(self.opts.level3_toc(self.root))
|
|
for elem in level3:
|
|
level2 = None
|
|
for item in self.root.iterdescendants():
|
|
if item in added2.keys():
|
|
level2 = added2[item]
|
|
elif item == elem and level2 is not None:
|
|
text, _href, frag = elem_to_link(elem, href, counter)
|
|
counter += 1
|
|
if text:
|
|
add_item(_href, frag, text, level2, type='chapter')
|
|
|
|
|
|
if level1_order: # Fix play order
|
|
next_play_order = level1_order[0].play_order
|
|
for x in level1_order:
|
|
for y in x.flat():
|
|
y.play_order = next_play_order
|
|
next_play_order += 1
|
|
|
|
|
|
|
|
if len(toc) > 0:
|
|
# Detected TOC entries using --level* options
|
|
# so aborting all other toc processing
|
|
return
|
|
# Add chapters to TOC
|
|
if not self.opts.no_chapters_in_toc:
|
|
for elem in getattr(self, 'detected_chapters', []):
|
|
text = (u''.join(elem.xpath('string()'))).strip()
|
|
if text:
|
|
counter += 1
|
|
id = elem.get('id', 'calibre_chapter_%d'%counter)
|
|
elem.set('id', id)
|
|
add_item(href, id, text, toc, type='chapter')
|
|
|
|
if len(list(toc.flat())) >= self.opts.toc_threshold:
|
|
return
|
|
referrer = toc
|
|
if self.htmlfile.referrer is not None:
|
|
try:
|
|
name = self.htmlfile_map[self.htmlfile.referrer.path]
|
|
href = 'content/'+name
|
|
for i in toc.flat():
|
|
if href == i.href and i.fragment is None:
|
|
referrer = i
|
|
break
|
|
except KeyError:
|
|
pass
|
|
if referrer is toc:
|
|
text = self.htmlfile.title
|
|
name = self.htmlfile_map[self.htmlfile.referrer.path]
|
|
href = 'content/'+name
|
|
referrer = add_item(href, None, text, toc)
|
|
|
|
# Add links to TOC
|
|
if int(self.opts.max_toc_links) > 0:
|
|
for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]:
|
|
text = (u''.join(link.xpath('string()'))).strip()
|
|
if text:
|
|
href = link.get('href', '')
|
|
if href and not (href.startswith('http://') or href.startswith('https://')):
|
|
href = href.strip()
|
|
if href.startswith('#'):
|
|
href = self.htmlfile_map[self.htmlfile.path] + href
|
|
href = 'content/'+href
|
|
parts = href.split('#')
|
|
href, fragment = parts[0], None
|
|
if len(parts) > 1:
|
|
fragment = parts[1]
|
|
add_item(href, fragment, text, referrer)
|
|
|
|
@classmethod
|
|
def preprocess_css(cls, css, dpi=96):
|
|
def rescale(match):
|
|
val = match.group(1)
|
|
try:
|
|
val = float(val)
|
|
except ValueError:
|
|
return ''
|
|
return '%fpt'%(72 * val/dpi)
|
|
|
|
css = cls.PIXEL_PAT.sub(rescale, css)
|
|
css = cls.PAGE_PAT.sub('', css)
|
|
return css
|
|
|
|
def extract_css(self, parsed_sheets):
|
|
'''
|
|
Remove all CSS information from the document and store it as
|
|
:class:`StyleSheet` objects.
|
|
'''
|
|
|
|
def get_id(chapter, counter, prefix='calibre_css_'):
|
|
new_id = '%s_%d'%(prefix, counter)
|
|
if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
|
|
chapter.attrib['id'] = id = chapter.get('name')
|
|
if not id:
|
|
chapter.attrib['id'] = chapter.attrib['name'] = new_id
|
|
return new_id
|
|
if 'id' in chapter.keys():
|
|
id = chapter.get('id')
|
|
else:
|
|
id = new_id
|
|
chapter.set('id', id)
|
|
return id
|
|
|
|
self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('')
|
|
self.specified_override_css = []
|
|
for link in self.root.xpath('//link'):
|
|
ltype = link.get('type', link.get('rel', 'text/css')).lower()
|
|
if 'css' in ltype or 'style' in ltype:
|
|
file = os.path.join(self.tdir, *(link.get('href', '').split('/')))
|
|
if file and not 'http:' in file:
|
|
if not parsed_sheets.has_key(file):
|
|
try:
|
|
self.log_info('Processing stylesheet %s...'%file)
|
|
css = self.preprocess_css(open(file).read())
|
|
except (IOError, OSError):
|
|
self.log_error('Failed to open stylesheet: %s'%file)
|
|
else:
|
|
try:
|
|
try:
|
|
parsed_sheets[file] = self.css_parser.parseString(css)
|
|
except ValueError:
|
|
parsed_sheets[file] = \
|
|
self.css_parser.parseString(\
|
|
css.decode('utf8', 'replace'))
|
|
except:
|
|
parsed_sheets[file] = css.decode('utf8', 'replace')
|
|
self.log_warning('Failed to parse stylesheet: %s'%file)
|
|
if self.opts.verbose > 1:
|
|
self.log_exception('')
|
|
if parsed_sheets.has_key(file):
|
|
self.external_stylesheets.append(parsed_sheets[file])
|
|
|
|
|
|
for style in self.root.xpath('//style'):
|
|
if 'css' in style.get('type', 'text/css').lower():
|
|
override_css = style.get('title', '') == 'override_css'
|
|
raw = '\n'.join(style.xpath('./text()'))
|
|
css = self.preprocess_css(raw)
|
|
try:
|
|
sheet = self.css_parser.parseString(css)
|
|
except:
|
|
self.log_debug('Failed to parse style element')
|
|
else:
|
|
for rule in sheet:
|
|
if override_css:
|
|
self.specified_override_css.append(rule)
|
|
else:
|
|
self.stylesheet.add(rule)
|
|
style.getparent().remove(style)
|
|
cache = {}
|
|
class_counter = 0
|
|
for font in self.root.xpath('//font'):
|
|
try:
|
|
size = font.attrib.pop('size', '3')
|
|
except:
|
|
size = '3'
|
|
if size and size.strip() and size.strip()[0] in ('+', '-'):
|
|
size = re.search(r'[+-]{0,1}[\d\.]+', size)
|
|
try:
|
|
size = float(size.group())
|
|
except:
|
|
size = 0
|
|
size += 3 # Hack assumes basefont=3
|
|
try:
|
|
setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
|
|
except ValueError:
|
|
setting = ''
|
|
face = font.attrib.pop('face', None)
|
|
if face:
|
|
faces = []
|
|
for face in face.split(','):
|
|
face = face.strip()
|
|
if ' ' in face and not (face[0] == face[-1] == '"'):
|
|
face = '"%s"' % face.replace('"', r'\"')
|
|
faces.append(face)
|
|
for generic in ('serif', 'sans-serif', 'monospace'):
|
|
if generic in faces:
|
|
break
|
|
else:
|
|
faces.append('serif')
|
|
family = ', '.join(faces)
|
|
setting += 'font-family: %s;' % family
|
|
color = font.attrib.pop('color', None)
|
|
if color is not None:
|
|
setting += 'color:%s'%color
|
|
classname = cache.get(setting, None)
|
|
if classname is None:
|
|
classname = 'calibre_class_%d'%class_counter
|
|
class_counter += 1
|
|
cache[setting] = classname
|
|
cn = font.get('class', '')
|
|
if cn: cn += ' '
|
|
cn += classname
|
|
font.set('class', cn)
|
|
font.tag = 'span'
|
|
|
|
id_css, id_css_counter = {}, 0
|
|
for elem in self.root.xpath('//*[@style]'):
|
|
setting = elem.get('style')
|
|
if elem.get('id', False) or elem.get('class', False):
|
|
elem.set('id', elem.get('id', 'calibre_css_id_%d'%id_css_counter))
|
|
id_css_counter += 1
|
|
id_css[elem.tag+'#'+elem.get('id')] = setting
|
|
else:
|
|
classname = cache.get(setting, None)
|
|
if classname is None:
|
|
classname = 'calibre_class_%d'%class_counter
|
|
class_counter += 1
|
|
cache[setting] = classname
|
|
cn = elem.get('class', classname)
|
|
elem.set('class', cn)
|
|
elem.attrib.pop('style')
|
|
|
|
css = '\n'.join(['.%s {%s;}'%(cn, setting) for \
|
|
setting, cn in cache.items()])
|
|
css += '\n\n'
|
|
css += '\n'.join(['%s {%s;}'%(selector, setting) for \
|
|
selector, setting in id_css.items()])
|
|
sheet = self.css_parser.parseString(self.preprocess_css(css.replace(';;}', ';}')))
|
|
for rule in sheet:
|
|
self.stylesheet.add(rule)
|
|
css = ''
|
|
css += '\n\n' + 'body {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt;}'
|
|
css += '\n\n@page {margin-top: %fpt; margin-bottom: %fpt; }'%(self.opts.margin_top, self.opts.margin_bottom)
|
|
css += '\n\nbody {margin-left: %fpt; margin-right: %fpt}'%(self.opts.margin_left, self.opts.margin_right)
|
|
# Workaround for anchor rendering bug in ADE
|
|
css += '\n\na { color: inherit; text-decoration: inherit; cursor: default; }\na[href] { color: blue; text-decoration: underline; cursor:pointer; }'
|
|
if self.opts.remove_paragraph_spacing:
|
|
css += '\n\np {text-indent: 1.5em; margin-top:0pt; margin-bottom:0pt; padding:0pt; border:0pt;}'
|
|
if not self.opts.no_justification:
|
|
css += '\n\nbody {text-align: justify}'
|
|
if self.opts.override_css:
|
|
css += '\n\n' + self.opts.override_css
|
|
self.override_css = self.css_parser.parseString(self.preprocess_css(css))
|
|
for rule in reversed(self.specified_override_css):
|
|
self.override_css.insertRule(rule, index=0)
|
|
|
|
|
|
def config(defaults=None, config_name='html',
|
|
desc=_('Options to control the traversal of HTML')):
|
|
if defaults is None:
|
|
c = Config(config_name, desc)
|
|
else:
|
|
c = StringConfig(defaults, desc)
|
|
|
|
c.add_opt('output', ['-o', '--output'], default=None,
|
|
help=_('The output directory. Default is the current directory.'))
|
|
c.add_opt('encoding', ['--encoding'], default=None,
|
|
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
|
c.add_opt('zip', ['--zip'], default=False,
|
|
help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
|
|
|
|
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
|
|
traversal('breadth_first', ['--breadth-first'], default=False,
|
|
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
|
|
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
|
|
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
|
|
|
|
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
|
|
metadata('title', ['-t', '--title'], default=None,
|
|
help=_('Set the title. Default is to autodetect.'))
|
|
metadata('authors', ['-a', '--authors'], default=None,
|
|
help=_('The author(s) of the ebook, as a & separated list.'))
|
|
metadata('tags', ['--subjects'], default=None,
|
|
help=_('The subject(s) of this book, as a comma separated list.'))
|
|
metadata('publisher', ['--publisher'], default=None,
|
|
help=_('Set the publisher of this book.'))
|
|
metadata('comments', ['--comment'], default=None,
|
|
help=_('A summary of this book.'))
|
|
metadata('from_opf', ['--metadata-from'], default=None,
|
|
help=_('Load metadata from the specified OPF file'))
|
|
|
|
debug = c.add_group('debug', _('Options useful for debugging'))
|
|
debug('verbose', ['-v', '--verbose'], default=0, action='count',
|
|
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
|
|
debug('pretty_print', ['--pretty-print'], default=False,
|
|
help=_('Output HTML is "pretty printed" for easier parsing by humans'))
|
|
|
|
return c
|
|
|
|
def option_parser():
|
|
c = config()
|
|
return c.option_parser(usage=_('''\
|
|
%prog [options] file.html|opf
|
|
|
|
Follow all links in an HTML file and collect them into the specified directory.
|
|
Also collects any resources like images, stylesheets, scripts, etc.
|
|
If an OPF file is specified instead, the list of files in its <spine> element
|
|
is used.
|
|
'''))
|
|
|
|
def search_for_opf(dir):
|
|
for f in os.listdir(dir):
|
|
if f.lower().endswith('.opf'):
|
|
return OPF(open(os.path.join(dir, f), 'rb'), dir)
|
|
|
|
|
|
def get_filelist(htmlfile, opts):
|
|
'''
|
|
Build list of files referenced by html file or try to detect and use an
|
|
OPF file instead.
|
|
'''
|
|
print 'Building file list...'
|
|
dir = os.path.dirname(htmlfile)
|
|
if not dir:
|
|
dir = os.getcwd()
|
|
opf = search_for_opf(dir)
|
|
filelist = None
|
|
if opf is not None:
|
|
try:
|
|
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
|
except:
|
|
pass
|
|
if not filelist:
|
|
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
|
verbose=opts.verbose, encoding=opts.encoding)\
|
|
[0 if opts.breadth_first else 1]
|
|
if opts.verbose:
|
|
print '\tFound files...'
|
|
for f in filelist:
|
|
print '\t\t', f
|
|
return opf, filelist
|
|
|
|
def parse_content(filelist, opts):
|
|
'''
|
|
Parse content, rewriting links and copying resources.
|
|
'''
|
|
if not opts.output:
|
|
opts.output = '.'
|
|
opts.output = os.path.abspath(opts.output)
|
|
rdir = os.path.join(opts.output, 'content', 'resources')
|
|
if not os.path.exists(rdir):
|
|
os.makedirs(rdir)
|
|
resource_map = {}
|
|
for htmlfile in filelist:
|
|
p = Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
|
|
resource_map, filelist)
|
|
p.save()
|
|
return resource_map, p.htmlfile_map
|
|
|
|
def merge_metadata(htmlfile, opf, opts):
|
|
'''
|
|
Merge metadata from various sources.
|
|
'''
|
|
if opf:
|
|
mi = MetaInformation(opf)
|
|
elif htmlfile:
|
|
from calibre.ebooks.metadata.meta import get_metadata
|
|
try:
|
|
mi = get_metadata(open(htmlfile, 'rb'), 'html')
|
|
except:
|
|
mi = MetaInformation(None, None)
|
|
if opts.from_opf is not None and os.access(opts.from_opf, os.R_OK):
|
|
mi.smart_update(OPF(open(opts.from_opf, 'rb'), os.path.abspath(os.path.dirname(opts.from_opf))))
|
|
for attr in ('title', 'authors', 'publisher', 'tags', 'comments'):
|
|
val = getattr(opts, attr, None)
|
|
if val is None or val == _('Unknown') or val == [_('Unknown')]:
|
|
continue
|
|
if attr =='authors':
|
|
val = [i.strip() for i in val.split('&') if i.strip()]
|
|
elif attr == 'tags':
|
|
val = [i.strip() for i in val.split(',') if i.strip()]
|
|
setattr(mi, attr, val)
|
|
|
|
cover = getattr(opts, 'cover', False)
|
|
if cover and os.path.exists(cover):
|
|
mi.cover = os.path.abspath(cover)
|
|
|
|
if not mi.title:
|
|
if htmlfile:
|
|
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
|
|
else:
|
|
mi.title = _('Unknown')
|
|
if not mi.authors:
|
|
mi.authors = [_('Unknown')]
|
|
return mi
|
|
|
|
def create_metadata(basepath, mi, filelist, resources):
|
|
'''
|
|
Create an OPF metadata object with correct spine and manifest.
|
|
'''
|
|
mi = OPFCreator(basepath, mi)
|
|
mi.guide = None
|
|
entries = [('content/'+f, 'application/xhtml+xml') for f in filelist] + [(f, None) for f in resources]
|
|
for f in filelist:
|
|
if os.path.exists(os.path.join(basepath, 'content', 'resources', f+'.css')):
|
|
entries.append(('content/resources/'+f+'.css', 'text/css'))
|
|
mi.create_manifest(entries)
|
|
mi.create_spine(['content/'+f for f in filelist])
|
|
return mi
|
|
|
|
def rebase_toc(toc, htmlfile_map, basepath, root=True):
|
|
'''
|
|
Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. Maps all entries
|
|
in the TOC to point to their new locations relative to the new OPF file.
|
|
'''
|
|
def fix_entry(entry):
|
|
if entry.abspath in htmlfile_map.keys():
|
|
entry.href = 'content/' + htmlfile_map[entry.abspath]
|
|
|
|
for entry in toc:
|
|
rebase_toc(entry, htmlfile_map, basepath, root=False)
|
|
fix_entry(entry)
|
|
if root:
|
|
toc.base_path = basepath
|
|
|
|
def create_dir(htmlfile, opts):
|
|
'''
|
|
Create a directory that contains the open ebook
|
|
'''
|
|
if htmlfile.lower().endswith('.opf'):
|
|
opf = OPF(open(htmlfile, 'rb'), os.path.dirname(os.path.abspath(htmlfile)))
|
|
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
|
mi = MetaInformation(opf)
|
|
else:
|
|
opf, filelist = get_filelist(htmlfile, opts)
|
|
mi = merge_metadata(htmlfile, opf, opts)
|
|
|
|
resource_map, htmlfile_map = parse_content(filelist, opts)
|
|
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
|
|
|
|
if opf and opf.cover and os.access(opf.cover, os.R_OK):
|
|
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
|
|
shutil.copyfile(opf.cover, cpath)
|
|
resources.append(cpath)
|
|
mi.cover = cpath
|
|
|
|
spine = [htmlfile_map[f.path] for f in filelist]
|
|
mi = create_metadata(opts.output, mi, spine, resources)
|
|
buf = cStringIO.StringIO()
|
|
if mi.toc:
|
|
rebase_toc(mi.toc, htmlfile_map, opts.output)
|
|
with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
|
|
mi.render(f, buf)
|
|
toc = buf.getvalue()
|
|
if toc:
|
|
with open(os.path.join(opts.output, 'toc.ncx'), 'wb') as f:
|
|
f.write(toc)
|
|
print 'Open ebook created in', opts.output
|
|
|
|
def create_oebzip(htmlfile, opts):
|
|
'''
|
|
Create a zip file that contains the Open ebook.
|
|
'''
|
|
tdir = PersistentTemporaryDirectory('_create_oebzip')
|
|
if opts.output is None:
|
|
opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip')
|
|
ofile = opts.output
|
|
opts.output = tdir
|
|
create_dir(htmlfile, opts)
|
|
zf = ZipFile(ofile, 'w')
|
|
zf.add_dir(opts.output)
|
|
print 'Output saved to', ofile
|
|
|
|
def main(args=sys.argv):
|
|
parser = option_parser()
|
|
opts, args = parser.parse_args(args)
|
|
if len(args) < 2:
|
|
parser.print_help()
|
|
print _('You must specify an input HTML file')
|
|
return 1
|
|
|
|
htmlfile = args[1]
|
|
if opts.zip:
|
|
create_oebzip(htmlfile, opts)
|
|
else:
|
|
create_dir(htmlfile, opts)
|
|
|
|
return 0
|
|
|
|
def gui_main(htmlfile, pt=None):
|
|
'''
|
|
Convenience wrapper for use in recursively importing HTML files.
|
|
'''
|
|
if pt is None:
|
|
pt = PersistentTemporaryFile('_html2oeb_gui.oeb.zip')
|
|
pt.close()
|
|
opts = '''
|
|
pretty_print = True
|
|
max_levels = 5
|
|
output = %s
|
|
'''%repr(pt.name)
|
|
c = config(defaults=opts)
|
|
opts = c.parse()
|
|
create_oebzip(htmlfile, opts)
|
|
zf = ZipFile(pt.name, 'r')
|
|
nontrivial = [f for f in zf.infolist() if f.compress_size > 1 and not f.filename.endswith('.opf')]
|
|
if len(nontrivial) < 2:
|
|
return None
|
|
return pt.name
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main()) |