mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:Full implementation of HTML traversal
This commit is contained in:
parent
2efa1ec708
commit
39afcb27f7
@ -8,6 +8,7 @@ Conversion to EPUB.
|
||||
'''
|
||||
import sys
|
||||
from calibre.utils.config import Config, StringConfig
|
||||
from calibre.ebooks.html import config as common_config
|
||||
|
||||
def config(defaults=None):
|
||||
desc = _('Options to control the conversion to EPUB')
|
||||
@ -16,22 +17,11 @@ def config(defaults=None):
|
||||
else:
|
||||
c = StringConfig(defaults, desc)
|
||||
|
||||
c.update(common_config())
|
||||
c.remove_opt('output')
|
||||
|
||||
c.add_opt('output', ['-o', '--output'], default=None,
|
||||
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
|
||||
c.add_opt('encoding', ['--encoding'], default=None,
|
||||
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
||||
|
||||
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
|
||||
metadata('title', ['-t', '--title'], default=None,
|
||||
help=_('Set the title. Default is to autodetect.'))
|
||||
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
|
||||
help=_('The author(s) of the ebook, as a comma separated list.'))
|
||||
|
||||
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
|
||||
traversal('breadth_first', ['--breadth-first'], default=False,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
|
||||
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
|
||||
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
|
||||
|
||||
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
|
||||
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
|
||||
@ -46,8 +36,5 @@ help on using this feature.
|
||||
help=_('Don\'t add detected chapters to the Table of Contents'))
|
||||
structure('no_links_in_toc', ['--no-links-in-toc'], default=False,
|
||||
help=_('Don\'t add links in the root HTML file to the Table of Contents'))
|
||||
debug = c.add_group('debug', _('Options useful for debugging'))
|
||||
debug('verbose', ['-v', '--verbose'], default=0, action='count',
|
||||
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
|
||||
|
||||
return c
|
@ -2,44 +2,22 @@ from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
import os, sys, logging, re, shutil, tempfile
|
||||
from lxml import html
|
||||
import os, sys, re, shutil
|
||||
from lxml.etree import XPath
|
||||
get_text = XPath("//text()")
|
||||
|
||||
from calibre import LoggingInterface
|
||||
from calibre.ebooks.html import PreProcessor
|
||||
from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist
|
||||
from calibre.ebooks.epub import config as common_config
|
||||
from calibre.ebooks.epub.traverse import traverse, opf_traverse
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf import OPFReader
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
|
||||
|
||||
class HTMLProcessor(PreProcessor, LoggingInterface):
|
||||
|
||||
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
||||
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
|
||||
class HTMLProcessor(Parser):
|
||||
|
||||
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
|
||||
LoggingInterface.__init__(self, logging.getLogger('html2epub'))
|
||||
self.setup_cli_handler(opts.verbose)
|
||||
|
||||
self.htmlfile = htmlfile
|
||||
self.opts = opts
|
||||
self.tdir = tdir
|
||||
self.resource_map = resource_map
|
||||
self.resource_dir = os.path.join(tdir, 'resources')
|
||||
self.htmlfiles = htmlfiles
|
||||
|
||||
self.parse_html()
|
||||
|
||||
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
|
||||
|
||||
Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles,
|
||||
name='html2epub')
|
||||
if opts.verbose > 2:
|
||||
self.debug_tree('parsed')
|
||||
|
||||
self.detected_chapters = self.opts.chapter(self.root)
|
||||
self.extract_css()
|
||||
|
||||
if opts.verbose > 2:
|
||||
@ -49,130 +27,6 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
|
||||
|
||||
self.split()
|
||||
|
||||
def debug_tree(self, name):
|
||||
'''
|
||||
Dump source tree for later debugging.
|
||||
'''
|
||||
tdir = tempfile.gettempdir()
|
||||
if not os.path.exists(tdir):
|
||||
os.makedirs(tdir)
|
||||
with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\
|
||||
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
|
||||
f.write(html.tostring(self.root, encoding='utf-8'))
|
||||
self.log_debug(_('Written processed HTML to ')+f.name)
|
||||
|
||||
def parse_html(self):
|
||||
''' Create lxml ElementTree from HTML '''
|
||||
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
|
||||
src = self.preprocess(src)
|
||||
# lxml chokes on unicode input when it contains encoding declarations
|
||||
for pat in self.ENCODING_PATS:
|
||||
src = pat.sub('', src)
|
||||
try:
|
||||
self.root = html.document_fromstring(src)
|
||||
except:
|
||||
if self.opts.verbose:
|
||||
self.log_exception('lxml based parsing failed')
|
||||
self.root = html.soupparser.fromstring()
|
||||
self.head = self.body = None
|
||||
head = self.root.xpath('//head')
|
||||
if head:
|
||||
self.head = head[0]
|
||||
body = self.root.xpath('//body')
|
||||
if body:
|
||||
self.body = body[0]
|
||||
self.detected_chapters = self.opts.chapter(self.root)
|
||||
|
||||
def rewrite_links(self, olink):
|
||||
'''
|
||||
Make all links in document relative so that they work in the EPUB container.
|
||||
Also copies any resources (like images, stylesheets, scripts, etc.) into
|
||||
the local tree.
|
||||
'''
|
||||
if not isinstance(olink, unicode):
|
||||
olink = olink.decode(self.htmlfile.encoding)
|
||||
link = self.htmlfile.resolve(olink)
|
||||
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
|
||||
return olink
|
||||
if link.path in self.htmlfiles:
|
||||
return os.path.basename(link.path)
|
||||
if link.path in self.resource_map.keys():
|
||||
return self.resource_map[link.path]
|
||||
name = os.path.basename(link.path)
|
||||
name, ext = os.path.splitext(name)
|
||||
name += ('_%d'%len(self.resource_map)) + ext
|
||||
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
|
||||
name = 'resources/' + name
|
||||
self.resource_map[link.path] = name
|
||||
return name
|
||||
|
||||
|
||||
def extract_css(self):
|
||||
'''
|
||||
Remove all CSS information from the document and store in self.raw_css.
|
||||
This includes <font> tags.
|
||||
'''
|
||||
css = []
|
||||
for link in self.root.xpath('//link'):
|
||||
if 'css' in link.get('type', 'text/css').lower():
|
||||
file = self.htmlfile.resolve(link.get('href', ''))
|
||||
if os.path.exists(file) and os.path.isfile(file):
|
||||
css.append(open(file, 'rb').read().decode('utf-8'))
|
||||
link.getparent().remove(link)
|
||||
|
||||
for style in self.root.xpath('//style'):
|
||||
if 'css' in style.get('type', 'text/css').lower():
|
||||
css.append('\n'.join(get_text(style)))
|
||||
style.getparent().remove(style)
|
||||
|
||||
font_id = 1
|
||||
for font in self.root.xpath('//font'):
|
||||
try:
|
||||
size = int(font.attrib.pop('size', '3'))
|
||||
except:
|
||||
size = 3
|
||||
setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
|
||||
face = font.attrib.pop('face', None)
|
||||
if face is not None:
|
||||
setting += 'font-face:%s;'%face
|
||||
color = font.attrib.pop('color', None)
|
||||
if color is not None:
|
||||
setting += 'color:%s'%color
|
||||
id = 'calibre_font_id_%d'%font_id
|
||||
font.set('id', 'calibre_font_id_%d'%font_id)
|
||||
font_id += 1
|
||||
css.append('#%s { %s }'%(id, setting))
|
||||
|
||||
|
||||
css_counter = 1
|
||||
for elem in self.root.xpath('//*[@style]'):
|
||||
if 'id' not in elem.keys():
|
||||
elem.set('id', 'calibre_css_id_%d'%css_counter)
|
||||
css_counter += 1
|
||||
css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
|
||||
elem.attrib.pop('style')
|
||||
chapter_counter = 1
|
||||
for chapter in self.detected_chapters:
|
||||
if chapter.tag.lower() == 'a':
|
||||
if 'name' in chapter.keys():
|
||||
chapter.attrib['id'] = id = chapter.get('name')
|
||||
elif 'id' in chapter.keys():
|
||||
id = chapter.get('id')
|
||||
else:
|
||||
id = 'calibre_detected_chapter_%d'%chapter_counter
|
||||
chapter_counter += 1
|
||||
chapter.set('id', id)
|
||||
else:
|
||||
if 'id' not in chapter.keys():
|
||||
id = 'calibre_detected_chapter_%d'%chapter_counter
|
||||
chapter_counter += 1
|
||||
chapter.set('id', id)
|
||||
css.append('#%s {%s}'%(id, 'page-break-before:always'))
|
||||
|
||||
self.raw_css = '\n\n'.join(css)
|
||||
# TODO: Figure out what to do about CSS imports from linked stylesheets
|
||||
|
||||
def collect_font_statistics(self):
|
||||
'''
|
||||
Collect font statistics to figure out the base font size used in this
|
||||
@ -191,8 +45,8 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
|
||||
pass
|
||||
|
||||
|
||||
def config():
|
||||
c = common_config()
|
||||
def config(defaults=None):
|
||||
c = common_config(defaults=defaults)
|
||||
return c
|
||||
|
||||
def option_parser():
|
||||
@ -203,11 +57,6 @@ def option_parser():
|
||||
Convert a HTML file to an EPUB ebook. Follows links in the HTML file.
|
||||
'''))
|
||||
|
||||
def search_for_opf(dir):
|
||||
for f in os.listdir(dir):
|
||||
if f.lower().endswith('.opf'):
|
||||
return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
|
||||
|
||||
def parse_content(filelist, opts):
|
||||
tdir = PersistentTemporaryDirectory('_html2epub')
|
||||
os.makedirs(os.path.join(tdir, 'content', 'resources'))
|
||||
@ -221,39 +70,17 @@ def convert(htmlfile, opts, notification=None):
|
||||
if opts.output is None:
|
||||
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
|
||||
opts.output = os.path.abspath(opts.output)
|
||||
opf = search_for_opf(os.path.dirname(htmlfile))
|
||||
if opf:
|
||||
mi = MetaInformation(opf)
|
||||
else:
|
||||
mi = get_metadata(open(htmlfile, 'rb'), 'html')
|
||||
if opts.title:
|
||||
mi.title = opts.title
|
||||
if opts.authors != _('Unknown'):
|
||||
opts.authors = opts.authors.split(',')
|
||||
opts.authors = [a.strip() for a in opts.authors]
|
||||
mi.authors = opts.authors
|
||||
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
|
||||
opf, filelist = get_filelist(htmlfile, opts)
|
||||
mi = merge_metadata(htmlfile, opf, opts)
|
||||
opts.chapter = XPath(opts.chapter,
|
||||
namespaces={'re':'http://exslt.org/regular-expressions'})
|
||||
|
||||
filelist = None
|
||||
print 'Building file list...'
|
||||
if opf is not None:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
||||
if not filelist:
|
||||
filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
print '\tFound files...'
|
||||
for f in filelist:
|
||||
print '\t\t', f
|
||||
|
||||
parse_content(filelist, opts)
|
||||
resource_map = parse_content(filelist, opts)
|
||||
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
|
||||
if opf.cover and os.access(opf.cover, os.R_OK):
|
||||
shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
|
||||
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
|
||||
shutil.copyfile(opf.cover, cpath)
|
||||
resources.append(cpath)
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
@ -267,4 +94,3 @@ def main(args=sys.argv):
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
||||
|
@ -1,8 +1,220 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
import sys, re, os, shutil, logging, tempfile
|
||||
from urlparse import urlparse
|
||||
from urllib import unquote
|
||||
|
||||
from lxml import html
|
||||
from lxml.etree import XPath
|
||||
get_text = XPath("//text()")
|
||||
|
||||
from calibre import LoggingInterface, unicode_path
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.config import Config, StringConfig
|
||||
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
path = url.path
|
||||
if os.path.isabs(path):
|
||||
return path
|
||||
return os.path.abspath(os.path.join(base, path))
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
:param base: The base directory that relative URLs are with respect to.
|
||||
Must be a unicode string.
|
||||
'''
|
||||
assert isinstance(url, unicode) and isinstance(base, unicode)
|
||||
self.url = url
|
||||
self.parsed_url = urlparse(unquote(self.url))
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||
self.path = None
|
||||
self.fragment = self.parsed_url.fragment
|
||||
if self.is_local and not self.is_internal:
|
||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||
|
||||
def __hash__(self):
|
||||
if self.path is None:
|
||||
return hash(self.url)
|
||||
return hash(self.path)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return u'Link: %s --> %s'%(self.url, self.path)
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
|
||||
def __init__(self, msg, errno):
|
||||
Exception.__init__(self, msg)
|
||||
self.doesnt_exist = errno == 2
|
||||
self.errno = errno
|
||||
|
||||
class HTMLFile(object):
|
||||
'''
|
||||
Contains basic information about an HTML file. This
|
||||
includes a list of links to other files as well as
|
||||
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||
file in which case :member:`is_binary` is set to True.
|
||||
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose):
|
||||
'''
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
src = f.read()
|
||||
except IOError, err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||
|
||||
if not self.is_binary:
|
||||
if encoding is None:
|
||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||
self.encoding = encoding
|
||||
|
||||
src = src.decode(encoding, 'replace')
|
||||
self.find_links(src)
|
||||
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
def find_links(self, src):
|
||||
for match in self.LINK_PAT.finditer(src):
|
||||
url = None
|
||||
for i in ('url1', 'url2', 'url3'):
|
||||
url = match.group(i)
|
||||
if url:
|
||||
break
|
||||
link = self.resolve(url)
|
||||
if link not in self.links:
|
||||
self.links.append(link)
|
||||
|
||||
def resolve(self, url):
|
||||
return Link(url, self.base)
|
||||
|
||||
|
||||
def depth_first(root, flat, visited=set([])):
|
||||
yield root
|
||||
visited.add(root)
|
||||
for link in root.links:
|
||||
if link.path is not None and link not in visited:
|
||||
try:
|
||||
index = flat.index(link)
|
||||
except ValueError: # Can happen if max_levels is used
|
||||
continue
|
||||
hf = flat[index]
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
for hf in depth_first(hf, flat, visited):
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
||||
'''
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||
:class:`HTMLFile` objects.
|
||||
'''
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
nl = []
|
||||
for hf in next_level:
|
||||
rejects = []
|
||||
for link in hf.links:
|
||||
if link.path is None or link.path in flat:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile, err:
|
||||
rejects.append(link)
|
||||
if not err.doesnt_exist or verbose > 1:
|
||||
print str(err)
|
||||
for link in rejects:
|
||||
hf.links.remove(link)
|
||||
|
||||
next_level = list(nl)
|
||||
|
||||
return flat, list(depth_first(flat[0], flat))
|
||||
|
||||
|
||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
'''
|
||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||
`<spine>` element of the OPF.
|
||||
|
||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
'''
|
||||
if not opf_reader.spine:
|
||||
raise ValueError('OPF does not have a spine')
|
||||
flat = []
|
||||
for path in opf_reader.spine.items():
|
||||
if path not in flat:
|
||||
flat.append(os.path.abspath(path))
|
||||
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
|
||||
return flat
|
||||
|
||||
|
||||
|
||||
class PreProcessor(object):
|
||||
@ -72,3 +284,287 @@ class PreProcessor(object):
|
||||
|
||||
return html
|
||||
|
||||
class Parser(PreProcessor):
|
||||
|
||||
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
||||
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
|
||||
|
||||
|
||||
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
|
||||
LoggingInterface.__init__(self, logging.getLogger(name))
|
||||
self.setup_cli_handler(opts.verbose)
|
||||
self.htmlfile = htmlfile
|
||||
self.opts = opts
|
||||
self.tdir = tdir
|
||||
self.resource_map = resource_map
|
||||
self.htmlfiles = htmlfiles
|
||||
self.resource_dir = os.path.join(tdir, 'resources')
|
||||
|
||||
self.parse_html()
|
||||
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
|
||||
|
||||
def parse_html(self):
|
||||
''' Create lxml ElementTree from HTML '''
|
||||
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
|
||||
src = self.preprocess(src)
|
||||
# lxml chokes on unicode input when it contains encoding declarations
|
||||
for pat in self.ENCODING_PATS:
|
||||
src = pat.sub('', src)
|
||||
try:
|
||||
self.root = html.document_fromstring(src)
|
||||
except:
|
||||
if self.opts.verbose:
|
||||
self.log_exception('lxml based parsing failed')
|
||||
self.root = html.soupparser.fromstring()
|
||||
self.head = self.body = None
|
||||
head = self.root.xpath('//head')
|
||||
if head:
|
||||
self.head = head[0]
|
||||
body = self.root.xpath('//body')
|
||||
if body:
|
||||
self.body = body[0]
|
||||
|
||||
def debug_tree(self, name):
|
||||
'''
|
||||
Dump source tree for later debugging.
|
||||
'''
|
||||
tdir = tempfile.gettempdir()
|
||||
if not os.path.exists(tdir):
|
||||
os.makedirs(tdir)
|
||||
with open(os.path.join(tdir, '%s-%s-%s.html'%\
|
||||
(self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f:
|
||||
f.write(html.tostring(self.root, encoding='utf-8'))
|
||||
self.log_debug(_('Written processed HTML to ')+f.name)
|
||||
|
||||
|
||||
def rewrite_links(self, olink):
|
||||
'''
|
||||
Make all links in document relative so that they work in the EPUB container.
|
||||
Also copies any resources (like images, stylesheets, scripts, etc.) into
|
||||
the local tree.
|
||||
'''
|
||||
if not isinstance(olink, unicode):
|
||||
olink = olink.decode(self.htmlfile.encoding)
|
||||
link = self.htmlfile.resolve(olink)
|
||||
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
|
||||
return olink
|
||||
if link.path in self.htmlfiles:
|
||||
return os.path.basename(link.path)
|
||||
if link.path in self.resource_map.keys():
|
||||
return self.resource_map[link.path]
|
||||
name = os.path.basename(link.path)
|
||||
name, ext = os.path.splitext(name)
|
||||
name += ('_%d'%len(self.resource_map)) + ext
|
||||
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
|
||||
name = 'resources/' + name
|
||||
self.resource_map[link.path] = name
|
||||
return name
|
||||
|
||||
def extract_css(self):
|
||||
'''
|
||||
Remove all CSS information from the document and store in self.raw_css.
|
||||
This includes <font> tags.
|
||||
'''
|
||||
css = []
|
||||
for link in self.root.xpath('//link'):
|
||||
if 'css' in link.get('type', 'text/css').lower():
|
||||
file = self.htmlfile.resolve(link.get('href', ''))
|
||||
if os.path.exists(file) and os.path.isfile(file):
|
||||
css.append(open(file, 'rb').read().decode('utf-8'))
|
||||
link.getparent().remove(link)
|
||||
|
||||
for style in self.root.xpath('//style'):
|
||||
if 'css' in style.get('type', 'text/css').lower():
|
||||
css.append('\n'.join(get_text(style)))
|
||||
style.getparent().remove(style)
|
||||
|
||||
font_id = 1
|
||||
for font in self.root.xpath('//font'):
|
||||
try:
|
||||
size = int(font.attrib.pop('size', '3'))
|
||||
except:
|
||||
size = 3
|
||||
setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
|
||||
face = font.attrib.pop('face', None)
|
||||
if face is not None:
|
||||
setting += 'font-face:%s;'%face
|
||||
color = font.attrib.pop('color', None)
|
||||
if color is not None:
|
||||
setting += 'color:%s'%color
|
||||
id = 'calibre_font_id_%d'%font_id
|
||||
font.set('id', 'calibre_font_id_%d'%font_id)
|
||||
font_id += 1
|
||||
css.append('#%s { %s }'%(id, setting))
|
||||
|
||||
|
||||
css_counter = 1
|
||||
for elem in self.root.xpath('//*[@style]'):
|
||||
if 'id' not in elem.keys():
|
||||
elem.set('id', 'calibre_css_id_%d'%css_counter)
|
||||
css_counter += 1
|
||||
css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
|
||||
elem.attrib.pop('style')
|
||||
chapter_counter = 1
|
||||
for chapter in self.detected_chapters:
|
||||
if chapter.tag.lower() == 'a':
|
||||
if 'name' in chapter.keys():
|
||||
chapter.attrib['id'] = id = chapter.get('name')
|
||||
elif 'id' in chapter.keys():
|
||||
id = chapter.get('id')
|
||||
else:
|
||||
id = 'calibre_detected_chapter_%d'%chapter_counter
|
||||
chapter_counter += 1
|
||||
chapter.set('id', id)
|
||||
else:
|
||||
if 'id' not in chapter.keys():
|
||||
id = 'calibre_detected_chapter_%d'%chapter_counter
|
||||
chapter_counter += 1
|
||||
chapter.set('id', id)
|
||||
css.append('#%s {%s}'%(id, 'page-break-before:always'))
|
||||
|
||||
self.raw_css = '\n\n'.join(css)
|
||||
# TODO: Figure out what to do about CSS imports from linked stylesheets
|
||||
|
||||
def config(defaults=None):
|
||||
desc = _('Options to control the traversal of HTML')
|
||||
if defaults is None:
|
||||
c = Config('html', desc)
|
||||
else:
|
||||
c = StringConfig(defaults, desc)
|
||||
|
||||
c.add_opt('output', ['-o', '--output'], default=None,
|
||||
help=_('The output directory. Default is the current directory.'))
|
||||
c.add_opt('encoding', ['--encoding'], default=None,
|
||||
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
||||
|
||||
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
|
||||
traversal('breadth_first', ['--breadth-first'], default=False,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
|
||||
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
|
||||
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
|
||||
|
||||
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
|
||||
metadata('title', ['-t', '--title'], default=None,
|
||||
help=_('Set the title. Default is to autodetect.'))
|
||||
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
|
||||
help=_('The author(s) of the ebook, as a comma separated list.'))
|
||||
|
||||
debug = c.add_group('debug', _('Options useful for debugging'))
|
||||
debug('verbose', ['-v', '--verbose'], default=0, action='count',
|
||||
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
|
||||
|
||||
return c
|
||||
|
||||
def option_parser():
|
||||
c = config()
|
||||
return c.option_parser(usage=_('''\
|
||||
%prog [options] file.html
|
||||
|
||||
Follow all links in an HTML file and collect them into the specified directory.
|
||||
Also collects any references resources like images, stylesheets, scripts, etc.
|
||||
'''))
|
||||
|
||||
def safe_option_parser():
|
||||
return option_parser(safe=True)
|
||||
|
||||
def search_for_opf(dir):
|
||||
for f in os.listdir(dir):
|
||||
if f.lower().endswith('.opf'):
|
||||
return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
|
||||
|
||||
|
||||
def get_filelist(htmlfile, opts):
|
||||
print 'Building file list...'
|
||||
|
||||
opf = search_for_opf(os.path.dirname(htmlfile))
|
||||
if opf is not None:
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
||||
if not filelist:
|
||||
filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
|
||||
[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
print '\tFound files...'
|
||||
for f in filelist:
|
||||
print '\t\t', f
|
||||
|
||||
return opf, filelist
|
||||
|
||||
def parse_content(filelist, opts):
|
||||
if not opts.output:
|
||||
opts.output = '.'
|
||||
opts.output = os.path.abspath(opts.output)
|
||||
rdir = os.path.join(opts.output, 'content', 'resources')
|
||||
if not os.path.exists(rdir):
|
||||
os.makedirs(rdir)
|
||||
resource_map = {}
|
||||
for htmlfile in filelist:
|
||||
Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
|
||||
resource_map, filelist)
|
||||
return resource_map
|
||||
|
||||
def merge_metadata(htmlfile, opf, opts):
|
||||
if opf:
|
||||
mi = MetaInformation(opf)
|
||||
else:
|
||||
mi = get_metadata(open(htmlfile, 'rb'), 'html')
|
||||
if opts.title:
|
||||
mi.title = opts.title
|
||||
if opts.authors != _('Unknown'):
|
||||
opts.authors = opts.authors.split(',')
|
||||
opts.authors = [a.strip() for a in opts.authors]
|
||||
mi.authors = opts.authors
|
||||
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
|
||||
def create_metadata(basepath, mi, filelist, resources):
|
||||
mi = OPFCreator(basepath, mi)
|
||||
entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in filelist])
|
||||
return mi
|
||||
|
||||
def create_dir(htmlfile, opts):
|
||||
opf, filelist = get_filelist(htmlfile, opts)
|
||||
mi = merge_metadata(htmlfile, opf, opts)
|
||||
resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
|
||||
if opf.cover and os.access(opf.cover, os.R_OK):
|
||||
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
|
||||
shutil.copyfile(opf.cover, cpath)
|
||||
resources.append(cpath)
|
||||
mi = create_metadata(opts.output, mi, filelist, resources)
|
||||
with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
|
||||
mi.render(f)
|
||||
print 'Open ebook created in', opts.output
|
||||
|
||||
def create_oebzip(htmlfile, opts):
|
||||
tdir = PersistentTemporaryDirectory('_create_oebzip')
|
||||
if opts.output is None:
|
||||
opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip')
|
||||
ofile = opts.output
|
||||
opts.output = tdir
|
||||
create_dir(htmlfile, opts)
|
||||
zf = ZipFile(ofile, 'w')
|
||||
zf.add_dir(opts.output)
|
||||
print 'Output saved to', ofile
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
if len(args) < 2:
|
||||
parser.print_help()
|
||||
print _('You must specify an input HTML file')
|
||||
return 1
|
||||
|
||||
htmlfile = args[1]
|
||||
create_dir(htmlfile, opts)
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
||||
|
||||
|
@ -25,6 +25,7 @@ entry_points = {
|
||||
'epub-meta = calibre.ebooks.metadata.epub:main',
|
||||
'txt2lrf = calibre.ebooks.lrf.txt.convert_from:main',
|
||||
'html2lrf = calibre.ebooks.lrf.html.convert_from:main',
|
||||
'html2oeb = calibre.ebooks.html:main',
|
||||
'html2epub = calibre.ebooks.epub.from_html:main',
|
||||
'markdown-calibre = calibre.ebooks.markdown.markdown:main',
|
||||
'lit2lrf = calibre.ebooks.lrf.lit.convert_from:main',
|
||||
@ -168,6 +169,8 @@ def setup_completion(fatal_errors):
|
||||
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
|
||||
from calibre.ebooks.metadata.epub import option_parser as epub_meta
|
||||
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
||||
from calibre.ebooks.epub.from_html import option_parser as html2epub
|
||||
from calibre.ebooks.html import option_parser as html2oeb
|
||||
|
||||
f = open_file('/etc/bash_completion.d/libprs500')
|
||||
f.close()
|
||||
@ -203,6 +206,8 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
|
||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
|
||||
f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml']))
|
||||
f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml']))
|
||||
f.write('''
|
||||
_prs500_ls()
|
||||
{
|
||||
|
@ -169,7 +169,7 @@ class Option(object):
|
||||
self.metavar = metavar
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.name == getattr(other, 'name', None)
|
||||
return self.name == getattr(other, 'name', other)
|
||||
|
||||
class OptionValues(object):
|
||||
|
||||
@ -203,6 +203,19 @@ class OptionSet(object):
|
||||
self.group_list.append(name)
|
||||
return partial(self.add_opt, group=name)
|
||||
|
||||
def update(self, other):
|
||||
for name in other.groups.keys():
|
||||
self.groups[name] = other.groups[name]
|
||||
for pref in other.preferences:
|
||||
if pref in self.preferences:
|
||||
self.preferences.remove(pref)
|
||||
self.preferences.append(pref)
|
||||
|
||||
def remove_opt(self, name):
|
||||
if name in self.preferences:
|
||||
self.preferences.remove(name)
|
||||
|
||||
|
||||
def add_opt(self, name, switches=[], help=None, type=None, choices=None,
|
||||
group=None, default=None, action=None, metavar=None):
|
||||
'''
|
||||
@ -307,19 +320,34 @@ class OptionSet(object):
|
||||
for name in [None] + self.group_list]
|
||||
return src + '\n\n'.join(groups)
|
||||
|
||||
class Config(object):
|
||||
class ConfigInterface(object):
|
||||
|
||||
def __init__(self, basename, description=''):
|
||||
self.config_file_path = os.path.join(config_dir, basename+'.py')
|
||||
def __init__(self, description):
|
||||
self.option_set = OptionSet(description=description)
|
||||
self.add_opt = self.option_set.add_opt
|
||||
self.add_group = self.option_set.add_group
|
||||
self.remove_opt = self.option_set.remove_opt
|
||||
|
||||
def update(self, other):
|
||||
self.option_set.update(other.option_set)
|
||||
|
||||
def option_parser(self, usage='', gui_mode=False):
|
||||
return self.option_set.option_parser(user_defaults=self.parse(),
|
||||
usage=usage, gui_mode=gui_mode)
|
||||
|
||||
class Config(ConfigInterface):
|
||||
'''
|
||||
A file based configuration.
|
||||
'''
|
||||
|
||||
def __init__(self, basename, description=''):
|
||||
ConfigInterface.__init__(self, description)
|
||||
self.config_file_path = os.path.join(config_dir, basename+'.py')
|
||||
|
||||
|
||||
def parse(self):
|
||||
src = ''
|
||||
if os.path.exists(self.config_file_path):
|
||||
try:
|
||||
with ExclusiveFile(self.config_file_path) as f:
|
||||
src = f.read()
|
||||
@ -352,17 +380,14 @@ class Config(object):
|
||||
except LockError:
|
||||
raise IOError('Could not lock config file: %s'%self.config_file_path)
|
||||
|
||||
class StringConfig(object):
|
||||
class StringConfig(ConfigInterface):
|
||||
'''
|
||||
A string based configuration
|
||||
'''
|
||||
|
||||
def __init__(self, src, description=''):
|
||||
ConfigInterface.__init__(self, description)
|
||||
self.src = src
|
||||
self.option_set = OptionSet(description=description)
|
||||
self.add_opt = self.option_set.add_opt
|
||||
self.option_parser = self.option_set.option_parser
|
||||
|
||||
def option_parser(self, usage='', gui_mode=False):
|
||||
return self.option_set.option_parser(user_defaults=self.parse(),
|
||||
usage=usage, gui_mode=gui_mode)
|
||||
|
||||
def parse(self):
|
||||
return self.option_set.parse_string(self.src)
|
||||
|
@ -1034,6 +1034,7 @@ class ZipFile:
|
||||
os.makedirs(upperdirs)
|
||||
|
||||
source = self.open(member, pwd=pwd)
|
||||
if not os.path.exists(targetpath): # Could be a previously automatically created directory
|
||||
target = open(targetpath, "wb")
|
||||
shutil.copyfileobj(source, target)
|
||||
source.close()
|
||||
@ -1067,6 +1068,8 @@ class ZipFile:
|
||||
def write(self, filename, arcname=None, compress_type=None):
|
||||
"""Put the bytes from filename into the archive under the name
|
||||
arcname."""
|
||||
if isinstance(filename, unicode):
|
||||
filename = filename.encode('utf-8')
|
||||
if not self.fp:
|
||||
raise RuntimeError(
|
||||
"Attempt to write to ZIP archive that was already closed")
|
||||
@ -1133,15 +1136,17 @@ class ZipFile:
|
||||
self.filelist.append(zinfo)
|
||||
self.NameToInfo[zinfo.filename] = zinfo
|
||||
|
||||
def writestr(self, zinfo_or_arcname, bytes):
|
||||
def writestr(self, zinfo_or_arcname, bytes, permissions=0600):
|
||||
"""Write a file into the archive. The contents is the string
|
||||
'bytes'. 'zinfo_or_arcname' is either a ZipInfo instance or
|
||||
the name of the file in the archive."""
|
||||
if not isinstance(zinfo_or_arcname, ZipInfo):
|
||||
if isinstance(zinfo_or_arcname, unicode):
|
||||
zinfo_or_arcname = zinfo_or_arcname.encode('utf-8')
|
||||
zinfo = ZipInfo(filename=zinfo_or_arcname,
|
||||
date_time=time.localtime(time.time())[:6])
|
||||
zinfo.compress_type = self.compression
|
||||
zinfo.external_attr = 0600 << 16
|
||||
zinfo.external_attr = permissions << 16
|
||||
else:
|
||||
zinfo = zinfo_or_arcname
|
||||
|
||||
@ -1172,6 +1177,23 @@ class ZipFile:
|
||||
self.filelist.append(zinfo)
|
||||
self.NameToInfo[zinfo.filename] = zinfo
|
||||
|
||||
def add_dir(self, path, prefix=''):
|
||||
if prefix:
|
||||
self.writestr(prefix+'/', '', 0700)
|
||||
cwd = os.path.abspath(os.getcwd())
|
||||
try:
|
||||
os.chdir(path)
|
||||
fp = (prefix + ('/' if prefix else '')).replace('//', '/')
|
||||
for f in os.listdir('.'):
|
||||
arcname = fp + f
|
||||
if os.path.isdir(f):
|
||||
self.add_dir(f, prefix=arcname)
|
||||
else:
|
||||
self.write(f, arcname)
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
def __del__(self):
|
||||
"""Call the "close()" method in case the user forgot."""
|
||||
self.close()
|
||||
|
Loading…
x
Reference in New Issue
Block a user