IGN:Full implementation of HTML traversal

This commit is contained in:
Kovid Goyal 2008-08-25 16:42:07 -07:00
parent 2efa1ec708
commit 39afcb27f7
6 changed files with 597 additions and 236 deletions

View File

@ -8,6 +8,7 @@ Conversion to EPUB.
'''
import sys
from calibre.utils.config import Config, StringConfig
from calibre.ebooks.html import config as common_config
def config(defaults=None):
desc = _('Options to control the conversion to EPUB')
@ -16,22 +17,11 @@ def config(defaults=None):
else:
c = StringConfig(defaults, desc)
c.update(common_config())
c.remove_opt('output')
c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.'))
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
metadata('title', ['-t', '--title'], default=None,
help=_('Set the title. Default is to autodetect.'))
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
help=_('The author(s) of the ebook, as a comma separated list.'))
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
traversal('breadth_first', ['--breadth-first'], default=False,
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
@ -46,8 +36,5 @@ help on using this feature.
help=_('Don\'t add detected chapters to the Table of Contents'))
structure('no_links_in_toc', ['--no-links-in-toc'], default=False,
help=_('Don\'t add links in the root HTML file to the Table of Contents'))
debug = c.add_group('debug', _('Options useful for debugging'))
debug('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
return c

View File

@ -2,44 +2,22 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
import os, sys, logging, re, shutil, tempfile
from lxml import html
import os, sys, re, shutil
from lxml.etree import XPath
get_text = XPath("//text()")
from calibre import LoggingInterface
from calibre.ebooks.html import PreProcessor
from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist
from calibre.ebooks.epub import config as common_config
from calibre.ebooks.epub.traverse import traverse, opf_traverse
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf import OPFReader
from calibre.ptempfile import PersistentTemporaryDirectory
class HTMLProcessor(PreProcessor, LoggingInterface):
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
class HTMLProcessor(Parser):
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
LoggingInterface.__init__(self, logging.getLogger('html2epub'))
self.setup_cli_handler(opts.verbose)
self.htmlfile = htmlfile
self.opts = opts
self.tdir = tdir
self.resource_map = resource_map
self.resource_dir = os.path.join(tdir, 'resources')
self.htmlfiles = htmlfiles
self.parse_html()
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles,
name='html2epub')
if opts.verbose > 2:
self.debug_tree('parsed')
self.detected_chapters = self.opts.chapter(self.root)
self.extract_css()
if opts.verbose > 2:
@ -49,130 +27,6 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
self.split()
def debug_tree(self, name):
'''
Dump source tree for later debugging.
'''
tdir = tempfile.gettempdir()
if not os.path.exists(tdir):
os.makedirs(tdir)
with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
f.write(html.tostring(self.root, encoding='utf-8'))
self.log_debug(_('Written processed HTML to ')+f.name)
def parse_html(self):
''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
src = self.preprocess(src)
# lxml chokes on unicode input when it contains encoding declarations
for pat in self.ENCODING_PATS:
src = pat.sub('', src)
try:
self.root = html.document_fromstring(src)
except:
if self.opts.verbose:
self.log_exception('lxml based parsing failed')
self.root = html.soupparser.fromstring()
self.head = self.body = None
head = self.root.xpath('//head')
if head:
self.head = head[0]
body = self.root.xpath('//body')
if body:
self.body = body[0]
self.detected_chapters = self.opts.chapter(self.root)
def rewrite_links(self, olink):
'''
Make all links in document relative so that they work in the EPUB container.
Also copies any resources (like images, stylesheets, scripts, etc.) into
the local tree.
'''
if not isinstance(olink, unicode):
olink = olink.decode(self.htmlfile.encoding)
link = self.htmlfile.resolve(olink)
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
return olink
if link.path in self.htmlfiles:
return os.path.basename(link.path)
if link.path in self.resource_map.keys():
return self.resource_map[link.path]
name = os.path.basename(link.path)
name, ext = os.path.splitext(name)
name += ('_%d'%len(self.resource_map)) + ext
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
name = 'resources/' + name
self.resource_map[link.path] = name
return name
def extract_css(self):
'''
Remove all CSS information from the document and store in self.raw_css.
This includes <font> tags.
'''
css = []
for link in self.root.xpath('//link'):
if 'css' in link.get('type', 'text/css').lower():
file = self.htmlfile.resolve(link.get('href', ''))
if os.path.exists(file) and os.path.isfile(file):
css.append(open(file, 'rb').read().decode('utf-8'))
link.getparent().remove(link)
for style in self.root.xpath('//style'):
if 'css' in style.get('type', 'text/css').lower():
css.append('\n'.join(get_text(style)))
style.getparent().remove(style)
font_id = 1
for font in self.root.xpath('//font'):
try:
size = int(font.attrib.pop('size', '3'))
except:
size = 3
setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
face = font.attrib.pop('face', None)
if face is not None:
setting += 'font-face:%s;'%face
color = font.attrib.pop('color', None)
if color is not None:
setting += 'color:%s'%color
id = 'calibre_font_id_%d'%font_id
font.set('id', 'calibre_font_id_%d'%font_id)
font_id += 1
css.append('#%s { %s }'%(id, setting))
css_counter = 1
for elem in self.root.xpath('//*[@style]'):
if 'id' not in elem.keys():
elem.set('id', 'calibre_css_id_%d'%css_counter)
css_counter += 1
css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
elem.attrib.pop('style')
chapter_counter = 1
for chapter in self.detected_chapters:
if chapter.tag.lower() == 'a':
if 'name' in chapter.keys():
chapter.attrib['id'] = id = chapter.get('name')
elif 'id' in chapter.keys():
id = chapter.get('id')
else:
id = 'calibre_detected_chapter_%d'%chapter_counter
chapter_counter += 1
chapter.set('id', id)
else:
if 'id' not in chapter.keys():
id = 'calibre_detected_chapter_%d'%chapter_counter
chapter_counter += 1
chapter.set('id', id)
css.append('#%s {%s}'%(id, 'page-break-before:always'))
self.raw_css = '\n\n'.join(css)
# TODO: Figure out what to do about CSS imports from linked stylesheets
def collect_font_statistics(self):
'''
Collect font statistics to figure out the base font size used in this
@ -191,8 +45,8 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
pass
def config():
c = common_config()
def config(defaults=None):
c = common_config(defaults=defaults)
return c
def option_parser():
@ -203,11 +57,6 @@ def option_parser():
Convert a HTML file to an EPUB ebook. Follows links in the HTML file.
'''))
def search_for_opf(dir):
for f in os.listdir(dir):
if f.lower().endswith('.opf'):
return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
def parse_content(filelist, opts):
tdir = PersistentTemporaryDirectory('_html2epub')
os.makedirs(os.path.join(tdir, 'content', 'resources'))
@ -221,39 +70,17 @@ def convert(htmlfile, opts, notification=None):
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
opts.output = os.path.abspath(opts.output)
opf = search_for_opf(os.path.dirname(htmlfile))
if opf:
mi = MetaInformation(opf)
else:
mi = get_metadata(open(htmlfile, 'rb'), 'html')
if opts.title:
mi.title = opts.title
if opts.authors != _('Unknown'):
opts.authors = opts.authors.split(',')
opts.authors = [a.strip() for a in opts.authors]
mi.authors = opts.authors
if not mi.title:
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts)
opts.chapter = XPath(opts.chapter,
namespaces={'re':'http://exslt.org/regular-expressions'})
filelist = None
print 'Building file list...'
if opf is not None:
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
if not filelist:
filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
[0 if opts.breadth_first else 1]
if opts.verbose:
print '\tFound files...'
for f in filelist:
print '\t\t', f
parse_content(filelist, opts)
resource_map = parse_content(filelist, opts)
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
if opf.cover and os.access(opf.cover, os.R_OK):
shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
shutil.copyfile(opf.cover, cpath)
resources.append(cpath)
def main(args=sys.argv):
parser = option_parser()
@ -267,4 +94,3 @@ def main(args=sys.argv):
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,8 +1,220 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
import re
import sys, re, os, shutil, logging, tempfile
from urlparse import urlparse
from urllib import unquote
from lxml import html
from lxml.etree import XPath
get_text = XPath("//text()")
from calibre import LoggingInterface, unicode_path
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.config import Config, StringConfig
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.zipfile import ZipFile
class Link(object):
'''
Represents a link in a HTML file.
'''
@classmethod
def url_to_local_path(cls, url, base):
path = url.path
if os.path.isabs(path):
return path
return os.path.abspath(os.path.join(base, path))
def __init__(self, url, base):
'''
:param url: The url this link points to. Must be an unquoted unicode string.
:param base: The base directory that relative URLs are with respect to.
Must be a unicode string.
'''
assert isinstance(url, unicode) and isinstance(base, unicode)
self.url = url
self.parsed_url = urlparse(unquote(self.url))
self.is_local = self.parsed_url.scheme in ('', 'file')
self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.path = None
self.fragment = self.parsed_url.fragment
if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base)
def __hash__(self):
if self.path is None:
return hash(self.url)
return hash(self.path)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'Link: %s --> %s'%(self.url, self.path)
class IgnoreFile(Exception):
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == 2
self.errno = errno
class HTMLFile(object):
'''
Contains basic information about an HTML file. This
includes a list of links to other files as well as
the encoding of each file. Also tries to detect if the file is not a HTML
file in which case :member:`is_binary` is set to True.
The encoding of the file is available as :member:`encoding`.
'''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose):
'''
:param level: The level of this file. Should be 0 for the root file.
:param encoding: Use `encoding` to decode HTML.
'''
self.path = unicode_path(path_to_html_file, abs=True)
self.base = os.path.dirname(self.path)
self.level = level
self.links = []
try:
with open(self.path, 'rb') as f:
src = f.read()
except IOError, err:
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
if level == 0:
raise IOError(msg)
raise IgnoreFile(msg, err.errno)
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
if not self.is_binary:
if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding
src = src.decode(encoding, 'replace')
self.find_links(src)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
def __repr__(self):
return str(self)
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
for i in ('url1', 'url2', 'url3'):
url = match.group(i)
if url:
break
link = self.resolve(url)
if link not in self.links:
self.links.append(link)
def resolve(self, url):
return Link(url, self.base)
def depth_first(root, flat, visited=set([])):
yield root
visited.add(root)
for link in root.links:
if link.path is not None and link not in visited:
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
yield hf
visited.add(hf)
for hf in depth_first(hf, flat, visited):
if hf not in visited:
yield hf
visited.add(hf)
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
'''
Recursively traverse all links in the HTML file.
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
:return: A pair of lists (breadth_first, depth_first). Each list contains
:class:`HTMLFile` objects.
'''
assert max_levels >= 0
level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
next_level = list(flat)
while level < max_levels and len(next_level) > 0:
level += 1
nl = []
for hf in next_level:
rejects = []
for link in hf.links:
if link.path is None or link.path in flat:
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose)
nl.append(nf)
flat.append(nf)
except IgnoreFile, err:
rejects.append(link)
if not err.doesnt_exist or verbose > 1:
print str(err)
for link in rejects:
hf.links.remove(link)
next_level = list(nl)
return flat, list(depth_first(flat[0], flat))
def opf_traverse(opf_reader, verbose=0, encoding=None):
'''
Return a list of :class:`HTMLFile` objects in the order specified by the
`<spine>` element of the OPF.
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
'''
if not opf_reader.spine:
raise ValueError('OPF does not have a spine')
flat = []
for path in opf_reader.spine.items():
if path not in flat:
flat.append(os.path.abspath(path))
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
return flat
class PreProcessor(object):
@ -72,3 +284,287 @@ class PreProcessor(object):
return html
class Parser(PreProcessor):
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
LoggingInterface.__init__(self, logging.getLogger(name))
self.setup_cli_handler(opts.verbose)
self.htmlfile = htmlfile
self.opts = opts
self.tdir = tdir
self.resource_map = resource_map
self.htmlfiles = htmlfiles
self.resource_dir = os.path.join(tdir, 'resources')
self.parse_html()
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
def parse_html(self):
''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
src = self.preprocess(src)
# lxml chokes on unicode input when it contains encoding declarations
for pat in self.ENCODING_PATS:
src = pat.sub('', src)
try:
self.root = html.document_fromstring(src)
except:
if self.opts.verbose:
self.log_exception('lxml based parsing failed')
self.root = html.soupparser.fromstring()
self.head = self.body = None
head = self.root.xpath('//head')
if head:
self.head = head[0]
body = self.root.xpath('//body')
if body:
self.body = body[0]
def debug_tree(self, name):
'''
Dump source tree for later debugging.
'''
tdir = tempfile.gettempdir()
if not os.path.exists(tdir):
os.makedirs(tdir)
with open(os.path.join(tdir, '%s-%s-%s.html'%\
(self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f:
f.write(html.tostring(self.root, encoding='utf-8'))
self.log_debug(_('Written processed HTML to ')+f.name)
def rewrite_links(self, olink):
'''
Make all links in document relative so that they work in the EPUB container.
Also copies any resources (like images, stylesheets, scripts, etc.) into
the local tree.
'''
if not isinstance(olink, unicode):
olink = olink.decode(self.htmlfile.encoding)
link = self.htmlfile.resolve(olink)
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
return olink
if link.path in self.htmlfiles:
return os.path.basename(link.path)
if link.path in self.resource_map.keys():
return self.resource_map[link.path]
name = os.path.basename(link.path)
name, ext = os.path.splitext(name)
name += ('_%d'%len(self.resource_map)) + ext
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
name = 'resources/' + name
self.resource_map[link.path] = name
return name
def extract_css(self):
'''
Remove all CSS information from the document and store in self.raw_css.
This includes <font> tags.
'''
css = []
for link in self.root.xpath('//link'):
if 'css' in link.get('type', 'text/css').lower():
file = self.htmlfile.resolve(link.get('href', ''))
if os.path.exists(file) and os.path.isfile(file):
css.append(open(file, 'rb').read().decode('utf-8'))
link.getparent().remove(link)
for style in self.root.xpath('//style'):
if 'css' in style.get('type', 'text/css').lower():
css.append('\n'.join(get_text(style)))
style.getparent().remove(style)
font_id = 1
for font in self.root.xpath('//font'):
try:
size = int(font.attrib.pop('size', '3'))
except:
size = 3
setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
face = font.attrib.pop('face', None)
if face is not None:
setting += 'font-face:%s;'%face
color = font.attrib.pop('color', None)
if color is not None:
setting += 'color:%s'%color
id = 'calibre_font_id_%d'%font_id
font.set('id', 'calibre_font_id_%d'%font_id)
font_id += 1
css.append('#%s { %s }'%(id, setting))
css_counter = 1
for elem in self.root.xpath('//*[@style]'):
if 'id' not in elem.keys():
elem.set('id', 'calibre_css_id_%d'%css_counter)
css_counter += 1
css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
elem.attrib.pop('style')
chapter_counter = 1
for chapter in self.detected_chapters:
if chapter.tag.lower() == 'a':
if 'name' in chapter.keys():
chapter.attrib['id'] = id = chapter.get('name')
elif 'id' in chapter.keys():
id = chapter.get('id')
else:
id = 'calibre_detected_chapter_%d'%chapter_counter
chapter_counter += 1
chapter.set('id', id)
else:
if 'id' not in chapter.keys():
id = 'calibre_detected_chapter_%d'%chapter_counter
chapter_counter += 1
chapter.set('id', id)
css.append('#%s {%s}'%(id, 'page-break-before:always'))
self.raw_css = '\n\n'.join(css)
# TODO: Figure out what to do about CSS imports from linked stylesheets
def config(defaults=None):
desc = _('Options to control the traversal of HTML')
if defaults is None:
c = Config('html', desc)
else:
c = StringConfig(defaults, desc)
c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output directory. Default is the current directory.'))
c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.'))
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
traversal('breadth_first', ['--breadth-first'], default=False,
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
metadata('title', ['-t', '--title'], default=None,
help=_('Set the title. Default is to autodetect.'))
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
help=_('The author(s) of the ebook, as a comma separated list.'))
debug = c.add_group('debug', _('Options useful for debugging'))
debug('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
return c
def option_parser():
c = config()
return c.option_parser(usage=_('''\
%prog [options] file.html
Follow all links in an HTML file and collect them into the specified directory.
Also collects any references resources like images, stylesheets, scripts, etc.
'''))
def safe_option_parser():
return option_parser(safe=True)
def search_for_opf(dir):
for f in os.listdir(dir):
if f.lower().endswith('.opf'):
return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
def get_filelist(htmlfile, opts):
print 'Building file list...'
opf = search_for_opf(os.path.dirname(htmlfile))
if opf is not None:
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
if not filelist:
filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
[0 if opts.breadth_first else 1]
if opts.verbose:
print '\tFound files...'
for f in filelist:
print '\t\t', f
return opf, filelist
def parse_content(filelist, opts):
if not opts.output:
opts.output = '.'
opts.output = os.path.abspath(opts.output)
rdir = os.path.join(opts.output, 'content', 'resources')
if not os.path.exists(rdir):
os.makedirs(rdir)
resource_map = {}
for htmlfile in filelist:
Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
resource_map, filelist)
return resource_map
def merge_metadata(htmlfile, opf, opts):
if opf:
mi = MetaInformation(opf)
else:
mi = get_metadata(open(htmlfile, 'rb'), 'html')
if opts.title:
mi.title = opts.title
if opts.authors != _('Unknown'):
opts.authors = opts.authors.split(',')
opts.authors = [a.strip() for a in opts.authors]
mi.authors = opts.authors
if not mi.title:
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
def create_metadata(basepath, mi, filelist, resources):
mi = OPFCreator(basepath, mi)
entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
mi.create_manifest(entries)
mi.create_spine([f.path for f in filelist])
return mi
def create_dir(htmlfile, opts):
opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts)
resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
if opf.cover and os.access(opf.cover, os.R_OK):
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
shutil.copyfile(opf.cover, cpath)
resources.append(cpath)
mi = create_metadata(opts.output, mi, filelist, resources)
with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
mi.render(f)
print 'Open ebook created in', opts.output
def create_oebzip(htmlfile, opts):
tdir = PersistentTemporaryDirectory('_create_oebzip')
if opts.output is None:
opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip')
ofile = opts.output
opts.output = tdir
create_dir(htmlfile, opts)
zf = ZipFile(ofile, 'w')
zf.add_dir(opts.output)
print 'Output saved to', ofile
def main(args=sys.argv):
parser = option_parser()
opts, args = parser.parse_args(args)
if len(args) < 2:
parser.print_help()
print _('You must specify an input HTML file')
return 1
htmlfile = args[1]
create_dir(htmlfile, opts)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -25,6 +25,7 @@ entry_points = {
'epub-meta = calibre.ebooks.metadata.epub:main',
'txt2lrf = calibre.ebooks.lrf.txt.convert_from:main',
'html2lrf = calibre.ebooks.lrf.html.convert_from:main',
'html2oeb = calibre.ebooks.html:main',
'html2epub = calibre.ebooks.epub.from_html:main',
'markdown-calibre = calibre.ebooks.markdown.markdown:main',
'lit2lrf = calibre.ebooks.lrf.lit.convert_from:main',
@ -168,6 +169,8 @@ def setup_completion(fatal_errors):
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
from calibre.ebooks.metadata.epub import option_parser as epub_meta
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
from calibre.ebooks.epub.from_html import option_parser as html2epub
from calibre.ebooks.html import option_parser as html2oeb
f = open_file('/etc/bash_completion.d/libprs500')
f.close()
@ -203,6 +206,8 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml']))
f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml']))
f.write('''
_prs500_ls()
{

View File

@ -169,7 +169,7 @@ class Option(object):
self.metavar = metavar
def __eq__(self, other):
return self.name == getattr(other, 'name', None)
return self.name == getattr(other, 'name', other)
class OptionValues(object):
@ -203,6 +203,19 @@ class OptionSet(object):
self.group_list.append(name)
return partial(self.add_opt, group=name)
def update(self, other):
for name in other.groups.keys():
self.groups[name] = other.groups[name]
for pref in other.preferences:
if pref in self.preferences:
self.preferences.remove(pref)
self.preferences.append(pref)
def remove_opt(self, name):
if name in self.preferences:
self.preferences.remove(name)
def add_opt(self, name, switches=[], help=None, type=None, choices=None,
group=None, default=None, action=None, metavar=None):
'''
@ -307,19 +320,34 @@ class OptionSet(object):
for name in [None] + self.group_list]
return src + '\n\n'.join(groups)
class Config(object):
class ConfigInterface(object):
def __init__(self, basename, description=''):
self.config_file_path = os.path.join(config_dir, basename+'.py')
def __init__(self, description):
self.option_set = OptionSet(description=description)
self.add_opt = self.option_set.add_opt
self.add_group = self.option_set.add_group
self.remove_opt = self.option_set.remove_opt
def update(self, other):
self.option_set.update(other.option_set)
def option_parser(self, usage='', gui_mode=False):
return self.option_set.option_parser(user_defaults=self.parse(),
usage=usage, gui_mode=gui_mode)
class Config(ConfigInterface):
'''
A file based configuration.
'''
def __init__(self, basename, description=''):
ConfigInterface.__init__(self, description)
self.config_file_path = os.path.join(config_dir, basename+'.py')
def parse(self):
src = ''
if os.path.exists(self.config_file_path):
try:
with ExclusiveFile(self.config_file_path) as f:
src = f.read()
@ -352,17 +380,14 @@ class Config(object):
except LockError:
raise IOError('Could not lock config file: %s'%self.config_file_path)
class StringConfig(object):
class StringConfig(ConfigInterface):
'''
A string based configuration
'''
def __init__(self, src, description=''):
ConfigInterface.__init__(self, description)
self.src = src
self.option_set = OptionSet(description=description)
self.add_opt = self.option_set.add_opt
self.option_parser = self.option_set.option_parser
def option_parser(self, usage='', gui_mode=False):
return self.option_set.option_parser(user_defaults=self.parse(),
usage=usage, gui_mode=gui_mode)
def parse(self):
return self.option_set.parse_string(self.src)

View File

@ -1034,6 +1034,7 @@ class ZipFile:
os.makedirs(upperdirs)
source = self.open(member, pwd=pwd)
if not os.path.exists(targetpath): # Could be a previously automatically created directory
target = open(targetpath, "wb")
shutil.copyfileobj(source, target)
source.close()
@ -1067,6 +1068,8 @@ class ZipFile:
def write(self, filename, arcname=None, compress_type=None):
"""Put the bytes from filename into the archive under the name
arcname."""
if isinstance(filename, unicode):
filename = filename.encode('utf-8')
if not self.fp:
raise RuntimeError(
"Attempt to write to ZIP archive that was already closed")
@ -1133,15 +1136,17 @@ class ZipFile:
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
def writestr(self, zinfo_or_arcname, bytes):
def writestr(self, zinfo_or_arcname, bytes, permissions=0600):
"""Write a file into the archive. The contents is the string
'bytes'. 'zinfo_or_arcname' is either a ZipInfo instance or
the name of the file in the archive."""
if not isinstance(zinfo_or_arcname, ZipInfo):
if isinstance(zinfo_or_arcname, unicode):
zinfo_or_arcname = zinfo_or_arcname.encode('utf-8')
zinfo = ZipInfo(filename=zinfo_or_arcname,
date_time=time.localtime(time.time())[:6])
zinfo.compress_type = self.compression
zinfo.external_attr = 0600 << 16
zinfo.external_attr = permissions << 16
else:
zinfo = zinfo_or_arcname
@ -1172,6 +1177,23 @@ class ZipFile:
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
def add_dir(self, path, prefix=''):
if prefix:
self.writestr(prefix+'/', '', 0700)
cwd = os.path.abspath(os.getcwd())
try:
os.chdir(path)
fp = (prefix + ('/' if prefix else '')).replace('//', '/')
for f in os.listdir('.'):
arcname = fp + f
if os.path.isdir(f):
self.add_dir(f, prefix=arcname)
else:
self.write(f, arcname)
finally:
os.chdir(cwd)
def __del__(self):
"""Call the "close()" method in case the user forgot."""
self.close()