mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:Full implementation of HTML traversal
This commit is contained in:
parent
2efa1ec708
commit
39afcb27f7
@ -8,6 +8,7 @@ Conversion to EPUB.
|
|||||||
'''
|
'''
|
||||||
import sys
|
import sys
|
||||||
from calibre.utils.config import Config, StringConfig
|
from calibre.utils.config import Config, StringConfig
|
||||||
|
from calibre.ebooks.html import config as common_config
|
||||||
|
|
||||||
def config(defaults=None):
|
def config(defaults=None):
|
||||||
desc = _('Options to control the conversion to EPUB')
|
desc = _('Options to control the conversion to EPUB')
|
||||||
@ -15,23 +16,12 @@ def config(defaults=None):
|
|||||||
c = Config('epub', desc)
|
c = Config('epub', desc)
|
||||||
else:
|
else:
|
||||||
c = StringConfig(defaults, desc)
|
c = StringConfig(defaults, desc)
|
||||||
|
|
||||||
|
c.update(common_config())
|
||||||
|
c.remove_opt('output')
|
||||||
|
|
||||||
c.add_opt('output', ['-o', '--output'], default=None,
|
c.add_opt('output', ['-o', '--output'], default=None,
|
||||||
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
|
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
|
||||||
c.add_opt('encoding', ['--encoding'], default=None,
|
|
||||||
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
|
||||||
|
|
||||||
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
|
|
||||||
metadata('title', ['-t', '--title'], default=None,
|
|
||||||
help=_('Set the title. Default is to autodetect.'))
|
|
||||||
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
|
|
||||||
help=_('The author(s) of the ebook, as a comma separated list.'))
|
|
||||||
|
|
||||||
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
|
|
||||||
traversal('breadth_first', ['--breadth-first'], default=False,
|
|
||||||
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
|
|
||||||
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
|
|
||||||
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
|
|
||||||
|
|
||||||
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
|
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
|
||||||
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
|
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
|
||||||
@ -46,8 +36,5 @@ help on using this feature.
|
|||||||
help=_('Don\'t add detected chapters to the Table of Contents'))
|
help=_('Don\'t add detected chapters to the Table of Contents'))
|
||||||
structure('no_links_in_toc', ['--no-links-in-toc'], default=False,
|
structure('no_links_in_toc', ['--no-links-in-toc'], default=False,
|
||||||
help=_('Don\'t add links in the root HTML file to the Table of Contents'))
|
help=_('Don\'t add links in the root HTML file to the Table of Contents'))
|
||||||
debug = c.add_group('debug', _('Options useful for debugging'))
|
|
||||||
debug('verbose', ['-v', '--verbose'], default=0, action='count',
|
|
||||||
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
|
|
||||||
|
|
||||||
return c
|
return c
|
@ -2,44 +2,22 @@ from __future__ import with_statement
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
import os, sys, logging, re, shutil, tempfile
|
import os, sys, re, shutil
|
||||||
from lxml import html
|
|
||||||
from lxml.etree import XPath
|
from lxml.etree import XPath
|
||||||
get_text = XPath("//text()")
|
|
||||||
|
|
||||||
from calibre import LoggingInterface
|
from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist
|
||||||
from calibre.ebooks.html import PreProcessor
|
|
||||||
from calibre.ebooks.epub import config as common_config
|
from calibre.ebooks.epub import config as common_config
|
||||||
from calibre.ebooks.epub.traverse import traverse, opf_traverse
|
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
|
||||||
from calibre.ebooks.metadata.opf import OPFReader
|
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
class HTMLProcessor(PreProcessor, LoggingInterface):
|
class HTMLProcessor(Parser):
|
||||||
|
|
||||||
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
|
||||||
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
|
|
||||||
|
|
||||||
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
|
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
|
||||||
LoggingInterface.__init__(self, logging.getLogger('html2epub'))
|
Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles,
|
||||||
self.setup_cli_handler(opts.verbose)
|
name='html2epub')
|
||||||
|
|
||||||
self.htmlfile = htmlfile
|
|
||||||
self.opts = opts
|
|
||||||
self.tdir = tdir
|
|
||||||
self.resource_map = resource_map
|
|
||||||
self.resource_dir = os.path.join(tdir, 'resources')
|
|
||||||
self.htmlfiles = htmlfiles
|
|
||||||
|
|
||||||
self.parse_html()
|
|
||||||
|
|
||||||
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
|
|
||||||
|
|
||||||
if opts.verbose > 2:
|
if opts.verbose > 2:
|
||||||
self.debug_tree('parsed')
|
self.debug_tree('parsed')
|
||||||
|
self.detected_chapters = self.opts.chapter(self.root)
|
||||||
self.extract_css()
|
self.extract_css()
|
||||||
|
|
||||||
if opts.verbose > 2:
|
if opts.verbose > 2:
|
||||||
@ -49,130 +27,6 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
|
|||||||
|
|
||||||
self.split()
|
self.split()
|
||||||
|
|
||||||
def debug_tree(self, name):
|
|
||||||
'''
|
|
||||||
Dump source tree for later debugging.
|
|
||||||
'''
|
|
||||||
tdir = tempfile.gettempdir()
|
|
||||||
if not os.path.exists(tdir):
|
|
||||||
os.makedirs(tdir)
|
|
||||||
with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\
|
|
||||||
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
|
|
||||||
f.write(html.tostring(self.root, encoding='utf-8'))
|
|
||||||
self.log_debug(_('Written processed HTML to ')+f.name)
|
|
||||||
|
|
||||||
def parse_html(self):
|
|
||||||
''' Create lxml ElementTree from HTML '''
|
|
||||||
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
|
||||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
|
|
||||||
src = self.preprocess(src)
|
|
||||||
# lxml chokes on unicode input when it contains encoding declarations
|
|
||||||
for pat in self.ENCODING_PATS:
|
|
||||||
src = pat.sub('', src)
|
|
||||||
try:
|
|
||||||
self.root = html.document_fromstring(src)
|
|
||||||
except:
|
|
||||||
if self.opts.verbose:
|
|
||||||
self.log_exception('lxml based parsing failed')
|
|
||||||
self.root = html.soupparser.fromstring()
|
|
||||||
self.head = self.body = None
|
|
||||||
head = self.root.xpath('//head')
|
|
||||||
if head:
|
|
||||||
self.head = head[0]
|
|
||||||
body = self.root.xpath('//body')
|
|
||||||
if body:
|
|
||||||
self.body = body[0]
|
|
||||||
self.detected_chapters = self.opts.chapter(self.root)
|
|
||||||
|
|
||||||
def rewrite_links(self, olink):
|
|
||||||
'''
|
|
||||||
Make all links in document relative so that they work in the EPUB container.
|
|
||||||
Also copies any resources (like images, stylesheets, scripts, etc.) into
|
|
||||||
the local tree.
|
|
||||||
'''
|
|
||||||
if not isinstance(olink, unicode):
|
|
||||||
olink = olink.decode(self.htmlfile.encoding)
|
|
||||||
link = self.htmlfile.resolve(olink)
|
|
||||||
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
|
|
||||||
return olink
|
|
||||||
if link.path in self.htmlfiles:
|
|
||||||
return os.path.basename(link.path)
|
|
||||||
if link.path in self.resource_map.keys():
|
|
||||||
return self.resource_map[link.path]
|
|
||||||
name = os.path.basename(link.path)
|
|
||||||
name, ext = os.path.splitext(name)
|
|
||||||
name += ('_%d'%len(self.resource_map)) + ext
|
|
||||||
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
|
|
||||||
name = 'resources/' + name
|
|
||||||
self.resource_map[link.path] = name
|
|
||||||
return name
|
|
||||||
|
|
||||||
|
|
||||||
def extract_css(self):
|
|
||||||
'''
|
|
||||||
Remove all CSS information from the document and store in self.raw_css.
|
|
||||||
This includes <font> tags.
|
|
||||||
'''
|
|
||||||
css = []
|
|
||||||
for link in self.root.xpath('//link'):
|
|
||||||
if 'css' in link.get('type', 'text/css').lower():
|
|
||||||
file = self.htmlfile.resolve(link.get('href', ''))
|
|
||||||
if os.path.exists(file) and os.path.isfile(file):
|
|
||||||
css.append(open(file, 'rb').read().decode('utf-8'))
|
|
||||||
link.getparent().remove(link)
|
|
||||||
|
|
||||||
for style in self.root.xpath('//style'):
|
|
||||||
if 'css' in style.get('type', 'text/css').lower():
|
|
||||||
css.append('\n'.join(get_text(style)))
|
|
||||||
style.getparent().remove(style)
|
|
||||||
|
|
||||||
font_id = 1
|
|
||||||
for font in self.root.xpath('//font'):
|
|
||||||
try:
|
|
||||||
size = int(font.attrib.pop('size', '3'))
|
|
||||||
except:
|
|
||||||
size = 3
|
|
||||||
setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
|
|
||||||
face = font.attrib.pop('face', None)
|
|
||||||
if face is not None:
|
|
||||||
setting += 'font-face:%s;'%face
|
|
||||||
color = font.attrib.pop('color', None)
|
|
||||||
if color is not None:
|
|
||||||
setting += 'color:%s'%color
|
|
||||||
id = 'calibre_font_id_%d'%font_id
|
|
||||||
font.set('id', 'calibre_font_id_%d'%font_id)
|
|
||||||
font_id += 1
|
|
||||||
css.append('#%s { %s }'%(id, setting))
|
|
||||||
|
|
||||||
|
|
||||||
css_counter = 1
|
|
||||||
for elem in self.root.xpath('//*[@style]'):
|
|
||||||
if 'id' not in elem.keys():
|
|
||||||
elem.set('id', 'calibre_css_id_%d'%css_counter)
|
|
||||||
css_counter += 1
|
|
||||||
css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
|
|
||||||
elem.attrib.pop('style')
|
|
||||||
chapter_counter = 1
|
|
||||||
for chapter in self.detected_chapters:
|
|
||||||
if chapter.tag.lower() == 'a':
|
|
||||||
if 'name' in chapter.keys():
|
|
||||||
chapter.attrib['id'] = id = chapter.get('name')
|
|
||||||
elif 'id' in chapter.keys():
|
|
||||||
id = chapter.get('id')
|
|
||||||
else:
|
|
||||||
id = 'calibre_detected_chapter_%d'%chapter_counter
|
|
||||||
chapter_counter += 1
|
|
||||||
chapter.set('id', id)
|
|
||||||
else:
|
|
||||||
if 'id' not in chapter.keys():
|
|
||||||
id = 'calibre_detected_chapter_%d'%chapter_counter
|
|
||||||
chapter_counter += 1
|
|
||||||
chapter.set('id', id)
|
|
||||||
css.append('#%s {%s}'%(id, 'page-break-before:always'))
|
|
||||||
|
|
||||||
self.raw_css = '\n\n'.join(css)
|
|
||||||
# TODO: Figure out what to do about CSS imports from linked stylesheets
|
|
||||||
|
|
||||||
def collect_font_statistics(self):
|
def collect_font_statistics(self):
|
||||||
'''
|
'''
|
||||||
Collect font statistics to figure out the base font size used in this
|
Collect font statistics to figure out the base font size used in this
|
||||||
@ -191,8 +45,8 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def config():
|
def config(defaults=None):
|
||||||
c = common_config()
|
c = common_config(defaults=defaults)
|
||||||
return c
|
return c
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
@ -203,11 +57,6 @@ def option_parser():
|
|||||||
Convert a HTML file to an EPUB ebook. Follows links in the HTML file.
|
Convert a HTML file to an EPUB ebook. Follows links in the HTML file.
|
||||||
'''))
|
'''))
|
||||||
|
|
||||||
def search_for_opf(dir):
|
|
||||||
for f in os.listdir(dir):
|
|
||||||
if f.lower().endswith('.opf'):
|
|
||||||
return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
|
|
||||||
|
|
||||||
def parse_content(filelist, opts):
|
def parse_content(filelist, opts):
|
||||||
tdir = PersistentTemporaryDirectory('_html2epub')
|
tdir = PersistentTemporaryDirectory('_html2epub')
|
||||||
os.makedirs(os.path.join(tdir, 'content', 'resources'))
|
os.makedirs(os.path.join(tdir, 'content', 'resources'))
|
||||||
@ -221,39 +70,17 @@ def convert(htmlfile, opts, notification=None):
|
|||||||
if opts.output is None:
|
if opts.output is None:
|
||||||
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
|
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
|
||||||
opts.output = os.path.abspath(opts.output)
|
opts.output = os.path.abspath(opts.output)
|
||||||
opf = search_for_opf(os.path.dirname(htmlfile))
|
opf, filelist = get_filelist(htmlfile, opts)
|
||||||
if opf:
|
mi = merge_metadata(htmlfile, opf, opts)
|
||||||
mi = MetaInformation(opf)
|
|
||||||
else:
|
|
||||||
mi = get_metadata(open(htmlfile, 'rb'), 'html')
|
|
||||||
if opts.title:
|
|
||||||
mi.title = opts.title
|
|
||||||
if opts.authors != _('Unknown'):
|
|
||||||
opts.authors = opts.authors.split(',')
|
|
||||||
opts.authors = [a.strip() for a in opts.authors]
|
|
||||||
mi.authors = opts.authors
|
|
||||||
|
|
||||||
if not mi.title:
|
|
||||||
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
|
|
||||||
if not mi.authors:
|
|
||||||
mi.authors = [_('Unknown')]
|
|
||||||
|
|
||||||
opts.chapter = XPath(opts.chapter,
|
opts.chapter = XPath(opts.chapter,
|
||||||
namespaces={'re':'http://exslt.org/regular-expressions'})
|
namespaces={'re':'http://exslt.org/regular-expressions'})
|
||||||
|
resource_map = parse_content(filelist, opts)
|
||||||
filelist = None
|
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
|
||||||
print 'Building file list...'
|
if opf.cover and os.access(opf.cover, os.R_OK):
|
||||||
if opf is not None:
|
shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
|
||||||
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
|
||||||
if not filelist:
|
shutil.copyfile(opf.cover, cpath)
|
||||||
filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
|
resources.append(cpath)
|
||||||
[0 if opts.breadth_first else 1]
|
|
||||||
if opts.verbose:
|
|
||||||
print '\tFound files...'
|
|
||||||
for f in filelist:
|
|
||||||
print '\t\t', f
|
|
||||||
|
|
||||||
parse_content(filelist, opts)
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
@ -266,5 +93,4 @@ def main(args=sys.argv):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|
@ -1,8 +1,220 @@
|
|||||||
|
from __future__ import with_statement
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import sys, re, os, shutil, logging, tempfile
|
||||||
|
from urlparse import urlparse
|
||||||
|
from urllib import unquote
|
||||||
|
|
||||||
|
from lxml import html
|
||||||
|
from lxml.etree import XPath
|
||||||
|
get_text = XPath("//text()")
|
||||||
|
|
||||||
|
from calibre import LoggingInterface, unicode_path
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.utils.config import Config, StringConfig
|
||||||
|
from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
|
from calibre.utils.zipfile import ZipFile
|
||||||
|
|
||||||
|
|
||||||
|
class Link(object):
|
||||||
|
'''
|
||||||
|
Represents a link in a HTML file.
|
||||||
|
'''
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def url_to_local_path(cls, url, base):
|
||||||
|
path = url.path
|
||||||
|
if os.path.isabs(path):
|
||||||
|
return path
|
||||||
|
return os.path.abspath(os.path.join(base, path))
|
||||||
|
|
||||||
|
def __init__(self, url, base):
|
||||||
|
'''
|
||||||
|
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||||
|
:param base: The base directory that relative URLs are with respect to.
|
||||||
|
Must be a unicode string.
|
||||||
|
'''
|
||||||
|
assert isinstance(url, unicode) and isinstance(base, unicode)
|
||||||
|
self.url = url
|
||||||
|
self.parsed_url = urlparse(unquote(self.url))
|
||||||
|
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||||
|
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||||
|
self.path = None
|
||||||
|
self.fragment = self.parsed_url.fragment
|
||||||
|
if self.is_local and not self.is_internal:
|
||||||
|
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
if self.path is None:
|
||||||
|
return hash(self.url)
|
||||||
|
return hash(self.path)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.path == getattr(other, 'path', other)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return u'Link: %s --> %s'%(self.url, self.path)
|
||||||
|
|
||||||
|
|
||||||
|
class IgnoreFile(Exception):
|
||||||
|
|
||||||
|
def __init__(self, msg, errno):
|
||||||
|
Exception.__init__(self, msg)
|
||||||
|
self.doesnt_exist = errno == 2
|
||||||
|
self.errno = errno
|
||||||
|
|
||||||
|
class HTMLFile(object):
|
||||||
|
'''
|
||||||
|
Contains basic information about an HTML file. This
|
||||||
|
includes a list of links to other files as well as
|
||||||
|
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||||
|
file in which case :member:`is_binary` is set to True.
|
||||||
|
|
||||||
|
The encoding of the file is available as :member:`encoding`.
|
||||||
|
'''
|
||||||
|
|
||||||
|
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||||
|
LINK_PAT = re.compile(
|
||||||
|
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
|
||||||
|
re.DOTALL|re.IGNORECASE)
|
||||||
|
|
||||||
|
def __init__(self, path_to_html_file, level, encoding, verbose):
|
||||||
|
'''
|
||||||
|
:param level: The level of this file. Should be 0 for the root file.
|
||||||
|
:param encoding: Use `encoding` to decode HTML.
|
||||||
|
'''
|
||||||
|
self.path = unicode_path(path_to_html_file, abs=True)
|
||||||
|
self.base = os.path.dirname(self.path)
|
||||||
|
self.level = level
|
||||||
|
self.links = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.path, 'rb') as f:
|
||||||
|
src = f.read()
|
||||||
|
except IOError, err:
|
||||||
|
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
||||||
|
if level == 0:
|
||||||
|
raise IOError(msg)
|
||||||
|
raise IgnoreFile(msg, err.errno)
|
||||||
|
|
||||||
|
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||||
|
|
||||||
|
if not self.is_binary:
|
||||||
|
if encoding is None:
|
||||||
|
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
src = src.decode(encoding, 'replace')
|
||||||
|
self.find_links(src)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.path == getattr(other, 'path', other)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self)
|
||||||
|
|
||||||
|
|
||||||
|
def find_links(self, src):
|
||||||
|
for match in self.LINK_PAT.finditer(src):
|
||||||
|
url = None
|
||||||
|
for i in ('url1', 'url2', 'url3'):
|
||||||
|
url = match.group(i)
|
||||||
|
if url:
|
||||||
|
break
|
||||||
|
link = self.resolve(url)
|
||||||
|
if link not in self.links:
|
||||||
|
self.links.append(link)
|
||||||
|
|
||||||
|
def resolve(self, url):
|
||||||
|
return Link(url, self.base)
|
||||||
|
|
||||||
|
|
||||||
|
def depth_first(root, flat, visited=set([])):
|
||||||
|
yield root
|
||||||
|
visited.add(root)
|
||||||
|
for link in root.links:
|
||||||
|
if link.path is not None and link not in visited:
|
||||||
|
try:
|
||||||
|
index = flat.index(link)
|
||||||
|
except ValueError: # Can happen if max_levels is used
|
||||||
|
continue
|
||||||
|
hf = flat[index]
|
||||||
|
if hf not in visited:
|
||||||
|
yield hf
|
||||||
|
visited.add(hf)
|
||||||
|
for hf in depth_first(hf, flat, visited):
|
||||||
|
if hf not in visited:
|
||||||
|
yield hf
|
||||||
|
visited.add(hf)
|
||||||
|
|
||||||
|
|
||||||
|
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
||||||
|
'''
|
||||||
|
Recursively traverse all links in the HTML file.
|
||||||
|
|
||||||
|
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||||
|
implies that no links in the root HTML file are followed.
|
||||||
|
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||||
|
auto-detected.
|
||||||
|
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||||
|
:class:`HTMLFile` objects.
|
||||||
|
'''
|
||||||
|
assert max_levels >= 0
|
||||||
|
level = 0
|
||||||
|
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||||
|
next_level = list(flat)
|
||||||
|
while level < max_levels and len(next_level) > 0:
|
||||||
|
level += 1
|
||||||
|
nl = []
|
||||||
|
for hf in next_level:
|
||||||
|
rejects = []
|
||||||
|
for link in hf.links:
|
||||||
|
if link.path is None or link.path in flat:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
nf = HTMLFile(link.path, level, encoding, verbose)
|
||||||
|
nl.append(nf)
|
||||||
|
flat.append(nf)
|
||||||
|
except IgnoreFile, err:
|
||||||
|
rejects.append(link)
|
||||||
|
if not err.doesnt_exist or verbose > 1:
|
||||||
|
print str(err)
|
||||||
|
for link in rejects:
|
||||||
|
hf.links.remove(link)
|
||||||
|
|
||||||
|
next_level = list(nl)
|
||||||
|
|
||||||
|
return flat, list(depth_first(flat[0], flat))
|
||||||
|
|
||||||
|
|
||||||
|
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||||
|
'''
|
||||||
|
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||||
|
`<spine>` element of the OPF.
|
||||||
|
|
||||||
|
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
|
||||||
|
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||||
|
auto-detected.
|
||||||
|
'''
|
||||||
|
if not opf_reader.spine:
|
||||||
|
raise ValueError('OPF does not have a spine')
|
||||||
|
flat = []
|
||||||
|
for path in opf_reader.spine.items():
|
||||||
|
if path not in flat:
|
||||||
|
flat.append(os.path.abspath(path))
|
||||||
|
flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
|
||||||
|
return flat
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PreProcessor(object):
|
class PreProcessor(object):
|
||||||
@ -70,5 +282,289 @@ class PreProcessor(object):
|
|||||||
for rule in self.PREPROCESS + rules:
|
for rule in self.PREPROCESS + rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
class Parser(PreProcessor):
|
||||||
|
|
||||||
|
ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
||||||
|
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
|
||||||
|
LoggingInterface.__init__(self, logging.getLogger(name))
|
||||||
|
self.setup_cli_handler(opts.verbose)
|
||||||
|
self.htmlfile = htmlfile
|
||||||
|
self.opts = opts
|
||||||
|
self.tdir = tdir
|
||||||
|
self.resource_map = resource_map
|
||||||
|
self.htmlfiles = htmlfiles
|
||||||
|
self.resource_dir = os.path.join(tdir, 'resources')
|
||||||
|
|
||||||
|
self.parse_html()
|
||||||
|
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
|
||||||
|
|
||||||
|
def parse_html(self):
|
||||||
|
''' Create lxml ElementTree from HTML '''
|
||||||
|
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
||||||
|
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
|
||||||
|
src = self.preprocess(src)
|
||||||
|
# lxml chokes on unicode input when it contains encoding declarations
|
||||||
|
for pat in self.ENCODING_PATS:
|
||||||
|
src = pat.sub('', src)
|
||||||
|
try:
|
||||||
|
self.root = html.document_fromstring(src)
|
||||||
|
except:
|
||||||
|
if self.opts.verbose:
|
||||||
|
self.log_exception('lxml based parsing failed')
|
||||||
|
self.root = html.soupparser.fromstring()
|
||||||
|
self.head = self.body = None
|
||||||
|
head = self.root.xpath('//head')
|
||||||
|
if head:
|
||||||
|
self.head = head[0]
|
||||||
|
body = self.root.xpath('//body')
|
||||||
|
if body:
|
||||||
|
self.body = body[0]
|
||||||
|
|
||||||
|
def debug_tree(self, name):
|
||||||
|
'''
|
||||||
|
Dump source tree for later debugging.
|
||||||
|
'''
|
||||||
|
tdir = tempfile.gettempdir()
|
||||||
|
if not os.path.exists(tdir):
|
||||||
|
os.makedirs(tdir)
|
||||||
|
with open(os.path.join(tdir, '%s-%s-%s.html'%\
|
||||||
|
(self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f:
|
||||||
|
f.write(html.tostring(self.root, encoding='utf-8'))
|
||||||
|
self.log_debug(_('Written processed HTML to ')+f.name)
|
||||||
|
|
||||||
|
|
||||||
|
def rewrite_links(self, olink):
|
||||||
|
'''
|
||||||
|
Make all links in document relative so that they work in the EPUB container.
|
||||||
|
Also copies any resources (like images, stylesheets, scripts, etc.) into
|
||||||
|
the local tree.
|
||||||
|
'''
|
||||||
|
if not isinstance(olink, unicode):
|
||||||
|
olink = olink.decode(self.htmlfile.encoding)
|
||||||
|
link = self.htmlfile.resolve(olink)
|
||||||
|
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
|
||||||
|
return olink
|
||||||
|
if link.path in self.htmlfiles:
|
||||||
|
return os.path.basename(link.path)
|
||||||
|
if link.path in self.resource_map.keys():
|
||||||
|
return self.resource_map[link.path]
|
||||||
|
name = os.path.basename(link.path)
|
||||||
|
name, ext = os.path.splitext(name)
|
||||||
|
name += ('_%d'%len(self.resource_map)) + ext
|
||||||
|
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
|
||||||
|
name = 'resources/' + name
|
||||||
|
self.resource_map[link.path] = name
|
||||||
|
return name
|
||||||
|
|
||||||
|
def extract_css(self):
|
||||||
|
'''
|
||||||
|
Remove all CSS information from the document and store in self.raw_css.
|
||||||
|
This includes <font> tags.
|
||||||
|
'''
|
||||||
|
css = []
|
||||||
|
for link in self.root.xpath('//link'):
|
||||||
|
if 'css' in link.get('type', 'text/css').lower():
|
||||||
|
file = self.htmlfile.resolve(link.get('href', ''))
|
||||||
|
if os.path.exists(file) and os.path.isfile(file):
|
||||||
|
css.append(open(file, 'rb').read().decode('utf-8'))
|
||||||
|
link.getparent().remove(link)
|
||||||
|
|
||||||
|
for style in self.root.xpath('//style'):
|
||||||
|
if 'css' in style.get('type', 'text/css').lower():
|
||||||
|
css.append('\n'.join(get_text(style)))
|
||||||
|
style.getparent().remove(style)
|
||||||
|
|
||||||
|
font_id = 1
|
||||||
|
for font in self.root.xpath('//font'):
|
||||||
|
try:
|
||||||
|
size = int(font.attrib.pop('size', '3'))
|
||||||
|
except:
|
||||||
|
size = 3
|
||||||
|
setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
|
||||||
|
face = font.attrib.pop('face', None)
|
||||||
|
if face is not None:
|
||||||
|
setting += 'font-face:%s;'%face
|
||||||
|
color = font.attrib.pop('color', None)
|
||||||
|
if color is not None:
|
||||||
|
setting += 'color:%s'%color
|
||||||
|
id = 'calibre_font_id_%d'%font_id
|
||||||
|
font.set('id', 'calibre_font_id_%d'%font_id)
|
||||||
|
font_id += 1
|
||||||
|
css.append('#%s { %s }'%(id, setting))
|
||||||
|
|
||||||
|
|
||||||
|
css_counter = 1
|
||||||
|
for elem in self.root.xpath('//*[@style]'):
|
||||||
|
if 'id' not in elem.keys():
|
||||||
|
elem.set('id', 'calibre_css_id_%d'%css_counter)
|
||||||
|
css_counter += 1
|
||||||
|
css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
|
||||||
|
elem.attrib.pop('style')
|
||||||
|
chapter_counter = 1
|
||||||
|
for chapter in self.detected_chapters:
|
||||||
|
if chapter.tag.lower() == 'a':
|
||||||
|
if 'name' in chapter.keys():
|
||||||
|
chapter.attrib['id'] = id = chapter.get('name')
|
||||||
|
elif 'id' in chapter.keys():
|
||||||
|
id = chapter.get('id')
|
||||||
|
else:
|
||||||
|
id = 'calibre_detected_chapter_%d'%chapter_counter
|
||||||
|
chapter_counter += 1
|
||||||
|
chapter.set('id', id)
|
||||||
|
else:
|
||||||
|
if 'id' not in chapter.keys():
|
||||||
|
id = 'calibre_detected_chapter_%d'%chapter_counter
|
||||||
|
chapter_counter += 1
|
||||||
|
chapter.set('id', id)
|
||||||
|
css.append('#%s {%s}'%(id, 'page-break-before:always'))
|
||||||
|
|
||||||
|
self.raw_css = '\n\n'.join(css)
|
||||||
|
# TODO: Figure out what to do about CSS imports from linked stylesheets
|
||||||
|
|
||||||
|
def config(defaults=None):
|
||||||
|
desc = _('Options to control the traversal of HTML')
|
||||||
|
if defaults is None:
|
||||||
|
c = Config('html', desc)
|
||||||
|
else:
|
||||||
|
c = StringConfig(defaults, desc)
|
||||||
|
|
||||||
|
c.add_opt('output', ['-o', '--output'], default=None,
|
||||||
|
help=_('The output directory. Default is the current directory.'))
|
||||||
|
c.add_opt('encoding', ['--encoding'], default=None,
|
||||||
|
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
||||||
|
|
||||||
|
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
|
||||||
|
traversal('breadth_first', ['--breadth-first'], default=False,
|
||||||
|
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
|
||||||
|
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
|
||||||
|
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
|
||||||
|
|
||||||
|
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
|
||||||
|
metadata('title', ['-t', '--title'], default=None,
|
||||||
|
help=_('Set the title. Default is to autodetect.'))
|
||||||
|
metadata('authors', ['-a', '--authors'], default=_('Unknown'),
|
||||||
|
help=_('The author(s) of the ebook, as a comma separated list.'))
|
||||||
|
|
||||||
|
debug = c.add_group('debug', _('Options useful for debugging'))
|
||||||
|
debug('verbose', ['-v', '--verbose'], default=0, action='count',
|
||||||
|
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
|
||||||
|
|
||||||
|
return c
|
||||||
|
|
||||||
|
def option_parser():
|
||||||
|
c = config()
|
||||||
|
return c.option_parser(usage=_('''\
|
||||||
|
%prog [options] file.html
|
||||||
|
|
||||||
|
Follow all links in an HTML file and collect them into the specified directory.
|
||||||
|
Also collects any references resources like images, stylesheets, scripts, etc.
|
||||||
|
'''))
|
||||||
|
|
||||||
|
def safe_option_parser():
|
||||||
|
return option_parser(safe=True)
|
||||||
|
|
||||||
|
def search_for_opf(dir):
|
||||||
|
for f in os.listdir(dir):
|
||||||
|
if f.lower().endswith('.opf'):
|
||||||
|
return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
|
||||||
|
|
||||||
|
|
||||||
|
def get_filelist(htmlfile, opts):
|
||||||
|
print 'Building file list...'
|
||||||
|
|
||||||
|
opf = search_for_opf(os.path.dirname(htmlfile))
|
||||||
|
if opf is not None:
|
||||||
|
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
||||||
|
if not filelist:
|
||||||
|
filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
|
||||||
|
[0 if opts.breadth_first else 1]
|
||||||
|
if opts.verbose:
|
||||||
|
print '\tFound files...'
|
||||||
|
for f in filelist:
|
||||||
|
print '\t\t', f
|
||||||
|
|
||||||
|
return opf, filelist
|
||||||
|
|
||||||
|
def parse_content(filelist, opts):
|
||||||
|
if not opts.output:
|
||||||
|
opts.output = '.'
|
||||||
|
opts.output = os.path.abspath(opts.output)
|
||||||
|
rdir = os.path.join(opts.output, 'content', 'resources')
|
||||||
|
if not os.path.exists(rdir):
|
||||||
|
os.makedirs(rdir)
|
||||||
|
resource_map = {}
|
||||||
|
for htmlfile in filelist:
|
||||||
|
Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
|
||||||
|
resource_map, filelist)
|
||||||
|
return resource_map
|
||||||
|
|
||||||
|
def merge_metadata(htmlfile, opf, opts):
|
||||||
|
if opf:
|
||||||
|
mi = MetaInformation(opf)
|
||||||
|
else:
|
||||||
|
mi = get_metadata(open(htmlfile, 'rb'), 'html')
|
||||||
|
if opts.title:
|
||||||
|
mi.title = opts.title
|
||||||
|
if opts.authors != _('Unknown'):
|
||||||
|
opts.authors = opts.authors.split(',')
|
||||||
|
opts.authors = [a.strip() for a in opts.authors]
|
||||||
|
mi.authors = opts.authors
|
||||||
|
|
||||||
|
if not mi.title:
|
||||||
|
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
|
||||||
|
if not mi.authors:
|
||||||
|
mi.authors = [_('Unknown')]
|
||||||
|
|
||||||
|
def create_metadata(basepath, mi, filelist, resources):
|
||||||
|
mi = OPFCreator(basepath, mi)
|
||||||
|
entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
|
||||||
|
mi.create_manifest(entries)
|
||||||
|
mi.create_spine([f.path for f in filelist])
|
||||||
|
return mi
|
||||||
|
|
||||||
|
def create_dir(htmlfile, opts):
|
||||||
|
opf, filelist = get_filelist(htmlfile, opts)
|
||||||
|
mi = merge_metadata(htmlfile, opf, opts)
|
||||||
|
resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
|
||||||
|
if opf.cover and os.access(opf.cover, os.R_OK):
|
||||||
|
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
|
||||||
|
shutil.copyfile(opf.cover, cpath)
|
||||||
|
resources.append(cpath)
|
||||||
|
mi = create_metadata(opts.output, mi, filelist, resources)
|
||||||
|
with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
|
||||||
|
mi.render(f)
|
||||||
|
print 'Open ebook created in', opts.output
|
||||||
|
|
||||||
|
def create_oebzip(htmlfile, opts):
|
||||||
|
tdir = PersistentTemporaryDirectory('_create_oebzip')
|
||||||
|
if opts.output is None:
|
||||||
|
opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip')
|
||||||
|
ofile = opts.output
|
||||||
|
opts.output = tdir
|
||||||
|
create_dir(htmlfile, opts)
|
||||||
|
zf = ZipFile(ofile, 'w')
|
||||||
|
zf.add_dir(opts.output)
|
||||||
|
print 'Output saved to', ofile
|
||||||
|
|
||||||
|
def main(args=sys.argv):
|
||||||
|
parser = option_parser()
|
||||||
|
opts, args = parser.parse_args(args)
|
||||||
|
if len(args) < 2:
|
||||||
|
parser.print_help()
|
||||||
|
print _('You must specify an input HTML file')
|
||||||
|
return 1
|
||||||
|
|
||||||
|
htmlfile = args[1]
|
||||||
|
create_dir(htmlfile, opts)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ entry_points = {
|
|||||||
'epub-meta = calibre.ebooks.metadata.epub:main',
|
'epub-meta = calibre.ebooks.metadata.epub:main',
|
||||||
'txt2lrf = calibre.ebooks.lrf.txt.convert_from:main',
|
'txt2lrf = calibre.ebooks.lrf.txt.convert_from:main',
|
||||||
'html2lrf = calibre.ebooks.lrf.html.convert_from:main',
|
'html2lrf = calibre.ebooks.lrf.html.convert_from:main',
|
||||||
|
'html2oeb = calibre.ebooks.html:main',
|
||||||
'html2epub = calibre.ebooks.epub.from_html:main',
|
'html2epub = calibre.ebooks.epub.from_html:main',
|
||||||
'markdown-calibre = calibre.ebooks.markdown.markdown:main',
|
'markdown-calibre = calibre.ebooks.markdown.markdown:main',
|
||||||
'lit2lrf = calibre.ebooks.lrf.lit.convert_from:main',
|
'lit2lrf = calibre.ebooks.lrf.lit.convert_from:main',
|
||||||
@ -168,6 +169,8 @@ def setup_completion(fatal_errors):
|
|||||||
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
|
from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
|
||||||
from calibre.ebooks.metadata.epub import option_parser as epub_meta
|
from calibre.ebooks.metadata.epub import option_parser as epub_meta
|
||||||
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
|
||||||
|
from calibre.ebooks.epub.from_html import option_parser as html2epub
|
||||||
|
from calibre.ebooks.html import option_parser as html2oeb
|
||||||
|
|
||||||
f = open_file('/etc/bash_completion.d/libprs500')
|
f = open_file('/etc/bash_completion.d/libprs500')
|
||||||
f.close()
|
f.close()
|
||||||
@ -203,6 +206,8 @@ def setup_completion(fatal_errors):
|
|||||||
f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
|
f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
|
||||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||||
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
|
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
|
||||||
|
f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml']))
|
||||||
|
f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml']))
|
||||||
f.write('''
|
f.write('''
|
||||||
_prs500_ls()
|
_prs500_ls()
|
||||||
{
|
{
|
||||||
|
@ -169,7 +169,7 @@ class Option(object):
|
|||||||
self.metavar = metavar
|
self.metavar = metavar
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return self.name == getattr(other, 'name', None)
|
return self.name == getattr(other, 'name', other)
|
||||||
|
|
||||||
class OptionValues(object):
|
class OptionValues(object):
|
||||||
|
|
||||||
@ -202,6 +202,19 @@ class OptionSet(object):
|
|||||||
self.groups[name] = description
|
self.groups[name] = description
|
||||||
self.group_list.append(name)
|
self.group_list.append(name)
|
||||||
return partial(self.add_opt, group=name)
|
return partial(self.add_opt, group=name)
|
||||||
|
|
||||||
|
def update(self, other):
|
||||||
|
for name in other.groups.keys():
|
||||||
|
self.groups[name] = other.groups[name]
|
||||||
|
for pref in other.preferences:
|
||||||
|
if pref in self.preferences:
|
||||||
|
self.preferences.remove(pref)
|
||||||
|
self.preferences.append(pref)
|
||||||
|
|
||||||
|
def remove_opt(self, name):
|
||||||
|
if name in self.preferences:
|
||||||
|
self.preferences.remove(name)
|
||||||
|
|
||||||
|
|
||||||
def add_opt(self, name, switches=[], help=None, type=None, choices=None,
|
def add_opt(self, name, switches=[], help=None, type=None, choices=None,
|
||||||
group=None, default=None, action=None, metavar=None):
|
group=None, default=None, action=None, metavar=None):
|
||||||
@ -306,25 +319,40 @@ class OptionSet(object):
|
|||||||
groups = [self.render_group(name, self.groups.get(name, ''), opts) \
|
groups = [self.render_group(name, self.groups.get(name, ''), opts) \
|
||||||
for name in [None] + self.group_list]
|
for name in [None] + self.group_list]
|
||||||
return src + '\n\n'.join(groups)
|
return src + '\n\n'.join(groups)
|
||||||
|
|
||||||
|
class ConfigInterface(object):
|
||||||
|
|
||||||
class Config(object):
|
def __init__(self, description):
|
||||||
|
|
||||||
def __init__(self, basename, description=''):
|
|
||||||
self.config_file_path = os.path.join(config_dir, basename+'.py')
|
|
||||||
self.option_set = OptionSet(description=description)
|
self.option_set = OptionSet(description=description)
|
||||||
self.add_opt = self.option_set.add_opt
|
self.add_opt = self.option_set.add_opt
|
||||||
self.add_group = self.option_set.add_group
|
self.add_group = self.option_set.add_group
|
||||||
|
self.remove_opt = self.option_set.remove_opt
|
||||||
|
|
||||||
|
def update(self, other):
|
||||||
|
self.option_set.update(other.option_set)
|
||||||
|
|
||||||
def option_parser(self, usage='', gui_mode=False):
|
def option_parser(self, usage='', gui_mode=False):
|
||||||
return self.option_set.option_parser(user_defaults=self.parse(),
|
return self.option_set.option_parser(user_defaults=self.parse(),
|
||||||
usage=usage, gui_mode=gui_mode)
|
usage=usage, gui_mode=gui_mode)
|
||||||
|
|
||||||
|
class Config(ConfigInterface):
|
||||||
|
'''
|
||||||
|
A file based configuration.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, basename, description=''):
|
||||||
|
ConfigInterface.__init__(self, description)
|
||||||
|
self.config_file_path = os.path.join(config_dir, basename+'.py')
|
||||||
|
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
try:
|
src = ''
|
||||||
with ExclusiveFile(self.config_file_path) as f:
|
if os.path.exists(self.config_file_path):
|
||||||
src = f.read()
|
try:
|
||||||
except LockError:
|
with ExclusiveFile(self.config_file_path) as f:
|
||||||
raise IOError('Could not lock config file: %s'%self.config_file_path)
|
src = f.read()
|
||||||
|
except LockError:
|
||||||
|
raise IOError('Could not lock config file: %s'%self.config_file_path)
|
||||||
return self.option_set.parse_string(src)
|
return self.option_set.parse_string(src)
|
||||||
|
|
||||||
def as_string(self):
|
def as_string(self):
|
||||||
@ -352,18 +380,15 @@ class Config(object):
|
|||||||
except LockError:
|
except LockError:
|
||||||
raise IOError('Could not lock config file: %s'%self.config_file_path)
|
raise IOError('Could not lock config file: %s'%self.config_file_path)
|
||||||
|
|
||||||
class StringConfig(object):
|
class StringConfig(ConfigInterface):
|
||||||
|
'''
|
||||||
|
A string based configuration
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, src, description=''):
|
def __init__(self, src, description=''):
|
||||||
|
ConfigInterface.__init__(self, description)
|
||||||
self.src = src
|
self.src = src
|
||||||
self.option_set = OptionSet(description=description)
|
|
||||||
self.add_opt = self.option_set.add_opt
|
|
||||||
self.option_parser = self.option_set.option_parser
|
|
||||||
|
|
||||||
def option_parser(self, usage='', gui_mode=False):
|
|
||||||
return self.option_set.option_parser(user_defaults=self.parse(),
|
|
||||||
usage=usage, gui_mode=gui_mode)
|
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
return self.option_set.parse_string(self.src)
|
return self.option_set.parse_string(self.src)
|
||||||
|
|
||||||
|
@ -1034,10 +1034,11 @@ class ZipFile:
|
|||||||
os.makedirs(upperdirs)
|
os.makedirs(upperdirs)
|
||||||
|
|
||||||
source = self.open(member, pwd=pwd)
|
source = self.open(member, pwd=pwd)
|
||||||
target = open(targetpath, "wb")
|
if not os.path.exists(targetpath): # Could be a previously automatically created directory
|
||||||
shutil.copyfileobj(source, target)
|
target = open(targetpath, "wb")
|
||||||
source.close()
|
shutil.copyfileobj(source, target)
|
||||||
target.close()
|
source.close()
|
||||||
|
target.close()
|
||||||
|
|
||||||
return targetpath
|
return targetpath
|
||||||
|
|
||||||
@ -1067,6 +1068,8 @@ class ZipFile:
|
|||||||
def write(self, filename, arcname=None, compress_type=None):
|
def write(self, filename, arcname=None, compress_type=None):
|
||||||
"""Put the bytes from filename into the archive under the name
|
"""Put the bytes from filename into the archive under the name
|
||||||
arcname."""
|
arcname."""
|
||||||
|
if isinstance(filename, unicode):
|
||||||
|
filename = filename.encode('utf-8')
|
||||||
if not self.fp:
|
if not self.fp:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Attempt to write to ZIP archive that was already closed")
|
"Attempt to write to ZIP archive that was already closed")
|
||||||
@ -1133,15 +1136,17 @@ class ZipFile:
|
|||||||
self.filelist.append(zinfo)
|
self.filelist.append(zinfo)
|
||||||
self.NameToInfo[zinfo.filename] = zinfo
|
self.NameToInfo[zinfo.filename] = zinfo
|
||||||
|
|
||||||
def writestr(self, zinfo_or_arcname, bytes):
|
def writestr(self, zinfo_or_arcname, bytes, permissions=0600):
|
||||||
"""Write a file into the archive. The contents is the string
|
"""Write a file into the archive. The contents is the string
|
||||||
'bytes'. 'zinfo_or_arcname' is either a ZipInfo instance or
|
'bytes'. 'zinfo_or_arcname' is either a ZipInfo instance or
|
||||||
the name of the file in the archive."""
|
the name of the file in the archive."""
|
||||||
if not isinstance(zinfo_or_arcname, ZipInfo):
|
if not isinstance(zinfo_or_arcname, ZipInfo):
|
||||||
|
if isinstance(zinfo_or_arcname, unicode):
|
||||||
|
zinfo_or_arcname = zinfo_or_arcname.encode('utf-8')
|
||||||
zinfo = ZipInfo(filename=zinfo_or_arcname,
|
zinfo = ZipInfo(filename=zinfo_or_arcname,
|
||||||
date_time=time.localtime(time.time())[:6])
|
date_time=time.localtime(time.time())[:6])
|
||||||
zinfo.compress_type = self.compression
|
zinfo.compress_type = self.compression
|
||||||
zinfo.external_attr = 0600 << 16
|
zinfo.external_attr = permissions << 16
|
||||||
else:
|
else:
|
||||||
zinfo = zinfo_or_arcname
|
zinfo = zinfo_or_arcname
|
||||||
|
|
||||||
@ -1171,6 +1176,23 @@ class ZipFile:
|
|||||||
zinfo.file_size))
|
zinfo.file_size))
|
||||||
self.filelist.append(zinfo)
|
self.filelist.append(zinfo)
|
||||||
self.NameToInfo[zinfo.filename] = zinfo
|
self.NameToInfo[zinfo.filename] = zinfo
|
||||||
|
|
||||||
|
def add_dir(self, path, prefix=''):
|
||||||
|
if prefix:
|
||||||
|
self.writestr(prefix+'/', '', 0700)
|
||||||
|
cwd = os.path.abspath(os.getcwd())
|
||||||
|
try:
|
||||||
|
os.chdir(path)
|
||||||
|
fp = (prefix + ('/' if prefix else '')).replace('//', '/')
|
||||||
|
for f in os.listdir('.'):
|
||||||
|
arcname = fp + f
|
||||||
|
if os.path.isdir(f):
|
||||||
|
self.add_dir(f, prefix=arcname)
|
||||||
|
else:
|
||||||
|
self.write(f, arcname)
|
||||||
|
finally:
|
||||||
|
os.chdir(cwd)
|
||||||
|
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
"""Call the "close()" method in case the user forgot."""
|
"""Call the "close()" method in case the user forgot."""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user