diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index f13d6e64f5..ced1d268af 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -8,6 +8,7 @@ Conversion to EPUB. ''' import sys from calibre.utils.config import Config, StringConfig +from calibre.ebooks.html import config as common_config def config(defaults=None): desc = _('Options to control the conversion to EPUB') @@ -15,23 +16,12 @@ def config(defaults=None): c = Config('epub', desc) else: c = StringConfig(defaults, desc) - + + c.update(common_config()) + c.remove_opt('output') + c.add_opt('output', ['-o', '--output'], default=None, help=_('The output EPUB file. If not specified, it is derived from the input file name.')) - c.add_opt('encoding', ['--encoding'], default=None, - help=_('Character encoding for HTML files. Default is to auto detect.')) - - metadata = c.add_group('metadata', _('Set metadata of the generated ebook')) - metadata('title', ['-t', '--title'], default=None, - help=_('Set the title. Default is to autodetect.')) - metadata('authors', ['-a', '--authors'], default=_('Unknown'), - help=_('The author(s) of the ebook, as a comma separated list.')) - - traversal = c.add_group('traversal', _('Control the following of links in HTML files.')) - traversal('breadth_first', ['--breadth-first'], default=False, - help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first')) - traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal', - help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.')) structure = c.add_group('structure detection', _('Control auto-detection of document structure.')) structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]", @@ -46,8 +36,5 @@ help on using this feature. help=_('Don\'t add detected chapters to the Table of Contents')) structure('no_links_in_toc', ['--no-links-in-toc'], default=False, help=_('Don\'t add links in the root HTML file to the Table of Contents')) - debug = c.add_group('debug', _('Options useful for debugging')) - debug('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.')) return c \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 3a3fb445d7..91351423ba 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -2,44 +2,22 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -import os, sys, logging, re, shutil, tempfile -from lxml import html +import os, sys, re, shutil from lxml.etree import XPath -get_text = XPath("//text()") -from calibre import LoggingInterface -from calibre.ebooks.html import PreProcessor +from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist from calibre.ebooks.epub import config as common_config -from calibre.ebooks.epub.traverse import traverse, opf_traverse -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.metadata.opf import OPFReader from calibre.ptempfile import PersistentTemporaryDirectory -class HTMLProcessor(PreProcessor, LoggingInterface): - - ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), - re.compile(r'', re.IGNORECASE)] +class HTMLProcessor(Parser): def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles): - LoggingInterface.__init__(self, logging.getLogger('html2epub')) - self.setup_cli_handler(opts.verbose) - - self.htmlfile = htmlfile - self.opts = opts - self.tdir = tdir - self.resource_map = resource_map - self.resource_dir = os.path.join(tdir, 'resources') - self.htmlfiles = htmlfiles - - self.parse_html() - - self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) - + Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, + name='html2epub') if opts.verbose > 2: self.debug_tree('parsed') - + self.detected_chapters = self.opts.chapter(self.root) self.extract_css() if opts.verbose > 2: @@ -49,130 +27,6 @@ class HTMLProcessor(PreProcessor, LoggingInterface): self.split() - def debug_tree(self, name): - ''' - Dump source tree for later debugging. - ''' - tdir = tempfile.gettempdir() - if not os.path.exists(tdir): - os.makedirs(tdir) - with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\ - (os.path.basename(self.htmlfile.path), name)), 'wb') as f: - f.write(html.tostring(self.root, encoding='utf-8')) - self.log_debug(_('Written processed HTML to ')+f.name) - - def parse_html(self): - ''' Create lxml ElementTree from HTML ''' - self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) - src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace') - src = self.preprocess(src) - # lxml chokes on unicode input when it contains encoding declarations - for pat in self.ENCODING_PATS: - src = pat.sub('', src) - try: - self.root = html.document_fromstring(src) - except: - if self.opts.verbose: - self.log_exception('lxml based parsing failed') - self.root = html.soupparser.fromstring() - self.head = self.body = None - head = self.root.xpath('//head') - if head: - self.head = head[0] - body = self.root.xpath('//body') - if body: - self.body = body[0] - self.detected_chapters = self.opts.chapter(self.root) - - def rewrite_links(self, olink): - ''' - Make all links in document relative so that they work in the EPUB container. - Also copies any resources (like images, stylesheets, scripts, etc.) into - the local tree. - ''' - if not isinstance(olink, unicode): - olink = olink.decode(self.htmlfile.encoding) - link = self.htmlfile.resolve(olink) - if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path): - return olink - if link.path in self.htmlfiles: - return os.path.basename(link.path) - if link.path in self.resource_map.keys(): - return self.resource_map[link.path] - name = os.path.basename(link.path) - name, ext = os.path.splitext(name) - name += ('_%d'%len(self.resource_map)) + ext - shutil.copyfile(link.path, os.path.join(self.resource_dir, name)) - name = 'resources/' + name - self.resource_map[link.path] = name - return name - - - def extract_css(self): - ''' - Remove all CSS information from the document and store in self.raw_css. - This includes tags. - ''' - css = [] - for link in self.root.xpath('//link'): - if 'css' in link.get('type', 'text/css').lower(): - file = self.htmlfile.resolve(link.get('href', '')) - if os.path.exists(file) and os.path.isfile(file): - css.append(open(file, 'rb').read().decode('utf-8')) - link.getparent().remove(link) - - for style in self.root.xpath('//style'): - if 'css' in style.get('type', 'text/css').lower(): - css.append('\n'.join(get_text(style))) - style.getparent().remove(style) - - font_id = 1 - for font in self.root.xpath('//font'): - try: - size = int(font.attrib.pop('size', '3')) - except: - size = 3 - setting = 'font-size: %d%%;'%int((float(size)/3) * 100) - face = font.attrib.pop('face', None) - if face is not None: - setting += 'font-face:%s;'%face - color = font.attrib.pop('color', None) - if color is not None: - setting += 'color:%s'%color - id = 'calibre_font_id_%d'%font_id - font.set('id', 'calibre_font_id_%d'%font_id) - font_id += 1 - css.append('#%s { %s }'%(id, setting)) - - - css_counter = 1 - for elem in self.root.xpath('//*[@style]'): - if 'id' not in elem.keys(): - elem.set('id', 'calibre_css_id_%d'%css_counter) - css_counter += 1 - css.append('#%s {%s}'%(elem.get('id'), elem.get('style'))) - elem.attrib.pop('style') - chapter_counter = 1 - for chapter in self.detected_chapters: - if chapter.tag.lower() == 'a': - if 'name' in chapter.keys(): - chapter.attrib['id'] = id = chapter.get('name') - elif 'id' in chapter.keys(): - id = chapter.get('id') - else: - id = 'calibre_detected_chapter_%d'%chapter_counter - chapter_counter += 1 - chapter.set('id', id) - else: - if 'id' not in chapter.keys(): - id = 'calibre_detected_chapter_%d'%chapter_counter - chapter_counter += 1 - chapter.set('id', id) - css.append('#%s {%s}'%(id, 'page-break-before:always')) - - self.raw_css = '\n\n'.join(css) - # TODO: Figure out what to do about CSS imports from linked stylesheets - def collect_font_statistics(self): ''' Collect font statistics to figure out the base font size used in this @@ -191,8 +45,8 @@ class HTMLProcessor(PreProcessor, LoggingInterface): pass -def config(): - c = common_config() +def config(defaults=None): + c = common_config(defaults=defaults) return c def option_parser(): @@ -203,11 +57,6 @@ def option_parser(): Convert a HTML file to an EPUB ebook. Follows links in the HTML file. ''')) -def search_for_opf(dir): - for f in os.listdir(dir): - if f.lower().endswith('.opf'): - return OPFReader(open(os.path.join(dir, f), 'rb'), dir) - def parse_content(filelist, opts): tdir = PersistentTemporaryDirectory('_html2epub') os.makedirs(os.path.join(tdir, 'content', 'resources')) @@ -221,39 +70,17 @@ def convert(htmlfile, opts, notification=None): if opts.output is None: opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' opts.output = os.path.abspath(opts.output) - opf = search_for_opf(os.path.dirname(htmlfile)) - if opf: - mi = MetaInformation(opf) - else: - mi = get_metadata(open(htmlfile, 'rb'), 'html') - if opts.title: - mi.title = opts.title - if opts.authors != _('Unknown'): - opts.authors = opts.authors.split(',') - opts.authors = [a.strip() for a in opts.authors] - mi.authors = opts.authors - - if not mi.title: - mi.title = os.path.splitext(os.path.basename(htmlfile))[0] - if not mi.authors: - mi.authors = [_('Unknown')] - + opf, filelist = get_filelist(htmlfile, opts) + mi = merge_metadata(htmlfile, opf, opts) opts.chapter = XPath(opts.chapter, namespaces={'re':'http://exslt.org/regular-expressions'}) - - filelist = None - print 'Building file list...' - if opf is not None: - filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) - if not filelist: - filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\ - [0 if opts.breadth_first else 1] - if opts.verbose: - print '\tFound files...' - for f in filelist: - print '\t\t', f - - parse_content(filelist, opts) + resource_map = parse_content(filelist, opts) + resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] + if opf.cover and os.access(opf.cover, os.R_OK): + shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) + cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) + shutil.copyfile(opf.cover, cpath) + resources.append(cpath) def main(args=sys.argv): parser = option_parser() @@ -266,5 +93,4 @@ def main(args=sys.argv): return 0 if __name__ == '__main__': - sys.exit(main()) - + sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index dc2114f14a..9b0345a799 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -1,8 +1,220 @@ +from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -import re +import sys, re, os, shutil, logging, tempfile +from urlparse import urlparse +from urllib import unquote + +from lxml import html +from lxml.etree import XPath +get_text = XPath("//text()") + +from calibre import LoggingInterface, unicode_path +from calibre.ebooks.chardet import xml_to_unicode +from calibre.utils.config import Config, StringConfig +from calibre.ebooks.metadata.opf import OPFReader, OPFCreator +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.meta import get_metadata +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.zipfile import ZipFile + + +class Link(object): + ''' + Represents a link in a HTML file. + ''' + + @classmethod + def url_to_local_path(cls, url, base): + path = url.path + if os.path.isabs(path): + return path + return os.path.abspath(os.path.join(base, path)) + + def __init__(self, url, base): + ''' + :param url: The url this link points to. Must be an unquoted unicode string. + :param base: The base directory that relative URLs are with respect to. + Must be a unicode string. + ''' + assert isinstance(url, unicode) and isinstance(base, unicode) + self.url = url + self.parsed_url = urlparse(unquote(self.url)) + self.is_local = self.parsed_url.scheme in ('', 'file') + self.is_internal = self.is_local and not bool(self.parsed_url.path) + self.path = None + self.fragment = self.parsed_url.fragment + if self.is_local and not self.is_internal: + self.path = self.url_to_local_path(self.parsed_url, base) + + def __hash__(self): + if self.path is None: + return hash(self.url) + return hash(self.path) + + def __eq__(self, other): + return self.path == getattr(other, 'path', other) + + def __str__(self): + return u'Link: %s --> %s'%(self.url, self.path) + + +class IgnoreFile(Exception): + + def __init__(self, msg, errno): + Exception.__init__(self, msg) + self.doesnt_exist = errno == 2 + self.errno = errno + +class HTMLFile(object): + ''' + Contains basic information about an HTML file. This + includes a list of links to other files as well as + the encoding of each file. Also tries to detect if the file is not a HTML + file in which case :member:`is_binary` is set to True. + + The encoding of the file is available as :member:`encoding`. + ''' + + HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) + LINK_PAT = re.compile( + r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s]+))', + re.DOTALL|re.IGNORECASE) + + def __init__(self, path_to_html_file, level, encoding, verbose): + ''' + :param level: The level of this file. Should be 0 for the root file. + :param encoding: Use `encoding` to decode HTML. + ''' + self.path = unicode_path(path_to_html_file, abs=True) + self.base = os.path.dirname(self.path) + self.level = level + self.links = [] + + try: + with open(self.path, 'rb') as f: + src = f.read() + except IOError, err: + msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err)) + if level == 0: + raise IOError(msg) + raise IgnoreFile(msg, err.errno) + + self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) + + if not self.is_binary: + if encoding is None: + encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] + self.encoding = encoding + + src = src.decode(encoding, 'replace') + self.find_links(src) + + + + def __eq__(self, other): + return self.path == getattr(other, 'path', other) + + def __str__(self): + return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) + + def __repr__(self): + return str(self) + + + def find_links(self, src): + for match in self.LINK_PAT.finditer(src): + url = None + for i in ('url1', 'url2', 'url3'): + url = match.group(i) + if url: + break + link = self.resolve(url) + if link not in self.links: + self.links.append(link) + + def resolve(self, url): + return Link(url, self.base) + + +def depth_first(root, flat, visited=set([])): + yield root + visited.add(root) + for link in root.links: + if link.path is not None and link not in visited: + try: + index = flat.index(link) + except ValueError: # Can happen if max_levels is used + continue + hf = flat[index] + if hf not in visited: + yield hf + visited.add(hf) + for hf in depth_first(hf, flat, visited): + if hf not in visited: + yield hf + visited.add(hf) + + +def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None): + ''' + Recursively traverse all links in the HTML file. + + :param max_levels: Maximum levels of recursion. Must be non-negative. 0 + implies that no links in the root HTML file are followed. + :param encoding: Specify character encoding of HTML files. If `None` it is + auto-detected. + :return: A pair of lists (breadth_first, depth_first). Each list contains + :class:`HTMLFile` objects. + ''' + assert max_levels >= 0 + level = 0 + flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] + next_level = list(flat) + while level < max_levels and len(next_level) > 0: + level += 1 + nl = [] + for hf in next_level: + rejects = [] + for link in hf.links: + if link.path is None or link.path in flat: + continue + try: + nf = HTMLFile(link.path, level, encoding, verbose) + nl.append(nf) + flat.append(nf) + except IgnoreFile, err: + rejects.append(link) + if not err.doesnt_exist or verbose > 1: + print str(err) + for link in rejects: + hf.links.remove(link) + + next_level = list(nl) + + return flat, list(depth_first(flat[0], flat)) + + +def opf_traverse(opf_reader, verbose=0, encoding=None): + ''' + Return a list of :class:`HTMLFile` objects in the order specified by the + `` element of the OPF. + + :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance. + :param encoding: Specify character encoding of HTML files. If `None` it is + auto-detected. + ''' + if not opf_reader.spine: + raise ValueError('OPF does not have a spine') + flat = [] + for path in opf_reader.spine.items(): + if path not in flat: + flat.append(os.path.abspath(path)) + flat = [HTMLFile(path, 0, encoding, verbose) for path in flat] + return flat + class PreProcessor(object): @@ -70,5 +282,289 @@ class PreProcessor(object): for rule in self.PREPROCESS + rules: html = rule[0].sub(rule[1], html) - return html + return html + +class Parser(PreProcessor): + + ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), + re.compile(r'', re.IGNORECASE)] + + + def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): + LoggingInterface.__init__(self, logging.getLogger(name)) + self.setup_cli_handler(opts.verbose) + self.htmlfile = htmlfile + self.opts = opts + self.tdir = tdir + self.resource_map = resource_map + self.htmlfiles = htmlfiles + self.resource_dir = os.path.join(tdir, 'resources') + + self.parse_html() + self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) + + def parse_html(self): + ''' Create lxml ElementTree from HTML ''' + self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) + src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace') + src = self.preprocess(src) + # lxml chokes on unicode input when it contains encoding declarations + for pat in self.ENCODING_PATS: + src = pat.sub('', src) + try: + self.root = html.document_fromstring(src) + except: + if self.opts.verbose: + self.log_exception('lxml based parsing failed') + self.root = html.soupparser.fromstring() + self.head = self.body = None + head = self.root.xpath('//head') + if head: + self.head = head[0] + body = self.root.xpath('//body') + if body: + self.body = body[0] + + def debug_tree(self, name): + ''' + Dump source tree for later debugging. + ''' + tdir = tempfile.gettempdir() + if not os.path.exists(tdir): + os.makedirs(tdir) + with open(os.path.join(tdir, '%s-%s-%s.html'%\ + (self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f: + f.write(html.tostring(self.root, encoding='utf-8')) + self.log_debug(_('Written processed HTML to ')+f.name) + + + def rewrite_links(self, olink): + ''' + Make all links in document relative so that they work in the EPUB container. + Also copies any resources (like images, stylesheets, scripts, etc.) into + the local tree. + ''' + if not isinstance(olink, unicode): + olink = olink.decode(self.htmlfile.encoding) + link = self.htmlfile.resolve(olink) + if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path): + return olink + if link.path in self.htmlfiles: + return os.path.basename(link.path) + if link.path in self.resource_map.keys(): + return self.resource_map[link.path] + name = os.path.basename(link.path) + name, ext = os.path.splitext(name) + name += ('_%d'%len(self.resource_map)) + ext + shutil.copyfile(link.path, os.path.join(self.resource_dir, name)) + name = 'resources/' + name + self.resource_map[link.path] = name + return name + + def extract_css(self): + ''' + Remove all CSS information from the document and store in self.raw_css. + This includes tags. + ''' + css = [] + for link in self.root.xpath('//link'): + if 'css' in link.get('type', 'text/css').lower(): + file = self.htmlfile.resolve(link.get('href', '')) + if os.path.exists(file) and os.path.isfile(file): + css.append(open(file, 'rb').read().decode('utf-8')) + link.getparent().remove(link) + + for style in self.root.xpath('//style'): + if 'css' in style.get('type', 'text/css').lower(): + css.append('\n'.join(get_text(style))) + style.getparent().remove(style) + + font_id = 1 + for font in self.root.xpath('//font'): + try: + size = int(font.attrib.pop('size', '3')) + except: + size = 3 + setting = 'font-size: %d%%;'%int((float(size)/3) * 100) + face = font.attrib.pop('face', None) + if face is not None: + setting += 'font-face:%s;'%face + color = font.attrib.pop('color', None) + if color is not None: + setting += 'color:%s'%color + id = 'calibre_font_id_%d'%font_id + font.set('id', 'calibre_font_id_%d'%font_id) + font_id += 1 + css.append('#%s { %s }'%(id, setting)) + + + css_counter = 1 + for elem in self.root.xpath('//*[@style]'): + if 'id' not in elem.keys(): + elem.set('id', 'calibre_css_id_%d'%css_counter) + css_counter += 1 + css.append('#%s {%s}'%(elem.get('id'), elem.get('style'))) + elem.attrib.pop('style') + chapter_counter = 1 + for chapter in self.detected_chapters: + if chapter.tag.lower() == 'a': + if 'name' in chapter.keys(): + chapter.attrib['id'] = id = chapter.get('name') + elif 'id' in chapter.keys(): + id = chapter.get('id') + else: + id = 'calibre_detected_chapter_%d'%chapter_counter + chapter_counter += 1 + chapter.set('id', id) + else: + if 'id' not in chapter.keys(): + id = 'calibre_detected_chapter_%d'%chapter_counter + chapter_counter += 1 + chapter.set('id', id) + css.append('#%s {%s}'%(id, 'page-break-before:always')) + + self.raw_css = '\n\n'.join(css) + # TODO: Figure out what to do about CSS imports from linked stylesheets + +def config(defaults=None): + desc = _('Options to control the traversal of HTML') + if defaults is None: + c = Config('html', desc) + else: + c = StringConfig(defaults, desc) + + c.add_opt('output', ['-o', '--output'], default=None, + help=_('The output directory. Default is the current directory.')) + c.add_opt('encoding', ['--encoding'], default=None, + help=_('Character encoding for HTML files. Default is to auto detect.')) + + traversal = c.add_group('traversal', _('Control the following of links in HTML files.')) + traversal('breadth_first', ['--breadth-first'], default=False, + help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first')) + traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal', + help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.')) + + metadata = c.add_group('metadata', _('Set metadata of the generated ebook')) + metadata('title', ['-t', '--title'], default=None, + help=_('Set the title. Default is to autodetect.')) + metadata('authors', ['-a', '--authors'], default=_('Unknown'), + help=_('The author(s) of the ebook, as a comma separated list.')) + + debug = c.add_group('debug', _('Options useful for debugging')) + debug('verbose', ['-v', '--verbose'], default=0, action='count', + help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.')) + + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ +%prog [options] file.html + +Follow all links in an HTML file and collect them into the specified directory. +Also collects any references resources like images, stylesheets, scripts, etc. +''')) + +def safe_option_parser(): + return option_parser(safe=True) + +def search_for_opf(dir): + for f in os.listdir(dir): + if f.lower().endswith('.opf'): + return OPFReader(open(os.path.join(dir, f), 'rb'), dir) + + +def get_filelist(htmlfile, opts): + print 'Building file list...' + + opf = search_for_opf(os.path.dirname(htmlfile)) + if opf is not None: + filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) + if not filelist: + filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\ + [0 if opts.breadth_first else 1] + if opts.verbose: + print '\tFound files...' + for f in filelist: + print '\t\t', f + + return opf, filelist + +def parse_content(filelist, opts): + if not opts.output: + opts.output = '.' + opts.output = os.path.abspath(opts.output) + rdir = os.path.join(opts.output, 'content', 'resources') + if not os.path.exists(rdir): + os.makedirs(rdir) + resource_map = {} + for htmlfile in filelist: + Parser(htmlfile, opts, os.path.join(opts.output, 'content'), + resource_map, filelist) + return resource_map + +def merge_metadata(htmlfile, opf, opts): + if opf: + mi = MetaInformation(opf) + else: + mi = get_metadata(open(htmlfile, 'rb'), 'html') + if opts.title: + mi.title = opts.title + if opts.authors != _('Unknown'): + opts.authors = opts.authors.split(',') + opts.authors = [a.strip() for a in opts.authors] + mi.authors = opts.authors + + if not mi.title: + mi.title = os.path.splitext(os.path.basename(htmlfile))[0] + if not mi.authors: + mi.authors = [_('Unknown')] + +def create_metadata(basepath, mi, filelist, resources): + mi = OPFCreator(basepath, mi) + entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources] + mi.create_manifest(entries) + mi.create_spine([f.path for f in filelist]) + return mi + +def create_dir(htmlfile, opts): + opf, filelist = get_filelist(htmlfile, opts) + mi = merge_metadata(htmlfile, opf, opts) + resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()] + if opf.cover and os.access(opf.cover, os.R_OK): + cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) + shutil.copyfile(opf.cover, cpath) + resources.append(cpath) + mi = create_metadata(opts.output, mi, filelist, resources) + with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f: + mi.render(f) + print 'Open ebook created in', opts.output + +def create_oebzip(htmlfile, opts): + tdir = PersistentTemporaryDirectory('_create_oebzip') + if opts.output is None: + opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip') + ofile = opts.output + opts.output = tdir + create_dir(htmlfile, opts) + zf = ZipFile(ofile, 'w') + zf.add_dir(opts.output) + print 'Output saved to', ofile + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) < 2: + parser.print_help() + print _('You must specify an input HTML file') + return 1 + + htmlfile = args[1] + create_dir(htmlfile, opts) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 4b3a0bffaa..e5c883ab9d 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -25,6 +25,7 @@ entry_points = { 'epub-meta = calibre.ebooks.metadata.epub:main', 'txt2lrf = calibre.ebooks.lrf.txt.convert_from:main', 'html2lrf = calibre.ebooks.lrf.html.convert_from:main', + 'html2oeb = calibre.ebooks.html:main', 'html2epub = calibre.ebooks.epub.from_html:main', 'markdown-calibre = calibre.ebooks.markdown.markdown:main', 'lit2lrf = calibre.ebooks.lrf.lit.convert_from:main', @@ -168,6 +169,8 @@ def setup_completion(fatal_errors): from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf from calibre.ebooks.metadata.epub import option_parser as epub_meta from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop + from calibre.ebooks.epub.from_html import option_parser as html2epub + from calibre.ebooks.html import option_parser as html2oeb f = open_file('/etc/bash_completion.d/libprs500') f.close() @@ -203,6 +206,8 @@ def setup_completion(fatal_errors): f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr'])) f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles)) f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles)) + f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml'])) + f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml'])) f.write(''' _prs500_ls() { diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py index 61cc8516ff..ab15cc6ce5 100644 --- a/src/calibre/utils/config.py +++ b/src/calibre/utils/config.py @@ -169,7 +169,7 @@ class Option(object): self.metavar = metavar def __eq__(self, other): - return self.name == getattr(other, 'name', None) + return self.name == getattr(other, 'name', other) class OptionValues(object): @@ -202,6 +202,19 @@ class OptionSet(object): self.groups[name] = description self.group_list.append(name) return partial(self.add_opt, group=name) + + def update(self, other): + for name in other.groups.keys(): + self.groups[name] = other.groups[name] + for pref in other.preferences: + if pref in self.preferences: + self.preferences.remove(pref) + self.preferences.append(pref) + + def remove_opt(self, name): + if name in self.preferences: + self.preferences.remove(name) + def add_opt(self, name, switches=[], help=None, type=None, choices=None, group=None, default=None, action=None, metavar=None): @@ -306,25 +319,40 @@ class OptionSet(object): groups = [self.render_group(name, self.groups.get(name, ''), opts) \ for name in [None] + self.group_list] return src + '\n\n'.join(groups) + +class ConfigInterface(object): -class Config(object): - - def __init__(self, basename, description=''): - self.config_file_path = os.path.join(config_dir, basename+'.py') + def __init__(self, description): self.option_set = OptionSet(description=description) self.add_opt = self.option_set.add_opt self.add_group = self.option_set.add_group + self.remove_opt = self.option_set.remove_opt + + def update(self, other): + self.option_set.update(other.option_set) def option_parser(self, usage='', gui_mode=False): return self.option_set.option_parser(user_defaults=self.parse(), usage=usage, gui_mode=gui_mode) + +class Config(ConfigInterface): + ''' + A file based configuration. + ''' + + def __init__(self, basename, description=''): + ConfigInterface.__init__(self, description) + self.config_file_path = os.path.join(config_dir, basename+'.py') + def parse(self): - try: - with ExclusiveFile(self.config_file_path) as f: - src = f.read() - except LockError: - raise IOError('Could not lock config file: %s'%self.config_file_path) + src = '' + if os.path.exists(self.config_file_path): + try: + with ExclusiveFile(self.config_file_path) as f: + src = f.read() + except LockError: + raise IOError('Could not lock config file: %s'%self.config_file_path) return self.option_set.parse_string(src) def as_string(self): @@ -352,18 +380,15 @@ class Config(object): except LockError: raise IOError('Could not lock config file: %s'%self.config_file_path) -class StringConfig(object): +class StringConfig(ConfigInterface): + ''' + A string based configuration + ''' def __init__(self, src, description=''): + ConfigInterface.__init__(self, description) self.src = src - self.option_set = OptionSet(description=description) - self.add_opt = self.option_set.add_opt - self.option_parser = self.option_set.option_parser - def option_parser(self, usage='', gui_mode=False): - return self.option_set.option_parser(user_defaults=self.parse(), - usage=usage, gui_mode=gui_mode) - def parse(self): return self.option_set.parse_string(self.src) diff --git a/src/calibre/utils/zipfile.py b/src/calibre/utils/zipfile.py index ff9eacf158..3deba3a612 100644 --- a/src/calibre/utils/zipfile.py +++ b/src/calibre/utils/zipfile.py @@ -1034,10 +1034,11 @@ class ZipFile: os.makedirs(upperdirs) source = self.open(member, pwd=pwd) - target = open(targetpath, "wb") - shutil.copyfileobj(source, target) - source.close() - target.close() + if not os.path.exists(targetpath): # Could be a previously automatically created directory + target = open(targetpath, "wb") + shutil.copyfileobj(source, target) + source.close() + target.close() return targetpath @@ -1067,6 +1068,8 @@ class ZipFile: def write(self, filename, arcname=None, compress_type=None): """Put the bytes from filename into the archive under the name arcname.""" + if isinstance(filename, unicode): + filename = filename.encode('utf-8') if not self.fp: raise RuntimeError( "Attempt to write to ZIP archive that was already closed") @@ -1133,15 +1136,17 @@ class ZipFile: self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo - def writestr(self, zinfo_or_arcname, bytes): + def writestr(self, zinfo_or_arcname, bytes, permissions=0600): """Write a file into the archive. The contents is the string 'bytes'. 'zinfo_or_arcname' is either a ZipInfo instance or the name of the file in the archive.""" if not isinstance(zinfo_or_arcname, ZipInfo): + if isinstance(zinfo_or_arcname, unicode): + zinfo_or_arcname = zinfo_or_arcname.encode('utf-8') zinfo = ZipInfo(filename=zinfo_or_arcname, date_time=time.localtime(time.time())[:6]) zinfo.compress_type = self.compression - zinfo.external_attr = 0600 << 16 + zinfo.external_attr = permissions << 16 else: zinfo = zinfo_or_arcname @@ -1171,6 +1176,23 @@ class ZipFile: zinfo.file_size)) self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo + + def add_dir(self, path, prefix=''): + if prefix: + self.writestr(prefix+'/', '', 0700) + cwd = os.path.abspath(os.getcwd()) + try: + os.chdir(path) + fp = (prefix + ('/' if prefix else '')).replace('//', '/') + for f in os.listdir('.'): + arcname = fp + f + if os.path.isdir(f): + self.add_dir(f, prefix=arcname) + else: + self.write(f, arcname) + finally: + os.chdir(cwd) + def __del__(self): """Call the "close()" method in case the user forgot."""