From 22859c604e5a170196da53a20d4d4f8527223d6f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Aug 2008 13:54:32 -0700 Subject: [PATCH] Implement #976 (recursive import of HTML into library as OEBPS zip) --- src/calibre/ebooks/html.py | 80 ++++++++++++++++++++++++++---- src/calibre/ebooks/metadata/toc.py | 1 + src/calibre/gui2/main.py | 12 ++++- 3 files changed, 83 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index fc0dd4e472..bf007b6a57 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -1,4 +1,5 @@ from __future__ import with_statement +import cStringIO __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' @@ -17,7 +18,7 @@ from calibre.utils.config import Config, StringConfig from calibre.ebooks.metadata.opf import OPFReader, OPFCreator from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.meta import get_metadata -from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.zipfile import ZipFile @@ -319,7 +320,7 @@ class Parser(PreProcessor, LoggingInterface): with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f: f.write(html.tostring(self.root, encoding='utf-8', method='xml', - include_meta_content_type=True, + include_meta_content_type=True, pretty_print=self.opts.pretty_print) ) return f.name @@ -491,9 +492,6 @@ Follow all links in an HTML file and collect them into the specified directory. Also collects any references resources like images, stylesheets, scripts, etc. ''')) -def safe_option_parser(): - return option_parser(safe=True) - def search_for_opf(dir): for f in os.listdir(dir): if f.lower().endswith('.opf'): @@ -501,9 +499,16 @@ def search_for_opf(dir): def get_filelist(htmlfile, opts): + ''' + Build list of files references by html file or try to detect and use an + OPF file instead. + ''' print 'Building file list...' - - opf = search_for_opf(os.path.dirname(htmlfile)) + dir = os.path.dirname(htmlfile) + if not dir: + dir = os.getcwd() + opf = search_for_opf(dir) + filelist = None if opf is not None: filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) if not filelist: @@ -516,6 +521,9 @@ def get_filelist(htmlfile, opts): return opf, filelist def parse_content(filelist, opts): + ''' + Parse content, rewriting links and copying resources. + ''' if not opts.output: opts.output = '.' opts.output = os.path.abspath(opts.output) @@ -530,6 +538,9 @@ def parse_content(filelist, opts): return resource_map, p.htmlfile_map def merge_metadata(htmlfile, opf, opts): + ''' + Merge metadata from various sources. + ''' if opf: mi = MetaInformation(opf) else: @@ -548,29 +559,59 @@ def merge_metadata(htmlfile, opf, opts): return mi def create_metadata(basepath, mi, filelist, resources): + ''' + Create an OPF metadata object with correct spine and manifest. + ''' mi = OPFCreator(basepath, mi) entries = [('content/'+f, None) for f in filelist] + [(f, None) for f in resources] mi.create_manifest(entries) mi.create_spine(['content/'+f for f in filelist]) return mi +def rebase_toc(toc, htmlfile_map, basepath, root=True): + ''' + Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. + ''' + def fix_entry(entry): + if entry.abspath in htmlfile_map.keys(): + entry.href = 'content/' + htmlfile_map[entry.abspath] + + for entry in toc: + rebase_toc(entry, htmlfile_map, basepath, root=False) + fix_entry(entry) + if root: + toc.base_path = basepath + def create_dir(htmlfile, opts): + ''' + Create a directory that contains the open ebook + ''' opf, filelist = get_filelist(htmlfile, opts) mi = merge_metadata(htmlfile, opf, opts) resource_map, htmlfile_map = parse_content(filelist, opts) resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] - if opf.cover and os.access(opf.cover, os.R_OK): + if opf and opf.cover and os.access(opf.cover, os.R_OK): cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1]) shutil.copyfile(opf.cover, cpath) resources.append(cpath) mi.cover = cpath spine = [htmlfile_map[f.path] for f in filelist] mi = create_metadata(opts.output, mi, spine, resources) + buf = cStringIO.StringIO() + if mi.toc: + rebase_toc(mi.toc, htmlfile_map, opts.output) with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f: - mi.render(f) + mi.render(f, buf) + toc = buf.getvalue() + if toc: + with open(os.path.join(opts.output, 'toc.ncx'), 'wb') as f: + f.write(toc) print 'Open ebook created in', opts.output def create_oebzip(htmlfile, opts): + ''' + Create a zip file that contains the Open ebook. + ''' tdir = PersistentTemporaryDirectory('_create_oebzip') if opts.output is None: opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip') @@ -597,5 +638,26 @@ def main(args=sys.argv): return 0 +def gui_main(htmlfile): + ''' + Convenience wrapper for use in recursively importing HTML files. + ''' + pt = PersistentTemporaryFile('_html2oeb_gui.oeb.zip') + pt.close() + opts = ''' +pretty_print = True +max_levels = 5 +output = %s +'''%repr(pt.name) + c = config(defaults=opts) + opts = c.parse() + create_oebzip(htmlfile, opts) + zf = ZipFile(pt.name, 'r') + nontrivial = [f for f in zf.infolist() if f.compress_size > 1 and not f.filename.endswith('.opf')] + if len(nontrivial) < 2: + return None + return pt.name + + if __name__ == '__main__': sys.exit(main()) diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 5cecda1c6d..23ce77bcb6 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -56,6 +56,7 @@ class TOC(list): if not os.path.isabs(path): path = os.path.join(self.base_path, path) return path + return property(fget=fget, doc=doc) def read_from_opf(self, opfreader): diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index a8e9dca223..030d3e0250 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -1,6 +1,6 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os, sys, textwrap, collections, traceback, time +import os, sys, textwrap, collections, traceback, time, re from xml.parsers.expat import ExpatError from functools import partial from PyQt4.QtCore import Qt, SIGNAL, QObject, QCoreApplication, QUrl @@ -43,6 +43,7 @@ from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.gui2.dialogs.book_info import BookInfo from calibre.ebooks.metadata.meta import set_metadata from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.html import gui_main as html2oeb from calibre.ebooks import BOOK_EXTENSIONS from calibre.ebooks.lrf import preferred_source_formats as LRF_PREFERRED_SOURCE_FORMATS from calibre.library.database2 import LibraryDatabase2, CoverCache @@ -480,6 +481,15 @@ class Main(MainWindow, Ui_MainWindow): if not to_device: model = self.current_view().model() + html_pat = re.compile(r'\.x{0,1}htm(l{0,1})\s*$', re.IGNORECASE) + paths = list(paths) + for i, path in enumerate(paths): + if html_pat.search(path) is not None: + paths[i] = html2oeb(path) + if paths[i] is None: + paths[i] = path + else: + formats[i] = 'zip' duplicates = model.add_books(paths, formats, metadata) if duplicates: files = _('

Books with the same title as the following already exist in the database. Add them anyway?