From 8c53abe905d1086cce975586f6c9d638b1a723dc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 15 Sep 2008 12:48:54 -0700 Subject: [PATCH] IGN:Various fixes to html2epub --- src/calibre/ebooks/epub/__init__.py | 6 +++-- src/calibre/ebooks/epub/from_html.py | 17 +++++++++++--- src/calibre/ebooks/html.py | 35 ++++++++++++++++------------ src/calibre/ebooks/metadata/opf2.py | 16 ++++++------- src/calibre/linux.py | 2 +- src/calibre/utils/config.py | 14 +++++++++-- 6 files changed, 59 insertions(+), 31 deletions(-) diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 0585385143..bcbc82f6c9 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -8,7 +8,7 @@ Conversion to EPUB. ''' import sys, textwrap from calibre.utils.config import Config, StringConfig -from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED +from calibre.utils.zipfile import ZipFile, ZIP_STORED from calibre.ebooks.html import config as common_config def initialize_container(path_to_container, opf_name='metadata.opf'): @@ -24,7 +24,7 @@ def initialize_container(path_to_container, opf_name='metadata.opf'): '''%opf_name zf = ZipFile(path_to_container, 'w') - zf.writestr('mimetype', 'application/epub+zip', compression=ZIP_DEFLATED) + zf.writestr('mimetype', 'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', '', 0700) zf.writestr('META-INF/container.xml', CONTAINER) return zf @@ -67,5 +67,7 @@ to auto-generate a Table of Contents. toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, help=_("Don't add auto-detected chapters to the Table of Contents.")) + c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', + help=_('Print generated OPF file to stdout')) return c \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 6abb45e858..32a86df4ed 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -12,6 +12,7 @@ from calibre.ebooks.epub import config as common_config from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.epub import initialize_container class HTMLProcessor(Processor): @@ -93,10 +94,10 @@ def convert(htmlfile, opts, notification=None): with TemporaryDirectory('_html2epub') as tdir: resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir) - resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] + resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] - if opf.cover and os.access(opf.cover, os.R_OK): - shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) + if mi.cover and os.access(mi.cover, os.R_OK): + shutil.copyfile(mi.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) shutil.copyfile(opf.cover, cpath) resources.append(cpath) @@ -109,12 +110,22 @@ def convert(htmlfile, opts, notification=None): rebase_toc(mi.toc, htmlfile_map, opts.output) if mi.toc is None or len(mi.toc) < 2: mi.toc = generated_toc + for item in mi.manifest: + if getattr(item, 'mime_type', None) == 'text/html': + item.mime_type = 'application/xhtml+xml' with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: mi.render(f, buf) + if opts.show_opf: + print open(os.path.join(tdir, 'metadata.opf')).read() toc = buf.getvalue() if toc: with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f: f.write(toc) + + epub = initialize_container(opts.output) + epub.add_dir(tdir) + print 'Output written to', opts.output + def main(args=sys.argv): parser = option_parser() diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index f96cde8623..742a7d3856 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -205,7 +205,6 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) hf.links.remove(link) next_level = list(nl) - return flat, list(depth_first(flat[0], flat)) @@ -309,6 +308,7 @@ class Parser(PreProcessor, LoggingInterface): self.resource_dir = os.path.join(tdir, 'resources') save_counter = 1 self.htmlfile_map = {} + self.level = self.htmlfile.level for f in self.htmlfiles: name = os.path.basename(f.path) if name in self.htmlfile_map.values(): @@ -362,8 +362,8 @@ class Parser(PreProcessor, LoggingInterface): tdir = tempfile.gettempdir() if not os.path.exists(tdir): os.makedirs(tdir) - with open(os.path.join(tdir, '%s-%s-%s.html'%\ - (self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f: + with open(os.path.join(tdir, '%s-%s.html'%\ + (os.path.basename(self.htmlfile.path), name)), 'wb') as f: f.write(html.tostring(self.root, encoding='utf-8')) self.log_debug(_('Written processed HTML to ')+f.name) @@ -381,6 +381,8 @@ class Parser(PreProcessor, LoggingInterface): return olink if link.path in self.htmlfiles: return self.htmlfile_map[link.path] + if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None: + return olink # This happens when --max-levels is used if link.path in self.resource_map.keys(): return self.resource_map[link.path] name = os.path.basename(link.path) @@ -435,20 +437,20 @@ class Processor(Parser): def add_item(href, fragment, text, target): for entry in toc.flat(): - if entry.href == href and entry.fragment ==fragment: + if entry.href == href and entry.fragment == fragment: return entry if len(text) > 50: text = text[:50] + u'\u2026' return target.add_item(href, fragment, text) - name = self.htmlfile_map[self.htmlfile] + name = self.htmlfile_map[self.htmlfile.path] href = 'content/'+name if referrer.href != href: # Happens for root file target = add_item(href, None, self.htmlfile.title, referrer) # Add links to TOC - if self.opts.max_toc_links > 0: + if int(self.opts.max_toc_links) > 0: for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]: text = (u''.join(link.xpath('string()'))).strip() if text: @@ -468,7 +470,7 @@ class Processor(Parser): for elem in getattr(self, 'detected_chapters', []): text = (u''.join(elem.xpath('string()'))).strip() if text: - name = self.htmlfile_map[self.path] + name = self.htmlfile_map[self.htmlfile.path] href = 'content/'+name add_item(href, None, text, target) @@ -479,9 +481,9 @@ class Processor(Parser): This includes tags. ''' counter = 0 - def get_id(chapter, prefix='calibre_css_'): + + def get_id(chapter, counter, prefix='calibre_css_'): new_id = '%s_%d'%(prefix, counter) - counter += 1 if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): chapter.attrib['id'] = id = chapter.get('name') if not id: @@ -497,14 +499,14 @@ class Processor(Parser): css = [] for link in self.root.xpath('//link'): if 'css' in link.get('type', 'text/css').lower(): - file = self.htmlfile.resolve(link.get('href', '')) - if os.path.exists(file) and os.path.isfile(file): + file = self.htmlfile.resolve(unicode(link.get('href', ''), self.htmlfile.encoding)).path + if file and os.path.exists(file) and os.path.isfile(file): css.append(open(file, 'rb').read().decode('utf-8')) link.getparent().remove(link) for style in self.root.xpath('//style'): if 'css' in style.get('type', 'text/css').lower(): - css.append('\n'.join(get_text(style))) + css.append('\n'.join(style.xpath('./text()'))) style.getparent().remove(style) for font in self.root.xpath('//font'): @@ -519,12 +521,14 @@ class Processor(Parser): color = font.attrib.pop('color', None) if color is not None: setting += 'color:%s'%color - id = get_id(font) + id = get_id(font, counter) + counter += 1 css.append('#%s { %s }'%(id, setting)) for elem in self.root.xpath('//*[@style]'): if 'id' not in elem.keys(): - id = get_id(elem) + id = get_id(elem, counter) + counter += 1 css.append('#%s {%s}'%(id, elem.get('style'))) elem.attrib.pop('style') @@ -597,7 +601,8 @@ def get_filelist(htmlfile, opts): if opf is not None: filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) if not filelist: - filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\ + filelist = traverse(htmlfile, max_levels=int(opts.max_levels), + verbose=opts.verbose, encoding=opts.encoding)\ [0 if opts.breadth_first else 1] if opts.verbose: print '\tFound files...' diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 08ced86af9..009d5cfef8 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -252,14 +252,14 @@ class OPF(object): spine_path = XPath('/opf:package/*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]') guide_path = XPath('/opf:package/*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]') - title = MetadataField('title') - publisher = MetadataField('publisher') - language = MetadataField('language') - comments = MetadataField('description') - category = MetadataField('category') - series = MetadataField('series', is_dc=False) - series_index = MetadataField('series_index', is_dc=False, formatter=int) - rating = MetadataField('rating', is_dc=False, formatter=int) + title = MetadataField('title') + publisher = MetadataField('publisher') + language = MetadataField('language') + comments = MetadataField('description') + category = MetadataField('category') + series = MetadataField('series', is_dc=False) + series_index = MetadataField('series_index', is_dc=False, formatter=int) + rating = MetadataField('rating', is_dc=False, formatter=int) def __init__(self, stream, basedir=os.getcwdu()): diff --git a/src/calibre/linux.py b/src/calibre/linux.py index b5fdfe558b..95729a0ee3 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -210,7 +210,7 @@ def setup_completion(fatal_errors): f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr'])) f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles)) f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles)) - f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml'])) + f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml', 'opf'])) f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml'])) f.write(''' _prs500_ls() diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py index c20a49bb09..865d628429 100644 --- a/src/calibre/utils/config.py +++ b/src/calibre/utils/config.py @@ -162,6 +162,12 @@ class Option(object): self.switches = switches self.help = help.replace('%default', repr(default)) if help else None self.type = type + if self.type is None and action is None and choices is None: + if isinstance(default, float): + self.type = 'float' + elif isinstance(default, int) and not isinstance(default, bool): + self.type = 'int' + self.choices = choices self.check = check self.group = group @@ -229,7 +235,7 @@ class OptionSet(object): option will not be added to the command line parser. :param help: Help text. :param type: Type checking of option values. Supported types are: - `None, 'choice', 'complex', 'float', 'int', 'long', 'string'`. + `None, 'choice', 'complex', 'float', 'int', 'string'`. :param choices: List of strings or `None`. :param group: Group this option belongs to. You must previously have created this group with a call to :method:`add_group`. @@ -289,7 +295,11 @@ class OptionSet(object): exec src in options opts = OptionValues() for pref in self.preferences: - setattr(opts, pref.name, options.get(pref.name, pref.default)) + val = options.get(pref.name, pref.default) + formatter = __builtins__.get(pref.type, None) + if callable(formatter): + val = formatter(val) + setattr(opts, pref.name, val) return opts