From 8c53abe905d1086cce975586f6c9d638b1a723dc Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 15 Sep 2008 12:48:54 -0700
Subject: [PATCH] IGN:Various fixes to html2epub

---
 src/calibre/ebooks/epub/__init__.py  |  6 +++--
 src/calibre/ebooks/epub/from_html.py | 17 +++++++++++---
 src/calibre/ebooks/html.py           | 35 ++++++++++++++++------------
 src/calibre/ebooks/metadata/opf2.py  | 16 ++++++-------
 src/calibre/linux.py                 |  2 +-
 src/calibre/utils/config.py          | 14 +++++++++--
 6 files changed, 59 insertions(+), 31 deletions(-)
diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py
index 0585385143..bcbc82f6c9 100644
--- a/src/calibre/ebooks/epub/__init__.py
+++ b/src/calibre/ebooks/epub/__init__.py
@@ -8,7 +8,7 @@ Conversion to EPUB.
 '''
 import sys, textwrap
 from calibre.utils.config import Config, StringConfig
-from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED
+from calibre.utils.zipfile import ZipFile, ZIP_STORED
 from calibre.ebooks.html import config as common_config
 
 def initialize_container(path_to_container, opf_name='metadata.opf'):
@@ -24,7 +24,7 @@ def initialize_container(path_to_container, opf_name='metadata.opf'):
 </container>
     '''%opf_name
     zf = ZipFile(path_to_container, 'w')
-    zf.writestr('mimetype', 'application/epub+zip', compression=ZIP_DEFLATED)
+    zf.writestr('mimetype', 'application/epub+zip', compression=ZIP_STORED)
     zf.writestr('META-INF/', '', 0700)
     zf.writestr('META-INF/container.xml', CONTAINER)
     return zf
@@ -67,5 +67,7 @@ to auto-generate a Table of Contents.
     toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
         help=_("Don't add auto-detected chapters to the Table of Contents."))
     
+    c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
+              help=_('Print generated OPF file to stdout'))
     
     return c
\ No newline at end of file
diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index 6abb45e858..32a86df4ed 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -12,6 +12,7 @@ from calibre.ebooks.epub import config as common_config
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.toc import TOC
+from calibre.ebooks.epub import initialize_container
 
 
 class HTMLProcessor(Processor):
@@ -93,10 +94,10 @@ def convert(htmlfile, opts, notification=None):
     
     with TemporaryDirectory('_html2epub') as tdir:
         resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir)
-        resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
+        resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
         
-        if opf.cover and os.access(opf.cover, os.R_OK):
-            shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
+        if mi.cover and os.access(mi.cover, os.R_OK):
+            shutil.copyfile(mi.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
             cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
             shutil.copyfile(opf.cover, cpath)
             resources.append(cpath)
@@ -109,12 +110,22 @@ def convert(htmlfile, opts, notification=None):
             rebase_toc(mi.toc, htmlfile_map, opts.output)
         if mi.toc is None or len(mi.toc) < 2:
             mi.toc = generated_toc
+        for item in mi.manifest:
+            if getattr(item, 'mime_type', None) == 'text/html':
+                item.mime_type = 'application/xhtml+xml'
         with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
             mi.render(f, buf)
+        if opts.show_opf:
+            print open(os.path.join(tdir, 'metadata.opf')).read()
         toc = buf.getvalue()
         if toc:
             with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
                 f.write(toc)
+                
+        epub = initialize_container(opts.output)
+        epub.add_dir(tdir)
+        print 'Output written to', opts.output
+        
             
 def main(args=sys.argv):
     parser = option_parser()
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index f96cde8623..742a7d3856 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -205,7 +205,6 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
                 hf.links.remove(link)
                 
         next_level = list(nl)
-        
     return flat, list(depth_first(flat[0], flat))
     
     
@@ -309,6 +308,7 @@ class Parser(PreProcessor, LoggingInterface):
         self.resource_dir = os.path.join(tdir, 'resources')
         save_counter = 1
         self.htmlfile_map = {}
+        self.level = self.htmlfile.level
         for f in self.htmlfiles:
             name = os.path.basename(f.path)
             if name in self.htmlfile_map.values():
@@ -362,8 +362,8 @@ class Parser(PreProcessor, LoggingInterface):
         tdir = tempfile.gettempdir()
         if not os.path.exists(tdir):
             os.makedirs(tdir)
-        with open(os.path.join(tdir, '%s-%s-%s.html'%\
-                    (self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f:
+        with open(os.path.join(tdir, '%s-%s.html'%\
+                    (os.path.basename(self.htmlfile.path), name)), 'wb') as f:
             f.write(html.tostring(self.root, encoding='utf-8'))
             self.log_debug(_('Written processed HTML to ')+f.name)
     
@@ -381,6 +381,8 @@ class Parser(PreProcessor, LoggingInterface):
             return olink
         if link.path in self.htmlfiles:
             return self.htmlfile_map[link.path]
+        if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
+            return olink # This happens when --max-levels is used
         if link.path in self.resource_map.keys():
             return self.resource_map[link.path]
         name = os.path.basename(link.path)
@@ -435,20 +437,20 @@ class Processor(Parser):
         
         def add_item(href, fragment, text, target):
             for entry in toc.flat():
-                if entry.href == href and entry.fragment ==fragment:
+                if entry.href == href and entry.fragment == fragment:
                     return entry
             if len(text) > 50:
                 text = text[:50] + u'\u2026'
             return target.add_item(href, fragment, text)
             
-        name = self.htmlfile_map[self.htmlfile]
+        name = self.htmlfile_map[self.htmlfile.path]
         href = 'content/'+name
         
         if referrer.href != href: # Happens for root file
             target = add_item(href, None, self.htmlfile.title, referrer)
             
         # Add links to TOC
-        if self.opts.max_toc_links > 0:
+        if int(self.opts.max_toc_links) > 0:
             for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]:
                 text = (u''.join(link.xpath('string()'))).strip()
                 if text:
@@ -468,7 +470,7 @@ class Processor(Parser):
             for elem in getattr(self, 'detected_chapters', []):
                 text = (u''.join(elem.xpath('string()'))).strip()
                 if text:
-                    name = self.htmlfile_map[self.path]
+                    name = self.htmlfile_map[self.htmlfile.path]
                     href = 'content/'+name
                     add_item(href, None, text, target)
                     
@@ -479,9 +481,9 @@ class Processor(Parser):
         This includes <font> tags.
         '''
         counter = 0
-        def get_id(chapter, prefix='calibre_css_'):
+        
+        def get_id(chapter, counter, prefix='calibre_css_'):
             new_id = '%s_%d'%(prefix, counter)
-            counter  += 1 
             if chapter.tag.lower() == 'a' and  'name' in chapter.keys():
                 chapter.attrib['id'] = id = chapter.get('name')
                 if not id:
@@ -497,14 +499,14 @@ class Processor(Parser):
         css = []
         for link in self.root.xpath('//link'):
             if 'css' in link.get('type', 'text/css').lower():
-                file = self.htmlfile.resolve(link.get('href', ''))
-                if os.path.exists(file) and os.path.isfile(file):
+                file = self.htmlfile.resolve(unicode(link.get('href', ''), self.htmlfile.encoding)).path
+                if file and os.path.exists(file) and os.path.isfile(file):
                     css.append(open(file, 'rb').read().decode('utf-8'))
                 link.getparent().remove(link)
                     
         for style in self.root.xpath('//style'):
             if 'css' in style.get('type', 'text/css').lower():
-                css.append('\n'.join(get_text(style)))
+                css.append('\n'.join(style.xpath('./text()')))
                 style.getparent().remove(style)
         
         for font in self.root.xpath('//font'):
@@ -519,12 +521,14 @@ class Processor(Parser):
             color = font.attrib.pop('color', None)
             if color is not None:
                 setting += 'color:%s'%color
-            id = get_id(font)
+            id = get_id(font, counter)
+            counter += 1
             css.append('#%s { %s }'%(id, setting))
             
         for elem in self.root.xpath('//*[@style]'):
             if 'id' not in elem.keys():
-                id = get_id(elem)
+                id = get_id(elem, counter)
+                counter += 1 
             css.append('#%s {%s}'%(id, elem.get('style')))
             elem.attrib.pop('style')
             
@@ -597,7 +601,8 @@ def get_filelist(htmlfile, opts):
     if opf is not None:
         filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
     if not filelist:
-        filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
+        filelist = traverse(htmlfile, max_levels=int(opts.max_levels), 
+                            verbose=opts.verbose, encoding=opts.encoding)\
                     [0 if opts.breadth_first else 1]
     if opts.verbose:
         print '\tFound files...'
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index 08ced86af9..009d5cfef8 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -252,14 +252,14 @@ class OPF(object):
     spine_path      = XPath('/opf:package/*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]')
     guide_path      = XPath('/opf:package/*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]')
     
-    title             = MetadataField('title')
-    publisher         = MetadataField('publisher')
-    language          = MetadataField('language')
-    comments          = MetadataField('description')
-    category          = MetadataField('category')
-    series            = MetadataField('series', is_dc=False)
-    series_index      = MetadataField('series_index', is_dc=False, formatter=int)
-    rating            = MetadataField('rating', is_dc=False, formatter=int)
+    title           = MetadataField('title')
+    publisher       = MetadataField('publisher')
+    language        = MetadataField('language')
+    comments        = MetadataField('description')
+    category        = MetadataField('category')
+    series          = MetadataField('series', is_dc=False)
+    series_index    = MetadataField('series_index', is_dc=False, formatter=int)
+    rating          = MetadataField('rating', is_dc=False, formatter=int)
     
     
     def __init__(self, stream, basedir=os.getcwdu()):
diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index b5fdfe558b..95729a0ee3 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -210,7 +210,7 @@ def setup_completion(fatal_errors):
         f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
         f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
         f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
-        f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml']))
+        f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml', 'opf']))
         f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml']))
         f.write('''
 _prs500_ls()
diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py
index c20a49bb09..865d628429 100644
--- a/src/calibre/utils/config.py
+++ b/src/calibre/utils/config.py
@@ -162,6 +162,12 @@ class Option(object):
         self.switches = switches
         self.help     = help.replace('%default', repr(default)) if help else None
         self.type     = type
+        if self.type is None and action is None and choices is None:
+            if isinstance(default, float):
+                self.type = 'float'
+            elif isinstance(default, int) and not isinstance(default, bool):
+                self.type = 'int'
+            
         self.choices  = choices
         self.check    = check
         self.group    = group
@@ -229,7 +235,7 @@ class OptionSet(object):
                            option will not be added to the command line parser.
         :param help:       Help text.
         :param type:       Type checking of option values. Supported types are:
-                           `None, 'choice', 'complex', 'float', 'int', 'long', 'string'`.
+                           `None, 'choice', 'complex', 'float', 'int', 'string'`.
         :param choices:    List of strings or `None`.
         :param group:      Group this option belongs to. You must previously 
                            have created this group with a call to :method:`add_group`.
@@ -289,7 +295,11 @@ class OptionSet(object):
             exec src in options
         opts = OptionValues()
         for pref in self.preferences:
-            setattr(opts, pref.name, options.get(pref.name, pref.default))
+            val = options.get(pref.name, pref.default)
+            formatter = __builtins__.get(pref.type, None)
+            if callable(formatter):
+                val = formatter(val)
+            setattr(opts, pref.name, val)
             
         return opts