From 41a938aef021b963397347f4355c2ac109dd3b8f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 26 Aug 2008 12:11:03 -0700
Subject: [PATCH] IGN:Working html2oeb

---
 src/calibre/__init__.py              |   3 +
 src/calibre/ebooks/epub/__init__.py  |   1 +
 src/calibre/ebooks/epub/traverse.py  | 218 ---------------------------
 src/calibre/ebooks/html.py           |  61 ++++++--
 src/calibre/gui2/jobs2.py            |  24 +--
 src/calibre/trac/plugins/download.py |   2 +-
 6 files changed, 63 insertions(+), 246 deletions(-)
 delete mode 100644 src/calibre/ebooks/epub/traverse.py

diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index e7bc7dd472..2703431835 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -268,6 +268,9 @@ class LoggingInterface:
         self.__logger = logger
         
     def setup_cli_handler(self, verbosity):
+        for handler in self.__logger.handlers:
+            if isinstance(handler, logging.StreamHandler):
+                return
         if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
             return
         stream    = sys.stdout
diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py
index ced1d268af..45d5d44296 100644
--- a/src/calibre/ebooks/epub/__init__.py
+++ b/src/calibre/ebooks/epub/__init__.py
@@ -19,6 +19,7 @@ def config(defaults=None):
     
     c.update(common_config())
     c.remove_opt('output')
+    c.remove_opt('zip')
     
     c.add_opt('output', ['-o', '--output'], default=None,
              help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
diff --git a/src/calibre/ebooks/epub/traverse.py b/src/calibre/ebooks/epub/traverse.py
deleted file mode 100644
index 6f942de3b9..0000000000
--- a/src/calibre/ebooks/epub/traverse.py
+++ /dev/null
@@ -1,218 +0,0 @@
-from __future__ import with_statement
-__license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
-__docformat__ = 'restructuredtext en'
-
-'''
-Recursively parse HTML files to find all linked files. See :function:`traverse`.
-'''
-
-import sys, os, re
-from urlparse import urlparse
-from urllib import unquote
-from calibre import unicode_path
-from calibre.ebooks.chardet import xml_to_unicode
-
-class Link(object):
-    '''
-    Represents a link in a HTML file.
-    '''
-    
-    @classmethod
-    def url_to_local_path(cls, url, base):
-        path = url.path
-        if os.path.isabs(path):
-            return path
-        return os.path.abspath(os.path.join(base, path))
-    
-    def __init__(self, url, base):
-        '''
-        :param url:  The url this link points to. Must be an unquoted unicode string.
-        :param base: The base directory that relative URLs are with respect to.
-                     Must be a unicode string.
-        '''
-        assert isinstance(url, unicode) and isinstance(base, unicode)
-        self.url         = url
-        self.parsed_url  = urlparse(unquote(self.url))
-        self.is_local    = self.parsed_url.scheme in ('', 'file')
-        self.is_internal = self.is_local and not bool(self.parsed_url.path)
-        self.path        = None
-        self.fragment    = self.parsed_url.fragment 
-        if self.is_local and not self.is_internal:
-            self.path = self.url_to_local_path(self.parsed_url, base)
-
-    def __hash__(self):
-        if self.path is None:
-            return hash(self.url)
-        return hash(self.path)
-
-    def __eq__(self, other):
-        return self.path == getattr(other, 'path', other)
-    
-    def __str__(self):
-        return u'Link: %s --> %s'%(self.url, self.path) 
-        
-
-class IgnoreFile(Exception):
-    
-    def __init__(self, msg, errno):
-        Exception.__init__(self, msg)
-        self.doesnt_exist = errno == 2
-        self.errno = errno
-
-class HTMLFile(object):
-    '''
-    Contains basic information about an HTML file. This
-    includes a list of links to other files as well as
-    the encoding of each file. Also tries to detect if the file is not a HTML
-    file in which case :member:`is_binary` is set to True.
-
-    The encoding of the file is available as :member:`encoding`.
-    '''
-    
-    HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
-    LINK_PAT = re.compile(
-    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
-    re.DOTALL|re.IGNORECASE)
-    
-    def __init__(self, path_to_html_file, level, encoding, verbose):
-        '''
-        :param level: The level of this file. Should be 0 for the root file.
-        :param encoding: Use `encoding` to decode HTML.
-        '''
-        self.path  = unicode_path(path_to_html_file, abs=True)
-        self.base  = os.path.dirname(self.path)
-        self.level = level
-        self.links = []
-        
-        try:
-            with open(self.path, 'rb') as f:
-                src = f.read()
-        except IOError, err:
-            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
-            if level == 0:
-                raise IOError(msg)
-            raise IgnoreFile(msg, err.errno)
-        
-        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
-        
-        if not self.is_binary:
-            if encoding is None:
-                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
-                self.encoding = encoding
-
-            src = src.decode(encoding, 'replace')
-            self.find_links(src)
-                
-        
-                    
-    def __eq__(self, other):
-        return self.path == getattr(other, 'path', other)
-    
-    def __str__(self):
-        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
-    
-    def __repr__(self):
-        return str(self)
-                    
-        
-    def find_links(self, src):
-        for match in self.LINK_PAT.finditer(src):
-            url = None
-            for i in ('url1', 'url2', 'url3'):
-                url = match.group(i)
-                if url:
-                    break
-            link = self.resolve(url)
-            if link not in self.links:
-                self.links.append(link)
-                
-    def resolve(self, url):
-        return Link(url, self.base)
-
-
-def depth_first(root, flat, visited=set([])):
-    yield root
-    visited.add(root)
-    for link in root.links:
-        if link.path is not None and link not in visited:
-            try:
-                index = flat.index(link)
-            except ValueError: # Can happen if max_levels is used
-                continue
-            hf = flat[index]
-            if hf not in visited:
-                yield hf
-                visited.add(hf)
-                for hf in depth_first(hf, flat, visited):
-                    if hf not in visited:
-                        yield hf
-                        visited.add(hf)
-        
-                                
-def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
-    '''
-    Recursively traverse all links in the HTML file.
-    
-    :param max_levels: Maximum levels of recursion. Must be non-negative. 0 
-                       implies that no links in the root HTML file are followed.
-    :param encoding:   Specify character encoding of HTML files. If `None` it is
-                       auto-detected.
-    :return:           A pair of lists (breadth_first, depth_first). Each list contains
-                       :class:`HTMLFile` objects.
-    '''
-    assert max_levels >= 0
-    level = 0
-    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
-    next_level = list(flat)
-    while level < max_levels and len(next_level) > 0:
-        level += 1
-        nl = []
-        for hf in next_level:
-            rejects = []
-            for link in hf.links:
-                if link.path is None or link.path in flat:
-                    continue
-                try:
-                    nf = HTMLFile(link.path, level, encoding, verbose)
-                    nl.append(nf)
-                    flat.append(nf)
-                except IgnoreFile, err:
-                    rejects.append(link)
-                    if not err.doesnt_exist or verbose > 1:
-                        print str(err)
-            for link in rejects:
-                hf.links.remove(link)
-                
-        next_level = list(nl)
-        
-    return flat, list(depth_first(flat[0], flat))
-    
-    
-def opf_traverse(opf_reader, verbose=0, encoding=None):
-    '''
-    Return a list of :class:`HTMLFile` objects in the order specified by the
-    `<spine>` element of the OPF.
-    
-    :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.  
-    :param encoding:   Specify character encoding of HTML files. If `None` it is
-                       auto-detected.
-    '''
-    if not opf_reader.spine:
-        raise ValueError('OPF does not have a spine')
-    flat = []
-    for path in opf_reader.spine.items():
-        if path not in flat:
-            flat.append(os.path.abspath(path))
-    flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
-    return flat
-            
-    
-
-if __name__ == '__main__':
-    breadth_first, depth_first = traverse(sys.argv[1], verbose=2)
-    print 'Breadth first...'
-    for f in breadth_first: print f
-    print '\n\nDepth first...'
-    for f in depth_first: print f
-    
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index 9b0345a799..fc0dd4e472 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -284,7 +284,7 @@ class PreProcessor(object):
         
         return html
     
-class Parser(PreProcessor):
+class Parser(PreProcessor, LoggingInterface):
     
     ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
                      re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
@@ -299,17 +299,39 @@ class Parser(PreProcessor):
         self.resource_map = resource_map
         self.htmlfiles = htmlfiles
         self.resource_dir = os.path.join(tdir, 'resources')
+        save_counter = 1
+        self.htmlfile_map = {}
+        for f in self.htmlfiles:
+            name = os.path.basename(f.path)
+            if name in self.htmlfile_map.values():
+                name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
+                save_counter += 1
+            self.htmlfile_map[f.path] = name
         
         self.parse_html()
         self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
         
+    def save(self):
+        '''
+        Save processed HTML into the content directory.
+        Should be called after all HTML processing is finished.
+        '''
+        with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
+            f.write(html.tostring(self.root, 
+                        encoding='utf-8', method='xml',
+                         include_meta_content_type=True,
+                        pretty_print=self.opts.pretty_print)
+                    )
+            return f.name
+
+
     def parse_html(self):
         ''' Create lxml ElementTree from HTML '''
         self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
         src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
         src = self.preprocess(src)
         # lxml chokes on unicode input when it contains encoding declarations
-        for pat in self.ENCODING_PATS: 
+        for pat in self.ENCODING_PATS:
             src = pat.sub('', src)
         try:
             self.root = html.document_fromstring(src)
@@ -350,7 +372,7 @@ class Parser(PreProcessor):
         if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
             return olink
         if link.path in self.htmlfiles:
-            return os.path.basename(link.path)
+            return self.htmlfile_map[link.path]
         if link.path in self.resource_map.keys():
             return self.resource_map[link.path]
         name = os.path.basename(link.path)
@@ -358,7 +380,7 @@ class Parser(PreProcessor):
         name += ('_%d'%len(self.resource_map)) + ext
         shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
         name = 'resources/' + name
-        self.resource_map[link.path] = name 
+        self.resource_map[link.path] = name
         return name
     
     def extract_css(self):
@@ -437,6 +459,8 @@ def config(defaults=None):
              help=_('The output directory. Default is the current directory.'))
     c.add_opt('encoding', ['--encoding'], default=None, 
               help=_('Character encoding for HTML files. Default is to auto detect.'))
+    c.add_opt('zip', ['--zip'], default=False,
+              help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
     
     traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
     traversal('breadth_first', ['--breadth-first'], default=False,
@@ -453,6 +477,8 @@ def config(defaults=None):
     debug = c.add_group('debug', _('Options useful for debugging'))
     debug('verbose', ['-v', '--verbose'], default=0, action='count',
           help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
+    debug('pretty_print', ['--pretty-print'], default=False,
+          help=_('Output HTML is "pretty printed" for easier parsing by humans'))
     
     return c
 
@@ -487,7 +513,6 @@ def get_filelist(htmlfile, opts):
         print '\tFound files...'
         for f in filelist:
             print '\t\t', f
-    
     return opf, filelist
 
 def parse_content(filelist, opts):
@@ -499,9 +524,10 @@ def parse_content(filelist, opts):
         os.makedirs(rdir)
     resource_map = {}
     for htmlfile in filelist:
-        Parser(htmlfile, opts, os.path.join(opts.output, 'content'), 
+        p = Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
                            resource_map, filelist)
-    return resource_map
+        p.save()
+    return resource_map, p.htmlfile_map
 
 def merge_metadata(htmlfile, opf, opts):
     if opf:
@@ -519,23 +545,27 @@ def merge_metadata(htmlfile, opf, opts):
         mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
     if not mi.authors:
         mi.authors = [_('Unknown')]
+    return mi
 
 def create_metadata(basepath, mi, filelist, resources):
     mi = OPFCreator(basepath, mi)
-    entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
+    entries = [('content/'+f, None) for f in filelist] + [(f, None) for f in resources]
     mi.create_manifest(entries)
-    mi.create_spine([f.path for f in filelist])
+    mi.create_spine(['content/'+f for f in filelist])
     return mi
 
 def create_dir(htmlfile, opts):
     opf, filelist = get_filelist(htmlfile, opts)
     mi = merge_metadata(htmlfile, opf, opts)
-    resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
+    resource_map, htmlfile_map = parse_content(filelist, opts)
+    resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
     if opf.cover and os.access(opf.cover, os.R_OK):
-        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
+        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
         shutil.copyfile(opf.cover, cpath)
         resources.append(cpath)
-    mi = create_metadata(opts.output, mi, filelist, resources)
+        mi.cover = cpath
+    spine = [htmlfile_map[f.path] for f in filelist]
+    mi = create_metadata(opts.output, mi, spine, resources)
     with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
         mi.render(f)
     print 'Open ebook created in', opts.output
@@ -560,11 +590,12 @@ def main(args=sys.argv):
         return 1
     
     htmlfile = args[1]
-    create_dir(htmlfile, opts)
+    if opts.zip:
+        create_oebzip(htmlfile, opts)
+    else:
+        create_dir(htmlfile, opts)
         
     return 0
 
 if __name__ == '__main__':
     sys.exit(main())
-        
-    
diff --git a/src/calibre/gui2/jobs2.py b/src/calibre/gui2/jobs2.py
index 9b30a3190e..109c4eaa33 100644
--- a/src/calibre/gui2/jobs2.py
+++ b/src/calibre/gui2/jobs2.py
@@ -19,13 +19,13 @@ NONE = QVariant()
 
 class JobManager(QAbstractTableModel):
     
-    wait_icon     = QVariant(QIcon(':/images/jobs.svg'))
-    running_icon  = QVariant(QIcon(':/images/exec.svg'))
-    error_icon    = QVariant(QIcon(':/images/dialog_error.svg'))
-    done_icon     = QVariant(QIcon(':/images/ok.svg'))
-    
     def __init__(self):
         QAbstractTableModel.__init__(self)
+        self.wait_icon     = QVariant(QIcon(':/images/jobs.svg'))
+        self.running_icon  = QVariant(QIcon(':/images/exec.svg'))
+        self.error_icon    = QVariant(QIcon(':/images/dialog_error.svg'))
+        self.done_icon     = QVariant(QIcon(':/images/ok.svg'))
+    
         self.jobs          = []
         self.server        = Server()
         self.add_job       = Dispatcher(self._add_job)
@@ -42,13 +42,13 @@ class JobManager(QAbstractTableModel):
     def headerData(self, section, orientation, role):
         if role != Qt.DisplayRole:
             return NONE
-        if orientation == Qt.Horizontal:      
+        if orientation == Qt.Horizontal:
             if   section == 0: text = _("Job")
             elif section == 1: text = _("Status")
             elif section == 2: text = _("Progress")
             elif section == 3: text = _('Running time')
             return QVariant(text)
-        else: 
+        else:
             return QVariant(section+1)
         
     def data(self, index, role):
@@ -58,7 +58,7 @@ class JobManager(QAbstractTableModel):
             row, col = index.row(), index.column()
             job = self.jobs[row]
             
-            if role == Qt.DisplayRole:            
+            if role == Qt.DisplayRole:
                 if col == 0:
                     desc = job.description
                     if not desc:
@@ -145,7 +145,7 @@ class JobManager(QAbstractTableModel):
                 return True
         return False
     
-    def run_job(self, done, func, args=[], kwargs={}, 
+    def run_job(self, done, func, args=[], kwargs={},
                            description=None):
         job = ParallelJob(func, done, self, args=args, kwargs=kwargs,
                           description=description)
@@ -159,15 +159,15 @@ class JobManager(QAbstractTableModel):
     def kill_job(self, row, view):
         job = self.jobs[row]
         if isinstance(job, DeviceJob):
-            error_dialog(view, _('Cannot kill job'), 
+            error_dialog(view, _('Cannot kill job'),
                          _('Cannot kill jobs that communicate with the device')).exec_()
             return
         if job.has_run:
-            error_dialog(view, _('Cannot kill job'), 
+            error_dialog(view, _('Cannot kill job'),
                          _('Job has already run')).exec_()
             return
         if not job.is_running:
-            error_dialog(view, _('Cannot kill job'), 
+            error_dialog(view, _('Cannot kill job'),
                          _('Cannot kill waiting job')).exec_()
             return
 
diff --git a/src/calibre/trac/plugins/download.py b/src/calibre/trac/plugins/download.py
index 15ee30b09a..d2667fcb38 100644
--- a/src/calibre/trac/plugins/download.py
+++ b/src/calibre/trac/plugins/download.py
@@ -35,7 +35,7 @@ class Distribution(object):
         ('ImageMagick', '6.3.5', 'imagemagick', 'imagemagick', 'ImageMagick'),
         ('xdg-utils', '1.0.2', 'xdg-utils', 'xdg-utils', 'xdg-utils'),
         ('dbus-python', '0.82.2', 'dbus-python', 'python-dbus', 'dbus-python'),
-        ('lxml', '1.3.3', 'lxml', 'python-lxml', 'python-lxml'),
+        ('lxml', '2.0.5', 'lxml', 'python-lxml', 'python-lxml'),
         ('help2man', '1.36.4', 'help2man', 'help2man', 'help2man'),
         ]