IGN:Working html2oeb

2025-07-09 03:04:10 -04:00 · 2008-08-26 12:11:03 -07:00 · 2008-08-26 12:11:03 -07:00 · 41a938aef0
commit 41a938aef0
parent 39afcb27f7
6 changed files with 63 additions and 246 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -268,6 +268,9 @@ class LoggingInterface:
        self.__logger = logger
        
    def setup_cli_handler(self, verbosity):
+        for handler in self.__logger.handlers:
+            if isinstance(handler, logging.StreamHandler):
+                return
        if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers:
            return
        stream    = sys.stdout
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -19,6 +19,7 @@ def config(defaults=None):
    
    c.update(common_config())
    c.remove_opt('output')
+    c.remove_opt('zip')
    
    c.add_opt('output', ['-o', '--output'], default=None,
             help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
--- a/src/calibre/ebooks/epub/traverse.py
+++ b/src/calibre/ebooks/epub/traverse.py
@ -1,218 +0,0 @@
-from __future__ import with_statement
-__license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
-__docformat__ = 'restructuredtext en'
-
-'''
-Recursively parse HTML files to find all linked files. See :function:`traverse`.
-'''
-
-import sys, os, re
-from urlparse import urlparse
-from urllib import unquote
-from calibre import unicode_path
-from calibre.ebooks.chardet import xml_to_unicode
-
-class Link(object):
-    '''
-    Represents a link in a HTML file.
-    '''
-    
-    @classmethod
-    def url_to_local_path(cls, url, base):
-        path = url.path
-        if os.path.isabs(path):
-            return path
-        return os.path.abspath(os.path.join(base, path))
-    
-    def __init__(self, url, base):
-        '''
-        :param url:  The url this link points to. Must be an unquoted unicode string.
-        :param base: The base directory that relative URLs are with respect to.
-                     Must be a unicode string.
-        '''
-        assert isinstance(url, unicode) and isinstance(base, unicode)
-        self.url         = url
-        self.parsed_url  = urlparse(unquote(self.url))
-        self.is_local    = self.parsed_url.scheme in ('', 'file')
-        self.is_internal = self.is_local and not bool(self.parsed_url.path)
-        self.path        = None
-        self.fragment    = self.parsed_url.fragment 
-        if self.is_local and not self.is_internal:
-            self.path = self.url_to_local_path(self.parsed_url, base)
-
-    def __hash__(self):
-        if self.path is None:
-            return hash(self.url)
-        return hash(self.path)
-
-    def __eq__(self, other):
-        return self.path == getattr(other, 'path', other)
-    
-    def __str__(self):
-        return u'Link: %s --> %s'%(self.url, self.path) 
-        
-
-class IgnoreFile(Exception):
-    
-    def __init__(self, msg, errno):
-        Exception.__init__(self, msg)
-        self.doesnt_exist = errno == 2
-        self.errno = errno
-
-class HTMLFile(object):
-    '''
-    Contains basic information about an HTML file. This
-    includes a list of links to other files as well as
-    the encoding of each file. Also tries to detect if the file is not a HTML
-    file in which case :member:`is_binary` is set to True.
-
-    The encoding of the file is available as :member:`encoding`.
-    '''
-    
-    HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
-    LINK_PAT = re.compile(
-    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
-    re.DOTALL|re.IGNORECASE)
-    
-    def __init__(self, path_to_html_file, level, encoding, verbose):
-        '''
-        :param level: The level of this file. Should be 0 for the root file.
-        :param encoding: Use `encoding` to decode HTML.
-        '''
-        self.path  = unicode_path(path_to_html_file, abs=True)
-        self.base  = os.path.dirname(self.path)
-        self.level = level
-        self.links = []
-        
-        try:
-            with open(self.path, 'rb') as f:
-                src = f.read()
-        except IOError, err:
-            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
-            if level == 0:
-                raise IOError(msg)
-            raise IgnoreFile(msg, err.errno)
-        
-        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
-        
-        if not self.is_binary:
-            if encoding is None:
-                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
-                self.encoding = encoding
-
-            src = src.decode(encoding, 'replace')
-            self.find_links(src)
-                
-        
-                    
-    def __eq__(self, other):
-        return self.path == getattr(other, 'path', other)
-    
-    def __str__(self):
-        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
-    
-    def __repr__(self):
-        return str(self)
-                    
-        
-    def find_links(self, src):
-        for match in self.LINK_PAT.finditer(src):
-            url = None
-            for i in ('url1', 'url2', 'url3'):
-                url = match.group(i)
-                if url:
-                    break
-            link = self.resolve(url)
-            if link not in self.links:
-                self.links.append(link)
-                
-    def resolve(self, url):
-        return Link(url, self.base)
-
-
-def depth_first(root, flat, visited=set([])):
-    yield root
-    visited.add(root)
-    for link in root.links:
-        if link.path is not None and link not in visited:
-            try:
-                index = flat.index(link)
-            except ValueError: # Can happen if max_levels is used
-                continue
-            hf = flat[index]
-            if hf not in visited:
-                yield hf
-                visited.add(hf)
-                for hf in depth_first(hf, flat, visited):
-                    if hf not in visited:
-                        yield hf
-                        visited.add(hf)
-        
-                                
-def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
-    '''
-    Recursively traverse all links in the HTML file.
-    
-    :param max_levels: Maximum levels of recursion. Must be non-negative. 0 
-                       implies that no links in the root HTML file are followed.
-    :param encoding:   Specify character encoding of HTML files. If `None` it is
-                       auto-detected.
-    :return:           A pair of lists (breadth_first, depth_first). Each list contains
-                       :class:`HTMLFile` objects.
-    '''
-    assert max_levels >= 0
-    level = 0
-    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
-    next_level = list(flat)
-    while level < max_levels and len(next_level) > 0:
-        level += 1
-        nl = []
-        for hf in next_level:
-            rejects = []
-            for link in hf.links:
-                if link.path is None or link.path in flat:
-                    continue
-                try:
-                    nf = HTMLFile(link.path, level, encoding, verbose)
-                    nl.append(nf)
-                    flat.append(nf)
-                except IgnoreFile, err:
-                    rejects.append(link)
-                    if not err.doesnt_exist or verbose > 1:
-                        print str(err)
-            for link in rejects:
-                hf.links.remove(link)
-                
-        next_level = list(nl)
-        
-    return flat, list(depth_first(flat[0], flat))
-    
-    
-def opf_traverse(opf_reader, verbose=0, encoding=None):
-    '''
-    Return a list of :class:`HTMLFile` objects in the order specified by the
-    `<spine>` element of the OPF.
-    
-    :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.  
-    :param encoding:   Specify character encoding of HTML files. If `None` it is
-                       auto-detected.
-    '''
-    if not opf_reader.spine:
-        raise ValueError('OPF does not have a spine')
-    flat = []
-    for path in opf_reader.spine.items():
-        if path not in flat:
-            flat.append(os.path.abspath(path))
-    flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
-    return flat
-            
-    
-
-if __name__ == '__main__':
-    breadth_first, depth_first = traverse(sys.argv[1], verbose=2)
-    print 'Breadth first...'
-    for f in breadth_first: print f
-    print '\n\nDepth first...'
-    for f in depth_first: print f
-    
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -284,7 +284,7 @@ class PreProcessor(object):
        
        return html
    
-class Parser(PreProcessor):
+class Parser(PreProcessor, LoggingInterface):
    
    ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
                     re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
@ -299,10 +299,32 @@ class Parser(PreProcessor):
        self.resource_map = resource_map
        self.htmlfiles = htmlfiles
        self.resource_dir = os.path.join(tdir, 'resources')
+        save_counter = 1
+        self.htmlfile_map = {}
+        for f in self.htmlfiles:
+            name = os.path.basename(f.path)
+            if name in self.htmlfile_map.values():
+                name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
+                save_counter += 1
+            self.htmlfile_map[f.path] = name
        
        self.parse_html()
        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
        
+    def save(self):
+        '''
+        Save processed HTML into the content directory.
+        Should be called after all HTML processing is finished.
+        '''
+        with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
+            f.write(html.tostring(self.root, 
+                        encoding='utf-8', method='xml',
+                         include_meta_content_type=True,
+                        pretty_print=self.opts.pretty_print)
+                    )
+            return f.name
+
+
    def parse_html(self):
        ''' Create lxml ElementTree from HTML '''
        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
@ -350,7 +372,7 @@ class Parser(PreProcessor):
        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
            return olink
        if link.path in self.htmlfiles:
-            return os.path.basename(link.path)
+            return self.htmlfile_map[link.path]
        if link.path in self.resource_map.keys():
            return self.resource_map[link.path]
        name = os.path.basename(link.path)
@ -437,6 +459,8 @@ def config(defaults=None):
             help=_('The output directory. Default is the current directory.'))
    c.add_opt('encoding', ['--encoding'], default=None, 
              help=_('Character encoding for HTML files. Default is to auto detect.'))
+    c.add_opt('zip', ['--zip'], default=False,
+              help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
    
    traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
    traversal('breadth_first', ['--breadth-first'], default=False,
@ -453,6 +477,8 @@ def config(defaults=None):
    debug = c.add_group('debug', _('Options useful for debugging'))
    debug('verbose', ['-v', '--verbose'], default=0, action='count',
          help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
+    debug('pretty_print', ['--pretty-print'], default=False,
+          help=_('Output HTML is "pretty printed" for easier parsing by humans'))
    
    return c

@ -487,7 +513,6 @@ def get_filelist(htmlfile, opts):
        print '\tFound files...'
        for f in filelist:
            print '\t\t', f
-    
    return opf, filelist

 def parse_content(filelist, opts):
@ -499,9 +524,10 @@ def parse_content(filelist, opts):
        os.makedirs(rdir)
    resource_map = {}
    for htmlfile in filelist:
-        Parser(htmlfile, opts, os.path.join(opts.output, 'content'), 
+        p = Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
                           resource_map, filelist)
-    return resource_map
+        p.save()
+    return resource_map, p.htmlfile_map

 def merge_metadata(htmlfile, opf, opts):
    if opf:
@ -519,23 +545,27 @@ def merge_metadata(htmlfile, opf, opts):
        mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
    if not mi.authors:
        mi.authors = [_('Unknown')]
+    return mi

 def create_metadata(basepath, mi, filelist, resources):
    mi = OPFCreator(basepath, mi)
-    entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
+    entries = [('content/'+f, None) for f in filelist] + [(f, None) for f in resources]
    mi.create_manifest(entries)
-    mi.create_spine([f.path for f in filelist])
+    mi.create_spine(['content/'+f for f in filelist])
    return mi

 def create_dir(htmlfile, opts):
    opf, filelist = get_filelist(htmlfile, opts)
    mi = merge_metadata(htmlfile, opf, opts)
-    resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
+    resource_map, htmlfile_map = parse_content(filelist, opts)
+    resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
    if opf.cover and os.access(opf.cover, os.R_OK):
-        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
+        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
        shutil.copyfile(opf.cover, cpath)
        resources.append(cpath)
-    mi = create_metadata(opts.output, mi, filelist, resources)
+        mi.cover = cpath
+    spine = [htmlfile_map[f.path] for f in filelist]
+    mi = create_metadata(opts.output, mi, spine, resources)
    with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
        mi.render(f)
    print 'Open ebook created in', opts.output
@ -560,11 +590,12 @@ def main(args=sys.argv):
        return 1
    
    htmlfile = args[1]
+    if opts.zip:
+        create_oebzip(htmlfile, opts)
+    else:
        create_dir(htmlfile, opts)
        
    return 0

 if __name__ == '__main__':
    sys.exit(main())
-        
-    
--- a/src/calibre/gui2/jobs2.py
+++ b/src/calibre/gui2/jobs2.py
@ -19,13 +19,13 @@ NONE = QVariant()

 class JobManager(QAbstractTableModel):
    
-    wait_icon     = QVariant(QIcon(':/images/jobs.svg'))
-    running_icon  = QVariant(QIcon(':/images/exec.svg'))
-    error_icon    = QVariant(QIcon(':/images/dialog_error.svg'))
-    done_icon     = QVariant(QIcon(':/images/ok.svg'))
-    
    def __init__(self):
        QAbstractTableModel.__init__(self)
+        self.wait_icon     = QVariant(QIcon(':/images/jobs.svg'))
+        self.running_icon  = QVariant(QIcon(':/images/exec.svg'))
+        self.error_icon    = QVariant(QIcon(':/images/dialog_error.svg'))
+        self.done_icon     = QVariant(QIcon(':/images/ok.svg'))
+    
        self.jobs          = []
        self.server        = Server()
        self.add_job       = Dispatcher(self._add_job)
--- a/src/calibre/trac/plugins/download.py
+++ b/src/calibre/trac/plugins/download.py
@ -35,7 +35,7 @@ class Distribution(object):
        ('ImageMagick', '6.3.5', 'imagemagick', 'imagemagick', 'ImageMagick'),
        ('xdg-utils', '1.0.2', 'xdg-utils', 'xdg-utils', 'xdg-utils'),
        ('dbus-python', '0.82.2', 'dbus-python', 'python-dbus', 'dbus-python'),
-        ('lxml', '1.3.3', 'lxml', 'python-lxml', 'python-lxml'),
+        ('lxml', '2.0.5', 'lxml', 'python-lxml', 'python-lxml'),
        ('help2man', '1.36.4', 'help2man', 'help2man', 'help2man'),
        ]