IGN:Working HTML traversal and misc bug fixes

2025-11-09 16:23:22 -05:00 · 2008-08-04 22:15:37 -07:00 · 2008-08-04 22:15:37 -07:00 · 6eb005d5c6
commit 6eb005d5c6
parent 2adea64882
3 changed files with 114 additions and 99 deletions
--- a/src/calibre/ebooks/epub/traverse.py
+++ b/src/calibre/ebooks/epub/traverse.py
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
-Recursively parse HTML files to find all linked files.
+Recursively parse HTML files to find all linked files. See :function:`traverse`.
 '''
 import sys, os, re
@ -23,7 +23,7 @@ class Link(object):
        path = url.path
        if os.path.isabs(path):
            return path
-        return os.path.abspath(os.path.join(base, url))
+        return os.path.abspath(os.path.join(base, path))
    def __init__(self, url, base):
        '''
@ -32,12 +32,13 @@ class Link(object):
                     Must be a unicode string.
        '''
        assert isinstance(url, unicode) and isinstance(base, unicode)
-        self.url        = url
+        self.url         = url
-        self.parsed_url = urlparse(unquote(self.url))
+        self.parsed_url  = urlparse(unquote(self.url))
-        self.is_local   = self.parsed_url.scheme in ('', 'file')
+        self.is_local    = self.parsed_url.scheme in ('', 'file')
-        self.path = None
+        self.is_internal = self.is_local and not bool(self.parsed_url.path)
-        self.fragment = self.parsed_url.fragment 
+        self.path        = None
-        if self.is_local:
+        self.fragment    = self.parsed_url.fragment 
        if self.is_local and not self.is_internal:
            self.path = self.url_to_local_path(self.parsed_url, base)
    def __hash__(self):
@ -46,89 +47,77 @@ class Link(object):
        return hash(self.path)
    def __eq__(self, other):
-        if not (hasattr(other, 'url') and hasattr(other, 'path')):
+        return self.path == getattr(other, 'path', other)
-            return False
+    
-        if self.path is None:
+    def __str__(self):
-            return self.url == other.url
+        return u'Link: %s --> %s'%(self.url, self.path) 
        return self.path == other.path 
 class IgnoreFile(Exception):
-    pass
+    
    def __init__(self, msg, errno):
        Exception.__init__(self, msg)
        self.doesnt_exist = errno == 2
        self.errno = errno
 class HTMLFile(object):
    '''
-    Contains basic traversal information about an HTML file. This
+    Contains basic information about an HTML file. This
-    includes a recursive list of links to other files as well as
+    includes a list of links to other files as well as
-    the encoding of each file.
+    the encoding of each file. Also tries to detect if the file is not a HTML
-
+    file in which case :member:`is_binary` is set to True.
    You can iterate over the tree of files rooted at this file
    by calling either :method:`breadth_first` or :method:`depth_first`.
    The encoding of the file is available as :member:`encoding`.
    If the file is a binary file (i.e. if conversion to unicode fails)
    :member:`is_binary` is set to `True`.
    '''
-
+    
    HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
    LINK_PAT = re.compile(
    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
    re.DOTALL|re.IGNORECASE)
-    def __init__(self, path_to_html_file, level, max_levels=sys.maxint,
+    def __init__(self, path_to_html_file, level, encoding, verbose):
                 encoding=None, verbose=0):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param max_levels: `level >= max_levels` the links in this file
                            will not be followed. 
        :param encoding: Use `encoding` to decode HTML.
        '''
        self.path  = unicode_path(path_to_html_file, abs=True)
        self.base  = os.path.dirname(self.path)
        self.level = level
        self.links = []
-        self.map   = {}
+        
        self.is_binary  = False
        try:
            with open(self.path, 'rb') as f:
                src = f.read()
        except IOError, err:
-            msg = 'Could not read from file: %s with error: %s'%
+            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
                            (self.path, unicode(err))
            if level == 0:
                raise IOError(msg)
-            if verbose:
+            raise IgnoreFile(msg, err.errno)
-                print msg
+        
-            raise IgnoreFile
+        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
-        if encoding is None:
+        
-            encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
+        if not self.is_binary:
-        self.encoding = encoding
+            if encoding is None:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
                self.encoding = encoding
        try:
            src = src.decode(encoding, 'replace')
        except UnicodeDecodeError:
            self.is_binary = True
            if verbose > 1:
                print self.path, 'is a binary file.'
        else:
            self.find_links(src)
-        if self.level < max_levels:
+                    
-            rejects = []
+    def __eq__(self, other):
-            for link in self.links:
+        return self.path == getattr(other, 'path', other)
-                if link.path is not None:
+    
-                    try:
+    def __str__(self):
-                        self.map[link.url] = HTMLFile(link.path, level+1,
+        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
-                            max_levels, encoding=encoding, verbose=verbose)
+    
-                    except IgnoreFile:
+    def __repr__(self):
-                        rejects.append(link)
+        return str(self)
            for link in rejects:
                self.links.remove(link)
    def find_links(self, src):
-        for match in self.LINK_PAT.finditer():
+        for match in self.LINK_PAT.finditer(src):
            url = None
            for i in ('url1', 'url2', 'url3'):
                url = match.group(i)
@ -138,47 +127,73 @@ class HTMLFile(object):
            if link not in self.links:
                self.links.append(link)
    def breadth_first(self, root=True):
        '''
        Walk over the tree of linked files (by `<a href>` links) breadth
        first.
-        :param root: If `True` return `self` as the first file.
+def depth_first(root, flat, visited=set([])):
-        :return: A breadth-first iterator.
+    yield root
-        '''
+    visited.add(root)
-        if root:
+    for link in root.links:
-            yield self
+        if link.path is not None and link not in visited:
-        for link in self.links:
+            try:
-            if link.path is not None:
+                index = flat.index(link)
-                yield self.map[link.url]
+            except ValueError: # Can happen if max_levels is used
-
+                continue
-        for link in self.links:
+            hf = flat[index]
-            if link.path is not None:
+            if hf not in visited:
-                for hf in self.map[link.url].breadth_first(root=False):
+                yield hf
-                    yield hf
+                visited.add(hf)
-
+                for hf in depth_first(hf, flat, visited):
-    def depth_first(self, root=True):
+                    if hf not in visited:
-        '''
+                        yield hf
-        Walk over the tree of linked files (by `<a href>` links) depth
+                        visited.add(hf)
-        first.
+        
-
+                                
-        :param root: If `True` return `self` as the first file.
+def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
-        :return: A depth-first iterator.
+    '''
-        '''
+    Recursively traverse all links in the HTML file.
        if root:
            yield self
        for link in self.links:
            if link.path is not None:
                yield self.map[link.url]
                for hf in self.map[link.url].depth_first(root=False):
                    yield hf
    :param max_levels: Maximum levels of recursion. Must be non-negative. 0 
                       implies that no links in hte root HTML file are followed.
    :param encoding:   Specify character encoding of HTML files. If `None` it is
                       auto-detected.
    :return:           A pair of lists (breadth_first, depth_first). Each list contains
                       :class:`HTMLFile` objects.
    '''
    assert max_levels >= 0
    level = 0
    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
    next_level = list(flat)
    while level < max_levels and len(next_level) > 0:
        level += 1
        nl = []
        for hf in next_level:
            rejects = []
            for link in hf.links:
                if link.path is None or link.path in flat:
                    continue
                try:
                    nf = HTMLFile(link.path, level, encoding, verbose)
                    nl.append(nf)
                    flat.append(nf)
                except IgnoreFile, err:
                    rejects.append(link)
                    if not err.doesnt_exist or verbose > 1:
                        print str(err)
            for link in rejects:
                hf.links.remove(link)
        next_level = list(nl)
    return flat, list(depth_first(flat[0], flat))
 if __name__ == '__main__':
-    root = HTMLFile(sys.argv[1], 0, verbose=2)
+    breadth_first, depth_first = traverse(sys.argv[1], verbose=2)
-    print 'Depth first...'
+    print 'Breadth first...'
-    for f in root.depth_first():
+    for f in breadth_first: print f
-        print f.path
+    print '\n\nDepth first...'
-    print '\n\nBreadth first...'
+    for f in depth_first: print f
    for f in root.breadth_first():
        print f.path
--- a/src/calibre/gui2/make.py
+++ b/src/calibre/gui2/make.py
@ -42,7 +42,7 @@ def build_forms(forms):
            dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)', re.DOTALL).sub(r'_("\1")', dat)
            # Workaround bug in Qt 4.4 on Windows
-            if form.endswith('dialogs/config.ui') or form.endswith('dialogs/lrf_single.ui'):
+            if form.endswith('dialogs%sconfig.ui'%os.sep) or form.endswith('dialogs%slrf_single.ui'%os.sep):
                print 'Implementing Workaround for buggy pyuic in form', form
                dat = re.sub(r'= QtGui\.QTextEdit\(self\..*?\)', '= QtGui.QTextEdit()', dat) 
                dat = re.sub(r'= QtGui\.QListWidget\(self\..*?\)', '= QtGui.QListWidget()', dat)
--- a/src/calibre/library/cli.py
+++ b/src/calibre/library/cli.py
@ -95,7 +95,7 @@ List the books available in the calibre database.
    parser.add_option('-s', '--search', default=None, 
                      help=_('Filter the results by the search query. For the format of the search query, please see the search related documentation in the User Manual. Default is to do no filtering.'))
    opts, args = parser.parse_args(sys.argv[:1] + args)
-    fields = [f.strip().lower() for f in opts.fields.split(',')]
+    fields = [str(f.strip().lower()) for f in opts.fields.split(',')]
    if not set(fields).issubset(FIELDS):
        parser.print_help()