From 6eb005d5c6e0186cf90431f39c9c172187f1cb4c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 4 Aug 2008 22:15:37 -0700
Subject: [PATCH] IGN:Working HTML traversal and misc bug fixes

---
 src/calibre/ebooks/epub/traverse.py | 209 +++++++++++++++-------------
 src/calibre/gui2/make.py            |   2 +-
 src/calibre/library/cli.py          |   2 +-
 3 files changed, 114 insertions(+), 99 deletions(-)
diff --git a/src/calibre/ebooks/epub/traverse.py b/src/calibre/ebooks/epub/traverse.py
index 670e056028..d5d019f376 100644
--- a/src/calibre/ebooks/epub/traverse.py
+++ b/src/calibre/ebooks/epub/traverse.py
@@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 
 '''
-Recursively parse HTML files to find all linked files.
+Recursively parse HTML files to find all linked files. See :function:`traverse`.
 '''
 
 import sys, os, re
@@ -23,7 +23,7 @@ class Link(object):
         path = url.path
         if os.path.isabs(path):
             return path
-        return os.path.abspath(os.path.join(base, url))
+        return os.path.abspath(os.path.join(base, path))
     
     def __init__(self, url, base):
         '''
@@ -32,12 +32,13 @@ class Link(object):
                      Must be a unicode string.
         '''
         assert isinstance(url, unicode) and isinstance(base, unicode)
-        self.url        = url
-        self.parsed_url = urlparse(unquote(self.url))
-        self.is_local   = self.parsed_url.scheme in ('', 'file')
-        self.path = None
-        self.fragment = self.parsed_url.fragment 
-        if self.is_local:
+        self.url         = url
+        self.parsed_url  = urlparse(unquote(self.url))
+        self.is_local    = self.parsed_url.scheme in ('', 'file')
+        self.is_internal = self.is_local and not bool(self.parsed_url.path)
+        self.path        = None
+        self.fragment    = self.parsed_url.fragment 
+        if self.is_local and not self.is_internal:
             self.path = self.url_to_local_path(self.parsed_url, base)
 
     def __hash__(self):
@@ -46,89 +47,77 @@ class Link(object):
         return hash(self.path)
 
     def __eq__(self, other):
-        if not (hasattr(other, 'url') and hasattr(other, 'path')):
-            return False
-        if self.path is None:
-            return self.url == other.url
-        return self.path == other.path 
+        return self.path == getattr(other, 'path', other)
+    
+    def __str__(self):
+        return u'Link: %s --> %s'%(self.url, self.path) 
         
 
 class IgnoreFile(Exception):
-    pass
+    
+    def __init__(self, msg, errno):
+        Exception.__init__(self, msg)
+        self.doesnt_exist = errno == 2
+        self.errno = errno
 
 class HTMLFile(object):
     '''
-    Contains basic traversal information about an HTML file. This
-    includes a recursive list of links to other files as well as
-    the encoding of each file.
-
-    You can iterate over the tree of files rooted at this file
-    by calling either :method:`breadth_first` or :method:`depth_first`.
+    Contains basic information about an HTML file. This
+    includes a list of links to other files as well as
+    the encoding of each file. Also tries to detect if the file is not a HTML
+    file in which case :member:`is_binary` is set to True.
 
     The encoding of the file is available as :member:`encoding`.
-
-    If the file is a binary file (i.e. if conversion to unicode fails)
-    :member:`is_binary` is set to `True`.
     '''
-
+    
+    HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
     LINK_PAT = re.compile(
     r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
     re.DOTALL|re.IGNORECASE)
     
-    def __init__(self, path_to_html_file, level, max_levels=sys.maxint,
-                 encoding=None, verbose=0):
+    def __init__(self, path_to_html_file, level, encoding, verbose):
         '''
         :param level: The level of this file. Should be 0 for the root file.
-        :param max_levels: `level >= max_levels` the links in this file
-                            will not be followed. 
         :param encoding: Use `encoding` to decode HTML.
         '''
         self.path  = unicode_path(path_to_html_file, abs=True)
         self.base  = os.path.dirname(self.path)
         self.level = level
         self.links = []
-        self.map   = {}
-        self.is_binary  = False
+        
         try:
             with open(self.path, 'rb') as f:
                 src = f.read()
         except IOError, err:
-            msg = 'Could not read from file: %s with error: %s'%
-                            (self.path, unicode(err))
+            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
             if level == 0:
                 raise IOError(msg)
-            if verbose:
-                print msg
-            raise IgnoreFile
-        if encoding is None:
-            encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
-        self.encoding = encoding
+            raise IgnoreFile(msg, err.errno)
+        
+        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
+        
+        if not self.is_binary:
+            if encoding is None:
+                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
+                self.encoding = encoding
 
-        
-        try:
             src = src.decode(encoding, 'replace')
-        except UnicodeDecodeError:
-            self.is_binary = True
-            if verbose > 1:
-                print self.path, 'is a binary file.'
-        else:
             self.find_links(src)
+                
         
-        if self.level < max_levels:
-            rejects = []
-            for link in self.links:
-                if link.path is not None:
-                    try:
-                        self.map[link.url] = HTMLFile(link.path, level+1,
-                            max_levels, encoding=encoding, verbose=verbose)
-                    except IgnoreFile:
-                        rejects.append(link)
-            for link in rejects:
-                self.links.remove(link)
+                    
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+    
+    def __str__(self):
+        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
+    
+    def __repr__(self):
+        return str(self)
                     
         
     def find_links(self, src):
-        for match in self.LINK_PAT.finditer():
+        for match in self.LINK_PAT.finditer(src):
             url = None
             for i in ('url1', 'url2', 'url3'):
                 url = match.group(i)
@@ -138,47 +127,73 @@ class HTMLFile(object):
             if link not in self.links:
                 self.links.append(link)
 
-    def breadth_first(self, root=True):
-        '''
-        Walk over the tree of linked files (by `<a href>` links) breadth
-        first.
 
-        :param root: If `True` return `self` as the first file.
-        :return: A breadth-first iterator.
-        '''
-        if root:
-            yield self
-        for link in self.links:
-            if link.path is not None:
-                yield self.map[link.url]
-
-        for link in self.links:
-            if link.path is not None:
-                for hf in self.map[link.url].breadth_first(root=False):
-                    yield hf
-
-    def depth_first(self, root=True):
-        '''
-        Walk over the tree of linked files (by `<a href>` links) depth
-        first.
-
-        :param root: If `True` return `self` as the first file.
-        :return: A depth-first iterator.
-        '''
-        if root:
-            yield self
-        for link in self.links:
-            if link.path is not None:
-                yield self.map[link.url]
-                for hf in self.map[link.url].depth_first(root=False):
-                    yield hf
+def depth_first(root, flat, visited=set([])):
+    yield root
+    visited.add(root)
+    for link in root.links:
+        if link.path is not None and link not in visited:
+            try:
+                index = flat.index(link)
+            except ValueError: # Can happen if max_levels is used
+                continue
+            hf = flat[index]
+            if hf not in visited:
+                yield hf
+                visited.add(hf)
+                for hf in depth_first(hf, flat, visited):
+                    if hf not in visited:
+                        yield hf
+                        visited.add(hf)
+        
+                                
+def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
+    '''
+    Recursively traverse all links in the HTML file.
     
+    :param max_levels: Maximum levels of recursion. Must be non-negative. 0 
+                       implies that no links in hte root HTML file are followed.
+    :param encoding:   Specify character encoding of HTML files. If `None` it is
+                       auto-detected.
+    :return:           A pair of lists (breadth_first, depth_first). Each list contains
+                       :class:`HTMLFile` objects.
+    '''
+    assert max_levels >= 0
+    level = 0
+    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
+    next_level = list(flat)
+    while level < max_levels and len(next_level) > 0:
+        level += 1
+        nl = []
+        for hf in next_level:
+            rejects = []
+            for link in hf.links:
+                if link.path is None or link.path in flat:
+                    continue
+                try:
+                    nf = HTMLFile(link.path, level, encoding, verbose)
+                    nl.append(nf)
+                    flat.append(nf)
+                except IgnoreFile, err:
+                    rejects.append(link)
+                    if not err.doesnt_exist or verbose > 1:
+                        print str(err)
+            for link in rejects:
+                hf.links.remove(link)
+                
+        next_level = list(nl)
+        
+    return flat, list(depth_first(flat[0], flat))
+    
+    
+                
+            
+    
+
 if __name__ == '__main__':
-    root = HTMLFile(sys.argv[1], 0, verbose=2)
-    print 'Depth first...'
-    for f in root.depth_first():
-        print f.path
-    print '\n\nBreadth first...'
-    for f in root.breadth_first():
-        print f.path
+    breadth_first, depth_first = traverse(sys.argv[1], verbose=2)
+    print 'Breadth first...'
+    for f in breadth_first: print f
+    print '\n\nDepth first...'
+    for f in depth_first: print f
     
diff --git a/src/calibre/gui2/make.py b/src/calibre/gui2/make.py
index 64b957b46b..fea040ca71 100644
--- a/src/calibre/gui2/make.py
+++ b/src/calibre/gui2/make.py
@@ -42,7 +42,7 @@ def build_forms(forms):
             dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)', re.DOTALL).sub(r'_("\1")', dat)
             
             # Workaround bug in Qt 4.4 on Windows
-            if form.endswith('dialogs/config.ui') or form.endswith('dialogs/lrf_single.ui'):
+            if form.endswith('dialogs%sconfig.ui'%os.sep) or form.endswith('dialogs%slrf_single.ui'%os.sep):
                 print 'Implementing Workaround for buggy pyuic in form', form
                 dat = re.sub(r'= QtGui\.QTextEdit\(self\..*?\)', '= QtGui.QTextEdit()', dat) 
                 dat = re.sub(r'= QtGui\.QListWidget\(self\..*?\)', '= QtGui.QListWidget()', dat)
diff --git a/src/calibre/library/cli.py b/src/calibre/library/cli.py
index d5eea9696e..fe1fe6580b 100644
--- a/src/calibre/library/cli.py
+++ b/src/calibre/library/cli.py
@@ -95,7 +95,7 @@ List the books available in the calibre database.
     parser.add_option('-s', '--search', default=None, 
                       help=_('Filter the results by the search query. For the format of the search query, please see the search related documentation in the User Manual. Default is to do no filtering.'))
     opts, args = parser.parse_args(sys.argv[:1] + args)
-    fields = [f.strip().lower() for f in opts.fields.split(',')]
+    fields = [str(f.strip().lower()) for f in opts.fields.split(',')]
     
     if not set(fields).issubset(FIELDS):
         parser.print_help()