IGN:Working HTML traversal and misc bug fixes

This commit is contained in:
Kovid Goyal 2008-08-04 22:15:37 -07:00
parent 2adea64882
commit 6eb005d5c6
3 changed files with 114 additions and 99 deletions

View File

@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
''' '''
Recursively parse HTML files to find all linked files. Recursively parse HTML files to find all linked files. See :function:`traverse`.
''' '''
import sys, os, re import sys, os, re
@ -23,7 +23,7 @@ class Link(object):
path = url.path path = url.path
if os.path.isabs(path): if os.path.isabs(path):
return path return path
return os.path.abspath(os.path.join(base, url)) return os.path.abspath(os.path.join(base, path))
def __init__(self, url, base): def __init__(self, url, base):
''' '''
@ -32,12 +32,13 @@ class Link(object):
Must be a unicode string. Must be a unicode string.
''' '''
assert isinstance(url, unicode) and isinstance(base, unicode) assert isinstance(url, unicode) and isinstance(base, unicode)
self.url = url self.url = url
self.parsed_url = urlparse(unquote(self.url)) self.parsed_url = urlparse(unquote(self.url))
self.is_local = self.parsed_url.scheme in ('', 'file') self.is_local = self.parsed_url.scheme in ('', 'file')
self.path = None self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.fragment = self.parsed_url.fragment self.path = None
if self.is_local: self.fragment = self.parsed_url.fragment
if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base) self.path = self.url_to_local_path(self.parsed_url, base)
def __hash__(self): def __hash__(self):
@ -46,89 +47,77 @@ class Link(object):
return hash(self.path) return hash(self.path)
def __eq__(self, other): def __eq__(self, other):
if not (hasattr(other, 'url') and hasattr(other, 'path')): return self.path == getattr(other, 'path', other)
return False
if self.path is None: def __str__(self):
return self.url == other.url return u'Link: %s --> %s'%(self.url, self.path)
return self.path == other.path
class IgnoreFile(Exception): class IgnoreFile(Exception):
pass
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == 2
self.errno = errno
class HTMLFile(object): class HTMLFile(object):
''' '''
Contains basic traversal information about an HTML file. This Contains basic information about an HTML file. This
includes a recursive list of links to other files as well as includes a list of links to other files as well as
the encoding of each file. the encoding of each file. Also tries to detect if the file is not a HTML
file in which case :member:`is_binary` is set to True.
You can iterate over the tree of files rooted at this file
by calling either :method:`breadth_first` or :method:`depth_first`.
The encoding of the file is available as :member:`encoding`. The encoding of the file is available as :member:`encoding`.
If the file is a binary file (i.e. if conversion to unicode fails)
:member:`is_binary` is set to `True`.
''' '''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
LINK_PAT = re.compile( LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))', r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
re.DOTALL|re.IGNORECASE) re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, max_levels=sys.maxint, def __init__(self, path_to_html_file, level, encoding, verbose):
encoding=None, verbose=0):
''' '''
:param level: The level of this file. Should be 0 for the root file. :param level: The level of this file. Should be 0 for the root file.
:param max_levels: `level >= max_levels` the links in this file
will not be followed.
:param encoding: Use `encoding` to decode HTML. :param encoding: Use `encoding` to decode HTML.
''' '''
self.path = unicode_path(path_to_html_file, abs=True) self.path = unicode_path(path_to_html_file, abs=True)
self.base = os.path.dirname(self.path) self.base = os.path.dirname(self.path)
self.level = level self.level = level
self.links = [] self.links = []
self.map = {}
self.is_binary = False
try: try:
with open(self.path, 'rb') as f: with open(self.path, 'rb') as f:
src = f.read() src = f.read()
except IOError, err: except IOError, err:
msg = 'Could not read from file: %s with error: %s'% msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
(self.path, unicode(err))
if level == 0: if level == 0:
raise IOError(msg) raise IOError(msg)
if verbose: raise IgnoreFile(msg, err.errno)
print msg
raise IgnoreFile self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] if not self.is_binary:
self.encoding = encoding if encoding is None:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding
try:
src = src.decode(encoding, 'replace') src = src.decode(encoding, 'replace')
except UnicodeDecodeError:
self.is_binary = True
if verbose > 1:
print self.path, 'is a binary file.'
else:
self.find_links(src) self.find_links(src)
if self.level < max_levels:
rejects = [] def __eq__(self, other):
for link in self.links: return self.path == getattr(other, 'path', other)
if link.path is not None:
try: def __str__(self):
self.map[link.url] = HTMLFile(link.path, level+1, return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
max_levels, encoding=encoding, verbose=verbose)
except IgnoreFile: def __repr__(self):
rejects.append(link) return str(self)
for link in rejects:
self.links.remove(link)
def find_links(self, src): def find_links(self, src):
for match in self.LINK_PAT.finditer(): for match in self.LINK_PAT.finditer(src):
url = None url = None
for i in ('url1', 'url2', 'url3'): for i in ('url1', 'url2', 'url3'):
url = match.group(i) url = match.group(i)
@ -138,47 +127,73 @@ class HTMLFile(object):
if link not in self.links: if link not in self.links:
self.links.append(link) self.links.append(link)
def breadth_first(self, root=True):
'''
Walk over the tree of linked files (by `<a href>` links) breadth
first.
:param root: If `True` return `self` as the first file. def depth_first(root, flat, visited=set([])):
:return: A breadth-first iterator. yield root
''' visited.add(root)
if root: for link in root.links:
yield self if link.path is not None and link not in visited:
for link in self.links: try:
if link.path is not None: index = flat.index(link)
yield self.map[link.url] except ValueError: # Can happen if max_levels is used
continue
for link in self.links: hf = flat[index]
if link.path is not None: if hf not in visited:
for hf in self.map[link.url].breadth_first(root=False): yield hf
yield hf visited.add(hf)
for hf in depth_first(hf, flat, visited):
def depth_first(self, root=True): if hf not in visited:
''' yield hf
Walk over the tree of linked files (by `<a href>` links) depth visited.add(hf)
first.
:param root: If `True` return `self` as the first file. def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
:return: A depth-first iterator. '''
''' Recursively traverse all links in the HTML file.
if root:
yield self
for link in self.links:
if link.path is not None:
yield self.map[link.url]
for hf in self.map[link.url].depth_first(root=False):
yield hf
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in hte root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
:return: A pair of lists (breadth_first, depth_first). Each list contains
:class:`HTMLFile` objects.
'''
assert max_levels >= 0
level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
next_level = list(flat)
while level < max_levels and len(next_level) > 0:
level += 1
nl = []
for hf in next_level:
rejects = []
for link in hf.links:
if link.path is None or link.path in flat:
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose)
nl.append(nf)
flat.append(nf)
except IgnoreFile, err:
rejects.append(link)
if not err.doesnt_exist or verbose > 1:
print str(err)
for link in rejects:
hf.links.remove(link)
next_level = list(nl)
return flat, list(depth_first(flat[0], flat))
if __name__ == '__main__': if __name__ == '__main__':
root = HTMLFile(sys.argv[1], 0, verbose=2) breadth_first, depth_first = traverse(sys.argv[1], verbose=2)
print 'Depth first...' print 'Breadth first...'
for f in root.depth_first(): for f in breadth_first: print f
print f.path print '\n\nDepth first...'
print '\n\nBreadth first...' for f in depth_first: print f
for f in root.breadth_first():
print f.path

View File

@ -42,7 +42,7 @@ def build_forms(forms):
dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)', re.DOTALL).sub(r'_("\1")', dat) dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)', re.DOTALL).sub(r'_("\1")', dat)
# Workaround bug in Qt 4.4 on Windows # Workaround bug in Qt 4.4 on Windows
if form.endswith('dialogs/config.ui') or form.endswith('dialogs/lrf_single.ui'): if form.endswith('dialogs%sconfig.ui'%os.sep) or form.endswith('dialogs%slrf_single.ui'%os.sep):
print 'Implementing Workaround for buggy pyuic in form', form print 'Implementing Workaround for buggy pyuic in form', form
dat = re.sub(r'= QtGui\.QTextEdit\(self\..*?\)', '= QtGui.QTextEdit()', dat) dat = re.sub(r'= QtGui\.QTextEdit\(self\..*?\)', '= QtGui.QTextEdit()', dat)
dat = re.sub(r'= QtGui\.QListWidget\(self\..*?\)', '= QtGui.QListWidget()', dat) dat = re.sub(r'= QtGui\.QListWidget\(self\..*?\)', '= QtGui.QListWidget()', dat)

View File

@ -95,7 +95,7 @@ List the books available in the calibre database.
parser.add_option('-s', '--search', default=None, parser.add_option('-s', '--search', default=None,
help=_('Filter the results by the search query. For the format of the search query, please see the search related documentation in the User Manual. Default is to do no filtering.')) help=_('Filter the results by the search query. For the format of the search query, please see the search related documentation in the User Manual. Default is to do no filtering.'))
opts, args = parser.parse_args(sys.argv[:1] + args) opts, args = parser.parse_args(sys.argv[:1] + args)
fields = [f.strip().lower() for f in opts.fields.split(',')] fields = [str(f.strip().lower()) for f in opts.fields.split(',')]
if not set(fields).issubset(FIELDS): if not set(fields).issubset(FIELDS):
parser.print_help() parser.print_help()