mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
IGN:Working HTML traversal and misc bug fixes
This commit is contained in:
parent
2adea64882
commit
6eb005d5c6
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Recursively parse HTML files to find all linked files.
|
Recursively parse HTML files to find all linked files. See :function:`traverse`.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import sys, os, re
|
import sys, os, re
|
||||||
@ -23,7 +23,7 @@ class Link(object):
|
|||||||
path = url.path
|
path = url.path
|
||||||
if os.path.isabs(path):
|
if os.path.isabs(path):
|
||||||
return path
|
return path
|
||||||
return os.path.abspath(os.path.join(base, url))
|
return os.path.abspath(os.path.join(base, path))
|
||||||
|
|
||||||
def __init__(self, url, base):
|
def __init__(self, url, base):
|
||||||
'''
|
'''
|
||||||
@ -32,12 +32,13 @@ class Link(object):
|
|||||||
Must be a unicode string.
|
Must be a unicode string.
|
||||||
'''
|
'''
|
||||||
assert isinstance(url, unicode) and isinstance(base, unicode)
|
assert isinstance(url, unicode) and isinstance(base, unicode)
|
||||||
self.url = url
|
self.url = url
|
||||||
self.parsed_url = urlparse(unquote(self.url))
|
self.parsed_url = urlparse(unquote(self.url))
|
||||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||||
self.path = None
|
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||||
self.fragment = self.parsed_url.fragment
|
self.path = None
|
||||||
if self.is_local:
|
self.fragment = self.parsed_url.fragment
|
||||||
|
if self.is_local and not self.is_internal:
|
||||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
@ -46,89 +47,77 @@ class Link(object):
|
|||||||
return hash(self.path)
|
return hash(self.path)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
if not (hasattr(other, 'url') and hasattr(other, 'path')):
|
return self.path == getattr(other, 'path', other)
|
||||||
return False
|
|
||||||
if self.path is None:
|
def __str__(self):
|
||||||
return self.url == other.url
|
return u'Link: %s --> %s'%(self.url, self.path)
|
||||||
return self.path == other.path
|
|
||||||
|
|
||||||
|
|
||||||
class IgnoreFile(Exception):
|
class IgnoreFile(Exception):
|
||||||
pass
|
|
||||||
|
def __init__(self, msg, errno):
|
||||||
|
Exception.__init__(self, msg)
|
||||||
|
self.doesnt_exist = errno == 2
|
||||||
|
self.errno = errno
|
||||||
|
|
||||||
class HTMLFile(object):
|
class HTMLFile(object):
|
||||||
'''
|
'''
|
||||||
Contains basic traversal information about an HTML file. This
|
Contains basic information about an HTML file. This
|
||||||
includes a recursive list of links to other files as well as
|
includes a list of links to other files as well as
|
||||||
the encoding of each file.
|
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||||
|
file in which case :member:`is_binary` is set to True.
|
||||||
You can iterate over the tree of files rooted at this file
|
|
||||||
by calling either :method:`breadth_first` or :method:`depth_first`.
|
|
||||||
|
|
||||||
The encoding of the file is available as :member:`encoding`.
|
The encoding of the file is available as :member:`encoding`.
|
||||||
|
|
||||||
If the file is a binary file (i.e. if conversion to unicode fails)
|
|
||||||
:member:`is_binary` is set to `True`.
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||||
LINK_PAT = re.compile(
|
LINK_PAT = re.compile(
|
||||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
|
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
|
||||||
re.DOTALL|re.IGNORECASE)
|
re.DOTALL|re.IGNORECASE)
|
||||||
|
|
||||||
def __init__(self, path_to_html_file, level, max_levels=sys.maxint,
|
def __init__(self, path_to_html_file, level, encoding, verbose):
|
||||||
encoding=None, verbose=0):
|
|
||||||
'''
|
'''
|
||||||
:param level: The level of this file. Should be 0 for the root file.
|
:param level: The level of this file. Should be 0 for the root file.
|
||||||
:param max_levels: `level >= max_levels` the links in this file
|
|
||||||
will not be followed.
|
|
||||||
:param encoding: Use `encoding` to decode HTML.
|
:param encoding: Use `encoding` to decode HTML.
|
||||||
'''
|
'''
|
||||||
self.path = unicode_path(path_to_html_file, abs=True)
|
self.path = unicode_path(path_to_html_file, abs=True)
|
||||||
self.base = os.path.dirname(self.path)
|
self.base = os.path.dirname(self.path)
|
||||||
self.level = level
|
self.level = level
|
||||||
self.links = []
|
self.links = []
|
||||||
self.map = {}
|
|
||||||
self.is_binary = False
|
|
||||||
try:
|
try:
|
||||||
with open(self.path, 'rb') as f:
|
with open(self.path, 'rb') as f:
|
||||||
src = f.read()
|
src = f.read()
|
||||||
except IOError, err:
|
except IOError, err:
|
||||||
msg = 'Could not read from file: %s with error: %s'%
|
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
||||||
(self.path, unicode(err))
|
|
||||||
if level == 0:
|
if level == 0:
|
||||||
raise IOError(msg)
|
raise IOError(msg)
|
||||||
if verbose:
|
raise IgnoreFile(msg, err.errno)
|
||||||
print msg
|
|
||||||
raise IgnoreFile
|
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||||
if encoding is None:
|
|
||||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
if not self.is_binary:
|
||||||
self.encoding = encoding
|
if encoding is None:
|
||||||
|
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
src = src.decode(encoding, 'replace')
|
src = src.decode(encoding, 'replace')
|
||||||
except UnicodeDecodeError:
|
|
||||||
self.is_binary = True
|
|
||||||
if verbose > 1:
|
|
||||||
print self.path, 'is a binary file.'
|
|
||||||
else:
|
|
||||||
self.find_links(src)
|
self.find_links(src)
|
||||||
|
|
||||||
|
|
||||||
if self.level < max_levels:
|
|
||||||
rejects = []
|
def __eq__(self, other):
|
||||||
for link in self.links:
|
return self.path == getattr(other, 'path', other)
|
||||||
if link.path is not None:
|
|
||||||
try:
|
def __str__(self):
|
||||||
self.map[link.url] = HTMLFile(link.path, level+1,
|
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||||
max_levels, encoding=encoding, verbose=verbose)
|
|
||||||
except IgnoreFile:
|
def __repr__(self):
|
||||||
rejects.append(link)
|
return str(self)
|
||||||
for link in rejects:
|
|
||||||
self.links.remove(link)
|
|
||||||
|
|
||||||
|
|
||||||
def find_links(self, src):
|
def find_links(self, src):
|
||||||
for match in self.LINK_PAT.finditer():
|
for match in self.LINK_PAT.finditer(src):
|
||||||
url = None
|
url = None
|
||||||
for i in ('url1', 'url2', 'url3'):
|
for i in ('url1', 'url2', 'url3'):
|
||||||
url = match.group(i)
|
url = match.group(i)
|
||||||
@ -138,47 +127,73 @@ class HTMLFile(object):
|
|||||||
if link not in self.links:
|
if link not in self.links:
|
||||||
self.links.append(link)
|
self.links.append(link)
|
||||||
|
|
||||||
def breadth_first(self, root=True):
|
|
||||||
'''
|
|
||||||
Walk over the tree of linked files (by `<a href>` links) breadth
|
|
||||||
first.
|
|
||||||
|
|
||||||
:param root: If `True` return `self` as the first file.
|
def depth_first(root, flat, visited=set([])):
|
||||||
:return: A breadth-first iterator.
|
yield root
|
||||||
'''
|
visited.add(root)
|
||||||
if root:
|
for link in root.links:
|
||||||
yield self
|
if link.path is not None and link not in visited:
|
||||||
for link in self.links:
|
try:
|
||||||
if link.path is not None:
|
index = flat.index(link)
|
||||||
yield self.map[link.url]
|
except ValueError: # Can happen if max_levels is used
|
||||||
|
continue
|
||||||
for link in self.links:
|
hf = flat[index]
|
||||||
if link.path is not None:
|
if hf not in visited:
|
||||||
for hf in self.map[link.url].breadth_first(root=False):
|
yield hf
|
||||||
yield hf
|
visited.add(hf)
|
||||||
|
for hf in depth_first(hf, flat, visited):
|
||||||
def depth_first(self, root=True):
|
if hf not in visited:
|
||||||
'''
|
yield hf
|
||||||
Walk over the tree of linked files (by `<a href>` links) depth
|
visited.add(hf)
|
||||||
first.
|
|
||||||
|
|
||||||
:param root: If `True` return `self` as the first file.
|
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
||||||
:return: A depth-first iterator.
|
'''
|
||||||
'''
|
Recursively traverse all links in the HTML file.
|
||||||
if root:
|
|
||||||
yield self
|
|
||||||
for link in self.links:
|
|
||||||
if link.path is not None:
|
|
||||||
yield self.map[link.url]
|
|
||||||
for hf in self.map[link.url].depth_first(root=False):
|
|
||||||
yield hf
|
|
||||||
|
|
||||||
|
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||||
|
implies that no links in hte root HTML file are followed.
|
||||||
|
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||||
|
auto-detected.
|
||||||
|
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||||
|
:class:`HTMLFile` objects.
|
||||||
|
'''
|
||||||
|
assert max_levels >= 0
|
||||||
|
level = 0
|
||||||
|
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||||
|
next_level = list(flat)
|
||||||
|
while level < max_levels and len(next_level) > 0:
|
||||||
|
level += 1
|
||||||
|
nl = []
|
||||||
|
for hf in next_level:
|
||||||
|
rejects = []
|
||||||
|
for link in hf.links:
|
||||||
|
if link.path is None or link.path in flat:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
nf = HTMLFile(link.path, level, encoding, verbose)
|
||||||
|
nl.append(nf)
|
||||||
|
flat.append(nf)
|
||||||
|
except IgnoreFile, err:
|
||||||
|
rejects.append(link)
|
||||||
|
if not err.doesnt_exist or verbose > 1:
|
||||||
|
print str(err)
|
||||||
|
for link in rejects:
|
||||||
|
hf.links.remove(link)
|
||||||
|
|
||||||
|
next_level = list(nl)
|
||||||
|
|
||||||
|
return flat, list(depth_first(flat[0], flat))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
root = HTMLFile(sys.argv[1], 0, verbose=2)
|
breadth_first, depth_first = traverse(sys.argv[1], verbose=2)
|
||||||
print 'Depth first...'
|
print 'Breadth first...'
|
||||||
for f in root.depth_first():
|
for f in breadth_first: print f
|
||||||
print f.path
|
print '\n\nDepth first...'
|
||||||
print '\n\nBreadth first...'
|
for f in depth_first: print f
|
||||||
for f in root.breadth_first():
|
|
||||||
print f.path
|
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ def build_forms(forms):
|
|||||||
dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)', re.DOTALL).sub(r'_("\1")', dat)
|
dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)', re.DOTALL).sub(r'_("\1")', dat)
|
||||||
|
|
||||||
# Workaround bug in Qt 4.4 on Windows
|
# Workaround bug in Qt 4.4 on Windows
|
||||||
if form.endswith('dialogs/config.ui') or form.endswith('dialogs/lrf_single.ui'):
|
if form.endswith('dialogs%sconfig.ui'%os.sep) or form.endswith('dialogs%slrf_single.ui'%os.sep):
|
||||||
print 'Implementing Workaround for buggy pyuic in form', form
|
print 'Implementing Workaround for buggy pyuic in form', form
|
||||||
dat = re.sub(r'= QtGui\.QTextEdit\(self\..*?\)', '= QtGui.QTextEdit()', dat)
|
dat = re.sub(r'= QtGui\.QTextEdit\(self\..*?\)', '= QtGui.QTextEdit()', dat)
|
||||||
dat = re.sub(r'= QtGui\.QListWidget\(self\..*?\)', '= QtGui.QListWidget()', dat)
|
dat = re.sub(r'= QtGui\.QListWidget\(self\..*?\)', '= QtGui.QListWidget()', dat)
|
||||||
|
@ -95,7 +95,7 @@ List the books available in the calibre database.
|
|||||||
parser.add_option('-s', '--search', default=None,
|
parser.add_option('-s', '--search', default=None,
|
||||||
help=_('Filter the results by the search query. For the format of the search query, please see the search related documentation in the User Manual. Default is to do no filtering.'))
|
help=_('Filter the results by the search query. For the format of the search query, please see the search related documentation in the User Manual. Default is to do no filtering.'))
|
||||||
opts, args = parser.parse_args(sys.argv[:1] + args)
|
opts, args = parser.parse_args(sys.argv[:1] + args)
|
||||||
fields = [f.strip().lower() for f in opts.fields.split(',')]
|
fields = [str(f.strip().lower()) for f in opts.fields.split(',')]
|
||||||
|
|
||||||
if not set(fields).issubset(FIELDS):
|
if not set(fields).issubset(FIELDS):
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user