diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index f405040c77..d515c606d3 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -30,10 +30,10 @@ from calibre.utils.zipfile import ZipFile
from cssutils import CSSParser
class HTMLElement(HtmlElement):
-
+
@apply
def specified_font_size():
-
+
def fget(self):
ans = self.get('specified_font_size', '')
if not ans:
@@ -41,12 +41,12 @@ class HTMLElement(HtmlElement):
if ans.startswith('f'):
return functools.partial(operator.mul, float(ans[1:]))
return float(ans)
-
+
def fset(self, val):
self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))
-
+
return property(fget=fget, fset=fset)
-
+
@apply
def computed_font_size():
def fget(self):
@@ -54,48 +54,48 @@ class HTMLElement(HtmlElement):
if ans == '':
return None
return float(ans)
-
+
def fset(self, val):
self.set('computed_font_size', repr(val))
-
+
return property(fget=fget, fset=fset)
-
+
def remove_font_size_information(self):
for elem in self.iter():
for p in ('computed', 'specified'):
elem.attrib.pop(p+'_font_size', None)
-
+
def getpath(self):
return self.getroottree().getpath(self)
class Lookup(HtmlElementClassLookup):
-
+
def lookup(self, node_type, document, namespace, name):
if node_type == 'element':
return HTMLElement
return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)
class HTMLParser(_HTMLParser):
-
+
def __init__(self, **kwargs):
super(HTMLParser, self).__init__(**kwargs)
self.set_element_class_lookup(Lookup())
-
+
parser = HTMLParser()
def fromstring(raw, **kw):
return _fromstring(raw, parser=parser, **kw)
def tostring(root, pretty_print=False):
- return _tostring(root, encoding='utf-8', method='xml',
- include_meta_content_type=True,
+ return _tostring(root, encoding='utf-8', method='xml',
+ include_meta_content_type=True,
pretty_print=pretty_print)
-
+
class Link(object):
'''
Represents a link in a HTML file.
'''
-
+
@classmethod
def url_to_local_path(cls, url, base):
path = urlunparse(('', '', url.path, url.params, url.query, ''))
@@ -103,7 +103,7 @@ class Link(object):
if os.path.isabs(path):
return path
return os.path.abspath(os.path.join(base, path))
-
+
def __init__(self, url, base):
'''
:param url: The url this link points to. Must be an unquoted unicode string.
@@ -127,13 +127,13 @@ class Link(object):
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
-
+
def __str__(self):
- return u'Link: %s --> %s'%(self.url, self.path)
-
+ return u'Link: %s --> %s'%(self.url, self.path)
+
class IgnoreFile(Exception):
-
+
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == 2
@@ -148,13 +148,13 @@ class HTMLFile(object):
The encoding of the file is available as :member:`encoding`.
'''
-
+
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
TITLE_PAT = re.compile('
([^<>]+)', re.IGNORECASE)
LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))',
re.DOTALL|re.IGNORECASE)
-
+
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
'''
:param level: The level of this file. Should be 0 for the root file.
@@ -167,7 +167,7 @@ class HTMLFile(object):
self.level = level
self.referrer = referrer
self.links = []
-
+
try:
with open(self.path, 'rb') as f:
src = f.read()
@@ -176,7 +176,7 @@ class HTMLFile(object):
if level == 0:
raise IOError(msg)
raise IgnoreFile(msg, err.errno)
-
+
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
if not self.is_binary:
if encoding is None:
@@ -189,19 +189,19 @@ class HTMLFile(object):
match = self.TITLE_PAT.search(src)
self.title = match.group(1) if match is not None else self.title
self.find_links(src)
-
-
-
+
+
+
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
-
+
def __str__(self):
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
-
+
def __repr__(self):
return str(self)
-
-
+
+
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
@@ -212,7 +212,7 @@ class HTMLFile(object):
link = self.resolve(url)
if link not in self.links:
self.links.append(link)
-
+
def resolve(self, url):
return Link(url, self.base)
@@ -234,13 +234,13 @@ def depth_first(root, flat, visited=set([])):
if hf not in visited:
yield hf
visited.add(hf)
-
-
+
+
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
'''
Recursively traverse all links in the HTML file.
-
- :param max_levels: Maximum levels of recursion. Must be non-negative. 0
+
+ :param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
@@ -271,7 +271,7 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
print repr(err)
for link in rejects:
hf.links.remove(link)
-
+
next_level = list(nl)
orec = sys.getrecursionlimit()
sys.setrecursionlimit(500000)
@@ -279,14 +279,14 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
return flat, list(depth_first(flat[0], flat))
finally:
sys.setrecursionlimit(orec)
-
-
+
+
def opf_traverse(opf_reader, verbose=0, encoding=None):
'''
Return a list of :class:`HTMLFile` objects in the order specified by the
`` element of the OPF.
-
- :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
+
+ :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
'''
@@ -317,7 +317,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
print 'WARNING: OPF spine item %s does not exist'%path
ans = [f for f in ans if not f.is_binary]
return ans
-
+
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
_span_pat = re.compile('', re.DOTALL|re.IGNORECASE)
@@ -326,20 +326,20 @@ def sanitize_head(match):
x = match.group(1)
x = _span_pat.sub('', x)
return '\n'+x+'\n'
-
+
class PreProcessor(object):
PREPROCESS = [
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into . This messes up lxml
- (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL),
+ (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the ', re.IGNORECASE),
+ (re.compile(r'{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''),
]
-
+
# Fix pdftohtml markup
PDFTOHTML = [
# Remove tags
@@ -348,20 +348,20 @@ class PreProcessor(object):
(re.compile(r'\d+ ', re.IGNORECASE), lambda match: ''),
# Remove and replace