MOBI Output:Fix bug that cause MOBI conversion to sometimes fail when linking to an external stylesheet

This commit is contained in:
Kovid Goyal 2009-04-21 14:10:00 -07:00
parent 8dd974ff42
commit 241a2fc099
2 changed files with 130 additions and 131 deletions

View File

@ -30,10 +30,10 @@ from calibre.utils.zipfile import ZipFile
from cssutils import CSSParser from cssutils import CSSParser
class HTMLElement(HtmlElement): class HTMLElement(HtmlElement):
@apply @apply
def specified_font_size(): def specified_font_size():
def fget(self): def fget(self):
ans = self.get('specified_font_size', '') ans = self.get('specified_font_size', '')
if not ans: if not ans:
@ -41,12 +41,12 @@ class HTMLElement(HtmlElement):
if ans.startswith('f'): if ans.startswith('f'):
return functools.partial(operator.mul, float(ans[1:])) return functools.partial(operator.mul, float(ans[1:]))
return float(ans) return float(ans)
def fset(self, val): def fset(self, val):
self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val)) self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
@apply @apply
def computed_font_size(): def computed_font_size():
def fget(self): def fget(self):
@ -54,48 +54,48 @@ class HTMLElement(HtmlElement):
if ans == '': if ans == '':
return None return None
return float(ans) return float(ans)
def fset(self, val): def fset(self, val):
self.set('computed_font_size', repr(val)) self.set('computed_font_size', repr(val))
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
def remove_font_size_information(self): def remove_font_size_information(self):
for elem in self.iter(): for elem in self.iter():
for p in ('computed', 'specified'): for p in ('computed', 'specified'):
elem.attrib.pop(p+'_font_size', None) elem.attrib.pop(p+'_font_size', None)
def getpath(self): def getpath(self):
return self.getroottree().getpath(self) return self.getroottree().getpath(self)
class Lookup(HtmlElementClassLookup): class Lookup(HtmlElementClassLookup):
def lookup(self, node_type, document, namespace, name): def lookup(self, node_type, document, namespace, name):
if node_type == 'element': if node_type == 'element':
return HTMLElement return HTMLElement
return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name) return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)
class HTMLParser(_HTMLParser): class HTMLParser(_HTMLParser):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(HTMLParser, self).__init__(**kwargs) super(HTMLParser, self).__init__(**kwargs)
self.set_element_class_lookup(Lookup()) self.set_element_class_lookup(Lookup())
parser = HTMLParser() parser = HTMLParser()
def fromstring(raw, **kw): def fromstring(raw, **kw):
return _fromstring(raw, parser=parser, **kw) return _fromstring(raw, parser=parser, **kw)
def tostring(root, pretty_print=False): def tostring(root, pretty_print=False):
return _tostring(root, encoding='utf-8', method='xml', return _tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=True, include_meta_content_type=True,
pretty_print=pretty_print) pretty_print=pretty_print)
class Link(object): class Link(object):
''' '''
Represents a link in a HTML file. Represents a link in a HTML file.
''' '''
@classmethod @classmethod
def url_to_local_path(cls, url, base): def url_to_local_path(cls, url, base):
path = urlunparse(('', '', url.path, url.params, url.query, '')) path = urlunparse(('', '', url.path, url.params, url.query, ''))
@ -103,7 +103,7 @@ class Link(object):
if os.path.isabs(path): if os.path.isabs(path):
return path return path
return os.path.abspath(os.path.join(base, path)) return os.path.abspath(os.path.join(base, path))
def __init__(self, url, base): def __init__(self, url, base):
''' '''
:param url: The url this link points to. Must be an unquoted unicode string. :param url: The url this link points to. Must be an unquoted unicode string.
@ -127,13 +127,13 @@ class Link(object):
def __eq__(self, other): def __eq__(self, other):
return self.path == getattr(other, 'path', other) return self.path == getattr(other, 'path', other)
def __str__(self): def __str__(self):
return u'Link: %s --> %s'%(self.url, self.path) return u'Link: %s --> %s'%(self.url, self.path)
class IgnoreFile(Exception): class IgnoreFile(Exception):
def __init__(self, msg, errno): def __init__(self, msg, errno):
Exception.__init__(self, msg) Exception.__init__(self, msg)
self.doesnt_exist = errno == 2 self.doesnt_exist = errno == 2
@ -148,13 +148,13 @@ class HTMLFile(object):
The encoding of the file is available as :member:`encoding`. The encoding of the file is available as :member:`encoding`.
''' '''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE) TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
LINK_PAT = re.compile( LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))', r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
re.DOTALL|re.IGNORECASE) re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
''' '''
:param level: The level of this file. Should be 0 for the root file. :param level: The level of this file. Should be 0 for the root file.
@ -167,7 +167,7 @@ class HTMLFile(object):
self.level = level self.level = level
self.referrer = referrer self.referrer = referrer
self.links = [] self.links = []
try: try:
with open(self.path, 'rb') as f: with open(self.path, 'rb') as f:
src = f.read() src = f.read()
@ -176,7 +176,7 @@ class HTMLFile(object):
if level == 0: if level == 0:
raise IOError(msg) raise IOError(msg)
raise IgnoreFile(msg, err.errno) raise IgnoreFile(msg, err.errno)
self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
if not self.is_binary: if not self.is_binary:
if encoding is None: if encoding is None:
@ -189,19 +189,19 @@ class HTMLFile(object):
match = self.TITLE_PAT.search(src) match = self.TITLE_PAT.search(src)
self.title = match.group(1) if match is not None else self.title self.title = match.group(1) if match is not None else self.title
self.find_links(src) self.find_links(src)
def __eq__(self, other): def __eq__(self, other):
return self.path == getattr(other, 'path', other) return self.path == getattr(other, 'path', other)
def __str__(self): def __str__(self):
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
def __repr__(self): def __repr__(self):
return str(self) return str(self)
def find_links(self, src): def find_links(self, src):
for match in self.LINK_PAT.finditer(src): for match in self.LINK_PAT.finditer(src):
url = None url = None
@ -212,7 +212,7 @@ class HTMLFile(object):
link = self.resolve(url) link = self.resolve(url)
if link not in self.links: if link not in self.links:
self.links.append(link) self.links.append(link)
def resolve(self, url): def resolve(self, url):
return Link(url, self.base) return Link(url, self.base)
@ -234,13 +234,13 @@ def depth_first(root, flat, visited=set([])):
if hf not in visited: if hf not in visited:
yield hf yield hf
visited.add(hf) visited.add(hf)
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None): def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
''' '''
Recursively traverse all links in the HTML file. Recursively traverse all links in the HTML file.
:param max_levels: Maximum levels of recursion. Must be non-negative. 0 :param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed. implies that no links in the root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is :param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected. auto-detected.
@ -271,7 +271,7 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
print repr(err) print repr(err)
for link in rejects: for link in rejects:
hf.links.remove(link) hf.links.remove(link)
next_level = list(nl) next_level = list(nl)
orec = sys.getrecursionlimit() orec = sys.getrecursionlimit()
sys.setrecursionlimit(500000) sys.setrecursionlimit(500000)
@ -279,14 +279,14 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
return flat, list(depth_first(flat[0], flat)) return flat, list(depth_first(flat[0], flat))
finally: finally:
sys.setrecursionlimit(orec) sys.setrecursionlimit(orec)
def opf_traverse(opf_reader, verbose=0, encoding=None): def opf_traverse(opf_reader, verbose=0, encoding=None):
''' '''
Return a list of :class:`HTMLFile` objects in the order specified by the Return a list of :class:`HTMLFile` objects in the order specified by the
`<spine>` element of the OPF. `<spine>` element of the OPF.
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance. :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
:param encoding: Specify character encoding of HTML files. If `None` it is :param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected. auto-detected.
''' '''
@ -317,7 +317,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
print 'WARNING: OPF spine item %s does not exist'%path print 'WARNING: OPF spine item %s does not exist'%path
ans = [f for f in ans if not f.is_binary] ans = [f for f in ans if not f.is_binary]
return ans return ans
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp']) convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE) _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
@ -326,20 +326,20 @@ def sanitize_head(match):
x = match.group(1) x = match.group(1)
x = _span_pat.sub('', x) x = _span_pat.sub('', x)
return '<head>\n'+x+'\n</head>' return '<head>\n'+x+'\n</head>'
class PreProcessor(object): class PreProcessor(object):
PREPROCESS = [ PREPROCESS = [
# Some idiotic HTML generators (Frontpage I'm looking at you) # Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml # Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
sanitize_head), sanitize_head),
# Convert all entities, since lxml doesn't handle them well # Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities), (re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word # Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''), lambda match: ''),
] ]
# Fix pdftohtml markup # Fix pdftohtml markup
PDFTOHTML = [ PDFTOHTML = [
# Remove <hr> tags # Remove <hr> tags
@ -348,20 +348,20 @@ class PreProcessor(object):
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''), (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p> # Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'), (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.*)<br.*?>', re.IGNORECASE), (re.compile(r'(.*)<br.*?>', re.IGNORECASE),
lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
else match.group(1)), else match.group(1)),
# Remove hyphenation # Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''), (re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces # Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '), (re.compile(ur'\u00a0'), lambda match : ' '),
] ]
# Fix Book Designer markup # Fix Book Designer markup
BOOK_DESIGNER = [ BOOK_DESIGNER = [
# HR # HR
@ -377,17 +377,17 @@ class PreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
] ]
def is_baen(self, src): def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
re.IGNORECASE).search(src) is not None re.IGNORECASE).search(src) is not None
def is_book_designer(self, raw): def is_book_designer(self, raw):
return re.search('<H2[^><]*id=BookTitle', raw) is not None return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src): def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def preprocess(self, html): def preprocess(self, html):
opts = getattr(self, 'opts', False) opts = getattr(self, 'opts', False)
if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False): if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False):
@ -403,17 +403,17 @@ class PreProcessor(object):
for rule in self.PREPROCESS + rules: for rule in self.PREPROCESS + rules:
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
return html return html
class Parser(PreProcessor, LoggingInterface): class Parser(PreProcessor, LoggingInterface):
# SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont' # SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont'
# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in # SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in
# [ # [
# (r'<(?P<tag>%s)(?P<attrs>(\s+[^<>]*){0,1})(?<!/)>', # (r'<(?P<tag>%s)(?P<attrs>(\s+[^<>]*){0,1})(?<!/)>',
# '<\g<tag>\g<attrs> />'), # '<\g<tag>\g<attrs> />'),
# (), # (),
# ] # ]
# ] # ]
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
LoggingInterface.__init__(self, logging.getLogger(name)) LoggingInterface.__init__(self, logging.getLogger(name))
self.setup_cli_handler(opts.verbose) self.setup_cli_handler(opts.verbose)
@ -433,27 +433,27 @@ class Parser(PreProcessor, LoggingInterface):
name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1] name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
save_counter += 1 save_counter += 1
self.htmlfile_map[f.path] = name self.htmlfile_map[f.path] = name
self.parse_html() self.parse_html()
# Handle <image> tags inside embedded <svg> # Handle <image> tags inside embedded <svg>
# At least one source of EPUB files (Penguin) uses xlink:href # At least one source of EPUB files (Penguin) uses xlink:href
# without declaring the xlink namespace # without declaring the xlink namespace
for image in self.root.xpath('//image'): for image in self.root.xpath('//image'):
for attr in image.attrib.keys(): for attr in image.attrib.keys():
if attr.endswith(':href'): if attr.endswith(':href'):
nhref = self.rewrite_links(image.get(attr)) nhref = self.rewrite_links(image.get(attr))
image.set(attr, nhref) image.set(attr, nhref)
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
if self.root.get(bad, None) is not None: if self.root.get(bad, None) is not None:
self.root.attrib.pop(bad) self.root.attrib.pop(bad)
def save_path(self): def save_path(self):
return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]) return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
def save(self, strip_comments=False): def save(self, strip_comments=False):
''' '''
Save processed HTML into the content directory. Save processed HTML into the content directory.
@ -463,7 +463,7 @@ class Parser(PreProcessor, LoggingInterface):
self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink') self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink')
for svg in self.root.xpath('//svg'): for svg in self.root.xpath('//svg'):
svg.set('xmlns', 'http://www.w3.org/2000/svg') svg.set('xmlns', 'http://www.w3.org/2000/svg')
ans = tostring(self.root, pretty_print=self.opts.pretty_print) ans = tostring(self.root, pretty_print=self.opts.pretty_print)
ans = re.compile(r'<head>', re.IGNORECASE).sub( ans = re.compile(r'<head>', re.IGNORECASE).sub(
'<head>\n\t<meta http-equiv="Content-Type" ' '<head>\n\t<meta http-equiv="Content-Type" '
@ -503,7 +503,7 @@ class Parser(PreProcessor, LoggingInterface):
self.root.remove(head) self.root.remove(head)
self.root.insert(0, head) self.root.insert(0, head)
self.head = head self.head = head
try: try:
self.body = self.root.body self.body = self.root.body
except: except:
@ -526,7 +526,7 @@ class Parser(PreProcessor, LoggingInterface):
if not self.head.xpath('./title'): if not self.head.xpath('./title'):
title = etree.SubElement(self.head, 'title') title = etree.SubElement(self.head, 'title')
title.text = _('Unknown') title.text = _('Unknown')
def debug_tree(self, name): def debug_tree(self, name):
''' '''
Dump source tree for later debugging. Dump source tree for later debugging.
@ -538,8 +538,8 @@ class Parser(PreProcessor, LoggingInterface):
(os.path.basename(self.htmlfile.path), name)), 'wb') as f: (os.path.basename(self.htmlfile.path), name)), 'wb') as f:
f.write(tostring(self.root)) f.write(tostring(self.root))
self.log_debug(_('Written processed HTML to ')+f.name) self.log_debug(_('Written processed HTML to ')+f.name)
def rewrite_links(self, olink): def rewrite_links(self, olink):
''' '''
Make all links in document relative so that they work in the EPUB container. Make all links in document relative so that they work in the EPUB container.
@ -555,7 +555,7 @@ class Parser(PreProcessor, LoggingInterface):
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path): if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
return olink return olink
if link.path in self.htmlfiles: if link.path in self.htmlfiles:
return self.htmlfile_map[link.path] + frag return self.htmlfile_map[link.path] + frag
if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None: if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
return olink # This happens when --max-levels is used return olink # This happens when --max-levels is used
if link.path in self.resource_map.keys(): if link.path in self.resource_map.keys():
@ -567,26 +567,26 @@ class Parser(PreProcessor, LoggingInterface):
name = 'resources/' + name name = 'resources/' + name
self.resource_map[link.path] = name self.resource_map[link.path] = name
return name + frag return name + frag
class Processor(Parser): class Processor(Parser):
''' '''
This class builds on :class:`Parser` to provide additional methods This class builds on :class:`Parser` to provide additional methods
to perform various processing/modification tasks on HTML files. to perform various processing/modification tasks on HTML files.
''' '''
LINKS_PATH = XPath('//a[@href]') LINKS_PATH = XPath('//a[@href]')
PIXEL_PAT = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px') PIXEL_PAT = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px')
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
Parser.__init__(self, *args, **kwargs) Parser.__init__(self, *args, **kwargs)
temp = LoggingInterface(logging.getLogger('cssutils')) temp = LoggingInterface(logging.getLogger('cssutils'))
temp.setup_cli_handler(self.opts.verbose) temp.setup_cli_handler(self.opts.verbose)
self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR) self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR)
self.stylesheet = self.font_css = self.override_css = None self.stylesheet = self.font_css = self.override_css = None
def detect_chapters(self): def detect_chapters(self):
self.detected_chapters = self.opts.chapter(self.root) self.detected_chapters = self.opts.chapter(self.root)
chapter_mark = self.opts.chapter_mark chapter_mark = self.opts.chapter_mark
@ -604,12 +604,12 @@ class Processor(Parser):
else: # chapter_mark == 'both': else: # chapter_mark == 'both':
mark = etree.Element('hr', style=page_break_before) mark = etree.Element('hr', style=page_break_before)
elem.addprevious(mark) elem.addprevious(mark)
def save(self, strip_comments=False): def save(self, strip_comments=False):
style_path = os.path.splitext(os.path.basename(self.save_path()))[0] style_path = os.path.splitext(os.path.basename(self.save_path()))[0]+'_calibre'
for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]): for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
if sheet is not None: if sheet is not None:
style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet',
'href':'resources/%s_%d.css'%(style_path, i), 'href':'resources/%s_%d.css'%(style_path, i),
'charset':'UTF-8'}) 'charset':'UTF-8'})
style.tail = '\n' style.tail = '\n'
@ -620,16 +620,16 @@ class Processor(Parser):
raw = raw.encode('utf-8') raw = raw.encode('utf-8')
open(path, 'wb').write(raw) open(path, 'wb').write(raw)
return Parser.save(self, strip_comments=strip_comments) return Parser.save(self, strip_comments=strip_comments)
def populate_toc(self, toc): def populate_toc(self, toc):
''' '''
Populate the Table of Contents from detected chapters and links. Populate the Table of Contents from detected chapters and links.
''' '''
class Adder(object): class Adder(object):
def __init__(self, toc): def __init__(self, toc):
self.next_play_order = max([x.play_order for x in toc.flat()]) self.next_play_order = max([x.play_order for x in toc.flat()])
def __call__(self, href, fragment, text, target, type='link'): def __call__(self, href, fragment, text, target, type='link'):
for entry in toc.flat(): for entry in toc.flat():
if entry.href == href and entry.fragment == fragment: if entry.href == href and entry.fragment == fragment:
@ -637,15 +637,15 @@ class Processor(Parser):
if len(text) > 50: if len(text) > 50:
text = text[:50] + u'\u2026' text = text[:50] + u'\u2026'
self.next_play_order += 1 self.next_play_order += 1
return target.add_item(href, fragment, text, type=type, return target.add_item(href, fragment, text, type=type,
play_order=self.next_play_order) play_order=self.next_play_order)
add_item = Adder(toc) add_item = Adder(toc)
name = self.htmlfile_map[self.htmlfile.path] name = self.htmlfile_map[self.htmlfile.path]
href = 'content/'+name href = 'content/'+name
# Add level* TOC items # Add level* TOC items
counter = 0 counter = 0
def elem_to_link(elem, href, counter): def elem_to_link(elem, href, counter):
text = (u''.join(elem.xpath('string()'))).strip() text = (u''.join(elem.xpath('string()'))).strip()
if not text: if not text:
@ -662,8 +662,8 @@ class Processor(Parser):
elem.set('id', id) elem.set('id', id)
frag = id frag = id
return text, _href, frag return text, _href, frag
if self.opts.level1_toc is not None: if self.opts.level1_toc is not None:
level1 = self.opts.level1_toc(self.root) level1 = self.opts.level1_toc(self.root)
level1_order = [] level1_order = []
@ -702,17 +702,17 @@ class Processor(Parser):
counter += 1 counter += 1
if text: if text:
add_item(_href, frag, text, level2, type='chapter') add_item(_href, frag, text, level2, type='chapter')
if level1_order: # Fix play order if level1_order: # Fix play order
next_play_order = level1_order[0].play_order next_play_order = level1_order[0].play_order
for x in level1_order: for x in level1_order:
for y in x.flat(): for y in x.flat():
y.play_order = next_play_order y.play_order = next_play_order
next_play_order += 1 next_play_order += 1
if len(toc) > 0: if len(toc) > 0:
# Detected TOC entries using --level* options # Detected TOC entries using --level* options
# so aborting all other toc processing # so aborting all other toc processing
@ -726,7 +726,7 @@ class Processor(Parser):
id = elem.get('id', 'calibre_chapter_%d'%counter) id = elem.get('id', 'calibre_chapter_%d'%counter)
elem.set('id', id) elem.set('id', id)
add_item(href, id, text, toc, type='chapter') add_item(href, id, text, toc, type='chapter')
if len(list(toc.flat())) >= self.opts.toc_threshold: if len(list(toc.flat())) >= self.opts.toc_threshold:
return return
referrer = toc referrer = toc
@ -745,7 +745,7 @@ class Processor(Parser):
name = self.htmlfile_map[self.htmlfile.referrer.path] name = self.htmlfile_map[self.htmlfile.referrer.path]
href = 'content/'+name href = 'content/'+name
referrer = add_item(href, None, text, toc) referrer = add_item(href, None, text, toc)
# Add links to TOC # Add links to TOC
if int(self.opts.max_toc_links) > 0: if int(self.opts.max_toc_links) > 0:
for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]: for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]:
@ -762,7 +762,7 @@ class Processor(Parser):
if len(parts) > 1: if len(parts) > 1:
fragment = parts[1] fragment = parts[1]
add_item(href, fragment, text, referrer) add_item(href, fragment, text, referrer)
@classmethod @classmethod
def preprocess_css(cls, css, dpi=96): def preprocess_css(cls, css, dpi=96):
def rescale(match): def rescale(match):
@ -772,17 +772,17 @@ class Processor(Parser):
except ValueError: except ValueError:
return '' return ''
return '%fpt'%(72 * val/dpi) return '%fpt'%(72 * val/dpi)
css = cls.PIXEL_PAT.sub(rescale, css) css = cls.PIXEL_PAT.sub(rescale, css)
css = cls.PAGE_PAT.sub('', css) css = cls.PAGE_PAT.sub('', css)
return css return css
def extract_css(self, parsed_sheets): def extract_css(self, parsed_sheets):
''' '''
Remove all CSS information from the document and store it as Remove all CSS information from the document and store it as
:class:`StyleSheet` objects. :class:`StyleSheet` objects.
''' '''
def get_id(chapter, counter, prefix='calibre_css_'): def get_id(chapter, counter, prefix='calibre_css_'):
new_id = '%s_%d'%(prefix, counter) new_id = '%s_%d'%(prefix, counter)
if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
@ -796,7 +796,7 @@ class Processor(Parser):
id = new_id id = new_id
chapter.set('id', id) chapter.set('id', id)
return id return id
self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('') self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('')
self.specified_override_css = [] self.specified_override_css = []
for link in self.root.xpath('//link'): for link in self.root.xpath('//link'):
@ -825,8 +825,7 @@ class Processor(Parser):
self.log_exception('') self.log_exception('')
if parsed_sheets.has_key(file): if parsed_sheets.has_key(file):
self.external_stylesheets.append(parsed_sheets[file]) self.external_stylesheets.append(parsed_sheets[file])
for style in self.root.xpath('//style'): for style in self.root.xpath('//style'):
if 'css' in style.get('type', 'text/css').lower(): if 'css' in style.get('type', 'text/css').lower():
override_css = style.get('title', '') == 'override_css' override_css = style.get('title', '') == 'override_css'
@ -889,7 +888,7 @@ class Processor(Parser):
cn += classname cn += classname
font.set('class', cn) font.set('class', cn)
font.tag = 'span' font.tag = 'span'
id_css, id_css_counter = {}, 0 id_css, id_css_counter = {}, 0
for elem in self.root.xpath('//*[@style]'): for elem in self.root.xpath('//*[@style]'):
setting = elem.get('style') setting = elem.get('style')
@ -906,7 +905,7 @@ class Processor(Parser):
cn = elem.get('class', classname) cn = elem.get('class', classname)
elem.set('class', cn) elem.set('class', cn)
elem.attrib.pop('style') elem.attrib.pop('style')
css = '\n'.join(['.%s {%s;}'%(cn, setting) for \ css = '\n'.join(['.%s {%s;}'%(cn, setting) for \
setting, cn in cache.items()]) setting, cn in cache.items()])
css += '\n\n' css += '\n\n'
@ -930,28 +929,28 @@ class Processor(Parser):
self.override_css = self.css_parser.parseString(self.preprocess_css(css)) self.override_css = self.css_parser.parseString(self.preprocess_css(css))
for rule in reversed(self.specified_override_css): for rule in reversed(self.specified_override_css):
self.override_css.insertRule(rule, index=0) self.override_css.insertRule(rule, index=0)
def config(defaults=None, config_name='html', def config(defaults=None, config_name='html',
desc=_('Options to control the traversal of HTML')): desc=_('Options to control the traversal of HTML')):
if defaults is None: if defaults is None:
c = Config(config_name, desc) c = Config(config_name, desc)
else: else:
c = StringConfig(defaults, desc) c = StringConfig(defaults, desc)
c.add_opt('output', ['-o', '--output'], default=None, c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output directory. Default is the current directory.')) help=_('The output directory. Default is the current directory.'))
c.add_opt('encoding', ['--encoding'], default=None, c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.')) help=_('Character encoding for HTML files. Default is to auto detect.'))
c.add_opt('zip', ['--zip'], default=False, c.add_opt('zip', ['--zip'], default=False,
help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.')) help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
traversal = c.add_group('traversal', _('Control the following of links in HTML files.')) traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
traversal('breadth_first', ['--breadth-first'], default=False, traversal('breadth_first', ['--breadth-first'], default=False,
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first')) help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal', traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.')) help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
metadata = c.add_group('metadata', _('Set metadata of the generated ebook')) metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
metadata('title', ['-t', '--title'], default=None, metadata('title', ['-t', '--title'], default=None,
help=_('Set the title. Default is to autodetect.')) help=_('Set the title. Default is to autodetect.'))
@ -965,13 +964,13 @@ def config(defaults=None, config_name='html',
help=_('A summary of this book.')) help=_('A summary of this book.'))
metadata('from_opf', ['--metadata-from'], default=None, metadata('from_opf', ['--metadata-from'], default=None,
help=_('Load metadata from the specified OPF file')) help=_('Load metadata from the specified OPF file'))
debug = c.add_group('debug', _('Options useful for debugging')) debug = c.add_group('debug', _('Options useful for debugging'))
debug('verbose', ['-v', '--verbose'], default=0, action='count', debug('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.')) help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
debug('pretty_print', ['--pretty-print'], default=False, debug('pretty_print', ['--pretty-print'], default=False,
help=_('Output HTML is "pretty printed" for easier parsing by humans')) help=_('Output HTML is "pretty printed" for easier parsing by humans'))
return c return c
def option_parser(): def option_parser():
@ -980,7 +979,7 @@ def option_parser():
%prog [options] file.html|opf %prog [options] file.html|opf
Follow all links in an HTML file and collect them into the specified directory. Follow all links in an HTML file and collect them into the specified directory.
Also collects any resources like images, stylesheets, scripts, etc. Also collects any resources like images, stylesheets, scripts, etc.
If an OPF file is specified instead, the list of files in its <spine> element If an OPF file is specified instead, the list of files in its <spine> element
is used. is used.
''')) '''))
@ -1056,11 +1055,11 @@ def merge_metadata(htmlfile, opf, opts):
elif attr == 'tags': elif attr == 'tags':
val = [i.strip() for i in val.split(',') if i.strip()] val = [i.strip() for i in val.split(',') if i.strip()]
setattr(mi, attr, val) setattr(mi, attr, val)
cover = getattr(opts, 'cover', False) cover = getattr(opts, 'cover', False)
if cover and os.path.exists(cover): if cover and os.path.exists(cover):
mi.cover = os.path.abspath(cover) mi.cover = os.path.abspath(cover)
if not mi.title: if not mi.title:
if htmlfile: if htmlfile:
mi.title = os.path.splitext(os.path.basename(htmlfile))[0] mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
@ -1092,13 +1091,13 @@ def rebase_toc(toc, htmlfile_map, basepath, root=True):
def fix_entry(entry): def fix_entry(entry):
if entry.abspath in htmlfile_map.keys(): if entry.abspath in htmlfile_map.keys():
entry.href = 'content/' + htmlfile_map[entry.abspath] entry.href = 'content/' + htmlfile_map[entry.abspath]
for entry in toc: for entry in toc:
rebase_toc(entry, htmlfile_map, basepath, root=False) rebase_toc(entry, htmlfile_map, basepath, root=False)
fix_entry(entry) fix_entry(entry)
if root: if root:
toc.base_path = basepath toc.base_path = basepath
def create_dir(htmlfile, opts): def create_dir(htmlfile, opts):
''' '''
Create a directory that contains the open ebook Create a directory that contains the open ebook
@ -1110,16 +1109,16 @@ def create_dir(htmlfile, opts):
else: else:
opf, filelist = get_filelist(htmlfile, opts) opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts) mi = merge_metadata(htmlfile, opf, opts)
resource_map, htmlfile_map = parse_content(filelist, opts) resource_map, htmlfile_map = parse_content(filelist, opts)
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
if opf and opf.cover and os.access(opf.cover, os.R_OK): if opf and opf.cover and os.access(opf.cover, os.R_OK):
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1]) cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
shutil.copyfile(opf.cover, cpath) shutil.copyfile(opf.cover, cpath)
resources.append(cpath) resources.append(cpath)
mi.cover = cpath mi.cover = cpath
spine = [htmlfile_map[f.path] for f in filelist] spine = [htmlfile_map[f.path] for f in filelist]
mi = create_metadata(opts.output, mi, spine, resources) mi = create_metadata(opts.output, mi, spine, resources)
buf = cStringIO.StringIO() buf = cStringIO.StringIO()
@ -1132,7 +1131,7 @@ def create_dir(htmlfile, opts):
with open(os.path.join(opts.output, 'toc.ncx'), 'wb') as f: with open(os.path.join(opts.output, 'toc.ncx'), 'wb') as f:
f.write(toc) f.write(toc)
print 'Open ebook created in', opts.output print 'Open ebook created in', opts.output
def create_oebzip(htmlfile, opts): def create_oebzip(htmlfile, opts):
''' '''
Create a zip file that contains the Open ebook. Create a zip file that contains the Open ebook.
@ -1154,13 +1153,13 @@ def main(args=sys.argv):
parser.print_help() parser.print_help()
print _('You must specify an input HTML file') print _('You must specify an input HTML file')
return 1 return 1
htmlfile = args[1] htmlfile = args[1]
if opts.zip: if opts.zip:
create_oebzip(htmlfile, opts) create_oebzip(htmlfile, opts)
else: else:
create_dir(htmlfile, opts) create_dir(htmlfile, opts)
return 0 return 0
def gui_main(htmlfile, pt=None): def gui_main(htmlfile, pt=None):
@ -1183,7 +1182,7 @@ output = %s
if len(nontrivial) < 2: if len(nontrivial) < 2:
return None return None
return pt.name return pt.name
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -21,7 +21,7 @@ def config(defaults=None):
c.remove_opt('profile') c.remove_opt('profile')
mobic = mobi_config(defaults=defaults) mobic = mobi_config(defaults=defaults)
c.update(mobic) c.update(mobic)
return c return c
def option_parser(usage=USAGE): def option_parser(usage=USAGE):
usage = usage % ('Mobipocket', formats()) usage = usage % ('Mobipocket', formats())
@ -33,13 +33,13 @@ def any2mobi(opts, path, notification=None):
if not ext: if not ext:
raise ValueError('Unknown file type: '+path) raise ValueError('Unknown file type: '+path)
ext = ext.lower()[1:] ext = ext.lower()[1:]
if opts.output is None: if opts.output is None:
opts.output = os.path.splitext(os.path.basename(path))[0]+'.mobi' opts.output = os.path.splitext(os.path.basename(path))[0]+'.mobi'
opts.output = os.path.abspath(opts.output) opts.output = os.path.abspath(opts.output)
orig_output = opts.output orig_output = opts.output
with TemporaryDirectory('_any2mobi') as tdir: with TemporaryDirectory('_any2mobi') as tdir:
oebdir = os.path.join(tdir, 'oeb') oebdir = os.path.join(tdir, 'oeb')
os.mkdir(oebdir) os.mkdir(oebdir)
@ -54,7 +54,7 @@ def any2mobi(opts, path, notification=None):
opts.output = orig_output opts.output = orig_output
logging.getLogger('html2epub').info(_('Creating Mobipocket file from EPUB...')) logging.getLogger('html2epub').info(_('Creating Mobipocket file from EPUB...'))
oeb2mobi(opts, opf) oeb2mobi(opts, opf)
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()