mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
MOBI Output:Fix bug that cause MOBI conversion to sometimes fail when linking to an external stylesheet
This commit is contained in:
parent
8dd974ff42
commit
241a2fc099
@ -30,10 +30,10 @@ from calibre.utils.zipfile import ZipFile
|
||||
from cssutils import CSSParser
|
||||
|
||||
class HTMLElement(HtmlElement):
|
||||
|
||||
|
||||
@apply
|
||||
def specified_font_size():
|
||||
|
||||
|
||||
def fget(self):
|
||||
ans = self.get('specified_font_size', '')
|
||||
if not ans:
|
||||
@ -41,12 +41,12 @@ class HTMLElement(HtmlElement):
|
||||
if ans.startswith('f'):
|
||||
return functools.partial(operator.mul, float(ans[1:]))
|
||||
return float(ans)
|
||||
|
||||
|
||||
def fset(self, val):
|
||||
self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))
|
||||
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
|
||||
@apply
|
||||
def computed_font_size():
|
||||
def fget(self):
|
||||
@ -54,48 +54,48 @@ class HTMLElement(HtmlElement):
|
||||
if ans == '':
|
||||
return None
|
||||
return float(ans)
|
||||
|
||||
|
||||
def fset(self, val):
|
||||
self.set('computed_font_size', repr(val))
|
||||
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
|
||||
def remove_font_size_information(self):
|
||||
for elem in self.iter():
|
||||
for p in ('computed', 'specified'):
|
||||
elem.attrib.pop(p+'_font_size', None)
|
||||
|
||||
|
||||
def getpath(self):
|
||||
return self.getroottree().getpath(self)
|
||||
|
||||
class Lookup(HtmlElementClassLookup):
|
||||
|
||||
|
||||
def lookup(self, node_type, document, namespace, name):
|
||||
if node_type == 'element':
|
||||
return HTMLElement
|
||||
return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)
|
||||
|
||||
class HTMLParser(_HTMLParser):
|
||||
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(HTMLParser, self).__init__(**kwargs)
|
||||
self.set_element_class_lookup(Lookup())
|
||||
|
||||
|
||||
parser = HTMLParser()
|
||||
|
||||
def fromstring(raw, **kw):
|
||||
return _fromstring(raw, parser=parser, **kw)
|
||||
|
||||
def tostring(root, pretty_print=False):
|
||||
return _tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=True,
|
||||
return _tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=True,
|
||||
pretty_print=pretty_print)
|
||||
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
path = urlunparse(('', '', url.path, url.params, url.query, ''))
|
||||
@ -103,7 +103,7 @@ class Link(object):
|
||||
if os.path.isabs(path):
|
||||
return path
|
||||
return os.path.abspath(os.path.join(base, path))
|
||||
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
@ -127,13 +127,13 @@ class Link(object):
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return u'Link: %s --> %s'%(self.url, self.path)
|
||||
|
||||
return u'Link: %s --> %s'%(self.url, self.path)
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
|
||||
|
||||
def __init__(self, msg, errno):
|
||||
Exception.__init__(self, msg)
|
||||
self.doesnt_exist = errno == 2
|
||||
@ -148,13 +148,13 @@ class HTMLFile(object):
|
||||
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
'''
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
@ -167,7 +167,7 @@ class HTMLFile(object):
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
src = f.read()
|
||||
@ -176,7 +176,7 @@ class HTMLFile(object):
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
|
||||
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||
if not self.is_binary:
|
||||
if encoding is None:
|
||||
@ -189,19 +189,19 @@ class HTMLFile(object):
|
||||
match = self.TITLE_PAT.search(src)
|
||||
self.title = match.group(1) if match is not None else self.title
|
||||
self.find_links(src)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
|
||||
|
||||
def find_links(self, src):
|
||||
for match in self.LINK_PAT.finditer(src):
|
||||
url = None
|
||||
@ -212,7 +212,7 @@ class HTMLFile(object):
|
||||
link = self.resolve(url)
|
||||
if link not in self.links:
|
||||
self.links.append(link)
|
||||
|
||||
|
||||
def resolve(self, url):
|
||||
return Link(url, self.base)
|
||||
|
||||
@ -234,13 +234,13 @@ def depth_first(root, flat, visited=set([])):
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
|
||||
'''
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
@ -271,7 +271,7 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
||||
print repr(err)
|
||||
for link in rejects:
|
||||
hf.links.remove(link)
|
||||
|
||||
|
||||
next_level = list(nl)
|
||||
orec = sys.getrecursionlimit()
|
||||
sys.setrecursionlimit(500000)
|
||||
@ -279,14 +279,14 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None)
|
||||
return flat, list(depth_first(flat[0], flat))
|
||||
finally:
|
||||
sys.setrecursionlimit(orec)
|
||||
|
||||
|
||||
|
||||
|
||||
def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
'''
|
||||
Return a list of :class:`HTMLFile` objects in the order specified by the
|
||||
`<spine>` element of the OPF.
|
||||
|
||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
|
||||
|
||||
:param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
'''
|
||||
@ -317,7 +317,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
|
||||
print 'WARNING: OPF spine item %s does not exist'%path
|
||||
ans = [f for f in ans if not f.is_binary]
|
||||
return ans
|
||||
|
||||
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
@ -326,20 +326,20 @@ def sanitize_head(match):
|
||||
x = match.group(1)
|
||||
x = _span_pat.sub('', x)
|
||||
return '<head>\n'+x+'\n</head>'
|
||||
|
||||
|
||||
class PreProcessor(object):
|
||||
PREPROCESS = [
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
|
||||
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
|
||||
sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
|
||||
|
||||
# Fix pdftohtml markup
|
||||
PDFTOHTML = [
|
||||
# Remove <hr> tags
|
||||
@ -348,20 +348,20 @@ class PreProcessor(object):
|
||||
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
|
||||
# Remove <br> and replace <br><br> with <p>
|
||||
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
||||
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
|
||||
lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
|
||||
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
|
||||
lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
|
||||
else match.group(1)),
|
||||
# Remove hyphenation
|
||||
(re.compile(r'-\n\r?'), lambda match: ''),
|
||||
|
||||
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||
|
||||
|
||||
# Remove non breaking spaces
|
||||
(re.compile(ur'\u00a0'), lambda match : ' '),
|
||||
|
||||
|
||||
]
|
||||
|
||||
|
||||
# Fix Book Designer markup
|
||||
BOOK_DESIGNER = [
|
||||
# HR
|
||||
@ -377,17 +377,17 @@ class PreProcessor(object):
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
|
||||
|
||||
def is_baen(self, src):
|
||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||
re.IGNORECASE).search(src) is not None
|
||||
|
||||
|
||||
def is_book_designer(self, raw):
|
||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
||||
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
|
||||
def preprocess(self, html):
|
||||
opts = getattr(self, 'opts', False)
|
||||
if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False):
|
||||
@ -403,17 +403,17 @@ class PreProcessor(object):
|
||||
for rule in self.PREPROCESS + rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
return html
|
||||
|
||||
|
||||
class Parser(PreProcessor, LoggingInterface):
|
||||
# SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont'
|
||||
# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in
|
||||
# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in
|
||||
# [
|
||||
# (r'<(?P<tag>%s)(?P<attrs>(\s+[^<>]*){0,1})(?<!/)>',
|
||||
# '<\g<tag>\g<attrs> />'),
|
||||
# (),
|
||||
# ]
|
||||
# ]
|
||||
|
||||
|
||||
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
|
||||
LoggingInterface.__init__(self, logging.getLogger(name))
|
||||
self.setup_cli_handler(opts.verbose)
|
||||
@ -433,27 +433,27 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
|
||||
save_counter += 1
|
||||
self.htmlfile_map[f.path] = name
|
||||
|
||||
|
||||
self.parse_html()
|
||||
# Handle <image> tags inside embedded <svg>
|
||||
# At least one source of EPUB files (Penguin) uses xlink:href
|
||||
# without declaring the xlink namespace
|
||||
for image in self.root.xpath('//image'):
|
||||
for image in self.root.xpath('//image'):
|
||||
for attr in image.attrib.keys():
|
||||
if attr.endswith(':href'):
|
||||
nhref = self.rewrite_links(image.get(attr))
|
||||
image.set(attr, nhref)
|
||||
|
||||
|
||||
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
|
||||
for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
|
||||
if self.root.get(bad, None) is not None:
|
||||
self.root.attrib.pop(bad)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def save_path(self):
|
||||
return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
|
||||
|
||||
|
||||
def save(self, strip_comments=False):
|
||||
'''
|
||||
Save processed HTML into the content directory.
|
||||
@ -463,7 +463,7 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink')
|
||||
for svg in self.root.xpath('//svg'):
|
||||
svg.set('xmlns', 'http://www.w3.org/2000/svg')
|
||||
|
||||
|
||||
ans = tostring(self.root, pretty_print=self.opts.pretty_print)
|
||||
ans = re.compile(r'<head>', re.IGNORECASE).sub(
|
||||
'<head>\n\t<meta http-equiv="Content-Type" '
|
||||
@ -503,7 +503,7 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
self.root.remove(head)
|
||||
self.root.insert(0, head)
|
||||
|
||||
self.head = head
|
||||
self.head = head
|
||||
try:
|
||||
self.body = self.root.body
|
||||
except:
|
||||
@ -526,7 +526,7 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
if not self.head.xpath('./title'):
|
||||
title = etree.SubElement(self.head, 'title')
|
||||
title.text = _('Unknown')
|
||||
|
||||
|
||||
def debug_tree(self, name):
|
||||
'''
|
||||
Dump source tree for later debugging.
|
||||
@ -538,8 +538,8 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
(os.path.basename(self.htmlfile.path), name)), 'wb') as f:
|
||||
f.write(tostring(self.root))
|
||||
self.log_debug(_('Written processed HTML to ')+f.name)
|
||||
|
||||
|
||||
|
||||
|
||||
def rewrite_links(self, olink):
|
||||
'''
|
||||
Make all links in document relative so that they work in the EPUB container.
|
||||
@ -555,7 +555,7 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
|
||||
return olink
|
||||
if link.path in self.htmlfiles:
|
||||
return self.htmlfile_map[link.path] + frag
|
||||
return self.htmlfile_map[link.path] + frag
|
||||
if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
|
||||
return olink # This happens when --max-levels is used
|
||||
if link.path in self.resource_map.keys():
|
||||
@ -567,26 +567,26 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
name = 'resources/' + name
|
||||
self.resource_map[link.path] = name
|
||||
return name + frag
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class Processor(Parser):
|
||||
'''
|
||||
This class builds on :class:`Parser` to provide additional methods
|
||||
to perform various processing/modification tasks on HTML files.
|
||||
'''
|
||||
|
||||
|
||||
LINKS_PATH = XPath('//a[@href]')
|
||||
PIXEL_PAT = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px')
|
||||
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
||||
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
Parser.__init__(self, *args, **kwargs)
|
||||
temp = LoggingInterface(logging.getLogger('cssutils'))
|
||||
temp.setup_cli_handler(self.opts.verbose)
|
||||
self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR)
|
||||
self.stylesheet = self.font_css = self.override_css = None
|
||||
|
||||
|
||||
def detect_chapters(self):
|
||||
self.detected_chapters = self.opts.chapter(self.root)
|
||||
chapter_mark = self.opts.chapter_mark
|
||||
@ -604,12 +604,12 @@ class Processor(Parser):
|
||||
else: # chapter_mark == 'both':
|
||||
mark = etree.Element('hr', style=page_break_before)
|
||||
elem.addprevious(mark)
|
||||
|
||||
|
||||
def save(self, strip_comments=False):
|
||||
style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
|
||||
style_path = os.path.splitext(os.path.basename(self.save_path()))[0]+'_calibre'
|
||||
for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
|
||||
if sheet is not None:
|
||||
style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet',
|
||||
style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet',
|
||||
'href':'resources/%s_%d.css'%(style_path, i),
|
||||
'charset':'UTF-8'})
|
||||
style.tail = '\n'
|
||||
@ -620,16 +620,16 @@ class Processor(Parser):
|
||||
raw = raw.encode('utf-8')
|
||||
open(path, 'wb').write(raw)
|
||||
return Parser.save(self, strip_comments=strip_comments)
|
||||
|
||||
|
||||
def populate_toc(self, toc):
|
||||
'''
|
||||
Populate the Table of Contents from detected chapters and links.
|
||||
'''
|
||||
class Adder(object):
|
||||
|
||||
|
||||
def __init__(self, toc):
|
||||
self.next_play_order = max([x.play_order for x in toc.flat()])
|
||||
|
||||
|
||||
def __call__(self, href, fragment, text, target, type='link'):
|
||||
for entry in toc.flat():
|
||||
if entry.href == href and entry.fragment == fragment:
|
||||
@ -637,15 +637,15 @@ class Processor(Parser):
|
||||
if len(text) > 50:
|
||||
text = text[:50] + u'\u2026'
|
||||
self.next_play_order += 1
|
||||
return target.add_item(href, fragment, text, type=type,
|
||||
return target.add_item(href, fragment, text, type=type,
|
||||
play_order=self.next_play_order)
|
||||
add_item = Adder(toc)
|
||||
name = self.htmlfile_map[self.htmlfile.path]
|
||||
href = 'content/'+name
|
||||
|
||||
|
||||
# Add level* TOC items
|
||||
counter = 0
|
||||
|
||||
|
||||
def elem_to_link(elem, href, counter):
|
||||
text = (u''.join(elem.xpath('string()'))).strip()
|
||||
if not text:
|
||||
@ -662,8 +662,8 @@ class Processor(Parser):
|
||||
elem.set('id', id)
|
||||
frag = id
|
||||
return text, _href, frag
|
||||
|
||||
|
||||
|
||||
|
||||
if self.opts.level1_toc is not None:
|
||||
level1 = self.opts.level1_toc(self.root)
|
||||
level1_order = []
|
||||
@ -702,17 +702,17 @@ class Processor(Parser):
|
||||
counter += 1
|
||||
if text:
|
||||
add_item(_href, frag, text, level2, type='chapter')
|
||||
|
||||
|
||||
|
||||
|
||||
if level1_order: # Fix play order
|
||||
next_play_order = level1_order[0].play_order
|
||||
for x in level1_order:
|
||||
for y in x.flat():
|
||||
y.play_order = next_play_order
|
||||
next_play_order += 1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if len(toc) > 0:
|
||||
# Detected TOC entries using --level* options
|
||||
# so aborting all other toc processing
|
||||
@ -726,7 +726,7 @@ class Processor(Parser):
|
||||
id = elem.get('id', 'calibre_chapter_%d'%counter)
|
||||
elem.set('id', id)
|
||||
add_item(href, id, text, toc, type='chapter')
|
||||
|
||||
|
||||
if len(list(toc.flat())) >= self.opts.toc_threshold:
|
||||
return
|
||||
referrer = toc
|
||||
@ -745,7 +745,7 @@ class Processor(Parser):
|
||||
name = self.htmlfile_map[self.htmlfile.referrer.path]
|
||||
href = 'content/'+name
|
||||
referrer = add_item(href, None, text, toc)
|
||||
|
||||
|
||||
# Add links to TOC
|
||||
if int(self.opts.max_toc_links) > 0:
|
||||
for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]:
|
||||
@ -762,7 +762,7 @@ class Processor(Parser):
|
||||
if len(parts) > 1:
|
||||
fragment = parts[1]
|
||||
add_item(href, fragment, text, referrer)
|
||||
|
||||
|
||||
@classmethod
|
||||
def preprocess_css(cls, css, dpi=96):
|
||||
def rescale(match):
|
||||
@ -772,17 +772,17 @@ class Processor(Parser):
|
||||
except ValueError:
|
||||
return ''
|
||||
return '%fpt'%(72 * val/dpi)
|
||||
|
||||
|
||||
css = cls.PIXEL_PAT.sub(rescale, css)
|
||||
css = cls.PAGE_PAT.sub('', css)
|
||||
return css
|
||||
|
||||
|
||||
def extract_css(self, parsed_sheets):
|
||||
'''
|
||||
Remove all CSS information from the document and store it as
|
||||
Remove all CSS information from the document and store it as
|
||||
:class:`StyleSheet` objects.
|
||||
'''
|
||||
|
||||
|
||||
def get_id(chapter, counter, prefix='calibre_css_'):
|
||||
new_id = '%s_%d'%(prefix, counter)
|
||||
if chapter.tag.lower() == 'a' and 'name' in chapter.keys():
|
||||
@ -796,7 +796,7 @@ class Processor(Parser):
|
||||
id = new_id
|
||||
chapter.set('id', id)
|
||||
return id
|
||||
|
||||
|
||||
self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('')
|
||||
self.specified_override_css = []
|
||||
for link in self.root.xpath('//link'):
|
||||
@ -825,8 +825,7 @@ class Processor(Parser):
|
||||
self.log_exception('')
|
||||
if parsed_sheets.has_key(file):
|
||||
self.external_stylesheets.append(parsed_sheets[file])
|
||||
|
||||
|
||||
|
||||
for style in self.root.xpath('//style'):
|
||||
if 'css' in style.get('type', 'text/css').lower():
|
||||
override_css = style.get('title', '') == 'override_css'
|
||||
@ -889,7 +888,7 @@ class Processor(Parser):
|
||||
cn += classname
|
||||
font.set('class', cn)
|
||||
font.tag = 'span'
|
||||
|
||||
|
||||
id_css, id_css_counter = {}, 0
|
||||
for elem in self.root.xpath('//*[@style]'):
|
||||
setting = elem.get('style')
|
||||
@ -906,7 +905,7 @@ class Processor(Parser):
|
||||
cn = elem.get('class', classname)
|
||||
elem.set('class', cn)
|
||||
elem.attrib.pop('style')
|
||||
|
||||
|
||||
css = '\n'.join(['.%s {%s;}'%(cn, setting) for \
|
||||
setting, cn in cache.items()])
|
||||
css += '\n\n'
|
||||
@ -930,28 +929,28 @@ class Processor(Parser):
|
||||
self.override_css = self.css_parser.parseString(self.preprocess_css(css))
|
||||
for rule in reversed(self.specified_override_css):
|
||||
self.override_css.insertRule(rule, index=0)
|
||||
|
||||
|
||||
|
||||
|
||||
def config(defaults=None, config_name='html',
|
||||
desc=_('Options to control the traversal of HTML')):
|
||||
if defaults is None:
|
||||
c = Config(config_name, desc)
|
||||
else:
|
||||
c = StringConfig(defaults, desc)
|
||||
|
||||
|
||||
c.add_opt('output', ['-o', '--output'], default=None,
|
||||
help=_('The output directory. Default is the current directory.'))
|
||||
c.add_opt('encoding', ['--encoding'], default=None,
|
||||
help=_('Character encoding for HTML files. Default is to auto detect.'))
|
||||
c.add_opt('zip', ['--zip'], default=False,
|
||||
help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
|
||||
|
||||
|
||||
traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
|
||||
traversal('breadth_first', ['--breadth-first'], default=False,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
|
||||
traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
|
||||
help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
|
||||
|
||||
|
||||
metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
|
||||
metadata('title', ['-t', '--title'], default=None,
|
||||
help=_('Set the title. Default is to autodetect.'))
|
||||
@ -965,13 +964,13 @@ def config(defaults=None, config_name='html',
|
||||
help=_('A summary of this book.'))
|
||||
metadata('from_opf', ['--metadata-from'], default=None,
|
||||
help=_('Load metadata from the specified OPF file'))
|
||||
|
||||
|
||||
debug = c.add_group('debug', _('Options useful for debugging'))
|
||||
debug('verbose', ['-v', '--verbose'], default=0, action='count',
|
||||
help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
|
||||
debug('pretty_print', ['--pretty-print'], default=False,
|
||||
help=_('Output HTML is "pretty printed" for easier parsing by humans'))
|
||||
|
||||
|
||||
return c
|
||||
|
||||
def option_parser():
|
||||
@ -980,7 +979,7 @@ def option_parser():
|
||||
%prog [options] file.html|opf
|
||||
|
||||
Follow all links in an HTML file and collect them into the specified directory.
|
||||
Also collects any resources like images, stylesheets, scripts, etc.
|
||||
Also collects any resources like images, stylesheets, scripts, etc.
|
||||
If an OPF file is specified instead, the list of files in its <spine> element
|
||||
is used.
|
||||
'''))
|
||||
@ -1056,11 +1055,11 @@ def merge_metadata(htmlfile, opf, opts):
|
||||
elif attr == 'tags':
|
||||
val = [i.strip() for i in val.split(',') if i.strip()]
|
||||
setattr(mi, attr, val)
|
||||
|
||||
|
||||
cover = getattr(opts, 'cover', False)
|
||||
if cover and os.path.exists(cover):
|
||||
mi.cover = os.path.abspath(cover)
|
||||
|
||||
|
||||
if not mi.title:
|
||||
if htmlfile:
|
||||
mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
|
||||
@ -1092,13 +1091,13 @@ def rebase_toc(toc, htmlfile_map, basepath, root=True):
|
||||
def fix_entry(entry):
|
||||
if entry.abspath in htmlfile_map.keys():
|
||||
entry.href = 'content/' + htmlfile_map[entry.abspath]
|
||||
|
||||
|
||||
for entry in toc:
|
||||
rebase_toc(entry, htmlfile_map, basepath, root=False)
|
||||
fix_entry(entry)
|
||||
if root:
|
||||
toc.base_path = basepath
|
||||
|
||||
|
||||
def create_dir(htmlfile, opts):
|
||||
'''
|
||||
Create a directory that contains the open ebook
|
||||
@ -1110,16 +1109,16 @@ def create_dir(htmlfile, opts):
|
||||
else:
|
||||
opf, filelist = get_filelist(htmlfile, opts)
|
||||
mi = merge_metadata(htmlfile, opf, opts)
|
||||
|
||||
|
||||
resource_map, htmlfile_map = parse_content(filelist, opts)
|
||||
resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
|
||||
|
||||
|
||||
if opf and opf.cover and os.access(opf.cover, os.R_OK):
|
||||
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
|
||||
shutil.copyfile(opf.cover, cpath)
|
||||
resources.append(cpath)
|
||||
mi.cover = cpath
|
||||
|
||||
|
||||
spine = [htmlfile_map[f.path] for f in filelist]
|
||||
mi = create_metadata(opts.output, mi, spine, resources)
|
||||
buf = cStringIO.StringIO()
|
||||
@ -1132,7 +1131,7 @@ def create_dir(htmlfile, opts):
|
||||
with open(os.path.join(opts.output, 'toc.ncx'), 'wb') as f:
|
||||
f.write(toc)
|
||||
print 'Open ebook created in', opts.output
|
||||
|
||||
|
||||
def create_oebzip(htmlfile, opts):
|
||||
'''
|
||||
Create a zip file that contains the Open ebook.
|
||||
@ -1154,13 +1153,13 @@ def main(args=sys.argv):
|
||||
parser.print_help()
|
||||
print _('You must specify an input HTML file')
|
||||
return 1
|
||||
|
||||
|
||||
htmlfile = args[1]
|
||||
if opts.zip:
|
||||
create_oebzip(htmlfile, opts)
|
||||
else:
|
||||
create_dir(htmlfile, opts)
|
||||
|
||||
|
||||
return 0
|
||||
|
||||
def gui_main(htmlfile, pt=None):
|
||||
@ -1183,7 +1182,7 @@ output = %s
|
||||
if len(nontrivial) < 2:
|
||||
return None
|
||||
return pt.name
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
@ -21,7 +21,7 @@ def config(defaults=None):
|
||||
c.remove_opt('profile')
|
||||
mobic = mobi_config(defaults=defaults)
|
||||
c.update(mobic)
|
||||
return c
|
||||
return c
|
||||
|
||||
def option_parser(usage=USAGE):
|
||||
usage = usage % ('Mobipocket', formats())
|
||||
@ -33,13 +33,13 @@ def any2mobi(opts, path, notification=None):
|
||||
if not ext:
|
||||
raise ValueError('Unknown file type: '+path)
|
||||
ext = ext.lower()[1:]
|
||||
|
||||
|
||||
if opts.output is None:
|
||||
opts.output = os.path.splitext(os.path.basename(path))[0]+'.mobi'
|
||||
|
||||
|
||||
opts.output = os.path.abspath(opts.output)
|
||||
orig_output = opts.output
|
||||
|
||||
|
||||
with TemporaryDirectory('_any2mobi') as tdir:
|
||||
oebdir = os.path.join(tdir, 'oeb')
|
||||
os.mkdir(oebdir)
|
||||
@ -54,7 +54,7 @@ def any2mobi(opts, path, notification=None):
|
||||
opts.output = orig_output
|
||||
logging.getLogger('html2epub').info(_('Creating Mobipocket file from EPUB...'))
|
||||
oeb2mobi(opts, opf)
|
||||
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
|
Loading…
x
Reference in New Issue
Block a user