From 241a2fc09911709065c8ffe42cf84e1803ad10c2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 21 Apr 2009 14:10:00 -0700 Subject: [PATCH] MOBI Output:Fix bug that cause MOBI conversion to sometimes fail when linking to an external stylesheet --- src/calibre/ebooks/html.py | 251 ++++++++++++++-------------- src/calibre/ebooks/mobi/from_any.py | 10 +- 2 files changed, 130 insertions(+), 131 deletions(-) diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index f405040c77..d515c606d3 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -30,10 +30,10 @@ from calibre.utils.zipfile import ZipFile from cssutils import CSSParser class HTMLElement(HtmlElement): - + @apply def specified_font_size(): - + def fget(self): ans = self.get('specified_font_size', '') if not ans: @@ -41,12 +41,12 @@ class HTMLElement(HtmlElement): if ans.startswith('f'): return functools.partial(operator.mul, float(ans[1:])) return float(ans) - + def fset(self, val): self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val)) - + return property(fget=fget, fset=fset) - + @apply def computed_font_size(): def fget(self): @@ -54,48 +54,48 @@ class HTMLElement(HtmlElement): if ans == '': return None return float(ans) - + def fset(self, val): self.set('computed_font_size', repr(val)) - + return property(fget=fget, fset=fset) - + def remove_font_size_information(self): for elem in self.iter(): for p in ('computed', 'specified'): elem.attrib.pop(p+'_font_size', None) - + def getpath(self): return self.getroottree().getpath(self) class Lookup(HtmlElementClassLookup): - + def lookup(self, node_type, document, namespace, name): if node_type == 'element': return HTMLElement return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name) class HTMLParser(_HTMLParser): - + def __init__(self, **kwargs): super(HTMLParser, self).__init__(**kwargs) self.set_element_class_lookup(Lookup()) - + parser = HTMLParser() def fromstring(raw, **kw): return _fromstring(raw, parser=parser, **kw) def tostring(root, pretty_print=False): - return _tostring(root, encoding='utf-8', method='xml', - include_meta_content_type=True, + return _tostring(root, encoding='utf-8', method='xml', + include_meta_content_type=True, pretty_print=pretty_print) - + class Link(object): ''' Represents a link in a HTML file. ''' - + @classmethod def url_to_local_path(cls, url, base): path = urlunparse(('', '', url.path, url.params, url.query, '')) @@ -103,7 +103,7 @@ class Link(object): if os.path.isabs(path): return path return os.path.abspath(os.path.join(base, path)) - + def __init__(self, url, base): ''' :param url: The url this link points to. Must be an unquoted unicode string. @@ -127,13 +127,13 @@ class Link(object): def __eq__(self, other): return self.path == getattr(other, 'path', other) - + def __str__(self): - return u'Link: %s --> %s'%(self.url, self.path) - + return u'Link: %s --> %s'%(self.url, self.path) + class IgnoreFile(Exception): - + def __init__(self, msg, errno): Exception.__init__(self, msg) self.doesnt_exist = errno == 2 @@ -148,13 +148,13 @@ class HTMLFile(object): The encoding of the file is available as :member:`encoding`. ''' - + HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) TITLE_PAT = re.compile('([^<>]+)', re.IGNORECASE) LINK_PAT = re.compile( r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', re.DOTALL|re.IGNORECASE) - + def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): ''' :param level: The level of this file. Should be 0 for the root file. @@ -167,7 +167,7 @@ class HTMLFile(object): self.level = level self.referrer = referrer self.links = [] - + try: with open(self.path, 'rb') as f: src = f.read() @@ -176,7 +176,7 @@ class HTMLFile(object): if level == 0: raise IOError(msg) raise IgnoreFile(msg, err.errno) - + self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) if not self.is_binary: if encoding is None: @@ -189,19 +189,19 @@ class HTMLFile(object): match = self.TITLE_PAT.search(src) self.title = match.group(1) if match is not None else self.title self.find_links(src) - - - + + + def __eq__(self, other): return self.path == getattr(other, 'path', other) - + def __str__(self): return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) - + def __repr__(self): return str(self) - - + + def find_links(self, src): for match in self.LINK_PAT.finditer(src): url = None @@ -212,7 +212,7 @@ class HTMLFile(object): link = self.resolve(url) if link not in self.links: self.links.append(link) - + def resolve(self, url): return Link(url, self.base) @@ -234,13 +234,13 @@ def depth_first(root, flat, visited=set([])): if hf not in visited: yield hf visited.add(hf) - - + + def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None): ''' Recursively traverse all links in the HTML file. - - :param max_levels: Maximum levels of recursion. Must be non-negative. 0 + + :param max_levels: Maximum levels of recursion. Must be non-negative. 0 implies that no links in the root HTML file are followed. :param encoding: Specify character encoding of HTML files. If `None` it is auto-detected. @@ -271,7 +271,7 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) print repr(err) for link in rejects: hf.links.remove(link) - + next_level = list(nl) orec = sys.getrecursionlimit() sys.setrecursionlimit(500000) @@ -279,14 +279,14 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) return flat, list(depth_first(flat[0], flat)) finally: sys.setrecursionlimit(orec) - - + + def opf_traverse(opf_reader, verbose=0, encoding=None): ''' Return a list of :class:`HTMLFile` objects in the order specified by the `` element of the OPF. - - :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance. + + :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance. :param encoding: Specify character encoding of HTML files. If `None` it is auto-detected. ''' @@ -317,7 +317,7 @@ def opf_traverse(opf_reader, verbose=0, encoding=None): print 'WARNING: OPF spine item %s does not exist'%path ans = [f for f in ans if not f.is_binary] return ans - + convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp']) _span_pat = re.compile('', re.DOTALL|re.IGNORECASE) @@ -326,20 +326,20 @@ def sanitize_head(match): x = match.group(1) x = _span_pat.sub('', x) return '\n'+x+'\n' - + class PreProcessor(object): PREPROCESS = [ # Some idiotic HTML generators (Frontpage I'm looking at you) # Put all sorts of crap into . This messes up lxml - (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), + (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), sanitize_head), # Convert all entities, since lxml doesn't handle them well (re.compile(r'&(\S+?);'), convert_entities), # Remove the ', re.IGNORECASE), + (re.compile(r'', re.IGNORECASE), lambda match: ''), ] - + # Fix pdftohtml markup PDFTOHTML = [ # Remove
tags @@ -348,20 +348,20 @@ class PreProcessor(object): (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Remove
and replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 + (re.compile(r'(.*)', re.IGNORECASE), + lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: ''), - + # Remove gray background (re.compile(r']+>'), lambda match : ''), - + # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), - + ] - + # Fix Book Designer markup BOOK_DESIGNER = [ # HR @@ -377,17 +377,17 @@ class PreProcessor(object): (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), ] - + def is_baen(self, src): - return re.compile(r'<]*id=BookTitle', raw) is not None - + def is_pdftohtml(self, src): return '' in src[:1000] - + def preprocess(self, html): opts = getattr(self, 'opts', False) if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False): @@ -403,17 +403,17 @@ class PreProcessor(object): for rule in self.PREPROCESS + rules: html = rule[0].sub(rule[1], html) return html - + class Parser(PreProcessor, LoggingInterface): # SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont' -# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in +# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in # [ # (r'<(?P%s)(?P(\s+[^<>]*){0,1})(?', # '<\g\g />'), # (), # ] # ] - + def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): LoggingInterface.__init__(self, logging.getLogger(name)) self.setup_cli_handler(opts.verbose) @@ -433,27 +433,27 @@ class Parser(PreProcessor, LoggingInterface): name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1] save_counter += 1 self.htmlfile_map[f.path] = name - + self.parse_html() # Handle tags inside embedded # At least one source of EPUB files (Penguin) uses xlink:href # without declaring the xlink namespace - for image in self.root.xpath('//image'): + for image in self.root.xpath('//image'): for attr in image.attrib.keys(): if attr.endswith(':href'): nhref = self.rewrite_links(image.get(attr)) image.set(attr, nhref) - + self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates if self.root.get(bad, None) is not None: self.root.attrib.pop(bad) - - - + + + def save_path(self): return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]) - + def save(self, strip_comments=False): ''' Save processed HTML into the content directory. @@ -463,7 +463,7 @@ class Parser(PreProcessor, LoggingInterface): self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink') for svg in self.root.xpath('//svg'): svg.set('xmlns', 'http://www.w3.org/2000/svg') - + ans = tostring(self.root, pretty_print=self.opts.pretty_print) ans = re.compile(r'', re.IGNORECASE).sub( '\n\t 50: text = text[:50] + u'\u2026' self.next_play_order += 1 - return target.add_item(href, fragment, text, type=type, + return target.add_item(href, fragment, text, type=type, play_order=self.next_play_order) add_item = Adder(toc) name = self.htmlfile_map[self.htmlfile.path] href = 'content/'+name - + # Add level* TOC items counter = 0 - + def elem_to_link(elem, href, counter): text = (u''.join(elem.xpath('string()'))).strip() if not text: @@ -662,8 +662,8 @@ class Processor(Parser): elem.set('id', id) frag = id return text, _href, frag - - + + if self.opts.level1_toc is not None: level1 = self.opts.level1_toc(self.root) level1_order = [] @@ -702,17 +702,17 @@ class Processor(Parser): counter += 1 if text: add_item(_href, frag, text, level2, type='chapter') - - + + if level1_order: # Fix play order next_play_order = level1_order[0].play_order for x in level1_order: for y in x.flat(): y.play_order = next_play_order next_play_order += 1 - - - + + + if len(toc) > 0: # Detected TOC entries using --level* options # so aborting all other toc processing @@ -726,7 +726,7 @@ class Processor(Parser): id = elem.get('id', 'calibre_chapter_%d'%counter) elem.set('id', id) add_item(href, id, text, toc, type='chapter') - + if len(list(toc.flat())) >= self.opts.toc_threshold: return referrer = toc @@ -745,7 +745,7 @@ class Processor(Parser): name = self.htmlfile_map[self.htmlfile.referrer.path] href = 'content/'+name referrer = add_item(href, None, text, toc) - + # Add links to TOC if int(self.opts.max_toc_links) > 0: for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]: @@ -762,7 +762,7 @@ class Processor(Parser): if len(parts) > 1: fragment = parts[1] add_item(href, fragment, text, referrer) - + @classmethod def preprocess_css(cls, css, dpi=96): def rescale(match): @@ -772,17 +772,17 @@ class Processor(Parser): except ValueError: return '' return '%fpt'%(72 * val/dpi) - + css = cls.PIXEL_PAT.sub(rescale, css) css = cls.PAGE_PAT.sub('', css) return css - + def extract_css(self, parsed_sheets): ''' - Remove all CSS information from the document and store it as + Remove all CSS information from the document and store it as :class:`StyleSheet` objects. ''' - + def get_id(chapter, counter, prefix='calibre_css_'): new_id = '%s_%d'%(prefix, counter) if chapter.tag.lower() == 'a' and 'name' in chapter.keys(): @@ -796,7 +796,7 @@ class Processor(Parser): id = new_id chapter.set('id', id) return id - + self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('') self.specified_override_css = [] for link in self.root.xpath('//link'): @@ -825,8 +825,7 @@ class Processor(Parser): self.log_exception('') if parsed_sheets.has_key(file): self.external_stylesheets.append(parsed_sheets[file]) - - + for style in self.root.xpath('//style'): if 'css' in style.get('type', 'text/css').lower(): override_css = style.get('title', '') == 'override_css' @@ -889,7 +888,7 @@ class Processor(Parser): cn += classname font.set('class', cn) font.tag = 'span' - + id_css, id_css_counter = {}, 0 for elem in self.root.xpath('//*[@style]'): setting = elem.get('style') @@ -906,7 +905,7 @@ class Processor(Parser): cn = elem.get('class', classname) elem.set('class', cn) elem.attrib.pop('style') - + css = '\n'.join(['.%s {%s;}'%(cn, setting) for \ setting, cn in cache.items()]) css += '\n\n' @@ -930,28 +929,28 @@ class Processor(Parser): self.override_css = self.css_parser.parseString(self.preprocess_css(css)) for rule in reversed(self.specified_override_css): self.override_css.insertRule(rule, index=0) - - + + def config(defaults=None, config_name='html', desc=_('Options to control the traversal of HTML')): if defaults is None: c = Config(config_name, desc) else: c = StringConfig(defaults, desc) - + c.add_opt('output', ['-o', '--output'], default=None, help=_('The output directory. Default is the current directory.')) c.add_opt('encoding', ['--encoding'], default=None, help=_('Character encoding for HTML files. Default is to auto detect.')) c.add_opt('zip', ['--zip'], default=False, help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.')) - + traversal = c.add_group('traversal', _('Control the following of links in HTML files.')) traversal('breadth_first', ['--breadth-first'], default=False, help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first')) traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal', help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.')) - + metadata = c.add_group('metadata', _('Set metadata of the generated ebook')) metadata('title', ['-t', '--title'], default=None, help=_('Set the title. Default is to autodetect.')) @@ -965,13 +964,13 @@ def config(defaults=None, config_name='html', help=_('A summary of this book.')) metadata('from_opf', ['--metadata-from'], default=None, help=_('Load metadata from the specified OPF file')) - + debug = c.add_group('debug', _('Options useful for debugging')) debug('verbose', ['-v', '--verbose'], default=0, action='count', help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.')) debug('pretty_print', ['--pretty-print'], default=False, help=_('Output HTML is "pretty printed" for easier parsing by humans')) - + return c def option_parser(): @@ -980,7 +979,7 @@ def option_parser(): %prog [options] file.html|opf Follow all links in an HTML file and collect them into the specified directory. -Also collects any resources like images, stylesheets, scripts, etc. +Also collects any resources like images, stylesheets, scripts, etc. If an OPF file is specified instead, the list of files in its element is used. ''')) @@ -1056,11 +1055,11 @@ def merge_metadata(htmlfile, opf, opts): elif attr == 'tags': val = [i.strip() for i in val.split(',') if i.strip()] setattr(mi, attr, val) - + cover = getattr(opts, 'cover', False) if cover and os.path.exists(cover): mi.cover = os.path.abspath(cover) - + if not mi.title: if htmlfile: mi.title = os.path.splitext(os.path.basename(htmlfile))[0] @@ -1092,13 +1091,13 @@ def rebase_toc(toc, htmlfile_map, basepath, root=True): def fix_entry(entry): if entry.abspath in htmlfile_map.keys(): entry.href = 'content/' + htmlfile_map[entry.abspath] - + for entry in toc: rebase_toc(entry, htmlfile_map, basepath, root=False) fix_entry(entry) if root: toc.base_path = basepath - + def create_dir(htmlfile, opts): ''' Create a directory that contains the open ebook @@ -1110,16 +1109,16 @@ def create_dir(htmlfile, opts): else: opf, filelist = get_filelist(htmlfile, opts) mi = merge_metadata(htmlfile, opf, opts) - + resource_map, htmlfile_map = parse_content(filelist, opts) resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()] - + if opf and opf.cover and os.access(opf.cover, os.R_OK): cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1]) shutil.copyfile(opf.cover, cpath) resources.append(cpath) mi.cover = cpath - + spine = [htmlfile_map[f.path] for f in filelist] mi = create_metadata(opts.output, mi, spine, resources) buf = cStringIO.StringIO() @@ -1132,7 +1131,7 @@ def create_dir(htmlfile, opts): with open(os.path.join(opts.output, 'toc.ncx'), 'wb') as f: f.write(toc) print 'Open ebook created in', opts.output - + def create_oebzip(htmlfile, opts): ''' Create a zip file that contains the Open ebook. @@ -1154,13 +1153,13 @@ def main(args=sys.argv): parser.print_help() print _('You must specify an input HTML file') return 1 - + htmlfile = args[1] if opts.zip: create_oebzip(htmlfile, opts) else: create_dir(htmlfile, opts) - + return 0 def gui_main(htmlfile, pt=None): @@ -1183,7 +1182,7 @@ output = %s if len(nontrivial) < 2: return None return pt.name - + if __name__ == '__main__': sys.exit(main()) diff --git a/src/calibre/ebooks/mobi/from_any.py b/src/calibre/ebooks/mobi/from_any.py index 5607690e21..fc9e94dafb 100644 --- a/src/calibre/ebooks/mobi/from_any.py +++ b/src/calibre/ebooks/mobi/from_any.py @@ -21,7 +21,7 @@ def config(defaults=None): c.remove_opt('profile') mobic = mobi_config(defaults=defaults) c.update(mobic) - return c + return c def option_parser(usage=USAGE): usage = usage % ('Mobipocket', formats()) @@ -33,13 +33,13 @@ def any2mobi(opts, path, notification=None): if not ext: raise ValueError('Unknown file type: '+path) ext = ext.lower()[1:] - + if opts.output is None: opts.output = os.path.splitext(os.path.basename(path))[0]+'.mobi' - + opts.output = os.path.abspath(opts.output) orig_output = opts.output - + with TemporaryDirectory('_any2mobi') as tdir: oebdir = os.path.join(tdir, 'oeb') os.mkdir(oebdir) @@ -54,7 +54,7 @@ def any2mobi(opts, path, notification=None): opts.output = orig_output logging.getLogger('html2epub').info(_('Creating Mobipocket file from EPUB...')) oeb2mobi(opts, opf) - + def main(args=sys.argv): parser = option_parser()