Conversion pipeline: Fix broken link rewriting for inline CSS embedded in HTML

This commit is contained in:
Kovid Goyal 2010-12-21 19:16:10 -07:00
parent 6abc12cf18
commit 523185f7a9

View File

@ -11,12 +11,11 @@ import os, re, uuid, logging
from mimetypes import types_map from mimetypes import types_map
from collections import defaultdict from collections import defaultdict
from itertools import count from itertools import count
from urlparse import urldefrag, urlparse, urlunparse from urlparse import urldefrag, urlparse, urlunparse, urljoin
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
from urlparse import urljoin
from lxml import etree, html from lxml import etree, html
from cssutils import CSSParser from cssutils import CSSParser, parseString, parseStyle, replaceUrls
from cssutils.css import CSSRule from cssutils.css import CSSRule
import calibre import calibre
@ -88,11 +87,11 @@ def XLINK(name):
def CALIBRE(name): def CALIBRE(name):
return '{%s}%s' % (CALIBRE_NS, name) return '{%s}%s' % (CALIBRE_NS, name)
_css_url_re = re.compile(r'url\((.*?)\)', re.I) _css_url_re = re.compile(r'url\s*\((.*?)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"') _css_import_re = re.compile(r'@import "(.*?)"')
_archive_re = re.compile(r'[^ ]+') _archive_re = re.compile(r'[^ ]+')
def iterlinks(root): def iterlinks(root, find_links_in_css=True):
''' '''
Iterate over all links in a OEB Document. Iterate over all links in a OEB Document.
@ -134,6 +133,8 @@ def iterlinks(root):
yield (el, attr, attribs[attr], 0) yield (el, attr, attribs[attr], 0)
if not find_links_in_css:
continue
if tag == XHTML('style') and el.text: if tag == XHTML('style') and el.text:
for match in _css_url_re.finditer(el.text): for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1)) yield (el, None, match.group(1), match.start(1))
@ -180,7 +181,7 @@ def rewrite_links(root, link_repl_func, resolve_base_href=False):
''' '''
if resolve_base_href: if resolve_base_href:
resolve_base_href(root) resolve_base_href(root)
for el, attrib, link, pos in iterlinks(root): for el, attrib, link, pos in iterlinks(root, find_links_in_css=False):
new_link = link_repl_func(link.strip()) new_link = link_repl_func(link.strip())
if new_link == link: if new_link == link:
continue continue
@ -203,6 +204,44 @@ def rewrite_links(root, link_repl_func, resolve_base_href=False):
new = cur[:pos] + new_link + cur[pos+len(link):] new = cur[:pos] + new_link + cur[pos+len(link):]
el.attrib[attrib] = new el.attrib[attrib] = new
def set_property(v):
if v.CSS_PRIMITIVE_VALUE == v.cssValueType and \
v.CSS_URI == v.primitiveType:
v.setStringValue(v.CSS_URI,
link_repl_func(v.getStringValue()))
for el in root.iter():
try:
tag = el.tag
except UnicodeDecodeError:
continue
if tag == XHTML('style') and el.text and \
(_css_url_re.search(el.text) is not None or '@import' in
el.text):
stylesheet = parseString(el.text)
replaceUrls(stylesheet, link_repl_func)
el.text = '\n'+stylesheet.cssText + '\n'
if 'style' in el.attrib:
text = el.attrib['style']
if _css_url_re.search(text) is not None:
stext = parseStyle(text)
changed = False
for p in stext.getProperties(all=True):
v = p.cssValue
if v.CSS_VALUE_LIST == v.cssValueType:
for item in v:
changed = True
set_property(item)
elif v.CSS_PRIMITIVE_VALUE == v.cssValueType:
changed = True
set_property(v)
if changed:
el.attrib['style'] = stext.cssText.replace('\n', ' ').replace('\r',
' ')
EPUB_MIME = types_map['.epub'] EPUB_MIME = types_map['.epub']
XHTML_MIME = types_map['.xhtml'] XHTML_MIME = types_map['.xhtml']