Fix #1746. Improve handling of encoding.

This commit is contained in:
Marshall T. Vandegrift 2009-02-01 21:20:41 -05:00
parent 2f3680e563
commit 6fbf78aa7f
2 changed files with 32 additions and 6 deletions

View File

@ -24,6 +24,7 @@ import calibre
from calibre import LoggingInterface from calibre import LoggingInterface
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
from calibre.startup import get_lang from calibre.startup import get_lang
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.metadata.epub import CoverRenderer from calibre.ebooks.metadata.epub import CoverRenderer
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
@ -87,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$') QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+') PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
def element(parent, *args, **kwargs): def element(parent, *args, **kwargs):
if parent is not None: if parent is not None:
@ -447,9 +449,10 @@ class Manifest(object):
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
def _force_xhtml(self, data): def _force_xhtml(self, data):
# Possibly decode in user-specified encoding # Convert to Unicode and normalize line endings
if self.oeb.encoding is not None: data = self.oeb.decode(data)
data = data.decode(self.oeb.encoding, 'replace') data = XMLDECL_RE.sub('', data)
data = data.replace('\r\n', '\n').replace('\r', '\n')
# Handle broken XHTML w/ SVG (ugh) # Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data: if 'svg:' in data and SVG_NS not in data:
data = data.replace( data = data.replace(
@ -1381,6 +1384,20 @@ class OEBBook(object):
lang = lang.split('-', 1)[0].lower() lang = lang.split('-', 1)[0].lower()
return translate(lang, text) return translate(lang, text)
def decode(self, data):
if isinstance(data, unicode):
return data
encodings = ['utf-8', 'utf-16']
if self.encoding is not None:
encodings.append(self.encoding)
for encoding in encodings:
try:
return data.decode(encoding)
except UnicodeDecodeError:
pass
data, _ = xml_to_unicode(data)
return data
def to_opf1(self): def to_opf1(self):
package = etree.Element('package', package = etree.Element('package',
attrib={'unique-identifier': self.uid.id}) attrib={'unique-identifier': self.uid.id})

View File

@ -109,6 +109,7 @@ class Stylizer(object):
STYLESHEETS = {} STYLESHEETS = {}
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']): def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
self.oeb = oeb
self.profile = profile self.profile = profile
self.logger = oeb.logger self.logger = oeb.logger
item = oeb.manifest.hrefs[path] item = oeb.manifest.hrefs[path]
@ -117,7 +118,7 @@ class Stylizer(object):
stylesheets = [HTML_CSS_STYLESHEET] stylesheets = [HTML_CSS_STYLESHEET]
head = xpath(tree, '/h:html/h:head')[0] head = xpath(tree, '/h:html/h:head')[0]
parser = cssutils.CSSParser() parser = cssutils.CSSParser()
parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path))) parser.setFetcher(self._fetch_css_file)
for elem in head: for elem in head:
if elem.tag == XHTML('style') and elem.text \ if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES: and elem.get('type', CSS_MIME) in OEB_STYLES:
@ -138,8 +139,7 @@ class Stylizer(object):
if path in self.STYLESHEETS: if path in self.STYLESHEETS:
stylesheet = self.STYLESHEETS[path] stylesheet = self.STYLESHEETS[path]
else: else:
data = XHTML_CSS_NAMESPACE data = self._fetch_css_file(path)[1]
data += oeb.manifest.hrefs[path].data
stylesheet = parser.parseString(data, href=path) stylesheet = parser.parseString(data, href=path)
stylesheet.namespaces['h'] = XHTML_NS stylesheet.namespaces['h'] = XHTML_NS
self.STYLESHEETS[path] = stylesheet self.STYLESHEETS[path] = stylesheet
@ -167,6 +167,15 @@ class Stylizer(object):
for elem in xpath(tree, '//h:*[@style]'): for elem in xpath(tree, '//h:*[@style]'):
self.style(elem)._apply_style_attr() self.style(elem)._apply_style_attr()
def _fetch_css_file(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
return (None, None)
data = hrefs[path].data
data = self.oeb.decode(data)
data = XHTML_CSS_NAMESPACE + data
return (None, data)
def flatten_rule(self, rule, href, index): def flatten_rule(self, rule, href, index):
results = [] results = []
if isinstance(rule, CSSStyleRule): if isinstance(rule, CSSStyleRule):