mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Fix #1746. Improve handling of encoding.
This commit is contained in:
parent
2f3680e563
commit
6fbf78aa7f
@ -24,6 +24,7 @@ import calibre
|
||||
from calibre import LoggingInterface
|
||||
from calibre.translations.dynamic import translate
|
||||
from calibre.startup import get_lang
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||
from calibre.ebooks.metadata.epub import CoverRenderer
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
@ -87,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
|
||||
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
|
||||
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
|
||||
def element(parent, *args, **kwargs):
|
||||
if parent is not None:
|
||||
@ -447,9 +449,10 @@ class Manifest(object):
|
||||
% (self.id, self.href, self.media_type)
|
||||
|
||||
def _force_xhtml(self, data):
|
||||
# Possibly decode in user-specified encoding
|
||||
if self.oeb.encoding is not None:
|
||||
data = data.decode(self.oeb.encoding, 'replace')
|
||||
# Convert to Unicode and normalize line endings
|
||||
data = self.oeb.decode(data)
|
||||
data = XMLDECL_RE.sub('', data)
|
||||
data = data.replace('\r\n', '\n').replace('\r', '\n')
|
||||
# Handle broken XHTML w/ SVG (ugh)
|
||||
if 'svg:' in data and SVG_NS not in data:
|
||||
data = data.replace(
|
||||
@ -1381,6 +1384,20 @@ class OEBBook(object):
|
||||
lang = lang.split('-', 1)[0].lower()
|
||||
return translate(lang, text)
|
||||
|
||||
def decode(self, data):
|
||||
if isinstance(data, unicode):
|
||||
return data
|
||||
encodings = ['utf-8', 'utf-16']
|
||||
if self.encoding is not None:
|
||||
encodings.append(self.encoding)
|
||||
for encoding in encodings:
|
||||
try:
|
||||
return data.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
data, _ = xml_to_unicode(data)
|
||||
return data
|
||||
|
||||
def to_opf1(self):
|
||||
package = etree.Element('package',
|
||||
attrib={'unique-identifier': self.uid.id})
|
||||
|
@ -109,6 +109,7 @@ class Stylizer(object):
|
||||
STYLESHEETS = {}
|
||||
|
||||
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
|
||||
self.oeb = oeb
|
||||
self.profile = profile
|
||||
self.logger = oeb.logger
|
||||
item = oeb.manifest.hrefs[path]
|
||||
@ -117,7 +118,7 @@ class Stylizer(object):
|
||||
stylesheets = [HTML_CSS_STYLESHEET]
|
||||
head = xpath(tree, '/h:html/h:head')[0]
|
||||
parser = cssutils.CSSParser()
|
||||
parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path)))
|
||||
parser.setFetcher(self._fetch_css_file)
|
||||
for elem in head:
|
||||
if elem.tag == XHTML('style') and elem.text \
|
||||
and elem.get('type', CSS_MIME) in OEB_STYLES:
|
||||
@ -138,8 +139,7 @@ class Stylizer(object):
|
||||
if path in self.STYLESHEETS:
|
||||
stylesheet = self.STYLESHEETS[path]
|
||||
else:
|
||||
data = XHTML_CSS_NAMESPACE
|
||||
data += oeb.manifest.hrefs[path].data
|
||||
data = self._fetch_css_file(path)[1]
|
||||
stylesheet = parser.parseString(data, href=path)
|
||||
stylesheet.namespaces['h'] = XHTML_NS
|
||||
self.STYLESHEETS[path] = stylesheet
|
||||
@ -167,6 +167,15 @@ class Stylizer(object):
|
||||
for elem in xpath(tree, '//h:*[@style]'):
|
||||
self.style(elem)._apply_style_attr()
|
||||
|
||||
def _fetch_css_file(self, path):
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
if path not in hrefs:
|
||||
return (None, None)
|
||||
data = hrefs[path].data
|
||||
data = self.oeb.decode(data)
|
||||
data = XHTML_CSS_NAMESPACE + data
|
||||
return (None, data)
|
||||
|
||||
def flatten_rule(self, rule, href, index):
|
||||
results = []
|
||||
if isinstance(rule, CSSStyleRule):
|
||||
|
Loading…
x
Reference in New Issue
Block a user