mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Fix #1746. Improve handling of encoding.
This commit is contained in:
parent
2f3680e563
commit
6fbf78aa7f
@ -24,6 +24,7 @@ import calibre
|
|||||||
from calibre import LoggingInterface
|
from calibre import LoggingInterface
|
||||||
from calibre.translations.dynamic import translate
|
from calibre.translations.dynamic import translate
|
||||||
from calibre.startup import get_lang
|
from calibre.startup import get_lang
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||||
from calibre.ebooks.metadata.epub import CoverRenderer
|
from calibre.ebooks.metadata.epub import CoverRenderer
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
@ -87,6 +88,7 @@ ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
|
|||||||
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
||||||
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
|
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
|
||||||
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
||||||
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
|
|
||||||
def element(parent, *args, **kwargs):
|
def element(parent, *args, **kwargs):
|
||||||
if parent is not None:
|
if parent is not None:
|
||||||
@ -447,9 +449,10 @@ class Manifest(object):
|
|||||||
% (self.id, self.href, self.media_type)
|
% (self.id, self.href, self.media_type)
|
||||||
|
|
||||||
def _force_xhtml(self, data):
|
def _force_xhtml(self, data):
|
||||||
# Possibly decode in user-specified encoding
|
# Convert to Unicode and normalize line endings
|
||||||
if self.oeb.encoding is not None:
|
data = self.oeb.decode(data)
|
||||||
data = data.decode(self.oeb.encoding, 'replace')
|
data = XMLDECL_RE.sub('', data)
|
||||||
|
data = data.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
if 'svg:' in data and SVG_NS not in data:
|
if 'svg:' in data and SVG_NS not in data:
|
||||||
data = data.replace(
|
data = data.replace(
|
||||||
@ -1381,6 +1384,20 @@ class OEBBook(object):
|
|||||||
lang = lang.split('-', 1)[0].lower()
|
lang = lang.split('-', 1)[0].lower()
|
||||||
return translate(lang, text)
|
return translate(lang, text)
|
||||||
|
|
||||||
|
def decode(self, data):
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
return data
|
||||||
|
encodings = ['utf-8', 'utf-16']
|
||||||
|
if self.encoding is not None:
|
||||||
|
encodings.append(self.encoding)
|
||||||
|
for encoding in encodings:
|
||||||
|
try:
|
||||||
|
return data.decode(encoding)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
data, _ = xml_to_unicode(data)
|
||||||
|
return data
|
||||||
|
|
||||||
def to_opf1(self):
|
def to_opf1(self):
|
||||||
package = etree.Element('package',
|
package = etree.Element('package',
|
||||||
attrib={'unique-identifier': self.uid.id})
|
attrib={'unique-identifier': self.uid.id})
|
||||||
|
@ -109,6 +109,7 @@ class Stylizer(object):
|
|||||||
STYLESHEETS = {}
|
STYLESHEETS = {}
|
||||||
|
|
||||||
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
|
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
|
||||||
|
self.oeb = oeb
|
||||||
self.profile = profile
|
self.profile = profile
|
||||||
self.logger = oeb.logger
|
self.logger = oeb.logger
|
||||||
item = oeb.manifest.hrefs[path]
|
item = oeb.manifest.hrefs[path]
|
||||||
@ -117,7 +118,7 @@ class Stylizer(object):
|
|||||||
stylesheets = [HTML_CSS_STYLESHEET]
|
stylesheets = [HTML_CSS_STYLESHEET]
|
||||||
head = xpath(tree, '/h:html/h:head')[0]
|
head = xpath(tree, '/h:html/h:head')[0]
|
||||||
parser = cssutils.CSSParser()
|
parser = cssutils.CSSParser()
|
||||||
parser.setFetcher(lambda path: ('utf-8', oeb.container.read(path)))
|
parser.setFetcher(self._fetch_css_file)
|
||||||
for elem in head:
|
for elem in head:
|
||||||
if elem.tag == XHTML('style') and elem.text \
|
if elem.tag == XHTML('style') and elem.text \
|
||||||
and elem.get('type', CSS_MIME) in OEB_STYLES:
|
and elem.get('type', CSS_MIME) in OEB_STYLES:
|
||||||
@ -138,8 +139,7 @@ class Stylizer(object):
|
|||||||
if path in self.STYLESHEETS:
|
if path in self.STYLESHEETS:
|
||||||
stylesheet = self.STYLESHEETS[path]
|
stylesheet = self.STYLESHEETS[path]
|
||||||
else:
|
else:
|
||||||
data = XHTML_CSS_NAMESPACE
|
data = self._fetch_css_file(path)[1]
|
||||||
data += oeb.manifest.hrefs[path].data
|
|
||||||
stylesheet = parser.parseString(data, href=path)
|
stylesheet = parser.parseString(data, href=path)
|
||||||
stylesheet.namespaces['h'] = XHTML_NS
|
stylesheet.namespaces['h'] = XHTML_NS
|
||||||
self.STYLESHEETS[path] = stylesheet
|
self.STYLESHEETS[path] = stylesheet
|
||||||
@ -167,6 +167,15 @@ class Stylizer(object):
|
|||||||
for elem in xpath(tree, '//h:*[@style]'):
|
for elem in xpath(tree, '//h:*[@style]'):
|
||||||
self.style(elem)._apply_style_attr()
|
self.style(elem)._apply_style_attr()
|
||||||
|
|
||||||
|
def _fetch_css_file(self, path):
|
||||||
|
hrefs = self.oeb.manifest.hrefs
|
||||||
|
if path not in hrefs:
|
||||||
|
return (None, None)
|
||||||
|
data = hrefs[path].data
|
||||||
|
data = self.oeb.decode(data)
|
||||||
|
data = XHTML_CSS_NAMESPACE + data
|
||||||
|
return (None, data)
|
||||||
|
|
||||||
def flatten_rule(self, rule, href, index):
|
def flatten_rule(self, rule, href, index):
|
||||||
results = []
|
results = []
|
||||||
if isinstance(rule, CSSStyleRule):
|
if isinstance(rule, CSSStyleRule):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user