Conversion pipeline: When decoding XML (but not XHTML) if no encoding is specified, assume utf-8. Make entity conversion more robust. When splitting html handle ids with quotes in them correctly

This commit is contained in:
Kovid Goyal 2010-04-24 07:30:50 -06:00
parent 8daf5561a3
commit b2f590b6b3
5 changed files with 16 additions and 25 deletions

View File

@ -7,6 +7,7 @@ import sys, os, re, logging, time, mimetypes, \
__builtin__.__dict__['dynamic_property'] = lambda(func): func(None) __builtin__.__dict__['dynamic_property'] = lambda(func): func(None)
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from math import floor from math import floor
from functools import partial
warnings.simplefilter('ignore', DeprecationWarning) warnings.simplefilter('ignore', DeprecationWarning)
@ -446,6 +447,12 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
return '&'+ent+';' return '&'+ent+';'
_ent_pat = re.compile(r'&(\S+?);') _ent_pat = re.compile(r'&(\S+?);')
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = {
'"' : '"',
"'" : ''',
'<' : '&lt;',
'>' : '&gt;',
'&' : '&amp;'})
def prepare_string_for_xml(raw, attribute=False): def prepare_string_for_xml(raw, attribute=False):
raw = _ent_pat.sub(entity_to_unicode, raw) raw = _ent_pat.sub(entity_to_unicode, raw)

View File

@ -43,11 +43,8 @@ def strip_encoding_declarations(raw):
return raw return raw
def substitute_entites(raw): def substitute_entites(raw):
from calibre import entity_to_unicode from calibre import xml_entity_to_unicode
from functools import partial return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
f = partial(entity_to_unicode, exceptions=
['amp', 'apos', 'quot', 'lt', 'gt'])
return ENTITY_PATTERN.sub(f, raw)
_CHARSET_ALIASES = { "macintosh" : "mac-roman", _CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" } "x-sjis" : "shift-jis" }

View File

@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Read data from .mobi files Read data from .mobi files
''' '''
import functools, shutil, os, re, struct, textwrap, cStringIO, sys import shutil, os, re, struct, textwrap, cStringIO, sys
try: try:
from PIL import Image as PILImage from PIL import Image as PILImage
@ -14,7 +14,7 @@ except ImportError:
from lxml import html, etree from lxml import html, etree
from calibre import entity_to_unicode, CurrentDir from calibre import xml_entity_to_unicode, CurrentDir, entity_to_unicode
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.utils.date import parse_date from calibre.utils.date import parse_date
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
@ -302,14 +302,7 @@ class MobiReader(object):
for pat in ENCODING_PATS: for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html) self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode, self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
result_exceptions={
'<' : u'&lt;',
'>' : u'&gt;',
'&' : u'&amp;',
'"' : u'&quot;',
"'" : u'&apos;'})
self.processed_html = re.sub(r'&(\S+?);', e2u,
self.processed_html) self.processed_html)
self.extract_images(processed_records, output_dir) self.extract_images(processed_records, output_dir)
self.replace_page_breaks() self.replace_page_breaks()

View File

@ -771,18 +771,12 @@ class Manifest(object):
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
def _parse_xml(self, data): def _parse_xml(self, data):
data = xml_to_unicode(data, strip_encoding_pats=True)[0] data = xml_to_unicode(data, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
if not data: if not data:
return None return None
parser = etree.XMLParser(recover=True) parser = etree.XMLParser(recover=True)
try: return etree.fromstring(data, parser=parser)
return etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError, err:
if getattr(err, 'code', 0) == 26 or str(err).startswith('Entity'):
data = xml_to_unicode(data, strip_encoding_pats=True,
resolve_entities=True)[0]
return etree.fromstring(data)
raise
def _parse_xhtml(self, data): def _parse_xhtml(self, data):
self.oeb.log.debug('Parsing', self.href, '...') self.oeb.log.debug('Parsing', self.href, '...')

View File

@ -115,7 +115,7 @@ class Split(object):
for i, x in enumerate(page_breaks): for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i)) x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id') id = x.get('id')
page_breaks_.append((XPath('//*[@id="%s"]'%id), page_breaks_.append((XPath('//*[@id=%r]'%id),
x.get('pb_before', False))) x.get('pb_before', False)))
page_break_ids.append(id) page_break_ids.append(id)