Refactor parts of OEBBook to sanely handle much more broken OPF.

This commit is contained in:
Marshall T. Vandegrift 2009-01-26 23:08:04 -05:00
parent a3ad3a07dd
commit 076ec9bbad
2 changed files with 337 additions and 54 deletions

View File

@ -10,7 +10,7 @@ import os
import sys import sys
from collections import defaultdict from collections import defaultdict
from types import StringTypes from types import StringTypes
from itertools import izip, count from itertools import izip, count, chain
from urlparse import urldefrag, urlparse, urlunparse from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
import logging import logging
@ -22,6 +22,7 @@ from lxml import html
from calibre import LoggingInterface from calibre import LoggingInterface
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
from calibre.startup import get_lang from calibre.startup import get_lang
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
XML_NS = 'http://www.w3.org/XML/1998/namespace' XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml' XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -40,6 +41,7 @@ XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
'svg': SVG_NS, 'xl': XLINK_NS} 'svg': SVG_NS, 'xl': XLINK_NS}
DC_PREFIXES = ('d11', 'd10', 'd09')
def XML(name): return '{%s}%s' % (XML_NS, name) def XML(name): return '{%s}%s' % (XML_NS, name)
def XHTML(name): return '{%s}%s' % (XHTML_NS, name) def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
@ -61,6 +63,7 @@ GIF_MIME = 'image/gif'
JPEG_MIME = 'image/jpeg' JPEG_MIME = 'image/jpeg'
PNG_MIME = 'image/png' PNG_MIME = 'image/png'
SVG_MIME = 'image/svg+xml' SVG_MIME = 'image/svg+xml'
BINARY_MIME = 'application/octet-stream'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
@ -69,6 +72,8 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
MS_COVER_TYPE = 'other.ms-coverimage-standard' MS_COVER_TYPE = 'other.ms-coverimage-standard'
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def element(parent, *args, **kwargs): def element(parent, *args, **kwargs):
if parent is not None: if parent is not None:
@ -191,11 +196,8 @@ class Metadata(object):
def __init__(self, term, value, fq_attrib={}, **kwargs): def __init__(self, term, value, fq_attrib={}, **kwargs):
self.fq_attrib = fq_attrib = dict(fq_attrib) self.fq_attrib = fq_attrib = dict(fq_attrib)
fq_attrib.update(kwargs) fq_attrib.update(kwargs)
if term == OPF('meta') and not value: if barename(term).lower() in Metadata.TERMS and \
term = self.fq_attrib.pop('name') (not namespace(term) or namespace(term) in DC_NSES):
value = self.fq_attrib.pop('content')
elif barename(term).lower() in Metadata.TERMS and \
(not namespace(term) or namespace(term) in DC_NSES):
# Anything looking like Dublin Core is coerced # Anything looking like Dublin Core is coerced
term = DC(barename(term).lower()) term = DC(barename(term).lower())
elif namespace(term) == OPF2_NS: elif namespace(term) == OPF2_NS:
@ -329,8 +331,11 @@ class Manifest(object):
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
def _force_xhtml(self, data): def _force_xhtml(self, data):
# Possibly decode in user-specified encoding
if self.oeb.encoding is not None: if self.oeb.encoding is not None:
data = data.decode(self.oeb.encoding, 'replace') data = data.decode(self.oeb.encoding, 'replace')
# Force to UNIX line encodings
data = data.replace('\r\n', '\n').replace('\r', '\n')
# Handle broken XHTML w/ SVG (ugh) # Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data: if 'svg:' in data and SVG_NS not in data:
data = data.replace( data = data.replace(
@ -338,18 +343,29 @@ class Manifest(object):
if 'xlink:' in data and XLINK_NS not in data: if 'xlink:' in data and XLINK_NS not in data:
data = data.replace( data = data.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
# Try with more & more drastic measures to parse
try: try:
data = etree.fromstring(data) data = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
data = html.fromstring(data) repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
data = etree.tostring(data, encoding=unicode) data = ENTITY_RE.sub(repl, data)
data = etree.fromstring(data) try:
data = etree.fromstring(data)
except etree.XMLSyntaxError:
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
data = html.fromstring(data)
data.attrib.pop('xmlns', None)
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data)
# Force into the XHTML namespace
if namespace(data.tag) != XHTML_NS: if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data, encoding=unicode) data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data) data = etree.fromstring(data)
# Remove any encoding-specifying <meta/> elements
for meta in self.META_XP(data): for meta in self.META_XP(data):
meta.getparent().remove(meta) meta.getparent().remove(meta)
# Ensure has a <head/>
head = xpath(data, '/h:html/h:head') head = xpath(data, '/h:html/h:head')
head = head[0] if head else None head = head[0] if head else None
if head is None: if head is None:
@ -364,6 +380,7 @@ class Manifest(object):
'File %r missing <title/> element' % self.href) 'File %r missing <title/> element' % self.href)
title = etree.SubElement(head, XHTML('title')) title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown')) title.text = self.oeb.translate(__('Unknown'))
# Ensure has a <body/>
if not xpath(data, '/h:html/h:body'): if not xpath(data, '/h:html/h:body'):
self.oeb.logger.warn( self.oeb.logger.warn(
'File %r missing <body/> element' % self.href) 'File %r missing <body/> element' % self.href)
@ -494,9 +511,9 @@ class Manifest(object):
elem = element(parent, 'manifest') elem = element(parent, 'manifest')
for item in self.ids.values(): for item in self.ids.values():
media_type = item.media_type media_type = item.media_type
if media_type == XHTML_MIME: if media_type in OEB_DOCS:
media_type = OEB_DOC_MIME media_type = OEB_DOC_MIME
elif media_type == CSS_MIME: elif media_type in OEB_STYLES:
media_type = OEB_CSS_MIME media_type = OEB_CSS_MIME
attrib = {'id': item.id, 'href': item.href, attrib = {'id': item.id, 'href': item.href,
'media-type': media_type} 'media-type': media_type}
@ -508,6 +525,11 @@ class Manifest(object):
def to_opf2(self, parent=None): def to_opf2(self, parent=None):
elem = element(parent, OPF('manifest')) elem = element(parent, OPF('manifest'))
for item in self.ids.values(): for item in self.ids.values():
media_type = item.media_type
if media_type in OEB_DOCS:
media_type = XHTML_MIME
elif media_type in OEB_STYLES:
media_type = CSS_MIME
attrib = {'id': item.id, 'href': item.href, attrib = {'id': item.id, 'href': item.href,
'media-type': item.media_type} 'media-type': item.media_type}
if item.fallback: if item.fallback:
@ -771,25 +793,19 @@ class OEBBook(object):
opf = self._read_opf(opfpath) opf = self._read_opf(opfpath)
self._all_from_opf(opf) self._all_from_opf(opf)
def _convert_opf1(self, opf): def _clean_opf(self, opf):
# Seriously, seriously wrong for elem in opf.iter():
if namespace(opf.tag) == OPF1_NS: if isinstance(elem.tag, basestring) \
opf.tag = barename(opf.tag) and namespace(elem.tag) in ('', OPF1_NS):
for elem in opf.iterdescendants(): elem.tag = OPF(barename(elem.tag))
if isinstance(elem.tag, basestring) \
and namespace(elem.tag) == OPF1_NS:
elem.tag = barename(elem.tag)
attrib = dict(opf.attrib) attrib = dict(opf.attrib)
attrib['version'] = '2.0'
nroot = etree.Element(OPF('package'), nroot = etree.Element(OPF('package'),
nsmap={None: OPF2_NS}, attrib=attrib) nsmap={None: OPF2_NS}, attrib=attrib)
metadata = etree.SubElement(nroot, OPF('metadata'), metadata = etree.SubElement(nroot, OPF('metadata'),
nsmap={'opf': OPF2_NS, 'dc': DC11_NS, nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
'xsi': XSI_NS, 'dcterms': DCTERMS_NS}) 'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
for prefix in ('d11', 'd10', 'd09'): dc = lambda prefix: xpath(opf, 'o2:metadata//%s:*' % prefix)
elements = xpath(opf, 'metadata//%s:*' % prefix) for element in chain(*(dc(prefix) for prefix in DC_PREFIXES)):
if elements: break
for element in elements:
if not element.text: continue if not element.text: continue
tag = barename(element.tag).lower() tag = barename(element.tag).lower()
element.tag = '{%s}%s' % (DC11_NS, tag) element.tag = '{%s}%s' % (DC11_NS, tag)
@ -799,28 +815,27 @@ class OEBBook(object):
element.attrib[nsname] = element.attrib[name] element.attrib[nsname] = element.attrib[name]
del element.attrib[name] del element.attrib[name]
metadata.append(element) metadata.append(element)
for element in opf.xpath('metadata//meta'): for element in xpath(opf, 'o2:metadata//o2:meta'):
metadata.append(element) metadata.append(element)
for item in opf.xpath('manifest/item'): for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
media_type = item.attrib['media-type'].lower() for element in xpath(opf, tag):
if media_type in OEB_DOCS:
media_type = XHTML_MIME
elif media_type in OEB_STYLES:
media_type = CSS_MIME
item.attrib['media-type'] = media_type
for tag in ('manifest', 'spine', 'tours', 'guide'):
for element in opf.xpath(tag):
nroot.append(element) nroot.append(element)
return etree.fromstring(etree.tostring(nroot)) return nroot
def _read_opf(self, opfpath): def _read_opf(self, opfpath):
opf = self.container.read_xml(opfpath) opf = self.container.read(opfpath)
version = float(opf.get('version', 1.0)) opf = opf.replace('\r\n', '\n').replace('\r', '\n')
try:
opf = etree.fromstring(opf)
except etree.XMLSyntaxError:
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
opf = ENTITY_RE.sub(repl, opf)
opf = etree.fromstring(opf)
self.logger.warn('OPF contains invalid HTML named entities')
ns = namespace(opf.tag) ns = namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS): if ns not in ('', OPF1_NS, OPF2_NS):
raise OEBError('Invalid namespace %r for OPF document' % ns) raise OEBError('Invalid namespace %r for OPF document' % ns)
if ns != OPF2_NS or version < 2.0: opf = self._clean_opf(opf)
opf = self._convert_opf1(opf)
return opf return opf
def _metadata_from_opf(self, opf): def _metadata_from_opf(self, opf):
@ -829,8 +844,16 @@ class OEBBook(object):
self.metadata = metadata = Metadata(self) self.metadata = metadata = Metadata(self)
ignored = (OPF('dc-metadata'), OPF('x-metadata')) ignored = (OPF('dc-metadata'), OPF('x-metadata'))
for elem in xpath(opf, '/o2:package/o2:metadata//*'): for elem in xpath(opf, '/o2:package/o2:metadata//*'):
if elem.tag not in ignored and (elem.text or elem.attrib): if elem.tag in ignored: continue
metadata.add(elem.tag, elem.text, elem.attrib) term = elem.tag
value = elem.text
if term == OPF('meta'):
term = elem.attrib.pop('name', None)
value = elem.attrib.pop('content', None)
if value:
value = COLLAPSE_RE.sub(' ', value.strip())
if term and (value or elem.attrib):
metadata.add(term, value, elem.attrib)
haveuuid = haveid = False haveuuid = haveid = False
for ident in metadata.identifier: for ident in metadata.identifier:
if unicode(ident).startswith('urn:uuid:'): if unicode(ident).startswith('urn:uuid:'):
@ -845,36 +868,38 @@ class OEBBook(object):
self.uid = item self.uid = item
break break
else: else:
self.logger.warn(u'Unique-identifier %r not found.' % uid) self.logger.warn(u'Unique-identifier %r not found' % uid)
for ident in metadata.identifier: for ident in metadata.identifier:
if 'id' in ident.attrib: if 'id' in ident.attrib:
self.uid = metadata.identifier[0] self.uid = metadata.identifier[0]
break break
if not metadata.language: if not metadata.language:
self.logger.warn(u'Language not specified.') self.logger.warn(u'Language not specified')
metadata.add('language', get_lang()) metadata.add('language', get_lang())
if not metadata.creator: if not metadata.creator:
self.logger.warn(u'Creator not specified.') self.logger.warn('Creator not specified')
metadata.add('creator', _('Unknown')) metadata.add('creator', self.translate(__('Unknown')))
if not metadata.title: if not metadata.title:
self.logger.warn(u'Title not specified.') self.logger.warn('Title not specified')
metadata.add('title', _('Unknown')) metadata.add('title', self.translate(__('Unknown')))
def _manifest_from_opf(self, opf): def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self) self.manifest = manifest = Manifest(self)
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
id = elem.get('id') id = elem.get('id')
href = elem.get('href') href = elem.get('href')
media_type = elem.get('media-type') media_type = elem.get('media-type', None)
if media_type is None:
media_type = elem.get('mediatype', BINARY_MIME)
fallback = elem.get('fallback') fallback = elem.get('fallback')
if href in manifest.hrefs: if href in manifest.hrefs:
self.logger.warn(u'Duplicate manifest entry for %r.' % href) self.logger.warn(u'Duplicate manifest entry for %r' % href)
continue continue
if not self.container.exists(href): if not self.container.exists(href):
self.logger.warn(u'Manifest item %r not found.' % href) self.logger.warn(u'Manifest item %r not found' % href)
continue continue
if id in manifest.ids: if id in manifest.ids:
self.logger.warn(u'Duplicate manifest id %r.' % id) self.logger.warn(u'Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href) id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback) manifest.add(id, href, media_type, fallback)
@ -883,7 +908,7 @@ class OEBBook(object):
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref') idref = elem.get('idref')
if idref not in self.manifest: if idref not in self.manifest:
self.logger.warn(u'Spine item %r not found.' % idref) self.logger.warn(u'Spine item %r not found' % idref)
continue continue
item = self.manifest[idref] item = self.manifest[idref]
spine.add(item, elem.get('linear')) spine.add(item, elem.get('linear'))
@ -931,7 +956,8 @@ class OEBBook(object):
item = self.manifest.ids[id] item = self.manifest.ids[id]
ncx = item.data ncx = item.data
self.manifest.remove(item) self.manifest.remove(item)
title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')[0] title = xpath(ncx, 'ncx:docTitle/ncx:text/text()')
title = title[0].strip() if title else unicode(self.metadata.title)
self.toc = toc = TOC(title) self.toc = toc = TOC(title)
navmaps = xpath(ncx, 'ncx:navMap') navmaps = xpath(ncx, 'ncx:navMap')
for navmap in navmaps: for navmap in navmaps:
@ -988,7 +1014,8 @@ class OEBBook(object):
if not item.linear: continue if not item.linear: continue
html = item.data html = item.data
title = xpath(html, '/h:html/h:head/h:title/text()') title = xpath(html, '/h:html/h:head/h:title/text()')
if title: titles.append(title[0]) title = title[0].strip() if title else None
if title: titles.append(title)
headers.append('(unlabled)') headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,) expr = '/h:html/h:body//h:%s[position()=1]/text()' % (tag,)

View File

@ -0,0 +1,256 @@
"""
Replacement for htmlentitydefs which uses purely numeric entities.
"""
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
ENTITYDEFS = \
{'AElig': '&#198;',
'Aacute': '&#193;',
'Acirc': '&#194;',
'Agrave': '&#192;',
'Alpha': '&#913;',
'Aring': '&#197;',
'Atilde': '&#195;',
'Auml': '&#196;',
'Beta': '&#914;',
'Ccedil': '&#199;',
'Chi': '&#935;',
'Dagger': '&#8225;',
'Delta': '&#916;',
'ETH': '&#208;',
'Eacute': '&#201;',
'Ecirc': '&#202;',
'Egrave': '&#200;',
'Epsilon': '&#917;',
'Eta': '&#919;',
'Euml': '&#203;',
'Gamma': '&#915;',
'Iacute': '&#205;',
'Icirc': '&#206;',
'Igrave': '&#204;',
'Iota': '&#921;',
'Iuml': '&#207;',
'Kappa': '&#922;',
'Lambda': '&#923;',
'Mu': '&#924;',
'Ntilde': '&#209;',
'Nu': '&#925;',
'OElig': '&#338;',
'Oacute': '&#211;',
'Ocirc': '&#212;',
'Ograve': '&#210;',
'Omega': '&#937;',
'Omicron': '&#927;',
'Oslash': '&#216;',
'Otilde': '&#213;',
'Ouml': '&#214;',
'Phi': '&#934;',
'Pi': '&#928;',
'Prime': '&#8243;',
'Psi': '&#936;',
'Rho': '&#929;',
'Scaron': '&#352;',
'Sigma': '&#931;',
'THORN': '&#222;',
'Tau': '&#932;',
'Theta': '&#920;',
'Uacute': '&#218;',
'Ucirc': '&#219;',
'Ugrave': '&#217;',
'Upsilon': '&#933;',
'Uuml': '&#220;',
'Xi': '&#926;',
'Yacute': '&#221;',
'Yuml': '&#376;',
'Zeta': '&#918;',
'aacute': '&#225;',
'acirc': '&#226;',
'acute': '&#180;',
'aelig': '&#230;',
'agrave': '&#224;',
'alefsym': '&#8501;',
'alpha': '&#945;',
'and': '&#8743;',
'ang': '&#8736;',
'aring': '&#229;',
'asymp': '&#8776;',
'atilde': '&#227;',
'auml': '&#228;',
'bdquo': '&#8222;',
'beta': '&#946;',
'brvbar': '&#166;',
'bull': '&#8226;',
'cap': '&#8745;',
'ccedil': '&#231;',
'cedil': '&#184;',
'cent': '&#162;',
'chi': '&#967;',
'circ': '&#710;',
'clubs': '&#9827;',
'cong': '&#8773;',
'copy': '&#169;',
'crarr': '&#8629;',
'cup': '&#8746;',
'curren': '&#164;',
'dArr': '&#8659;',
'dagger': '&#8224;',
'darr': '&#8595;',
'deg': '&#176;',
'delta': '&#948;',
'diams': '&#9830;',
'divide': '&#247;',
'eacute': '&#233;',
'ecirc': '&#234;',
'egrave': '&#232;',
'empty': '&#8709;',
'emsp': '&#8195;',
'ensp': '&#8194;',
'epsilon': '&#949;',
'equiv': '&#8801;',
'eta': '&#951;',
'eth': '&#240;',
'euml': '&#235;',
'euro': '&#8364;',
'exist': '&#8707;',
'fnof': '&#402;',
'forall': '&#8704;',
'frac12': '&#189;',
'frac14': '&#188;',
'frac34': '&#190;',
'frasl': '&#8260;',
'gamma': '&#947;',
'ge': '&#8805;',
'hArr': '&#8660;',
'harr': '&#8596;',
'hearts': '&#9829;',
'hellip': '&#8230;',
'iacute': '&#237;',
'icirc': '&#238;',
'iexcl': '&#161;',
'igrave': '&#236;',
'image': '&#8465;',
'infin': '&#8734;',
'int': '&#8747;',
'iota': '&#953;',
'iquest': '&#191;',
'isin': '&#8712;',
'iuml': '&#239;',
'kappa': '&#954;',
'lArr': '&#8656;',
'lambda': '&#955;',
'lang': '&#9001;',
'laquo': '&#171;',
'larr': '&#8592;',
'lceil': '&#8968;',
'ldquo': '&#8220;',
'le': '&#8804;',
'lfloor': '&#8970;',
'lowast': '&#8727;',
'loz': '&#9674;',
'lrm': '&#8206;',
'lsaquo': '&#8249;',
'lsquo': '&#8216;',
'macr': '&#175;',
'mdash': '&#8212;',
'micro': '&#181;',
'middot': '&#183;',
'minus': '&#8722;',
'mu': '&#956;',
'nabla': '&#8711;',
'nbsp': '&#160;',
'ndash': '&#8211;',
'ne': '&#8800;',
'ni': '&#8715;',
'not': '&#172;',
'notin': '&#8713;',
'nsub': '&#8836;',
'ntilde': '&#241;',
'nu': '&#957;',
'oacute': '&#243;',
'ocirc': '&#244;',
'oelig': '&#339;',
'ograve': '&#242;',
'oline': '&#8254;',
'omega': '&#969;',
'omicron': '&#959;',
'oplus': '&#8853;',
'or': '&#8744;',
'ordf': '&#170;',
'ordm': '&#186;',
'oslash': '&#248;',
'otilde': '&#245;',
'otimes': '&#8855;',
'ouml': '&#246;',
'para': '&#182;',
'part': '&#8706;',
'permil': '&#8240;',
'perp': '&#8869;',
'phi': '&#966;',
'pi': '&#960;',
'piv': '&#982;',
'plusmn': '&#177;',
'pound': '&#163;',
'prime': '&#8242;',
'prod': '&#8719;',
'prop': '&#8733;',
'psi': '&#968;',
'rArr': '&#8658;',
'radic': '&#8730;',
'rang': '&#9002;',
'raquo': '&#187;',
'rarr': '&#8594;',
'rceil': '&#8969;',
'rdquo': '&#8221;',
'real': '&#8476;',
'reg': '&#174;',
'rfloor': '&#8971;',
'rho': '&#961;',
'rlm': '&#8207;',
'rsaquo': '&#8250;',
'rsquo': '&#8217;',
'sbquo': '&#8218;',
'scaron': '&#353;',
'sdot': '&#8901;',
'sect': '&#167;',
'shy': '&#173;',
'sigma': '&#963;',
'sigmaf': '&#962;',
'sim': '&#8764;',
'spades': '&#9824;',
'sub': '&#8834;',
'sube': '&#8838;',
'sum': '&#8721;',
'sup': '&#8835;',
'sup1': '&#185;',
'sup2': '&#178;',
'sup3': '&#179;',
'supe': '&#8839;',
'szlig': '&#223;',
'tau': '&#964;',
'there4': '&#8756;',
'theta': '&#952;',
'thetasym': '&#977;',
'thinsp': '&#8201;',
'thorn': '&#254;',
'tilde': '&#732;',
'times': '&#215;',
'trade': '&#8482;',
'uArr': '&#8657;',
'uacute': '&#250;',
'uarr': '&#8593;',
'ucirc': '&#251;',
'ugrave': '&#249;',
'uml': '&#168;',
'upsih': '&#978;',
'upsilon': '&#965;',
'uuml': '&#252;',
'weierp': '&#8472;',
'xi': '&#958;',
'yacute': '&#253;',
'yen': '&#165;',
'yuml': '&#255;',
'zeta': '&#950;',
'zwj': '&#8205;',
'zwnj': '&#8204;'}