Properly escape invalid characters in TOC labels when generating LRF files

This commit is contained in:
Kovid Goyal 2008-05-29 13:14:14 -07:00
parent 96aa2c8b0b
commit 8670321901
3 changed files with 38 additions and 32 deletions

View File

@ -9,6 +9,7 @@ __appname__ = 'calibre'
import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, \ import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, \
textwrap, atexit, cPickle, codecs, time textwrap, atexit, cPickle, codecs, time
from gettext import GNUTranslations from gettext import GNUTranslations
from htmlentitydefs import name2codepoint
from math import floor from math import floor
from optparse import OptionParser as _OptionParser from optparse import OptionParser as _OptionParser
from optparse import IndentedHelpFormatter from optparse import IndentedHelpFormatter
@ -569,4 +570,35 @@ except Exception, err:
pictureflowerror = str(err) pictureflowerror = str(err)
if islinux: if islinux:
os.chdir(cwd) os.chdir(cwd)
def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
'''
@param match: A match object such that '&'+match.group(1)';' is the entity.
@param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
@param encoding: The encoding to use to decode numeric entities between 128 and 256.
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
'''
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent == 'apos':
return "'"
if ent.startswith(u'#x'):
num = int(ent[2:], 16)
if encoding is None or num > 255:
return unichr(num)
return chr(num).decode(encoding)
if ent.startswith(u'#'):
num = int(ent[1:])
if encoding is None or num > 255:
return unichr(num)
try:
return chr(num).decode(encoding)
except UnicodeDecodeError:
return unichr(num)
try:
return unichr(name2codepoint[ent])
except KeyError:
return '&'+ent+';'

View File

@ -110,7 +110,7 @@ def option_parser(usage, gui_mode=False):
help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.')) help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.'))
laf = parser.add_option_group('LOOK AND FEEL') laf = parser.add_option_group('LOOK AND FEEL')
laf.add_option('--base-font-size', action='store', type='float', default=10., laf.add_option('--base-font-size', action='store', type='float', default=10.,
help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0.''')) help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0. Default: %defaultpt'''))
laf.add_option('--enable-autorotation', action='store_true', default=False, laf.add_option('--enable-autorotation', action='store_true', default=False,
help=_('Enable autorotation of images that are wider than the screen width.'), help=_('Enable autorotation of images that are wider than the screen width.'),
dest='autorotation') dest='autorotation')
@ -324,32 +324,4 @@ def Book(options, logger, font_delta=0, header=None,
raise ConversionError, 'Could not find the normal version of the ' + family + ' font' raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
return book, fonts return book, fonts
def entity_to_unicode(match, exceptions=[], encoding='cp1252'): from calibre import entity_to_unicode
'''
@param match: A match object such that '&'+match.group(1)';' is the entity.
@param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
@param encoding: The encoding to use to decode numeric entities between 128 and 256.
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
'''
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent == 'apos':
return "'"
if ent.startswith(u'#x'):
num = int(ent[2:], 16)
if encoding is None or num > 255:
return unichr(num)
return chr(num).decode(encoding)
if ent.startswith(u'#'):
num = int(ent[1:])
if encoding is None or num > 255:
return unichr(num)
try:
return chr(num).decode(encoding)
except UnicodeDecodeError:
return unichr(num)
try:
return unichr(name2codepoint[ent])
except KeyError:
return '&'+ent+';'

View File

@ -37,6 +37,7 @@
# EmpLine, EmpDots # EmpLine, EmpDots
import os, re, codecs, operator import os, re, codecs, operator
from xml.sax.saxutils import escape
from datetime import date from datetime import date
try: try:
from elementtree.ElementTree import (Element, SubElement) from elementtree.ElementTree import (Element, SubElement)
@ -53,6 +54,7 @@ DEFAULT_SOURCE_ENCODING = "cp1252" # defualt is us-windows character set
DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs
from calibre import __appname__, __version__ from calibre import __appname__, __version__
from calibre import entity_to_unicode
class LrsError(Exception): class LrsError(Exception):
pass pass
@ -786,7 +788,7 @@ class TableOfContents(object):
class TocLabel(object): class TocLabel(object):
def __init__(self, label, textBlock): def __init__(self, label, textBlock):
self.label = label self.label = escape(re.sub(r'&(\S+);', entity_to_unicode, label))
self.textBlock = textBlock self.textBlock = textBlock