Properly escape invalid characters in TOC labels when generating LRF files

2025-07-31 14:33:54 -04:00 · 2008-05-29 13:14:14 -07:00 · 2008-05-29 13:14:14 -07:00 · 8670321901
commit 8670321901
parent 96aa2c8b0b
3 changed files with 38 additions and 32 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -9,6 +9,7 @@ __appname__   = 'calibre'
 import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, \
       textwrap, atexit, cPickle, codecs, time
 from gettext import GNUTranslations
+from htmlentitydefs import name2codepoint
 from math import floor
 from optparse import OptionParser as _OptionParser
 from optparse import IndentedHelpFormatter
@ -569,4 +570,35 @@ except Exception, err:
    pictureflowerror = str(err)

 if islinux:
-    os.chdir(cwd) 
+    os.chdir(cwd)
+    
+def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
+    '''
+    @param match: A match object such that '&'+match.group(1)';' is the entity.
+    @param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234' 
+    @param encoding: The encoding to use to decode numeric entities between 128 and 256. 
+    If None, the Unicode UCS encoding is used. A common encoding is cp1252.
+    '''
+    ent = match.group(1)
+    if ent in exceptions:
+        return '&'+ent+';'
+    if ent == 'apos':
+        return "'"
+    if ent.startswith(u'#x'):
+        num = int(ent[2:], 16)
+        if encoding is None or num > 255:
+            return unichr(num)
+        return chr(num).decode(encoding)
+    if ent.startswith(u'#'):
+        num = int(ent[1:])
+        if encoding is None or num > 255:
+            return unichr(num)
+        try:
+            return chr(num).decode(encoding)
+        except UnicodeDecodeError:
+            return unichr(num)
+    try:
+        return unichr(name2codepoint[ent])
+    except KeyError:
+        return '&'+ent+';'
+ 
--- a/src/calibre/ebooks/lrf/init.py
+++ b/src/calibre/ebooks/lrf/init.py
@ -110,7 +110,7 @@ def option_parser(usage, gui_mode=False):
                      help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.'))
    laf = parser.add_option_group('LOOK AND FEEL')
    laf.add_option('--base-font-size', action='store', type='float', default=10.,
-                   help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0.'''))
+                   help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0. Default: %defaultpt'''))
    laf.add_option('--enable-autorotation', action='store_true', default=False, 
                   help=_('Enable autorotation of images that are wider than the screen width.'), 
                   dest='autorotation')
@ -324,32 +324,4 @@ def Book(options, logger, font_delta=0, header=None,
            raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
    return book, fonts

-def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
-    '''
-    @param match: A match object such that '&'+match.group(1)';' is the entity.
-    @param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234' 
-    @param encoding: The encoding to use to decode numeric entities between 128 and 256. 
-    If None, the Unicode UCS encoding is used. A common encoding is cp1252.
-    '''
-    ent = match.group(1)
-    if ent in exceptions:
-        return '&'+ent+';'
-    if ent == 'apos':
-        return "'"
-    if ent.startswith(u'#x'):
-        num = int(ent[2:], 16)
-        if encoding is None or num > 255:
-            return unichr(num)
-        return chr(num).decode(encoding)
-    if ent.startswith(u'#'):
-        num = int(ent[1:])
-        if encoding is None or num > 255:
-            return unichr(num)
-        try:
-            return chr(num).decode(encoding)
-        except UnicodeDecodeError:
-            return unichr(num)
-    try:
-        return unichr(name2codepoint[ent])
-    except KeyError:
-        return '&'+ent+';'
+from calibre import entity_to_unicode
--- a/src/calibre/ebooks/lrf/pylrs/pylrs.py
+++ b/src/calibre/ebooks/lrf/pylrs/pylrs.py
@ -37,6 +37,7 @@
 #                           EmpLine, EmpDots

 import os, re, codecs, operator
+from xml.sax.saxutils import escape
 from datetime import date
 try:
    from elementtree.ElementTree import (Element, SubElement)
@ -53,6 +54,7 @@ DEFAULT_SOURCE_ENCODING = "cp1252"      # defualt is us-windows character set
 DEFAULT_GENREADING      = "fs"          # default is yes to both lrf and lrs

 from calibre import __appname__, __version__
+from calibre import entity_to_unicode

 class LrsError(Exception):
    pass
@ -786,7 +788,7 @@ class TableOfContents(object):

 class TocLabel(object):
    def __init__(self, label, textBlock):
-        self.label = label
+        self.label = escape(re.sub(r'&(\S+);', entity_to_unicode, label))
        self.textBlock = textBlock