News download: Handle titles with ASCII control codes in them. Fixes #739322 (News fetching - NULL bytes problem)

2025-12-02 11:15:04 -05:00 · 2011-03-21 10:09:09 -06:00 · 2011-03-21 10:09:09 -06:00 · 9385758a28
commit 9385758a28
parent de1e2369b3
2 changed files with 9 additions and 5 deletions
--- a/src/calibre/utils/cleantext.py
+++ b/src/calibre/utils/cleantext.py
@ -8,15 +8,18 @@ import re, htmlentitydefs
 _ascii_pat = None

 def clean_ascii_chars(txt, charlist=None):
-    '''
-    Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default
-    This is all control chars except \\t,\\n and \\r
+    r'''
+    Remove ASCII control chars.
+    This is all control chars except \t, \n and \r
    '''
    if not txt:
        return ''
    global _ascii_pat
    if _ascii_pat is None:
-        chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F))
+        chars = set(xrange(32))
+        chars.add(127)
+        for x in (9, 10, 13):
+            chars.remove(x)
        _ascii_pat = re.compile(u'|'.join(map(unichr, chars)))

    if charlist is None:
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -28,6 +28,7 @@ class Article(object):
            pass
        if not isinstance(self._title, unicode):
            self._title = self._title.decode('utf-8', 'replace')
+        self._title = clean_ascii_chars(self._title)
        self.url = url
        self.author = author
        if author and not isinstance(author, unicode):
@ -75,7 +76,7 @@ class Article(object):
                t = t.decode('utf-8', 'replace')
            return t
        def fset(self, val):
-            self._title = val
+            self._title = clean_ascii_chars(val)
        return property(fget=fget, fset=fset)