From 9385758a28f6257fe7a4adb263fddfc7ed924888 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 10:09:09 -0600 Subject: [PATCH] News download: Handle titles with ASCII control codes in them. Fixes #739322 (News fetching - NULL bytes problem) --- src/calibre/utils/cleantext.py | 11 +++++++---- src/calibre/web/feeds/__init__.py | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 89101a6219..27e667612e 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -8,15 +8,18 @@ import re, htmlentitydefs _ascii_pat = None def clean_ascii_chars(txt, charlist=None): - ''' - Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default - This is all control chars except \\t,\\n and \\r + r''' + Remove ASCII control chars. + This is all control chars except \t, \n and \r ''' if not txt: return '' global _ascii_pat if _ascii_pat is None: - chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F)) + chars = set(xrange(32)) + chars.add(127) + for x in (9, 10, 13): + chars.remove(x) _ascii_pat = re.compile(u'|'.join(map(unichr, chars))) if charlist is None: diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index cddb776b4c..a10fb03f91 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -28,6 +28,7 @@ class Article(object): pass if not isinstance(self._title, unicode): self._title = self._title.decode('utf-8', 'replace') + self._title = clean_ascii_chars(self._title) self.url = url self.author = author if author and not isinstance(author, unicode): @@ -75,7 +76,7 @@ class Article(object): t = t.decode('utf-8', 'replace') return t def fset(self, val): - self._title = val + self._title = clean_ascii_chars(val) return property(fget=fget, fset=fset)