News download: Handle titles with ASCII control codes in them. Fixes #739322 (News fetching - NULL bytes problem)

This commit is contained in:
Kovid Goyal 2011-03-21 10:09:09 -06:00
parent de1e2369b3
commit 9385758a28
2 changed files with 9 additions and 5 deletions

View File

@ -8,15 +8,18 @@ import re, htmlentitydefs
_ascii_pat = None
def clean_ascii_chars(txt, charlist=None):
'''
Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default
This is all control chars except \\t,\\n and \\r
r'''
Remove ASCII control chars.
This is all control chars except \t, \n and \r
'''
if not txt:
return ''
global _ascii_pat
if _ascii_pat is None:
chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F))
chars = set(xrange(32))
chars.add(127)
for x in (9, 10, 13):
chars.remove(x)
_ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
if charlist is None:

View File

@ -28,6 +28,7 @@ class Article(object):
pass
if not isinstance(self._title, unicode):
self._title = self._title.decode('utf-8', 'replace')
self._title = clean_ascii_chars(self._title)
self.url = url
self.author = author
if author and not isinstance(author, unicode):
@ -75,7 +76,7 @@ class Article(object):
t = t.decode('utf-8', 'replace')
return t
def fset(self, val):
self._title = val
self._title = clean_ascii_chars(val)
return property(fget=fget, fset=fset)