News download: Handle titles with ASCII control codes in them. Fixes #739322 (News fetching - NULL bytes problem)

This commit is contained in:
Kovid Goyal 2011-03-21 10:09:09 -06:00
parent de1e2369b3
commit 9385758a28
2 changed files with 9 additions and 5 deletions

View File

@ -8,15 +8,18 @@ import re, htmlentitydefs
_ascii_pat = None _ascii_pat = None
def clean_ascii_chars(txt, charlist=None): def clean_ascii_chars(txt, charlist=None):
''' r'''
Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default Remove ASCII control chars.
This is all control chars except \\t,\\n and \\r This is all control chars except \t, \n and \r
''' '''
if not txt: if not txt:
return '' return ''
global _ascii_pat global _ascii_pat
if _ascii_pat is None: if _ascii_pat is None:
chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F)) chars = set(xrange(32))
chars.add(127)
for x in (9, 10, 13):
chars.remove(x)
_ascii_pat = re.compile(u'|'.join(map(unichr, chars))) _ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
if charlist is None: if charlist is None:

View File

@ -28,6 +28,7 @@ class Article(object):
pass pass
if not isinstance(self._title, unicode): if not isinstance(self._title, unicode):
self._title = self._title.decode('utf-8', 'replace') self._title = self._title.decode('utf-8', 'replace')
self._title = clean_ascii_chars(self._title)
self.url = url self.url = url
self.author = author self.author = author
if author and not isinstance(author, unicode): if author and not isinstance(author, unicode):
@ -75,7 +76,7 @@ class Article(object):
t = t.decode('utf-8', 'replace') t = t.decode('utf-8', 'replace')
return t return t
def fset(self, val): def fset(self, val):
self._title = val self._title = clean_ascii_chars(val)
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)