mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Handle titles with ASCII control codes in them. Fixes #739322 (News fetching - NULL bytes problem)
This commit is contained in:
parent
de1e2369b3
commit
9385758a28
@ -8,15 +8,18 @@ import re, htmlentitydefs
|
|||||||
_ascii_pat = None
|
_ascii_pat = None
|
||||||
|
|
||||||
def clean_ascii_chars(txt, charlist=None):
|
def clean_ascii_chars(txt, charlist=None):
|
||||||
'''
|
r'''
|
||||||
Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default
|
Remove ASCII control chars.
|
||||||
This is all control chars except \\t,\\n and \\r
|
This is all control chars except \t, \n and \r
|
||||||
'''
|
'''
|
||||||
if not txt:
|
if not txt:
|
||||||
return ''
|
return ''
|
||||||
global _ascii_pat
|
global _ascii_pat
|
||||||
if _ascii_pat is None:
|
if _ascii_pat is None:
|
||||||
chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F))
|
chars = set(xrange(32))
|
||||||
|
chars.add(127)
|
||||||
|
for x in (9, 10, 13):
|
||||||
|
chars.remove(x)
|
||||||
_ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
|
_ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
|
||||||
|
|
||||||
if charlist is None:
|
if charlist is None:
|
||||||
|
@ -28,6 +28,7 @@ class Article(object):
|
|||||||
pass
|
pass
|
||||||
if not isinstance(self._title, unicode):
|
if not isinstance(self._title, unicode):
|
||||||
self._title = self._title.decode('utf-8', 'replace')
|
self._title = self._title.decode('utf-8', 'replace')
|
||||||
|
self._title = clean_ascii_chars(self._title)
|
||||||
self.url = url
|
self.url = url
|
||||||
self.author = author
|
self.author = author
|
||||||
if author and not isinstance(author, unicode):
|
if author and not isinstance(author, unicode):
|
||||||
@ -75,7 +76,7 @@ class Article(object):
|
|||||||
t = t.decode('utf-8', 'replace')
|
t = t.decode('utf-8', 'replace')
|
||||||
return t
|
return t
|
||||||
def fset(self, val):
|
def fset(self, val):
|
||||||
self._title = val
|
self._title = clean_ascii_chars(val)
|
||||||
return property(fget=fget, fset=fset)
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user