News download: Remove all invalid ASCII control characters from article descriptions as they cause XML parsing to fail

This commit is contained in:
Kovid Goyal 2011-02-10 18:08:15 -07:00
parent 40345e0628
commit b2e5b6679b
2 changed files with 7 additions and 4 deletions

View File

@ -8,11 +8,13 @@ import re, htmlentitydefs
_ascii_pat = None _ascii_pat = None
def clean_ascii_chars(txt, charlist=None): def clean_ascii_chars(txt, charlist=None):
'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default' '''
Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default
This is all control chars except \\t,\\n and \\r
'''
global _ascii_pat global _ascii_pat
if _ascii_pat is None: if _ascii_pat is None:
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \ chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F))
+ [0x1A, 0x1B]
_ascii_pat = re.compile(u'|'.join(map(unichr, chars))) _ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
if charlist is None: if charlist is None:

View File

@ -13,6 +13,7 @@ from calibre.web.feeds.feedparser import parse
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre import entity_to_unicode, strftime from calibre import entity_to_unicode, strftime
from calibre.utils.date import dt_factory, utcnow, local_tz from calibre.utils.date import dt_factory, utcnow, local_tz
from calibre.utils.cleantext import clean_ascii_chars
class Article(object): class Article(object):
@ -43,7 +44,7 @@ class Article(object):
print summary.encode('utf-8') print summary.encode('utf-8')
traceback.print_exc() traceback.print_exc()
summary = u'' summary = u''
self.text_summary = summary self.text_summary = clean_ascii_chars(summary)
self.author = author self.author = author
self.content = content self.content = content
self.date = published self.date = published