mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Remove all invalid ASCII control characters from article descriptions as they cause XML parsing to fail
This commit is contained in:
parent
40345e0628
commit
b2e5b6679b
@ -8,11 +8,13 @@ import re, htmlentitydefs
|
||||
_ascii_pat = None
|
||||
|
||||
def clean_ascii_chars(txt, charlist=None):
|
||||
'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default'
|
||||
'''
|
||||
Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default
|
||||
This is all control chars except \\t,\\n and \\r
|
||||
'''
|
||||
global _ascii_pat
|
||||
if _ascii_pat is None:
|
||||
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \
|
||||
+ [0x1A, 0x1B]
|
||||
chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F))
|
||||
_ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
|
||||
|
||||
if charlist is None:
|
||||
|
@ -13,6 +13,7 @@ from calibre.web.feeds.feedparser import parse
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from calibre.utils.date import dt_factory, utcnow, local_tz
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
|
||||
class Article(object):
|
||||
|
||||
@ -43,7 +44,7 @@ class Article(object):
|
||||
print summary.encode('utf-8')
|
||||
traceback.print_exc()
|
||||
summary = u''
|
||||
self.text_summary = summary
|
||||
self.text_summary = clean_ascii_chars(summary)
|
||||
self.author = author
|
||||
self.content = content
|
||||
self.date = published
|
||||
|
Loading…
x
Reference in New Issue
Block a user