diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 938960df93..f421fdcba2 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -8,11 +8,13 @@ import re, htmlentitydefs _ascii_pat = None def clean_ascii_chars(txt, charlist=None): - 'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default' + ''' + Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default + This is all control chars except \\t,\\n and \\r + ''' global _ascii_pat if _ascii_pat is None: - chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \ - + [0x1A, 0x1B] + chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F)) _ascii_pat = re.compile(u'|'.join(map(unichr, chars))) if charlist is None: diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 478dd5015b..cddb776b4c 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -13,6 +13,7 @@ from calibre.web.feeds.feedparser import parse from calibre.utils.logging import default_log from calibre import entity_to_unicode, strftime from calibre.utils.date import dt_factory, utcnow, local_tz +from calibre.utils.cleantext import clean_ascii_chars class Article(object): @@ -43,7 +44,7 @@ class Article(object): print summary.encode('utf-8') traceback.print_exc() summary = u'' - self.text_summary = summary + self.text_summary = clean_ascii_chars(summary) self.author = author self.content = content self.date = published