From b2e5b6679bb3e3a7f63f320d5c9f1161087dd99d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 10 Feb 2011 18:08:15 -0700 Subject: [PATCH] News download: Remove all invalid ASCII control characters from article descriptions as they cause XML parsing to fail --- src/calibre/utils/cleantext.py | 8 +++++--- src/calibre/web/feeds/__init__.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 938960df93..f421fdcba2 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -8,11 +8,13 @@ import re, htmlentitydefs _ascii_pat = None def clean_ascii_chars(txt, charlist=None): - 'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default' + ''' + Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default + This is all control chars except \\t,\\n and \\r + ''' global _ascii_pat if _ascii_pat is None: - chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \ - + [0x1A, 0x1B] + chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F)) _ascii_pat = re.compile(u'|'.join(map(unichr, chars))) if charlist is None: diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 478dd5015b..cddb776b4c 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -13,6 +13,7 @@ from calibre.web.feeds.feedparser import parse from calibre.utils.logging import default_log from calibre import entity_to_unicode, strftime from calibre.utils.date import dt_factory, utcnow, local_tz +from calibre.utils.cleantext import clean_ascii_chars class Article(object): @@ -43,7 +44,7 @@ class Article(object): print summary.encode('utf-8') traceback.print_exc() summary = u'' - self.text_summary = summary + self.text_summary = clean_ascii_chars(summary) self.author = author self.content = content self.date = published