From 3172795aa20cadbcc0507f0f40019c779173580b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 24 Mar 2008 07:29:07 +0000 Subject: [PATCH] Fix index parsing of non-ascii web sites --- src/libprs500/web/feeds/news.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index f1e8f3b9f0..68ca49d5ac 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -265,10 +265,9 @@ class BasicNewsRecipe(object): raw = url_or_raw if not isinstance(raw, unicode) and self.encoding: raw = raw.decode(self.encoding) - raw = re.sub(r'&(\S+?);', - lambda match: entity_to_unicode(match, encoding=self.encoding), - raw) - return BeautifulSoup(raw) + massage = list(BeautifulSoup.MARKUP_MASSAGE) + massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) + return BeautifulSoup(raw, markupMassage=massage) def sort_index_by(self, index, weights):