Fix index parsing of non-ascii web sites

This commit is contained in:
Kovid Goyal 2008-03-24 07:29:07 +00:00
parent 366258b571
commit 3172795aa2

View File

@ -265,10 +265,9 @@ class BasicNewsRecipe(object):
raw = url_or_raw raw = url_or_raw
if not isinstance(raw, unicode) and self.encoding: if not isinstance(raw, unicode) and self.encoding:
raw = raw.decode(self.encoding) raw = raw.decode(self.encoding)
raw = re.sub(r'&(\S+?);', massage = list(BeautifulSoup.MARKUP_MASSAGE)
lambda match: entity_to_unicode(match, encoding=self.encoding), massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
raw) return BeautifulSoup(raw, markupMassage=massage)
return BeautifulSoup(raw)
def sort_index_by(self, index, weights): def sort_index_by(self, index, weights):