News download: Strip all comments before parsing article HTML. If your recipe depended ont eh presence of comemnts, it will have to be adapted.

This commit is contained in:
Kovid Goyal 2010-05-29 19:36:20 -06:00
parent 150487d0e1
commit 9cd0fc1a52

View File

@ -148,6 +148,9 @@ class RecursiveFetcher(object):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
if self.keep_only_tags: