From 9cd0fc1a52425f15d4b17994cebdda7dc2197ee7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 29 May 2010 19:36:20 -0600 Subject: [PATCH] News download: Strip all comments before parsing article HTML. If your recipe depended ont eh presence of comemnts, it will have to be adapted. --- src/calibre/web/fetch/simple.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 24b7027420..93fb516f2d 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -148,6 +148,9 @@ class RecursiveFetcher(object): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup + # Remove comments as they can leave detritus when extracting tags leaves + # multiple nested comments + nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: