News download: Strip all comments before parsing article HTML. If your recipe depended ont eh presence of comemnts, it will have to be adapted.

2025-07-09 03:04:10 -04:00 · 2010-05-29 19:36:20 -06:00 · 2010-05-29 19:36:20 -06:00 · 9cd0fc1a52
commit 9cd0fc1a52
parent 150487d0e1
1 changed files with 3 additions and 0 deletions
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -148,6 +148,9 @@ class RecursiveFetcher(object):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
+        # Remove comments as they can leave detritus when extracting tags leaves
+        # multiple nested comments
+        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags: