From 9cd0fc1a52425f15d4b17994cebdda7dc2197ee7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 29 May 2010 19:36:20 -0600
Subject: [PATCH] News download: Strip all comments before parsing article
 HTML. If your recipe depended ont eh presence of comemnts, it will have to be
 adapted.

---
 src/calibre/web/fetch/simple.py | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index 24b7027420..93fb516f2d 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -148,6 +148,9 @@ class RecursiveFetcher(object):
         nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
         nmassage.extend(self.preprocess_regexps)
         nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
+        # Remove comments as they can leave detritus when extracting tags leaves
+        # multiple nested comments
+        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
         soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
 
         if self.keep_only_tags: