mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
News download: Strip all comments before parsing article HTML. If your recipe depended ont eh presence of comemnts, it will have to be adapted.
This commit is contained in:
parent
150487d0e1
commit
9cd0fc1a52
@ -148,6 +148,9 @@ class RecursiveFetcher(object):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||
# Remove comments as they can leave detritus when extracting tags leaves
|
||||
# multiple nested comments
|
||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
||||
|
||||
if self.keep_only_tags:
|
||||
|
Loading…
x
Reference in New Issue
Block a user