revert regex and add filter to strip head

This commit is contained in:
bobbysteel 2018-05-22 14:11:57 +01:00 committed by GitHub
parent 10ab8bfa26
commit af4934d6b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -86,12 +86,13 @@ class NewYorkTimes(BasicNewsRecipe):
dict(attrs={'data-videoid':True}), dict(attrs={'data-videoid':True}),
dict(name='button meta link'.split()), dict(name='button meta link'.split()),
dict(id=lambda x: x and x.startswith('story-ad-')), dict(id=lambda x: x and x.startswith('story-ad-')),
dict(name='head'),
dict(name='a', href=lambda x: x and '#story-continues-' in x), dict(name='a', href=lambda x: x and '#story-continues-' in x),
dict(name='a', href=lambda x: x and '#whats-next' in x), dict(name='a', href=lambda x: x and '#whats-next' in x),
dict(id=lambda x: x and 'sharetools-' in x), dict(id=lambda x: x and 'sharetools-' in x),
dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()), dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'), classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
dict(attrs={'class': lambda x: x and ('SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x)}), dict(attrs={'class': lambda x: x and ('SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):