Cleanup after latest changes to site

This commit is contained in:
bobbysteel 2017-02-28 20:39:45 +00:00 committed by GitHub
parent 0ac9ac2e15
commit 2a6bc4f501

View File

@ -59,23 +59,26 @@ class Economist(BasicNewsRecipe):
oldest_article = 7.0 oldest_article = 7.0
resolve_internal_links = True resolve_internal_links = True
remove_tags = [ remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
dict( dict(attrs={
attrs={
'class': [ 'class': [
'dblClkTrk', 'ec-article-info', 'share_inline_header', 'dblClkTrk', 'ec-article-info', 'share_inline_header',
'related-items', 'main-content-container', 'ec-topic-widget' 'related-items', 'main-content-container', 'ec-topic-widget',
'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label',
'blog-post__foot-note'
] ]
} }
), ),
{ dict(attrs={'class': lambda x: x and 'share-links-header' in x.split()}),
'class': lambda x: x and 'share-links-header' in x dict(attrs={'class': lambda x: x and 'teaser--wrapped' in x.split()}),
},
] ]
keep_only_tags = [dict(name='article', id=lambda x: not x)] keep_only_tags = [dict(name='article', id=lambda x: not x)]
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL), lambda x: '</html>')] preprocess_regexps = [
(re.compile('</html>.*', re.DOTALL), lambda x: '</html>'),
(re.compile('<h1 class="flytitle-and-title__body".*?><span class="flytitle-and-title__flytitle".*?>',re.DOTALL|re.IGNORECASE), lambda x: '<h2><br />'),
(re.compile('</span><span class="flytitle-and-title__title".*?>',re.DOTALL|re.IGNORECASE), lambda x: '</h2><h1><span class="flytitle-and-title__title">')
]
# economist.com has started throttling after about 60% of the total has # economist.com has started throttling after about 60% of the total has
# downloaded with connection reset by peer (104) errors. # downloaded with connection reset by peer (104) errors.
delay = 1 delay = 1