Update The Guardian

This commit is contained in:
Kovid Goyal 2016-05-10 08:28:58 +05:30
parent 2db8cac86a
commit c82f04c3b1

View File

@ -40,15 +40,22 @@ class Guardian(BasicNewsRecipe):
]
remove_tags = [
dict(attrs={'class': lambda x:x and '--twitter' in x}),
dict(attrs={'class': lambda x:x and 'submeta' in x.split()}),
dict(attrs={'data-component': ['share', 'social']}),
dict(attrs={'data-link-name': 'block share'}),
dict(attrs={'class': lambda x:x and 'inline-expand-image' in x}),
dict(attrs={'class': lambda x:x and 'modern-visible' in x.split()}),
dict(name=['link', 'meta', 'style']),
]
remove_tags_after = [
dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}),
]
def preprocess_raw_html(self, raw, url):
import html5lib
from lxml import html
return html.tostring(html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml'), encoding=unicode)
def preprocess_html(self, soup):
for img in soup.findAll('img', srcset=True):
img['src'] = img['srcset'].partition(' ')[0]