Fix images not working for Guardian and Independent

Apparently they serve images in JPEG-XR format if the user agent is IE
This commit is contained in:
Kovid Goyal 2016-12-13 11:02:45 +05:30
parent bcbac05d04
commit d700523080
2 changed files with 12 additions and 0 deletions

View File

@ -52,6 +52,12 @@ class Guardian(BasicNewsRecipe):
dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}), dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}),
] ]
def get_browser(self, *a, **kw):
# This site returns images in JPEG-XR format if the user agent is IE
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
return br
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
import html5lib import html5lib
from lxml import html from lxml import html

View File

@ -36,6 +36,12 @@ class TheIndependentNew(BasicNewsRecipe):
remove_attributes = ['style'] remove_attributes = ['style']
def get_browser(self, *a, **kw):
# This site returns images in JPEG-XR format if the user agent is IE
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
return br
def preprocess_html(self, soup): def preprocess_html(self, soup):
for div in soup.findAll(attrs={'class': 'full-gallery'}): for div in soup.findAll(attrs={'class': 'full-gallery'}):
imgs = {} imgs = {}