From d700523080a05dc37c613e301aad5c75c92a666a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Dec 2016 11:02:45 +0530 Subject: [PATCH] Fix images not working for Guardian and Independent Apparently they serve images in JPEG-XR format if the user agent is IE --- recipes/guardian.recipe | 6 ++++++ recipes/independent.recipe | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index c6f814952f..d0619b052d 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -52,6 +52,12 @@ class Guardian(BasicNewsRecipe): dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}), ] + def get_browser(self, *a, **kw): + # This site returns images in JPEG-XR format if the user agent is IE + br = BasicNewsRecipe.get_browser(self, *a, **kw) + br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')] + return br + def preprocess_raw_html(self, raw, url): import html5lib from lxml import html diff --git a/recipes/independent.recipe b/recipes/independent.recipe index 6194f05891..da97af0453 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -36,6 +36,12 @@ class TheIndependentNew(BasicNewsRecipe): remove_attributes = ['style'] + def get_browser(self, *a, **kw): + # This site returns images in JPEG-XR format if the user agent is IE + br = BasicNewsRecipe.get_browser(self, *a, **kw) + br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')] + return br + def preprocess_html(self, soup): for div in soup.findAll(attrs={'class': 'full-gallery'}): imgs = {}