From cc5d806c6e00e862f698ed9f612c0b6b5c4bdb72 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Dec 2016 12:14:30 +0530 Subject: [PATCH] Use a random non-IE user agent --- recipes/guardian.recipe | 8 +++++++- recipes/independent.recipe | 10 +++++++++- src/calibre/__init__.py | 29 +++++++---------------------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index d0619b052d..3572beee8a 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en' ''' www.guardian.co.uk ''' +from calibre import random_user_agent from calibre.web.feeds.news import BasicNewsRecipe from datetime import date @@ -54,8 +55,13 @@ class Guardian(BasicNewsRecipe): def get_browser(self, *a, **kw): # This site returns images in JPEG-XR format if the user agent is IE + if not hasattr(self, 'non_ie_ua'): + try: + self.non_ie_ua = random_user_agent(allow_ie=False) + except TypeError: + self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36' + kw['user_agent'] = self.non_ie_ua br = BasicNewsRecipe.get_browser(self, *a, **kw) - br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')] return br def preprocess_raw_html(self, raw, url): diff --git a/recipes/independent.recipe b/recipes/independent.recipe index da97af0453..1a393578a0 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -1,3 +1,6 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from calibre import random_user_agent from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag @@ -38,8 +41,13 @@ class TheIndependentNew(BasicNewsRecipe): def get_browser(self, *a, **kw): # This site returns images in JPEG-XR format if the user agent is IE + if not hasattr(self, 'non_ie_ua'): + try: + self.non_ie_ua = random_user_agent(allow_ie=False) + except TypeError: + self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36' + kw['user_agent'] = self.non_ie_ua br = BasicNewsRecipe.get_browser(self, *a, **kw) - br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')] return br def preprocess_html(self, soup): diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index c37c0fcf86..dfd897ba07 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -103,6 +103,7 @@ def osx_version(): def confirm_config_name(name): return name + '_again' + _filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]') _filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<', u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32)))) @@ -389,35 +390,19 @@ def get_proxy_info(proxy_scheme, proxy_string): return None return ans + # IE 11 on windows 7 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' -def random_user_agent(choose=None): +def random_user_agent(choose=None, allow_ie=True): try: ua_list = random_user_agent.ua_list except AttributeError: - try: - ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines() - except IOError: - # People running from source checkout - ua_list = random_user_agent.ua_list = [ - # IE 11 - windows 10 - 'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko', - # IE 11 - windows 8.1 - 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', - # IE 11 - windows 8 - 'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko', - # IE 11 - windows 7 - 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', - # 32bit IE 11 on 64 bit win 10 - 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', - # 32bit IE 11 on 64 bit win 8.1 - 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', - # 32bit IE 11 on 64 bit win 7 - 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', - ] + ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines() + if not allow_ie: + ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list) return random.choice(ua_list) if choose is None else ua_list[choose] @@ -631,6 +616,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252', except KeyError: return '&'+ent+';' + _ent_pat = re.compile(r'&(\S+?);') xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={ '"' : '"', @@ -739,4 +725,3 @@ def ipython(user_ns=None): def fsync(fileobj): fileobj.flush() os.fsync(fileobj.fileno()) -