Use a random non-IE user agent

2025-07-09 03:04:10 -04:00 · 2016-12-13 12:14:30 +05:30 · 2016-12-13 12:14:30 +05:30 · cc5d806c6e
commit cc5d806c6e
parent d700523080
3 changed files with 23 additions and 24 deletions
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
 '''
 www.guardian.co.uk
 '''
 from calibre import random_user_agent
 from calibre.web.feeds.news import BasicNewsRecipe
 from datetime import date
@ -54,8 +55,13 @@ class Guardian(BasicNewsRecipe):
    def get_browser(self, *a, **kw):
        # This site returns images in JPEG-XR format if the user agent is IE
        if not hasattr(self, 'non_ie_ua'):
            try:
                self.non_ie_ua = random_user_agent(allow_ie=False)
            except TypeError:
                self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
        kw['user_agent'] = self.non_ie_ua
        br = BasicNewsRecipe.get_browser(self, *a, **kw)
        br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
        return br
    def preprocess_raw_html(self, raw, url):
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -1,3 +1,6 @@
 #!/usr/bin/env python2
 # vim:fileencoding=utf-8
 from calibre import random_user_agent
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
@ -38,8 +41,13 @@ class TheIndependentNew(BasicNewsRecipe):
    def get_browser(self, *a, **kw):
        # This site returns images in JPEG-XR format if the user agent is IE
        if not hasattr(self, 'non_ie_ua'):
            try:
                self.non_ie_ua = random_user_agent(allow_ie=False)
            except TypeError:
                self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
        kw['user_agent'] = self.non_ie_ua
        br = BasicNewsRecipe.get_browser(self, *a, **kw)
        br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
        return br
    def preprocess_html(self, soup):
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -103,6 +103,7 @@ def osx_version():
 def confirm_config_name(name):
    return name + '_again'
 _filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
 _filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
    u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
@ -389,35 +390,19 @@ def get_proxy_info(proxy_scheme, proxy_string):
        return None
    return ans
 # IE 11 on windows 7
 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'
 USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
-def random_user_agent(choose=None):
+def random_user_agent(choose=None, allow_ie=True):
    try:
        ua_list = random_user_agent.ua_list
    except AttributeError:
-        try:
+        ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
-            ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
+    if not allow_ie:
-        except IOError:
+        ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list)
            # People running from source checkout
            ua_list = random_user_agent.ua_list = [
                 # IE 11 - windows 10
                 'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
                 # IE 11 - windows 8.1
                 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
                 # IE 11 - windows 8
                 'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko',
                 # IE 11 - windows 7
                 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
                 # 32bit IE 11 on 64 bit win 10
                 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
                 # 32bit IE 11 on 64 bit win 8.1
                 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                 # 32bit IE 11 on 64 bit win 7
                 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
            ]
    return random.choice(ua_list) if choose is None else ua_list[choose]
@ -631,6 +616,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
    except KeyError:
        return '&'+ent+';'
 _ent_pat = re.compile(r'&(\S+?);')
 xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={
    '"' : '&quot;',
@ -739,4 +725,3 @@ def ipython(user_ns=None):
 def fsync(fileobj):
    fileobj.flush()
    os.fsync(fileobj.fileno())