Use a random non-IE user agent

This commit is contained in:
Kovid Goyal 2016-12-13 12:14:30 +05:30
parent d700523080
commit cc5d806c6e
3 changed files with 23 additions and 24 deletions

View File

@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
''' '''
www.guardian.co.uk www.guardian.co.uk
''' '''
from calibre import random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date from datetime import date
@ -54,8 +55,13 @@ class Guardian(BasicNewsRecipe):
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
# This site returns images in JPEG-XR format if the user agent is IE # This site returns images in JPEG-XR format if the user agent is IE
if not hasattr(self, 'non_ie_ua'):
try:
self.non_ie_ua = random_user_agent(allow_ie=False)
except TypeError:
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
kw['user_agent'] = self.non_ie_ua
br = BasicNewsRecipe.get_browser(self, *a, **kw) br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
return br return br
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from calibre import random_user_agent
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
@ -38,8 +41,13 @@ class TheIndependentNew(BasicNewsRecipe):
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
# This site returns images in JPEG-XR format if the user agent is IE # This site returns images in JPEG-XR format if the user agent is IE
if not hasattr(self, 'non_ie_ua'):
try:
self.non_ie_ua = random_user_agent(allow_ie=False)
except TypeError:
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
kw['user_agent'] = self.non_ie_ua
br = BasicNewsRecipe.get_browser(self, *a, **kw) br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
return br return br
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -103,6 +103,7 @@ def osx_version():
def confirm_config_name(name): def confirm_config_name(name):
return name + '_again' return name + '_again'
_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]') _filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<', _filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32)))) u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
@ -389,35 +390,19 @@ def get_proxy_info(proxy_scheme, proxy_string):
return None return None
return ans return ans
# IE 11 on windows 7 # IE 11 on windows 7
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
def random_user_agent(choose=None): def random_user_agent(choose=None, allow_ie=True):
try: try:
ua_list = random_user_agent.ua_list ua_list = random_user_agent.ua_list
except AttributeError: except AttributeError:
try: ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines() if not allow_ie:
except IOError: ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list)
# People running from source checkout
ua_list = random_user_agent.ua_list = [
# IE 11 - windows 10
'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
# IE 11 - windows 8.1
'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
# IE 11 - windows 8
'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko',
# IE 11 - windows 7
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
# 32bit IE 11 on 64 bit win 10
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 32bit IE 11 on 64 bit win 8.1
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 32bit IE 11 on 64 bit win 7
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
]
return random.choice(ua_list) if choose is None else ua_list[choose] return random.choice(ua_list) if choose is None else ua_list[choose]
@ -631,6 +616,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
except KeyError: except KeyError:
return '&'+ent+';' return '&'+ent+';'
_ent_pat = re.compile(r'&(\S+?);') _ent_pat = re.compile(r'&(\S+?);')
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={ xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={
'"' : '&quot;', '"' : '&quot;',
@ -739,4 +725,3 @@ def ipython(user_ns=None):
def fsync(fileobj): def fsync(fileobj):
fileobj.flush() fileobj.flush()
os.fsync(fileobj.fileno()) os.fsync(fileobj.fileno())