Use a random non-IE user agent

This commit is contained in:
Kovid Goyal 2016-12-13 12:14:30 +05:30
parent d700523080
commit cc5d806c6e
3 changed files with 23 additions and 24 deletions

View File

@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
'''
www.guardian.co.uk
'''
from calibre import random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date
@ -54,8 +55,13 @@ class Guardian(BasicNewsRecipe):
def get_browser(self, *a, **kw):
# This site returns images in JPEG-XR format if the user agent is IE
if not hasattr(self, 'non_ie_ua'):
try:
self.non_ie_ua = random_user_agent(allow_ie=False)
except TypeError:
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
kw['user_agent'] = self.non_ie_ua
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
return br
def preprocess_raw_html(self, raw, url):

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from calibre import random_user_agent
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
@ -38,8 +41,13 @@ class TheIndependentNew(BasicNewsRecipe):
def get_browser(self, *a, **kw):
# This site returns images in JPEG-XR format if the user agent is IE
if not hasattr(self, 'non_ie_ua'):
try:
self.non_ie_ua = random_user_agent(allow_ie=False)
except TypeError:
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
kw['user_agent'] = self.non_ie_ua
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
return br
def preprocess_html(self, soup):

View File

@ -103,6 +103,7 @@ def osx_version():
def confirm_config_name(name):
return name + '_again'
_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
@ -389,35 +390,19 @@ def get_proxy_info(proxy_scheme, proxy_string):
return None
return ans
# IE 11 on windows 7
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
def random_user_agent(choose=None):
def random_user_agent(choose=None, allow_ie=True):
try:
ua_list = random_user_agent.ua_list
except AttributeError:
try:
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
except IOError:
# People running from source checkout
ua_list = random_user_agent.ua_list = [
# IE 11 - windows 10
'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
# IE 11 - windows 8.1
'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
# IE 11 - windows 8
'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko',
# IE 11 - windows 7
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
# 32bit IE 11 on 64 bit win 10
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 32bit IE 11 on 64 bit win 8.1
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 32bit IE 11 on 64 bit win 7
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
]
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
if not allow_ie:
ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list)
return random.choice(ua_list) if choose is None else ua_list[choose]
@ -631,6 +616,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
except KeyError:
return '&'+ent+';'
_ent_pat = re.compile(r'&(\S+?);')
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={
'"' : '&quot;',
@ -739,4 +725,3 @@ def ipython(user_ns=None):
def fsync(fileobj):
fileobj.flush()
os.fsync(fileobj.fileno())