mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use a random non-IE user agent
This commit is contained in:
parent
d700523080
commit
cc5d806c6e
@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
|
||||
'''
|
||||
www.guardian.co.uk
|
||||
'''
|
||||
from calibre import random_user_agent
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from datetime import date
|
||||
|
||||
@ -54,8 +55,13 @@ class Guardian(BasicNewsRecipe):
|
||||
|
||||
def get_browser(self, *a, **kw):
|
||||
# This site returns images in JPEG-XR format if the user agent is IE
|
||||
if not hasattr(self, 'non_ie_ua'):
|
||||
try:
|
||||
self.non_ie_ua = random_user_agent(allow_ie=False)
|
||||
except TypeError:
|
||||
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
|
||||
kw['user_agent'] = self.non_ie_ua
|
||||
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
||||
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
|
||||
return br
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from calibre import random_user_agent
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
@ -38,8 +41,13 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
|
||||
def get_browser(self, *a, **kw):
|
||||
# This site returns images in JPEG-XR format if the user agent is IE
|
||||
if not hasattr(self, 'non_ie_ua'):
|
||||
try:
|
||||
self.non_ie_ua = random_user_agent(allow_ie=False)
|
||||
except TypeError:
|
||||
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
|
||||
kw['user_agent'] = self.non_ie_ua
|
||||
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
||||
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
|
||||
return br
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
@ -103,6 +103,7 @@ def osx_version():
|
||||
def confirm_config_name(name):
|
||||
return name + '_again'
|
||||
|
||||
|
||||
_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
|
||||
_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
|
||||
u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
|
||||
@ -389,35 +390,19 @@ def get_proxy_info(proxy_scheme, proxy_string):
|
||||
return None
|
||||
return ans
|
||||
|
||||
|
||||
# IE 11 on windows 7
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'
|
||||
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
|
||||
|
||||
|
||||
def random_user_agent(choose=None):
|
||||
def random_user_agent(choose=None, allow_ie=True):
|
||||
try:
|
||||
ua_list = random_user_agent.ua_list
|
||||
except AttributeError:
|
||||
try:
|
||||
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
|
||||
except IOError:
|
||||
# People running from source checkout
|
||||
ua_list = random_user_agent.ua_list = [
|
||||
# IE 11 - windows 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 8.1
|
||||
'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 8
|
||||
'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 7
|
||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 8.1
|
||||
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 7
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
]
|
||||
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
|
||||
if not allow_ie:
|
||||
ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list)
|
||||
return random.choice(ua_list) if choose is None else ua_list[choose]
|
||||
|
||||
|
||||
@ -631,6 +616,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
||||
except KeyError:
|
||||
return '&'+ent+';'
|
||||
|
||||
|
||||
_ent_pat = re.compile(r'&(\S+?);')
|
||||
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={
|
||||
'"' : '"',
|
||||
@ -739,4 +725,3 @@ def ipython(user_ns=None):
|
||||
def fsync(fileobj):
|
||||
fileobj.flush()
|
||||
os.fsync(fileobj.fileno())
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user