mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use a random non-IE user agent
This commit is contained in:
parent
d700523080
commit
cc5d806c6e
@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
'''
|
'''
|
||||||
www.guardian.co.uk
|
www.guardian.co.uk
|
||||||
'''
|
'''
|
||||||
|
from calibre import random_user_agent
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
|
||||||
@ -54,8 +55,13 @@ class Guardian(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_browser(self, *a, **kw):
|
def get_browser(self, *a, **kw):
|
||||||
# This site returns images in JPEG-XR format if the user agent is IE
|
# This site returns images in JPEG-XR format if the user agent is IE
|
||||||
|
if not hasattr(self, 'non_ie_ua'):
|
||||||
|
try:
|
||||||
|
self.non_ie_ua = random_user_agent(allow_ie=False)
|
||||||
|
except TypeError:
|
||||||
|
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
|
||||||
|
kw['user_agent'] = self.non_ie_ua
|
||||||
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
||||||
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
|
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
def preprocess_raw_html(self, raw, url):
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from calibre import random_user_agent
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
|
|
||||||
@ -38,8 +41,13 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_browser(self, *a, **kw):
|
def get_browser(self, *a, **kw):
|
||||||
# This site returns images in JPEG-XR format if the user agent is IE
|
# This site returns images in JPEG-XR format if the user agent is IE
|
||||||
|
if not hasattr(self, 'non_ie_ua'):
|
||||||
|
try:
|
||||||
|
self.non_ie_ua = random_user_agent(allow_ie=False)
|
||||||
|
except TypeError:
|
||||||
|
self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
|
||||||
|
kw['user_agent'] = self.non_ie_ua
|
||||||
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
||||||
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36')]
|
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -103,6 +103,7 @@ def osx_version():
|
|||||||
def confirm_config_name(name):
|
def confirm_config_name(name):
|
||||||
return name + '_again'
|
return name + '_again'
|
||||||
|
|
||||||
|
|
||||||
_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
|
_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
|
||||||
_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
|
_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
|
||||||
u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
|
u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
|
||||||
@ -389,35 +390,19 @@ def get_proxy_info(proxy_scheme, proxy_string):
|
|||||||
return None
|
return None
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
# IE 11 on windows 7
|
# IE 11 on windows 7
|
||||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'
|
||||||
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
|
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
|
||||||
|
|
||||||
|
|
||||||
def random_user_agent(choose=None):
|
def random_user_agent(choose=None, allow_ie=True):
|
||||||
try:
|
try:
|
||||||
ua_list = random_user_agent.ua_list
|
ua_list = random_user_agent.ua_list
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
try:
|
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
|
||||||
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
|
if not allow_ie:
|
||||||
except IOError:
|
ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list)
|
||||||
# People running from source checkout
|
|
||||||
ua_list = random_user_agent.ua_list = [
|
|
||||||
# IE 11 - windows 10
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
# IE 11 - windows 8.1
|
|
||||||
'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
# IE 11 - windows 8
|
|
||||||
'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
# IE 11 - windows 7
|
|
||||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
# 32bit IE 11 on 64 bit win 10
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
# 32bit IE 11 on 64 bit win 8.1
|
|
||||||
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
# 32bit IE 11 on 64 bit win 7
|
|
||||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
]
|
|
||||||
return random.choice(ua_list) if choose is None else ua_list[choose]
|
return random.choice(ua_list) if choose is None else ua_list[choose]
|
||||||
|
|
||||||
|
|
||||||
@ -631,6 +616,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
return '&'+ent+';'
|
return '&'+ent+';'
|
||||||
|
|
||||||
|
|
||||||
_ent_pat = re.compile(r'&(\S+?);')
|
_ent_pat = re.compile(r'&(\S+?);')
|
||||||
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={
|
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions={
|
||||||
'"' : '"',
|
'"' : '"',
|
||||||
@ -739,4 +725,3 @@ def ipython(user_ns=None):
|
|||||||
def fsync(fileobj):
|
def fsync(fileobj):
|
||||||
fileobj.flush()
|
fileobj.flush()
|
||||||
os.fsync(fileobj.fileno())
|
os.fsync(fileobj.fileno())
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user