mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
A larger pool of random UAs
This commit is contained in:
parent
3cd8b3f89d
commit
caac92bbd8
2
.gitignore
vendored
2
.gitignore
vendored
@ -26,7 +26,7 @@ resources/content-server/locales.zip
|
||||
resources/content-server/mathjax.zip.xz
|
||||
resources/content-server/mathjax.version
|
||||
resources/mozilla-ca-certs.pem
|
||||
resources/common-user-agents.txt
|
||||
resources/user-agent-data.json
|
||||
icons/icns/*.iconset
|
||||
setup/installer/windows/calibre/build.log
|
||||
tags
|
||||
|
115
setup/browser_data.py
Normal file
115
setup/browser_data.py
Normal file
@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
from setup import download_securely
|
||||
|
||||
is_ci = os.environ.get('CI', '').lower() == 'true'
|
||||
|
||||
|
||||
def filter_ans(ans):
|
||||
return filter(None, (x.strip() for x in ans))
|
||||
|
||||
|
||||
def common_user_agents():
|
||||
if is_ci:
|
||||
return [
|
||||
# IE 11 - windows 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 8.1
|
||||
'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 8
|
||||
'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 7
|
||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 8.1
|
||||
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 7
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
]
|
||||
print('Getting recent UAs...')
|
||||
raw = download_securely(
|
||||
'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/').decode('utf-8')
|
||||
lines = re.search(
|
||||
r'<textarea.+"get-the-list".+>([^<]+)</textarea>', raw).group(1).splitlines()
|
||||
ans = filter_ans(lines)
|
||||
if not ans:
|
||||
raise ValueError('Failed to download list of common UAs')
|
||||
return ans
|
||||
|
||||
|
||||
def firefox_versions():
|
||||
if is_ci:
|
||||
return '51.0 50.0'.split()
|
||||
print('Getting firefox versions...')
|
||||
import html5lib
|
||||
raw = download_securely(
|
||||
'https://www.mozilla.org/en-US/firefox/releases/').decode('utf-8')
|
||||
root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||
ol = root.xpath('//div[@id="main-content"]/ol')[0]
|
||||
ol.xpath('descendant::li/strong/a[@href]')
|
||||
ans = filter_ans(ol.xpath('descendant::li/strong/a[@href]/text()'))
|
||||
if not ans:
|
||||
raise ValueError('Failed to download list of firefox versions')
|
||||
return ans
|
||||
|
||||
|
||||
def chrome_versions():
|
||||
if is_ci:
|
||||
return []
|
||||
print('Getting chrome versions...')
|
||||
import html5lib
|
||||
raw = download_securely(
|
||||
'https://en.wikipedia.org/wiki/Google_Chrome_version_history').decode('utf-8')
|
||||
root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||
table = root.xpath('//*[@id="mw-content-text"]//tbody')[-1]
|
||||
ans = []
|
||||
for tr in table.iterchildren('tr'):
|
||||
cells = tuple(tr.iterchildren('td'))
|
||||
if not cells:
|
||||
continue
|
||||
if not cells[2].text or not cells[2].text.strip():
|
||||
continue
|
||||
s = cells[0].get('style')
|
||||
if '#a0e75a' not in s and 'salmon' not in s:
|
||||
break
|
||||
chrome_version = cells[0].text.strip()
|
||||
ts = datetime.strptime(cells[1].text.strip().split()[
|
||||
0], '%Y-%m-%d').date().strftime('%Y-%m-%d')
|
||||
try:
|
||||
webkit_version = cells[2].text.strip().split()[1]
|
||||
except IndexError:
|
||||
continue
|
||||
ans.append({'date': ts, 'chrome_version': chrome_version,
|
||||
'webkit_version': webkit_version})
|
||||
return list(reversed(ans))
|
||||
|
||||
|
||||
def all_desktop_platforms(user_agents):
|
||||
ans = set()
|
||||
for ua in user_agents:
|
||||
if 'Mobile/' not in ua and ('Firefox/' in ua or 'Chrome/' in ua):
|
||||
plat = ua.partition('(')[2].partition(')')[0]
|
||||
parts = plat.split(';')
|
||||
if 'Firefox/' in ua:
|
||||
del parts[-1]
|
||||
ans.add(';'.join(parts))
|
||||
return ans
|
||||
|
||||
|
||||
def get_data():
|
||||
ans = {
|
||||
'chrome_versions': chrome_versions(),
|
||||
'firefox_versions': firefox_versions(),
|
||||
'common_user_agents': common_user_agents(),
|
||||
}
|
||||
ans['desktop_platforms'] = list(all_desktop_platforms(ans['common_user_agents']))
|
||||
return ans
|
@ -257,38 +257,14 @@ class CACerts(Command): # {{{
|
||||
|
||||
class RecentUAs(Command): # {{{
|
||||
|
||||
description = 'Get updated list of recent browser user agents'
|
||||
UA_PATH = os.path.join(Command.RESOURCES, 'common-user-agents.txt')
|
||||
|
||||
def get_list(self):
|
||||
if is_ci:
|
||||
# Dont hammer the server from CI
|
||||
return [
|
||||
# IE 11 - windows 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 8.1
|
||||
'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 8
|
||||
'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko',
|
||||
# IE 11 - windows 7
|
||||
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 10
|
||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 8.1
|
||||
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
# 32bit IE 11 on 64 bit win 7
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||
]
|
||||
raw = download_securely('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/').decode('utf-8')
|
||||
lines = re.search(r'<textarea.+"get-the-list".+>([^<]+)</textarea>', raw).group(1).splitlines()
|
||||
return [x.strip() for x in lines if x.strip()]
|
||||
description = 'Get updated list of common browser user agents'
|
||||
UA_PATH = os.path.join(Command.RESOURCES, 'user-agent-data.json')
|
||||
|
||||
def run(self, opts):
|
||||
lines = self.get_list()[:10]
|
||||
if not lines:
|
||||
raise RuntimeError('Failed to download list of common user agents')
|
||||
from setup.browser_data import get_data
|
||||
data = get_data()
|
||||
with open(self.UA_PATH, 'wb') as f:
|
||||
f.write('\n'.join(lines).encode('ascii'))
|
||||
f.write(json.dumps(data, indent=2))
|
||||
# }}}
|
||||
|
||||
|
||||
|
@ -397,12 +397,11 @@ USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/
|
||||
|
||||
|
||||
def random_user_agent(choose=None, allow_ie=True):
|
||||
try:
|
||||
ua_list = random_user_agent.ua_list
|
||||
except AttributeError:
|
||||
ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines()
|
||||
from calibre.utils.random_ua import common_user_agents
|
||||
ua_list = common_user_agents()
|
||||
ua_list = filter(lambda x: 'Mobile/' not in x, ua_list)
|
||||
if not allow_ie:
|
||||
ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list)
|
||||
ua_list = filter(lambda x: 'Trident/' not in x and 'Edge/' not in x, ua_list)
|
||||
return random.choice(ua_list) if choose is None else ua_list[choose]
|
||||
|
||||
|
||||
|
50
src/calibre/utils/random_ua.py
Normal file
50
src/calibre/utils/random_ua.py
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import json
|
||||
import random
|
||||
|
||||
|
||||
def user_agent_data():
|
||||
ans = getattr(user_agent_data, 'ans', None)
|
||||
if ans is None:
|
||||
ans = user_agent_data.ans = json.loads(
|
||||
P('user-agent-data.json', data=True, allow_user_override=False))
|
||||
return ans
|
||||
|
||||
|
||||
def common_user_agents():
|
||||
return user_agent_data()['common_user_agents']
|
||||
|
||||
|
||||
def random_firefox_version():
|
||||
versions = user_agent_data()['firefox_versions'][:7]
|
||||
return random.choice(versions)
|
||||
|
||||
|
||||
def random_desktop_platform():
|
||||
return random.choice(user_agent_data()['desktop_platforms'])
|
||||
|
||||
|
||||
def random_firefox_ua():
|
||||
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent/Firefox
|
||||
return 'Mozilla/5.0 ({p}; rv:{ver}) Gecko/20100101 Firefox/{ver}'.format(
|
||||
p=random_desktop_platform(), ver=random_firefox_version())
|
||||
|
||||
|
||||
def random_chrome_version():
|
||||
versions = user_agent_data()['chrome_versions'][:7]
|
||||
return random.choice(versions)
|
||||
|
||||
|
||||
def random_chrome_ua():
|
||||
v = random_chrome_version()
|
||||
return 'Mozilla/5.0 ({p}) AppleWebKit/{wv} (KHTML, like Gecko) Chrome/{cv} Safari/{wv}'.format(
|
||||
p=random_desktop_platform(), wv=v['webkit_version'], cv=v['chrome_version'])
|
||||
|
||||
|
||||
def random_user_agent():
|
||||
return random.choice((random_chrome_ua, random_firefox_ua))()
|
Loading…
x
Reference in New Issue
Block a user