diff --git a/setup/browser_data.py b/setup/browser_data.py index 7ee2e6263f..3338913e20 100644 --- a/setup/browser_data.py +++ b/setup/browser_data.py @@ -5,7 +5,9 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os -import re +import json +import gzip +import io from datetime import datetime from setup import download_securely @@ -39,10 +41,14 @@ def common_user_agents(): ] print('Getting recent UAs...') raw = download_securely( - 'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/').decode('utf-8') - lines = re.search( - r'([^<]+)', raw).group(1).splitlines() - ans = filter_ans(lines) + 'https://raw.githubusercontent.com/intoli/user-agents/master/src/user-agents.json.gz') + data = json.loads(gzip.GzipFile(fileobj=io.BytesIO(raw)).read()) + uas = [] + for item in data: + ua = item['userAgent'] + if not ua.startswith('Opera'): + uas.append(ua) + ans = filter_ans(uas)[:256] if not ans: raise ValueError('Failed to download list of common UAs') return ans diff --git a/setup/resources.py b/setup/resources.py index f8e8d5dacd..da7c246a01 100644 --- a/setup/resources.py +++ b/setup/resources.py @@ -270,7 +270,7 @@ class RecentUAs(Command): # {{{ from setup.browser_data import get_data data = get_data() with open(self.UA_PATH, 'wb') as f: - f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf-8')) + f.write(json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True).encode('utf-8')) # }}}