From 2c8e5a3a36d45c14c285d8d010c22ab34eccc932 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 24 Mar 2022 17:35:48 +0530 Subject: [PATCH] Start working on using Chromiums network stack to download HTML A bunch of websites have started using TLS fingerprinting to deny access. Bloody morons. --- src/calibre/__init__.py | 10 ++----- src/calibre/scraper/__init__.py | 0 src/calibre/scraper/simple.py | 50 +++++++++++++++++++++++++++++++++ src/calibre/utils/random_ua.py | 18 ++++++++++++ 4 files changed, 71 insertions(+), 7 deletions(-) create mode 100644 src/calibre/scraper/__init__.py create mode 100644 src/calibre/scraper/simple.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index e03ada8cc5..bea87b67ef 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -3,7 +3,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, re, time, random, warnings +import sys, os, re, time, warnings from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type from math import floor from functools import partial @@ -290,18 +290,14 @@ def is_mobile_ua(ua): def random_user_agent(choose=None, allow_ie=True): - from calibre.utils.random_ua import common_user_agents, user_agents_popularity_map + from calibre.utils.random_ua import common_user_agents, choose_randomly_by_popularity ua_list = common_user_agents() ua_list = tuple(x for x in ua_list if not is_mobile_ua(x)) if not allow_ie: ua_list = tuple(x for x in ua_list if 'Trident/' not in x) if choose is not None: return ua_list[choose] - pm = user_agents_popularity_map() - weights = None - if pm: - weights = tuple(map(pm.__getitem__, ua_list)) - return random.choices(ua_list, weights=weights)[0] + return choose_randomly_by_popularity(ua_list) def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificates=True, handle_refresh=True, **kw): diff --git a/src/calibre/scraper/__init__.py b/src/calibre/scraper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/scraper/simple.py b/src/calibre/scraper/simple.py new file mode 100644 index 0000000000..f45ba8d89e --- /dev/null +++ b/src/calibre/scraper/simple.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2022, Kovid Goyal + + +import os +from functools import lru_cache +from qt.core import QApplication +from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings + +from calibre.constants import cache_dir + + +@lru_cache(maxsize=4) +def create_profile(cache_name='simple', allow_js=False): + from calibre.utils.random_ua import random_common_chrome_user_agent + ans = QWebEngineProfile(cache_name, QApplication.instance()) + ans.setHttpUserAgent(random_common_chrome_user_agent()) + ans.setHttpCacheMaximumSize(0) # managed by webengine + ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name)) + s = ans.settings() + a = s.setAttribute + a(QWebEngineSettings.WebAttribute.PluginsEnabled, False) + a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js) + s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes) + a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False) + a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False) + # ensure javascript cannot read from local files + a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False) + a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False) + return s + + +class SimpleScraper(QWebEnginePage): + + def __init__(self, source, parent=None): + super().__init__(create_profile(source), parent=parent) + self.setAudioMuted(True) + + def javaScriptAlert(self, url, msg): + pass + + def javaScriptConfirm(self, url, msg): + return True + + def javaScriptPrompt(self, url, msg, defval): + return True, defval + + def javaScriptConsoleMessage(self, level, message, line_num, source_id): + pass diff --git a/src/calibre/utils/random_ua.py b/src/calibre/utils/random_ua.py index d6e76bcaf1..96f5bea650 100644 --- a/src/calibre/utils/random_ua.py +++ b/src/calibre/utils/random_ua.py @@ -25,6 +25,24 @@ def common_user_agents(): return user_agent_data()['common_user_agents'] +def common_chrome_user_agents(): + for x in user_agent_data()['common_user_agents']: + if 'Chrome/' in x: + yield x + + +def choose_randomly_by_popularity(ua_list): + pm = user_agents_popularity_map() + weights = None + if pm: + weights = tuple(map(pm.__getitem__, ua_list)) + return random.choices(ua_list, weights=weights)[0] + + +def random_common_chrome_user_agent(): + return choose_randomly_by_popularity(tuple(common_chrome_user_agents())) + + def user_agents_popularity_map(): return user_agent_data().get('user_agents_popularity', {})