mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Start working on using Chromiums network stack to download HTML
A bunch of websites have started using TLS fingerprinting to deny access. Bloody morons.
This commit is contained in:
parent
b5370e0270
commit
2c8e5a3a36
@ -3,7 +3,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import sys, os, re, time, random, warnings
|
import sys, os, re, time, warnings
|
||||||
from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type
|
from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type
|
||||||
from math import floor
|
from math import floor
|
||||||
from functools import partial
|
from functools import partial
|
||||||
@ -290,18 +290,14 @@ def is_mobile_ua(ua):
|
|||||||
|
|
||||||
|
|
||||||
def random_user_agent(choose=None, allow_ie=True):
|
def random_user_agent(choose=None, allow_ie=True):
|
||||||
from calibre.utils.random_ua import common_user_agents, user_agents_popularity_map
|
from calibre.utils.random_ua import common_user_agents, choose_randomly_by_popularity
|
||||||
ua_list = common_user_agents()
|
ua_list = common_user_agents()
|
||||||
ua_list = tuple(x for x in ua_list if not is_mobile_ua(x))
|
ua_list = tuple(x for x in ua_list if not is_mobile_ua(x))
|
||||||
if not allow_ie:
|
if not allow_ie:
|
||||||
ua_list = tuple(x for x in ua_list if 'Trident/' not in x)
|
ua_list = tuple(x for x in ua_list if 'Trident/' not in x)
|
||||||
if choose is not None:
|
if choose is not None:
|
||||||
return ua_list[choose]
|
return ua_list[choose]
|
||||||
pm = user_agents_popularity_map()
|
return choose_randomly_by_popularity(ua_list)
|
||||||
weights = None
|
|
||||||
if pm:
|
|
||||||
weights = tuple(map(pm.__getitem__, ua_list))
|
|
||||||
return random.choices(ua_list, weights=weights)[0]
|
|
||||||
|
|
||||||
|
|
||||||
def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificates=True, handle_refresh=True, **kw):
|
def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificates=True, handle_refresh=True, **kw):
|
||||||
|
0
src/calibre/scraper/__init__.py
Normal file
0
src/calibre/scraper/__init__.py
Normal file
50
src/calibre/scraper/simple.py
Normal file
50
src/calibre/scraper/simple.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
from functools import lru_cache
|
||||||
|
from qt.core import QApplication
|
||||||
|
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
|
||||||
|
|
||||||
|
from calibre.constants import cache_dir
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=4)
|
||||||
|
def create_profile(cache_name='simple', allow_js=False):
|
||||||
|
from calibre.utils.random_ua import random_common_chrome_user_agent
|
||||||
|
ans = QWebEngineProfile(cache_name, QApplication.instance())
|
||||||
|
ans.setHttpUserAgent(random_common_chrome_user_agent())
|
||||||
|
ans.setHttpCacheMaximumSize(0) # managed by webengine
|
||||||
|
ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name))
|
||||||
|
s = ans.settings()
|
||||||
|
a = s.setAttribute
|
||||||
|
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
|
||||||
|
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
|
||||||
|
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
|
||||||
|
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
|
||||||
|
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
|
||||||
|
# ensure javascript cannot read from local files
|
||||||
|
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
|
||||||
|
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleScraper(QWebEnginePage):
|
||||||
|
|
||||||
|
def __init__(self, source, parent=None):
|
||||||
|
super().__init__(create_profile(source), parent=parent)
|
||||||
|
self.setAudioMuted(True)
|
||||||
|
|
||||||
|
def javaScriptAlert(self, url, msg):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def javaScriptConfirm(self, url, msg):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def javaScriptPrompt(self, url, msg, defval):
|
||||||
|
return True, defval
|
||||||
|
|
||||||
|
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
|
||||||
|
pass
|
@ -25,6 +25,24 @@ def common_user_agents():
|
|||||||
return user_agent_data()['common_user_agents']
|
return user_agent_data()['common_user_agents']
|
||||||
|
|
||||||
|
|
||||||
|
def common_chrome_user_agents():
|
||||||
|
for x in user_agent_data()['common_user_agents']:
|
||||||
|
if 'Chrome/' in x:
|
||||||
|
yield x
|
||||||
|
|
||||||
|
|
||||||
|
def choose_randomly_by_popularity(ua_list):
|
||||||
|
pm = user_agents_popularity_map()
|
||||||
|
weights = None
|
||||||
|
if pm:
|
||||||
|
weights = tuple(map(pm.__getitem__, ua_list))
|
||||||
|
return random.choices(ua_list, weights=weights)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def random_common_chrome_user_agent():
|
||||||
|
return choose_randomly_by_popularity(tuple(common_chrome_user_agents()))
|
||||||
|
|
||||||
|
|
||||||
def user_agents_popularity_map():
|
def user_agents_popularity_map():
|
||||||
return user_agent_data().get('user_agents_popularity', {})
|
return user_agent_data().get('user_agents_popularity', {})
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user