From 5b00e588b29168442a238835c3508c2e4096a83d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 16 Aug 2024 20:01:50 +0530 Subject: [PATCH] Allow using the webengine backend in recipes --- recipes/science_journal.recipe | 1 - src/calibre/scraper/webengine_backend.py | 2 +- src/calibre/web/feeds/news.py | 14 ++++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/recipes/science_journal.recipe b/recipes/science_journal.recipe index c0e9e2e458..2f5d0097f4 100644 --- a/recipes/science_journal.recipe +++ b/recipes/science_journal.recipe @@ -43,7 +43,6 @@ class science(BasicNewsRecipe): classes('pb-ad') ] browser_type = 'qt' - simultaneous_downloads = 1 # server returns invalid data on HTTP2 connections when multiple requests are queued on the same connection def preprocess_html(self, soup): for p in soup.findAll(attrs={'role':'paragraph'}): diff --git a/src/calibre/scraper/webengine_backend.py b/src/calibre/scraper/webengine_backend.py index 15b837faa7..fc594dcae9 100644 --- a/src/calibre/scraper/webengine_backend.py +++ b/src/calibre/scraper/webengine_backend.py @@ -216,7 +216,7 @@ class FetchBackend(QObject): self.timeout_timer.start() return if len(self.workers) < 5: - self.workers.append(self.create_worker) + self.workers.append(self.create_worker()) self.workers[-1].start_download(self.output_dir, req, data) self.timeout_timer.start() return diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index c4f33e4c6a..c6bbb5d52d 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -428,9 +428,10 @@ class BasicNewsRecipe(Recipe): recipe_specific_options = None #: The simulated browser engine to use when downloading from servers. The default is to use the Python mechanize - #: browser engine. An alternate is "qt" which will use the network engine from the Qt toolkit. - #: The mechanize engine supports logging in, the Qt engine does not. However, the Qt engine supports HTTP/2 and - #: similar technologies and also is harder for bot interception services to fingerprint. + #: browser engine, which supports logging in. However, if you don't need logging in, consider changing this + #: to either 'webengine' which uses an actual Chromium browser to do the network requests or 'qt' which + #: uses the Qt Networking backend. Both 'webengine' and 'qt' support HTTP/2, which mechanize does not and + #: are thus harder to fingerprint for bot protection services. browser_type = 'mechanize' #: Set to False if you do not want to use gzipped transfers with the mechanize browser. @@ -571,9 +572,10 @@ class BasicNewsRecipe(Recipe): ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False) kwargs['user_agent'] = self.last_used_user_agent = ua self.log('Using user agent:', kwargs['user_agent']) - if self.browser_type == 'qt': - from calibre.scraper.qt import Browser - return Browser(user_agent=kwargs['user_agent'], verify_ssl_certificates=kwargs.get('verify_ssl_certificates', False)) + if self.browser_type != 'mechanize': + from calibre.scraper.qt import Browser, WebEngineBrowser + return {'qt': Browser, 'webengine': WebEngineBrowser}[self.browser_type]( + user_agent=kwargs['user_agent'], verify_ssl_certificates=kwargs.get('verify_ssl_certificates', False)) br = browser(*args, **kwargs) br.addheaders += [('Accept', '*/*')] if self.handle_gzip: