diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index 5377085b6e..769828dea8 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -7,9 +7,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera import json import re import time +from threading import Lock from collections import defaultdict, namedtuple + try: - from urllib.parse import parse_qs, quote_plus, urlencode, unquote + from urllib.parse import parse_qs, quote_plus, unquote, urlencode except ImportError: from urlparse import parse_qs from urllib import quote_plus, urlencode, unquote @@ -17,10 +19,11 @@ except ImportError: from lxml import etree from calibre import browser as _browser, prints, random_user_agent +from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.monotonic import monotonic -from calibre.utils.random_ua import accept_header_for_ua +from calibre.utils.random_ua import accept_header_for_ua, random_common_chrome_user_agent -current_version = (1, 0, 12) +current_version = (1, 0, 13) minimum_calibre_version = (2, 80, 0) @@ -60,16 +63,26 @@ def parse_html(raw): return parse(raw) -def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None): - delta = monotonic() - last_visited[key] +last_visited_lock = Lock() + + +def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None): + with last_visited_lock: + lv = last_visited[key] + delta = monotonic() - lv if delta < limit and delta > 0: time.sleep(delta) try: - raw = br.open_novisit(url, timeout=timeout).read() + if simple_scraper is None: + raw = br.open_novisit(url, timeout=timeout).read() + raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] + else: + raw = simple_scraper(url, timeout=timeout) finally: - last_visited[key] = monotonic() + with last_visited_lock: + last_visited[key] = monotonic() if dump_raw is not None: - with open(dump_raw, 'wb') as f: + with open(dump_raw, 'w') as f: f.write(raw) if save_raw is not None: save_raw(raw) @@ -169,7 +182,7 @@ def bing_url_processor(url): return url -def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): +def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60, show_user_agent=False): # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm terms = [quote_term(bing_term(t)) for t in terms] if site is not None: @@ -178,6 +191,14 @@ def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_r url = 'https://www.bing.com/search?q={q}'.format(q=q) log('Making bing query: ' + url) br = br or browser() + br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent'] + ua = '' + while not ua or 'Edg/' in ua: + ua = random_common_chrome_user_agent() + if show_user_agent: + print('User-agent:', ua) + br.addheaders.append(('User-agent', ua)) + root = query(br, url, 'bing', dump_raw, timeout=timeout) ans = [] for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'): @@ -200,8 +221,7 @@ def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_r def bing_develop(): - br = browser() - for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]: + for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', show_user_agent=True)[0]: if '/dp/' in result.url: print(result.title) print(' ', result.url) @@ -314,3 +334,9 @@ def resolve_url(url): if prefix == 'wayback': return wayback_url_processor(rest) return url + + +# if __name__ == '__main__': +# import sys +# func = sys.argv[-1] +# globals()[func]() diff --git a/src/calibre/ebooks/metadata/sources/worker.py b/src/calibre/ebooks/metadata/sources/worker.py index 25d91bf1c6..98d3ddf686 100644 --- a/src/calibre/ebooks/metadata/sources/worker.py +++ b/src/calibre/ebooks/metadata/sources/worker.py @@ -1,11 +1,10 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # License: GPLv3 Copyright: 2012, Kovid Goyal -from __future__ import absolute_import, division, print_function, unicode_literals - import os from collections import Counter from io import BytesIO +from functools import wraps from threading import Event, Thread from calibre.customize.ui import metadata_plugins @@ -17,8 +16,8 @@ from calibre.ebooks.metadata.sources.identify import identify, msprefs from calibre.ebooks.metadata.sources.update import patch_plugins from calibre.utils.date import as_utc from calibre.utils.logging import GUILog -from polyglot.queue import Empty, Queue from polyglot.builtins import iteritems +from polyglot.queue import Empty, Queue def merge_result(oldmi, newmi, ensure_fields=None): @@ -51,6 +50,18 @@ def merge_result(oldmi, newmi, ensure_fields=None): return newmi +def shutdown_webengine_workers(func): + @wraps(func) + def wrapper(*a, **k): + from calibre.scraper.simple import cleanup_overseers + try: + return func(*a, **k) + finally: + cleanup_overseers()() + return wrapper + + +@shutdown_webengine_workers def main(do_identify, covers, metadata, ensure_fields, tdir): failed_ids = set() failed_covers = set() @@ -101,6 +112,7 @@ def main(do_identify, covers, metadata, ensure_fields, tdir): return failed_ids, failed_covers, all_failed +@shutdown_webengine_workers def single_identify(title, authors, identifiers): log = GUILog() patch_plugins() @@ -110,6 +122,7 @@ def single_identify(title, authors, identifiers): r in results], dump_caches(), log.dump() +@shutdown_webengine_workers def single_covers(title, authors, identifiers, caches, tdir): patch_plugins() load_caches(caches) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 0e03be953c..11252d81e4 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -1555,6 +1555,7 @@ def ensure_app(headless=True): os.environ['QT_MAC_DISABLE_FOREGROUND_APPLICATION_TRANSFORM'] = '1' if headless and iswindows: QApplication.setAttribute(Qt.ApplicationAttribute.AA_UseSoftwareOpenGL, True) + QApplication.setAttribute(Qt.ApplicationAttribute.AA_ShareOpenGLContexts) _store_app = QApplication(args) if headless and has_headless: _store_app.headless = True diff --git a/src/calibre/scraper/simple.py b/src/calibre/scraper/simple.py index d00d598b4c..328fb33bc3 100644 --- a/src/calibre/scraper/simple.py +++ b/src/calibre/scraper/simple.py @@ -7,8 +7,6 @@ import json import os import sys import weakref -from contextlib import suppress -from qt.core import QLoggingCategory, QUrl from threading import Lock, Thread, get_ident from calibre.constants import iswindows @@ -18,6 +16,7 @@ from calibre.utils.ipc.simple_worker import start_pipe_worker def worker_main(source): + from qt.core import QLoggingCategory, QUrl QLoggingCategory.setFilterRules('''\ qt.webenginecontext.info=false ''') @@ -53,11 +52,6 @@ qt.webenginecontext.info=false overseers = [] -def safe_wait(w, timeout): - with suppress(Exception): - return w.wait(timeout) - - class Overseer: def __init__(self): @@ -65,6 +59,12 @@ class Overseer: self.workers = {} overseers.append(weakref.ref(self)) + def safe_wait(self, w, timeout): + try: + return w.wait(timeout) + except Exception: + pass + def worker_for_source(self, source): wname = f'{source}::{get_ident()}' with self.lock: @@ -75,6 +75,7 @@ class Overseer: return ans def fetch_url(self, url_or_qurl, source='', timeout=60): + from qt.core import QUrl w = self.worker_for_source(source) if isinstance(url_or_qurl, str): url_or_qurl = QUrl(url_or_qurl) @@ -96,10 +97,10 @@ class Overseer: w.stdin.write(b'EXIT:0\n') w.stdin.flush() for w in self.workers.values(): - if safe_wait(w, 0.2) is None: + if self.safe_wait(w, 0.2) is None: w.terminate() if not iswindows: - if safe_wait(w, 0.1) is None: + if self.safe_wait(w, 0.1) is None: w.kill() self.workers.clear() close = __del__ @@ -148,6 +149,7 @@ def find_tests(): class TestSimpleWebEngineScraper(unittest.TestCase): def test_dom_load(self): + from qt.core import QUrl overseer = Overseer() for f in ('book', 'nav'): path = P(f'templates/new_{f}.html', allow_user_override=False)