Merge branch 'kovidgoyal:master' into translation-fixes

This commit is contained in:
413Michele 2022-07-17 11:35:47 +02:00 committed by GitHub
commit 28b8601875
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 65 additions and 23 deletions

View File

@ -7,9 +7,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json import json
import re import re
import time import time
from threading import Lock
from collections import defaultdict, namedtuple from collections import defaultdict, namedtuple
try: try:
from urllib.parse import parse_qs, quote_plus, urlencode, unquote from urllib.parse import parse_qs, quote_plus, unquote, urlencode
except ImportError: except ImportError:
from urlparse import parse_qs from urlparse import parse_qs
from urllib import quote_plus, urlencode, unquote from urllib import quote_plus, urlencode, unquote
@ -17,10 +19,11 @@ except ImportError:
from lxml import etree from lxml import etree
from calibre import browser as _browser, prints, random_user_agent from calibre import browser as _browser, prints, random_user_agent
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.monotonic import monotonic from calibre.utils.monotonic import monotonic
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua, random_common_chrome_user_agent
current_version = (1, 0, 12) current_version = (1, 0, 13)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
@ -60,16 +63,26 @@ def parse_html(raw):
return parse(raw) return parse(raw)
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None): last_visited_lock = Lock()
delta = monotonic() - last_visited[key]
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None):
with last_visited_lock:
lv = last_visited[key]
delta = monotonic() - lv
if delta < limit and delta > 0: if delta < limit and delta > 0:
time.sleep(delta) time.sleep(delta)
try: try:
raw = br.open_novisit(url, timeout=timeout).read() if simple_scraper is None:
raw = br.open_novisit(url, timeout=timeout).read()
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
else:
raw = simple_scraper(url, timeout=timeout)
finally: finally:
last_visited[key] = monotonic() with last_visited_lock:
last_visited[key] = monotonic()
if dump_raw is not None: if dump_raw is not None:
with open(dump_raw, 'wb') as f: with open(dump_raw, 'w') as f:
f.write(raw) f.write(raw)
if save_raw is not None: if save_raw is not None:
save_raw(raw) save_raw(raw)
@ -169,7 +182,7 @@ def bing_url_processor(url):
return url return url
def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60, show_user_agent=False):
# http://vlaurie.com/computers2/Articles/bing_advanced_search.htm # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
terms = [quote_term(bing_term(t)) for t in terms] terms = [quote_term(bing_term(t)) for t in terms]
if site is not None: if site is not None:
@ -178,6 +191,14 @@ def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_r
url = 'https://www.bing.com/search?q={q}'.format(q=q) url = 'https://www.bing.com/search?q={q}'.format(q=q)
log('Making bing query: ' + url) log('Making bing query: ' + url)
br = br or browser() br = br or browser()
br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']
ua = ''
while not ua or 'Edg/' in ua:
ua = random_common_chrome_user_agent()
if show_user_agent:
print('User-agent:', ua)
br.addheaders.append(('User-agent', ua))
root = query(br, url, 'bing', dump_raw, timeout=timeout) root = query(br, url, 'bing', dump_raw, timeout=timeout)
ans = [] ans = []
for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'): for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
@ -200,8 +221,7 @@ def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_r
def bing_develop(): def bing_develop():
br = browser() for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', show_user_agent=True)[0]:
for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:
if '/dp/' in result.url: if '/dp/' in result.url:
print(result.title) print(result.title)
print(' ', result.url) print(' ', result.url)
@ -314,3 +334,9 @@ def resolve_url(url):
if prefix == 'wayback': if prefix == 'wayback':
return wayback_url_processor(rest) return wayback_url_processor(rest)
return url return url
# if __name__ == '__main__':
# import sys
# func = sys.argv[-1]
# globals()[func]()

View File

@ -1,11 +1,10 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
# License: GPLv3 Copyright: 2012, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2012, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
from collections import Counter from collections import Counter
from io import BytesIO from io import BytesIO
from functools import wraps
from threading import Event, Thread from threading import Event, Thread
from calibre.customize.ui import metadata_plugins from calibre.customize.ui import metadata_plugins
@ -17,8 +16,8 @@ from calibre.ebooks.metadata.sources.identify import identify, msprefs
from calibre.ebooks.metadata.sources.update import patch_plugins from calibre.ebooks.metadata.sources.update import patch_plugins
from calibre.utils.date import as_utc from calibre.utils.date import as_utc
from calibre.utils.logging import GUILog from calibre.utils.logging import GUILog
from polyglot.queue import Empty, Queue
from polyglot.builtins import iteritems from polyglot.builtins import iteritems
from polyglot.queue import Empty, Queue
def merge_result(oldmi, newmi, ensure_fields=None): def merge_result(oldmi, newmi, ensure_fields=None):
@ -51,6 +50,18 @@ def merge_result(oldmi, newmi, ensure_fields=None):
return newmi return newmi
def shutdown_webengine_workers(func):
@wraps(func)
def wrapper(*a, **k):
from calibre.scraper.simple import cleanup_overseers
try:
return func(*a, **k)
finally:
cleanup_overseers()()
return wrapper
@shutdown_webengine_workers
def main(do_identify, covers, metadata, ensure_fields, tdir): def main(do_identify, covers, metadata, ensure_fields, tdir):
failed_ids = set() failed_ids = set()
failed_covers = set() failed_covers = set()
@ -101,6 +112,7 @@ def main(do_identify, covers, metadata, ensure_fields, tdir):
return failed_ids, failed_covers, all_failed return failed_ids, failed_covers, all_failed
@shutdown_webengine_workers
def single_identify(title, authors, identifiers): def single_identify(title, authors, identifiers):
log = GUILog() log = GUILog()
patch_plugins() patch_plugins()
@ -110,6 +122,7 @@ def single_identify(title, authors, identifiers):
r in results], dump_caches(), log.dump() r in results], dump_caches(), log.dump()
@shutdown_webengine_workers
def single_covers(title, authors, identifiers, caches, tdir): def single_covers(title, authors, identifiers, caches, tdir):
patch_plugins() patch_plugins()
load_caches(caches) load_caches(caches)

View File

@ -1555,6 +1555,7 @@ def ensure_app(headless=True):
os.environ['QT_MAC_DISABLE_FOREGROUND_APPLICATION_TRANSFORM'] = '1' os.environ['QT_MAC_DISABLE_FOREGROUND_APPLICATION_TRANSFORM'] = '1'
if headless and iswindows: if headless and iswindows:
QApplication.setAttribute(Qt.ApplicationAttribute.AA_UseSoftwareOpenGL, True) QApplication.setAttribute(Qt.ApplicationAttribute.AA_UseSoftwareOpenGL, True)
QApplication.setAttribute(Qt.ApplicationAttribute.AA_ShareOpenGLContexts)
_store_app = QApplication(args) _store_app = QApplication(args)
if headless and has_headless: if headless and has_headless:
_store_app.headless = True _store_app.headless = True

View File

@ -7,8 +7,6 @@ import json
import os import os
import sys import sys
import weakref import weakref
from contextlib import suppress
from qt.core import QLoggingCategory, QUrl
from threading import Lock, Thread, get_ident from threading import Lock, Thread, get_ident
from calibre.constants import iswindows from calibre.constants import iswindows
@ -18,6 +16,7 @@ from calibre.utils.ipc.simple_worker import start_pipe_worker
def worker_main(source): def worker_main(source):
from qt.core import QLoggingCategory, QUrl
QLoggingCategory.setFilterRules('''\ QLoggingCategory.setFilterRules('''\
qt.webenginecontext.info=false qt.webenginecontext.info=false
''') ''')
@ -53,11 +52,6 @@ qt.webenginecontext.info=false
overseers = [] overseers = []
def safe_wait(w, timeout):
with suppress(Exception):
return w.wait(timeout)
class Overseer: class Overseer:
def __init__(self): def __init__(self):
@ -65,6 +59,12 @@ class Overseer:
self.workers = {} self.workers = {}
overseers.append(weakref.ref(self)) overseers.append(weakref.ref(self))
def safe_wait(self, w, timeout):
try:
return w.wait(timeout)
except Exception:
pass
def worker_for_source(self, source): def worker_for_source(self, source):
wname = f'{source}::{get_ident()}' wname = f'{source}::{get_ident()}'
with self.lock: with self.lock:
@ -75,6 +75,7 @@ class Overseer:
return ans return ans
def fetch_url(self, url_or_qurl, source='', timeout=60): def fetch_url(self, url_or_qurl, source='', timeout=60):
from qt.core import QUrl
w = self.worker_for_source(source) w = self.worker_for_source(source)
if isinstance(url_or_qurl, str): if isinstance(url_or_qurl, str):
url_or_qurl = QUrl(url_or_qurl) url_or_qurl = QUrl(url_or_qurl)
@ -96,10 +97,10 @@ class Overseer:
w.stdin.write(b'EXIT:0\n') w.stdin.write(b'EXIT:0\n')
w.stdin.flush() w.stdin.flush()
for w in self.workers.values(): for w in self.workers.values():
if safe_wait(w, 0.2) is None: if self.safe_wait(w, 0.2) is None:
w.terminate() w.terminate()
if not iswindows: if not iswindows:
if safe_wait(w, 0.1) is None: if self.safe_wait(w, 0.1) is None:
w.kill() w.kill()
self.workers.clear() self.workers.clear()
close = __del__ close = __del__
@ -148,6 +149,7 @@ def find_tests():
class TestSimpleWebEngineScraper(unittest.TestCase): class TestSimpleWebEngineScraper(unittest.TestCase):
def test_dom_load(self): def test_dom_load(self):
from qt.core import QUrl
overseer = Overseer() overseer = Overseer()
for f in ('book', 'nav'): for f in ('book', 'nav'):
path = P(f'templates/new_{f}.html', allow_user_override=False) path = P(f'templates/new_{f}.html', allow_user_override=False)