mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'kovidgoyal:master' into translation-fixes
This commit is contained in:
commit
28b8601875
@ -7,9 +7,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
from threading import Lock
|
||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict, namedtuple
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import parse_qs, quote_plus, urlencode, unquote
|
from urllib.parse import parse_qs, quote_plus, unquote, urlencode
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import parse_qs
|
from urlparse import parse_qs
|
||||||
from urllib import quote_plus, urlencode, unquote
|
from urllib import quote_plus, urlencode, unquote
|
||||||
@ -17,10 +19,11 @@ except ImportError:
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre import browser as _browser, prints, random_user_agent
|
from calibre import browser as _browser, prints, random_user_agent
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.utils.monotonic import monotonic
|
from calibre.utils.monotonic import monotonic
|
||||||
from calibre.utils.random_ua import accept_header_for_ua
|
from calibre.utils.random_ua import accept_header_for_ua, random_common_chrome_user_agent
|
||||||
|
|
||||||
current_version = (1, 0, 12)
|
current_version = (1, 0, 13)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
|
|
||||||
|
|
||||||
@ -60,16 +63,26 @@ def parse_html(raw):
|
|||||||
return parse(raw)
|
return parse(raw)
|
||||||
|
|
||||||
|
|
||||||
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None):
|
last_visited_lock = Lock()
|
||||||
delta = monotonic() - last_visited[key]
|
|
||||||
|
|
||||||
|
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None):
|
||||||
|
with last_visited_lock:
|
||||||
|
lv = last_visited[key]
|
||||||
|
delta = monotonic() - lv
|
||||||
if delta < limit and delta > 0:
|
if delta < limit and delta > 0:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(url, timeout=timeout).read()
|
if simple_scraper is None:
|
||||||
|
raw = br.open_novisit(url, timeout=timeout).read()
|
||||||
|
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
|
||||||
|
else:
|
||||||
|
raw = simple_scraper(url, timeout=timeout)
|
||||||
finally:
|
finally:
|
||||||
last_visited[key] = monotonic()
|
with last_visited_lock:
|
||||||
|
last_visited[key] = monotonic()
|
||||||
if dump_raw is not None:
|
if dump_raw is not None:
|
||||||
with open(dump_raw, 'wb') as f:
|
with open(dump_raw, 'w') as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
if save_raw is not None:
|
if save_raw is not None:
|
||||||
save_raw(raw)
|
save_raw(raw)
|
||||||
@ -169,7 +182,7 @@ def bing_url_processor(url):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
|
def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60, show_user_agent=False):
|
||||||
# http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
|
# http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
|
||||||
terms = [quote_term(bing_term(t)) for t in terms]
|
terms = [quote_term(bing_term(t)) for t in terms]
|
||||||
if site is not None:
|
if site is not None:
|
||||||
@ -178,6 +191,14 @@ def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_r
|
|||||||
url = 'https://www.bing.com/search?q={q}'.format(q=q)
|
url = 'https://www.bing.com/search?q={q}'.format(q=q)
|
||||||
log('Making bing query: ' + url)
|
log('Making bing query: ' + url)
|
||||||
br = br or browser()
|
br = br or browser()
|
||||||
|
br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']
|
||||||
|
ua = ''
|
||||||
|
while not ua or 'Edg/' in ua:
|
||||||
|
ua = random_common_chrome_user_agent()
|
||||||
|
if show_user_agent:
|
||||||
|
print('User-agent:', ua)
|
||||||
|
br.addheaders.append(('User-agent', ua))
|
||||||
|
|
||||||
root = query(br, url, 'bing', dump_raw, timeout=timeout)
|
root = query(br, url, 'bing', dump_raw, timeout=timeout)
|
||||||
ans = []
|
ans = []
|
||||||
for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
|
for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
|
||||||
@ -200,8 +221,7 @@ def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_r
|
|||||||
|
|
||||||
|
|
||||||
def bing_develop():
|
def bing_develop():
|
||||||
br = browser()
|
for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', show_user_agent=True)[0]:
|
||||||
for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:
|
|
||||||
if '/dp/' in result.url:
|
if '/dp/' in result.url:
|
||||||
print(result.title)
|
print(result.title)
|
||||||
print(' ', result.url)
|
print(' ', result.url)
|
||||||
@ -314,3 +334,9 @@ def resolve_url(url):
|
|||||||
if prefix == 'wayback':
|
if prefix == 'wayback':
|
||||||
return wayback_url_processor(rest)
|
return wayback_url_processor(rest)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# import sys
|
||||||
|
# func = sys.argv[-1]
|
||||||
|
# globals()[func]()
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
# License: GPLv3 Copyright: 2012, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2012, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from functools import wraps
|
||||||
from threading import Event, Thread
|
from threading import Event, Thread
|
||||||
|
|
||||||
from calibre.customize.ui import metadata_plugins
|
from calibre.customize.ui import metadata_plugins
|
||||||
@ -17,8 +16,8 @@ from calibre.ebooks.metadata.sources.identify import identify, msprefs
|
|||||||
from calibre.ebooks.metadata.sources.update import patch_plugins
|
from calibre.ebooks.metadata.sources.update import patch_plugins
|
||||||
from calibre.utils.date import as_utc
|
from calibre.utils.date import as_utc
|
||||||
from calibre.utils.logging import GUILog
|
from calibre.utils.logging import GUILog
|
||||||
from polyglot.queue import Empty, Queue
|
|
||||||
from polyglot.builtins import iteritems
|
from polyglot.builtins import iteritems
|
||||||
|
from polyglot.queue import Empty, Queue
|
||||||
|
|
||||||
|
|
||||||
def merge_result(oldmi, newmi, ensure_fields=None):
|
def merge_result(oldmi, newmi, ensure_fields=None):
|
||||||
@ -51,6 +50,18 @@ def merge_result(oldmi, newmi, ensure_fields=None):
|
|||||||
return newmi
|
return newmi
|
||||||
|
|
||||||
|
|
||||||
|
def shutdown_webengine_workers(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapper(*a, **k):
|
||||||
|
from calibre.scraper.simple import cleanup_overseers
|
||||||
|
try:
|
||||||
|
return func(*a, **k)
|
||||||
|
finally:
|
||||||
|
cleanup_overseers()()
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
@shutdown_webengine_workers
|
||||||
def main(do_identify, covers, metadata, ensure_fields, tdir):
|
def main(do_identify, covers, metadata, ensure_fields, tdir):
|
||||||
failed_ids = set()
|
failed_ids = set()
|
||||||
failed_covers = set()
|
failed_covers = set()
|
||||||
@ -101,6 +112,7 @@ def main(do_identify, covers, metadata, ensure_fields, tdir):
|
|||||||
return failed_ids, failed_covers, all_failed
|
return failed_ids, failed_covers, all_failed
|
||||||
|
|
||||||
|
|
||||||
|
@shutdown_webengine_workers
|
||||||
def single_identify(title, authors, identifiers):
|
def single_identify(title, authors, identifiers):
|
||||||
log = GUILog()
|
log = GUILog()
|
||||||
patch_plugins()
|
patch_plugins()
|
||||||
@ -110,6 +122,7 @@ def single_identify(title, authors, identifiers):
|
|||||||
r in results], dump_caches(), log.dump()
|
r in results], dump_caches(), log.dump()
|
||||||
|
|
||||||
|
|
||||||
|
@shutdown_webengine_workers
|
||||||
def single_covers(title, authors, identifiers, caches, tdir):
|
def single_covers(title, authors, identifiers, caches, tdir):
|
||||||
patch_plugins()
|
patch_plugins()
|
||||||
load_caches(caches)
|
load_caches(caches)
|
||||||
|
@ -1555,6 +1555,7 @@ def ensure_app(headless=True):
|
|||||||
os.environ['QT_MAC_DISABLE_FOREGROUND_APPLICATION_TRANSFORM'] = '1'
|
os.environ['QT_MAC_DISABLE_FOREGROUND_APPLICATION_TRANSFORM'] = '1'
|
||||||
if headless and iswindows:
|
if headless and iswindows:
|
||||||
QApplication.setAttribute(Qt.ApplicationAttribute.AA_UseSoftwareOpenGL, True)
|
QApplication.setAttribute(Qt.ApplicationAttribute.AA_UseSoftwareOpenGL, True)
|
||||||
|
QApplication.setAttribute(Qt.ApplicationAttribute.AA_ShareOpenGLContexts)
|
||||||
_store_app = QApplication(args)
|
_store_app = QApplication(args)
|
||||||
if headless and has_headless:
|
if headless and has_headless:
|
||||||
_store_app.headless = True
|
_store_app.headless = True
|
||||||
|
@ -7,8 +7,6 @@ import json
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import weakref
|
import weakref
|
||||||
from contextlib import suppress
|
|
||||||
from qt.core import QLoggingCategory, QUrl
|
|
||||||
from threading import Lock, Thread, get_ident
|
from threading import Lock, Thread, get_ident
|
||||||
|
|
||||||
from calibre.constants import iswindows
|
from calibre.constants import iswindows
|
||||||
@ -18,6 +16,7 @@ from calibre.utils.ipc.simple_worker import start_pipe_worker
|
|||||||
|
|
||||||
|
|
||||||
def worker_main(source):
|
def worker_main(source):
|
||||||
|
from qt.core import QLoggingCategory, QUrl
|
||||||
QLoggingCategory.setFilterRules('''\
|
QLoggingCategory.setFilterRules('''\
|
||||||
qt.webenginecontext.info=false
|
qt.webenginecontext.info=false
|
||||||
''')
|
''')
|
||||||
@ -53,11 +52,6 @@ qt.webenginecontext.info=false
|
|||||||
overseers = []
|
overseers = []
|
||||||
|
|
||||||
|
|
||||||
def safe_wait(w, timeout):
|
|
||||||
with suppress(Exception):
|
|
||||||
return w.wait(timeout)
|
|
||||||
|
|
||||||
|
|
||||||
class Overseer:
|
class Overseer:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -65,6 +59,12 @@ class Overseer:
|
|||||||
self.workers = {}
|
self.workers = {}
|
||||||
overseers.append(weakref.ref(self))
|
overseers.append(weakref.ref(self))
|
||||||
|
|
||||||
|
def safe_wait(self, w, timeout):
|
||||||
|
try:
|
||||||
|
return w.wait(timeout)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
def worker_for_source(self, source):
|
def worker_for_source(self, source):
|
||||||
wname = f'{source}::{get_ident()}'
|
wname = f'{source}::{get_ident()}'
|
||||||
with self.lock:
|
with self.lock:
|
||||||
@ -75,6 +75,7 @@ class Overseer:
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def fetch_url(self, url_or_qurl, source='', timeout=60):
|
def fetch_url(self, url_or_qurl, source='', timeout=60):
|
||||||
|
from qt.core import QUrl
|
||||||
w = self.worker_for_source(source)
|
w = self.worker_for_source(source)
|
||||||
if isinstance(url_or_qurl, str):
|
if isinstance(url_or_qurl, str):
|
||||||
url_or_qurl = QUrl(url_or_qurl)
|
url_or_qurl = QUrl(url_or_qurl)
|
||||||
@ -96,10 +97,10 @@ class Overseer:
|
|||||||
w.stdin.write(b'EXIT:0\n')
|
w.stdin.write(b'EXIT:0\n')
|
||||||
w.stdin.flush()
|
w.stdin.flush()
|
||||||
for w in self.workers.values():
|
for w in self.workers.values():
|
||||||
if safe_wait(w, 0.2) is None:
|
if self.safe_wait(w, 0.2) is None:
|
||||||
w.terminate()
|
w.terminate()
|
||||||
if not iswindows:
|
if not iswindows:
|
||||||
if safe_wait(w, 0.1) is None:
|
if self.safe_wait(w, 0.1) is None:
|
||||||
w.kill()
|
w.kill()
|
||||||
self.workers.clear()
|
self.workers.clear()
|
||||||
close = __del__
|
close = __del__
|
||||||
@ -148,6 +149,7 @@ def find_tests():
|
|||||||
class TestSimpleWebEngineScraper(unittest.TestCase):
|
class TestSimpleWebEngineScraper(unittest.TestCase):
|
||||||
|
|
||||||
def test_dom_load(self):
|
def test_dom_load(self):
|
||||||
|
from qt.core import QUrl
|
||||||
overseer = Overseer()
|
overseer = Overseer()
|
||||||
for f in ('book', 'nav'):
|
for f in ('book', 'nav'):
|
||||||
path = P(f'templates/new_{f}.html', allow_user_override=False)
|
path = P(f'templates/new_{f}.html', allow_user_override=False)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user