mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Various improvements for jsbrowser
Add APIs to get DOM elements by selector, get resources (images/stylesheets/etc) and get image data for displayed images. Fix waiting till the DOM is ready. Make the default timeout a per browser-object setting.
This commit is contained in:
parent
d9635111d2
commit
eaed92987f
@ -7,24 +7,27 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, pprint, time
|
import os, pprint, time, uuid
|
||||||
from cookielib import Cookie
|
from cookielib import Cookie
|
||||||
from threading import current_thread
|
from threading import current_thread
|
||||||
|
|
||||||
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
|
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
|
||||||
QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl, pyqtSignal,
|
QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl, pyqtSignal,
|
||||||
QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot)
|
QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot, QPixmap)
|
||||||
from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView, QWebElement
|
from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView, QWebElement
|
||||||
|
|
||||||
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info
|
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info, prepare_string_for_xml
|
||||||
from calibre.constants import ispy3, cache_dir
|
from calibre.constants import ispy3, cache_dir
|
||||||
from calibre.utils.logging import ThreadSafeLog
|
from calibre.utils.logging import ThreadSafeLog
|
||||||
from calibre.gui2 import must_use_qt
|
from calibre.gui2 import must_use_qt
|
||||||
from calibre.web.jsbrowser.forms import FormsMixin
|
from calibre.web.jsbrowser.forms import FormsMixin, default_timeout
|
||||||
|
|
||||||
class Timeout(Exception): pass
|
class Timeout(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class LoadError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
class LoadError(Exception): pass
|
|
||||||
|
|
||||||
class WebPage(QWebPage): # {{{
|
class WebPage(QWebPage): # {{{
|
||||||
|
|
||||||
@ -48,6 +51,24 @@ class WebPage(QWebPage): # {{{
|
|||||||
QWebSettings.enablePersistentStorage(os.path.join(cache_dir(),
|
QWebSettings.enablePersistentStorage(os.path.join(cache_dir(),
|
||||||
'webkit-persistence'))
|
'webkit-persistence'))
|
||||||
QWebSettings.setMaximumPagesInCache(0)
|
QWebSettings.setMaximumPagesInCache(0)
|
||||||
|
self.bridge_name = 'b' + uuid.uuid4().get_hex()
|
||||||
|
self.mainFrame().javaScriptWindowObjectCleared.connect(
|
||||||
|
self.add_window_objects)
|
||||||
|
self.dom_loaded = False
|
||||||
|
|
||||||
|
def add_window_objects(self):
|
||||||
|
self.dom_loaded = False
|
||||||
|
mf = self.mainFrame()
|
||||||
|
mf.addToJavaScriptWindowObject(self.bridge_name, self)
|
||||||
|
mf.evaluateJavaScript('document.addEventListener( "DOMContentLoaded", %s.content_loaded, false )' % self.bridge_name)
|
||||||
|
|
||||||
|
def load_url(self, url):
|
||||||
|
self.dom_loaded = False
|
||||||
|
self.mainFrame().load(QUrl(url))
|
||||||
|
|
||||||
|
@pyqtSlot()
|
||||||
|
def content_loaded(self):
|
||||||
|
self.dom_loaded = True
|
||||||
|
|
||||||
def userAgentForUrl(self, url):
|
def userAgentForUrl(self, url):
|
||||||
return self.user_agent
|
return self.user_agent
|
||||||
@ -96,6 +117,25 @@ class WebPage(QWebPage): # {{{
|
|||||||
def ready_state(self):
|
def ready_state(self):
|
||||||
return unicode(self.mainFrame().evaluateJavaScript('document.readyState').toString())
|
return unicode(self.mainFrame().evaluateJavaScript('document.readyState').toString())
|
||||||
|
|
||||||
|
@pyqtSlot(QPixmap)
|
||||||
|
def transfer_image(self, img):
|
||||||
|
self.saved_img = img
|
||||||
|
|
||||||
|
def get_image(self, qwe_or_selector):
|
||||||
|
qwe = qwe_or_selector
|
||||||
|
if not isinstance(qwe, QWebElement):
|
||||||
|
qwe = self.mainFrame().findFirstElement(qwe)
|
||||||
|
if qwe.isNull():
|
||||||
|
raise ValueError('Failed to find element with selector: %r'
|
||||||
|
% qwe_or_selector)
|
||||||
|
self.saved_img = QPixmap()
|
||||||
|
qwe.evaluateJavaScript('%s.transfer_image(this)' % self.bridge_name)
|
||||||
|
try:
|
||||||
|
return self.saved_img
|
||||||
|
finally:
|
||||||
|
del self.saved_img
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
class ProxyFactory(QNetworkProxyFactory): # {{{
|
class ProxyFactory(QNetworkProxyFactory): # {{{
|
||||||
@ -107,9 +147,11 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
|
|||||||
for scheme, proxy_string in proxies.iteritems():
|
for scheme, proxy_string in proxies.iteritems():
|
||||||
scheme = scheme.lower()
|
scheme = scheme.lower()
|
||||||
info = get_proxy_info(scheme, proxy_string)
|
info = get_proxy_info(scheme, proxy_string)
|
||||||
if info is None: continue
|
if info is None:
|
||||||
|
continue
|
||||||
hn, port = info['hostname'], info['port']
|
hn, port = info['hostname'], info['port']
|
||||||
if not hn or not port: continue
|
if not hn or not port:
|
||||||
|
continue
|
||||||
log.debug('JSBrowser using proxy:', pprint.pformat(info))
|
log.debug('JSBrowser using proxy:', pprint.pformat(info))
|
||||||
pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme,
|
pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme,
|
||||||
QNetworkProxy.HttpProxy)
|
QNetworkProxy.HttpProxy)
|
||||||
@ -130,7 +172,7 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
|
|||||||
|
|
||||||
class NetworkAccessManager(QNetworkAccessManager): # {{{
|
class NetworkAccessManager(QNetworkAccessManager): # {{{
|
||||||
|
|
||||||
OPERATION_NAMES = { getattr(QNetworkAccessManager, '%sOperation'%x) :
|
OPERATION_NAMES = {getattr(QNetworkAccessManager, '%sOperation'%x) :
|
||||||
x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete',
|
x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete',
|
||||||
'Custom')
|
'Custom')
|
||||||
}
|
}
|
||||||
@ -283,7 +325,7 @@ class Browser(QObject, FormsMixin):
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
# Logging. If None, uses a default log, which does not output
|
# Logging. If None, uses a default log, which does not output
|
||||||
# debugging info
|
# debugging info
|
||||||
log = None,
|
log=None,
|
||||||
# Receives a string and returns True/False. By default, returns
|
# Receives a string and returns True/False. By default, returns
|
||||||
# True for all strings
|
# True for all strings
|
||||||
confirm_callback=None,
|
confirm_callback=None,
|
||||||
@ -303,7 +345,10 @@ class Browser(QObject, FormsMixin):
|
|||||||
enable_developer_tools=False,
|
enable_developer_tools=False,
|
||||||
|
|
||||||
# Verbosity
|
# Verbosity
|
||||||
verbosity = 0
|
verbosity=0,
|
||||||
|
|
||||||
|
# The default timeout (in seconds)
|
||||||
|
default_timeout=30
|
||||||
):
|
):
|
||||||
must_use_qt()
|
must_use_qt()
|
||||||
QObject.__init__(self)
|
QObject.__init__(self)
|
||||||
@ -314,6 +359,7 @@ class Browser(QObject, FormsMixin):
|
|||||||
if verbosity:
|
if verbosity:
|
||||||
log.filter_level = log.DEBUG
|
log.filter_level = log.DEBUG
|
||||||
self.log = log
|
self.log = log
|
||||||
|
self.default_timeout = default_timeout
|
||||||
|
|
||||||
self.page = WebPage(log, confirm_callback=confirm_callback,
|
self.page = WebPage(log, confirm_callback=confirm_callback,
|
||||||
prompt_callback=prompt_callback, user_agent=user_agent,
|
prompt_callback=prompt_callback, user_agent=user_agent,
|
||||||
@ -327,6 +373,7 @@ class Browser(QObject, FormsMixin):
|
|||||||
return self.page.user_agent
|
return self.page.user_agent
|
||||||
|
|
||||||
def _wait_for_load(self, timeout, url=None):
|
def _wait_for_load(self, timeout, url=None):
|
||||||
|
timeout = self.default_timeout if timeout is default_timeout else timeout
|
||||||
loop = QEventLoop(self)
|
loop = QEventLoop(self)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
end_time = start_time + timeout
|
end_time = start_time + timeout
|
||||||
@ -358,7 +405,16 @@ class Browser(QObject, FormsMixin):
|
|||||||
if not loop.processEvents():
|
if not loop.processEvents():
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
def visit(self, url, timeout=30.0):
|
def wait_for_element(self, selector, timeout=default_timeout):
|
||||||
|
timeout = self.default_timeout if timeout is default_timeout else timeout
|
||||||
|
start_time = time.time()
|
||||||
|
while self.css_select(selector) is None:
|
||||||
|
self.run_for_a_time(0.1)
|
||||||
|
if time.time() - start_time > timeout:
|
||||||
|
raise Timeout('DOM failed to load in %.1g seconds' % timeout)
|
||||||
|
return self.css_select(selector)
|
||||||
|
|
||||||
|
def visit(self, url, timeout=default_timeout):
|
||||||
'''
|
'''
|
||||||
Open the page specified in URL and wait for it to complete loading.
|
Open the page specified in URL and wait for it to complete loading.
|
||||||
Note that when this method returns, there may still be javascript
|
Note that when this method returns, there may still be javascript
|
||||||
@ -369,14 +425,26 @@ class Browser(QObject, FormsMixin):
|
|||||||
Returns True if loading was successful, False otherwise.
|
Returns True if loading was successful, False otherwise.
|
||||||
'''
|
'''
|
||||||
self.current_form = None
|
self.current_form = None
|
||||||
self.page.mainFrame().load(QUrl(url))
|
self.page.load_url(url)
|
||||||
return self._wait_for_load(timeout, url)
|
return self._wait_for_load(timeout, url)
|
||||||
|
|
||||||
|
def back(self, wait_for_load=True, timeout=default_timeout):
|
||||||
|
'''
|
||||||
|
Like clicking the back button in the browser. Waits for loading to complete.
|
||||||
|
This method will raise a Timeout exception if loading takes more than timeout seconds.
|
||||||
|
|
||||||
|
Returns True if loading was successful, False otherwise.
|
||||||
|
'''
|
||||||
|
self.page.triggerAction(self.page.Back)
|
||||||
|
if wait_for_load:
|
||||||
|
return self._wait_for_load(timeout)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dom_ready(self):
|
def dom_ready(self):
|
||||||
return self.page.ready_state in {'complete', 'interactive'}
|
return self.page.dom_loaded
|
||||||
|
|
||||||
def wait_till_dom_ready(self, timeout=30.0, url=None):
|
def wait_till_dom_ready(self, timeout=default_timeout, url=None):
|
||||||
|
timeout = self.default_timeout if timeout is default_timeout else timeout
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
while not self.dom_ready:
|
while not self.dom_ready:
|
||||||
if time.time() - start_time > timeout:
|
if time.time() - start_time > timeout:
|
||||||
@ -384,18 +452,30 @@ class Browser(QObject, FormsMixin):
|
|||||||
url, timeout))
|
url, timeout))
|
||||||
self.run_for_a_time(0.1)
|
self.run_for_a_time(0.1)
|
||||||
|
|
||||||
def start_load(self, url, timeout=30.0):
|
def wait_till_element_exists(self, selector, timeout=default_timeout, url=None):
|
||||||
|
timeout = self.default_timeout if timeout is default_timeout else timeout
|
||||||
|
start_time = time.time()
|
||||||
|
while self.css_select(selector) is None:
|
||||||
|
if time.time() - start_time > timeout:
|
||||||
|
raise Timeout('Loading of %r took longer than %d seconds'%(
|
||||||
|
url, timeout))
|
||||||
|
self.run_for_a_time(0.1)
|
||||||
|
|
||||||
|
def start_load(self, url, timeout=default_timeout, selector=None):
|
||||||
'''
|
'''
|
||||||
Start the loading of the page at url and return once the DOM is ready,
|
Start the loading of the page at url and return once the DOM is ready,
|
||||||
sub-resources such as scripts/stylesheets/images/etc. may not have all
|
sub-resources such as scripts/stylesheets/images/etc. may not have all
|
||||||
loaded.
|
loaded.
|
||||||
'''
|
'''
|
||||||
self.current_form = None
|
self.current_form = None
|
||||||
self.page.mainFrame().load(QUrl(url))
|
self.page.load_url(url)
|
||||||
self.run_for_a_time(0.01)
|
self.run_for_a_time(0.01)
|
||||||
|
if selector is not None:
|
||||||
|
self.wait_till_element_exists(selector, timeout=timeout, url=url)
|
||||||
|
else:
|
||||||
self.wait_till_dom_ready(timeout=timeout, url=url)
|
self.wait_till_dom_ready(timeout=timeout, url=url)
|
||||||
|
|
||||||
def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=30.0):
|
def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=default_timeout):
|
||||||
'''
|
'''
|
||||||
Click the :class:`QWebElement` pointed to by qwe_or_selector.
|
Click the :class:`QWebElement` pointed to by qwe_or_selector.
|
||||||
|
|
||||||
@ -408,8 +488,8 @@ class Browser(QObject, FormsMixin):
|
|||||||
initial_count = self.nam.reply_count
|
initial_count = self.nam.reply_count
|
||||||
qwe = qwe_or_selector
|
qwe = qwe_or_selector
|
||||||
if not isinstance(qwe, QWebElement):
|
if not isinstance(qwe, QWebElement):
|
||||||
qwe = self.page.mainFrame().findFirstElement(qwe)
|
qwe = self.css_select(qwe)
|
||||||
if qwe.isNull():
|
if qwe is None:
|
||||||
raise ValueError('Failed to find element with selector: %r'
|
raise ValueError('Failed to find element with selector: %r'
|
||||||
% qwe_or_selector)
|
% qwe_or_selector)
|
||||||
js = '''
|
js = '''
|
||||||
@ -425,7 +505,7 @@ class Browser(QObject, FormsMixin):
|
|||||||
raise LoadError('Clicking resulted in a failed load')
|
raise LoadError('Clicking resulted in a failed load')
|
||||||
|
|
||||||
def click_text_link(self, text_or_regex, selector='a[href]',
|
def click_text_link(self, text_or_regex, selector='a[href]',
|
||||||
wait_for_load=True, ajax_replies=0, timeout=30.0):
|
wait_for_load=True, ajax_replies=0, timeout=default_timeout):
|
||||||
target = None
|
target = None
|
||||||
for qwe in self.page.mainFrame().findAllElements(selector):
|
for qwe in self.page.mainFrame().findAllElements(selector):
|
||||||
src = unicode(qwe.toPlainText())
|
src = unicode(qwe.toPlainText())
|
||||||
@ -441,6 +521,59 @@ class Browser(QObject, FormsMixin):
|
|||||||
return self.click(target, wait_for_load=wait_for_load,
|
return self.click(target, wait_for_load=wait_for_load,
|
||||||
ajax_replies=ajax_replies, timeout=timeout)
|
ajax_replies=ajax_replies, timeout=timeout)
|
||||||
|
|
||||||
|
def css_select(self, selector, all=False):
|
||||||
|
if all:
|
||||||
|
return tuple(self.page.mainFrame().findAllElements(selector).toList())
|
||||||
|
ans = self.page.mainFrame().findFirstElement(selector)
|
||||||
|
if ans.isNull():
|
||||||
|
ans = None
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def get_image(self, qwe_or_selector):
|
||||||
|
'''
|
||||||
|
Return the image identified by qwe_or_selector as a QPixmap. If no such
|
||||||
|
image exists, the returned pixmap will be null.
|
||||||
|
'''
|
||||||
|
return self.page.get_image(qwe_or_selector)
|
||||||
|
|
||||||
|
def get_cached(self, url):
|
||||||
|
iod = self.nam.cache.data(QUrl(url))
|
||||||
|
if iod is not None:
|
||||||
|
return bytes(bytearray(iod.readAll()))
|
||||||
|
|
||||||
|
def get_resource(self, url, rtype='img', use_cache=True, timeout=default_timeout):
|
||||||
|
'''
|
||||||
|
Download a resource (image/stylesheet/script). The resource is
|
||||||
|
downloaded by visiting an simple HTML page that contains only that
|
||||||
|
resource. The resource is then returned from the cache (therefore, to
|
||||||
|
use this method you must not disable the cache). If use_cache is True
|
||||||
|
then the cache is queried before loading the resource. This can result
|
||||||
|
in a stale object if the resource has changed on the server, however,
|
||||||
|
it is a big performance boost in the common case, by avoiding a
|
||||||
|
roundtrip to the server. The resource is returned as a bytestring or None
|
||||||
|
if it could not be loaded.
|
||||||
|
'''
|
||||||
|
if not hasattr(self.nam, 'cache'):
|
||||||
|
raise RuntimeError('Cannot get resources when the cache is disabled')
|
||||||
|
if use_cache:
|
||||||
|
ans = self.get_cached(url)
|
||||||
|
if ans is not None:
|
||||||
|
return ans
|
||||||
|
try:
|
||||||
|
tag = {
|
||||||
|
'img': '<img src="%s">',
|
||||||
|
'link': '<link href="%s"></link>',
|
||||||
|
'script': '<script src="%s"></script>',
|
||||||
|
}[rtype] % prepare_string_for_xml(url, attribute=True)
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError('Unknown resource type: %s' % rtype)
|
||||||
|
|
||||||
|
self.page.mainFrame().setHtml(
|
||||||
|
'''<!DOCTYPE html><html><body><div>{0}</div></body></html>'''.format(tag))
|
||||||
|
self._wait_for_load(timeout)
|
||||||
|
ans = self.get_cached(url)
|
||||||
|
if ans is not None:
|
||||||
|
return ans
|
||||||
|
|
||||||
def show_browser(self):
|
def show_browser(self):
|
||||||
'''
|
'''
|
||||||
@ -474,3 +607,5 @@ class Browser(QObject, FormsMixin):
|
|||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,6 +10,8 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from calibre import as_unicode
|
from calibre import as_unicode
|
||||||
|
|
||||||
|
default_timeout = object()
|
||||||
|
|
||||||
# Forms {{{
|
# Forms {{{
|
||||||
class Control(object):
|
class Control(object):
|
||||||
|
|
||||||
@ -221,7 +223,7 @@ class FormsMixin(object):
|
|||||||
return self.current_form
|
return self.current_form
|
||||||
|
|
||||||
def submit(self, submit_control_selector=None, wait_for_load=True,
|
def submit(self, submit_control_selector=None, wait_for_load=True,
|
||||||
ajax_replies=0, timeout=30.0):
|
ajax_replies=0, timeout=default_timeout):
|
||||||
'''
|
'''
|
||||||
Submit the currently selected form. Tries to autodetect the submit
|
Submit the currently selected form. Tries to autodetect the submit
|
||||||
control. You can override auto-detection by specifying a CSS2 selector
|
control. You can override auto-detection by specifying a CSS2 selector
|
||||||
@ -238,7 +240,7 @@ class FormsMixin(object):
|
|||||||
ajax_replies=ajax_replies, timeout=timeout)
|
ajax_replies=ajax_replies, timeout=timeout)
|
||||||
|
|
||||||
def ajax_submit(self, submit_control_selector=None,
|
def ajax_submit(self, submit_control_selector=None,
|
||||||
num_of_replies=1, timeout=30.0):
|
num_of_replies=1, timeout=default_timeout):
|
||||||
'''
|
'''
|
||||||
Submit the current form. This method is meant for those forms that
|
Submit the current form. This method is meant for those forms that
|
||||||
use AJAX rather than a plain submit. It will block until the specified
|
use AJAX rather than a plain submit. It will block until the specified
|
||||||
@ -249,3 +251,4 @@ class FormsMixin(object):
|
|||||||
wait_for_load=False, ajax_replies=num_of_replies,
|
wait_for_load=False, ajax_replies=num_of_replies,
|
||||||
timeout=timeout)
|
timeout=timeout)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user