Various improvements for jsbrowser

Add APIs to get DOM elements by selector, get resources
(images/stylesheets/etc) and get image data for displayed images. Fix
waiting till the DOM is ready. Make the default timeout a per
browser-object setting.
This commit is contained in:
Kovid Goyal 2013-06-08 22:42:03 +05:30
parent d9635111d2
commit eaed92987f
2 changed files with 172 additions and 34 deletions

View File

@ -7,26 +7,29 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, pprint, time import os, pprint, time, uuid
from cookielib import Cookie from cookielib import Cookie
from threading import current_thread from threading import current_thread
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache, from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl, pyqtSignal, QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl, pyqtSignal,
QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot) QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot, QPixmap)
from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView, QWebElement from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView, QWebElement
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info from calibre import USER_AGENT, prints, get_proxies, get_proxy_info, prepare_string_for_xml
from calibre.constants import ispy3, cache_dir from calibre.constants import ispy3, cache_dir
from calibre.utils.logging import ThreadSafeLog from calibre.utils.logging import ThreadSafeLog
from calibre.gui2 import must_use_qt from calibre.gui2 import must_use_qt
from calibre.web.jsbrowser.forms import FormsMixin from calibre.web.jsbrowser.forms import FormsMixin, default_timeout
class Timeout(Exception): pass class Timeout(Exception):
pass
class LoadError(Exception): pass class LoadError(Exception):
pass
class WebPage(QWebPage): # {{{
class WebPage(QWebPage): # {{{
def __init__(self, log, def __init__(self, log,
confirm_callback=None, confirm_callback=None,
@ -48,6 +51,24 @@ class WebPage(QWebPage): # {{{
QWebSettings.enablePersistentStorage(os.path.join(cache_dir(), QWebSettings.enablePersistentStorage(os.path.join(cache_dir(),
'webkit-persistence')) 'webkit-persistence'))
QWebSettings.setMaximumPagesInCache(0) QWebSettings.setMaximumPagesInCache(0)
self.bridge_name = 'b' + uuid.uuid4().get_hex()
self.mainFrame().javaScriptWindowObjectCleared.connect(
self.add_window_objects)
self.dom_loaded = False
def add_window_objects(self):
self.dom_loaded = False
mf = self.mainFrame()
mf.addToJavaScriptWindowObject(self.bridge_name, self)
mf.evaluateJavaScript('document.addEventListener( "DOMContentLoaded", %s.content_loaded, false )' % self.bridge_name)
def load_url(self, url):
self.dom_loaded = False
self.mainFrame().load(QUrl(url))
@pyqtSlot()
def content_loaded(self):
self.dom_loaded = True
def userAgentForUrl(self, url): def userAgentForUrl(self, url):
return self.user_agent return self.user_agent
@ -96,9 +117,28 @@ class WebPage(QWebPage): # {{{
def ready_state(self): def ready_state(self):
return unicode(self.mainFrame().evaluateJavaScript('document.readyState').toString()) return unicode(self.mainFrame().evaluateJavaScript('document.readyState').toString())
@pyqtSlot(QPixmap)
def transfer_image(self, img):
self.saved_img = img
def get_image(self, qwe_or_selector):
qwe = qwe_or_selector
if not isinstance(qwe, QWebElement):
qwe = self.mainFrame().findFirstElement(qwe)
if qwe.isNull():
raise ValueError('Failed to find element with selector: %r'
% qwe_or_selector)
self.saved_img = QPixmap()
qwe.evaluateJavaScript('%s.transfer_image(this)' % self.bridge_name)
try:
return self.saved_img
finally:
del self.saved_img
# }}} # }}}
class ProxyFactory(QNetworkProxyFactory): # {{{ class ProxyFactory(QNetworkProxyFactory): # {{{
def __init__(self, log): def __init__(self, log):
QNetworkProxyFactory.__init__(self) QNetworkProxyFactory.__init__(self)
@ -107,9 +147,11 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
for scheme, proxy_string in proxies.iteritems(): for scheme, proxy_string in proxies.iteritems():
scheme = scheme.lower() scheme = scheme.lower()
info = get_proxy_info(scheme, proxy_string) info = get_proxy_info(scheme, proxy_string)
if info is None: continue if info is None:
continue
hn, port = info['hostname'], info['port'] hn, port = info['hostname'], info['port']
if not hn or not port: continue if not hn or not port:
continue
log.debug('JSBrowser using proxy:', pprint.pformat(info)) log.debug('JSBrowser using proxy:', pprint.pformat(info))
pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme, pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme,
QNetworkProxy.HttpProxy) QNetworkProxy.HttpProxy)
@ -128,9 +170,9 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
return [self.proxies.get(scheme, self.default_proxy)] return [self.proxies.get(scheme, self.default_proxy)]
# }}} # }}}
class NetworkAccessManager(QNetworkAccessManager): # {{{ class NetworkAccessManager(QNetworkAccessManager): # {{{
OPERATION_NAMES = { getattr(QNetworkAccessManager, '%sOperation'%x) : OPERATION_NAMES = {getattr(QNetworkAccessManager, '%sOperation'%x) :
x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete', x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete',
'Custom') 'Custom')
} }
@ -230,18 +272,18 @@ class NetworkAccessManager(QNetworkAccessManager): # {{{
c = Cookie(0, # version c = Cookie(0, # version
name, value, name, value,
None, # port None, # port
False, # port specified False, # port specified
domain, domain_specified, initial_dot, path, domain, domain_specified, initial_dot, path,
path_specified, path_specified,
secure, expires, is_session_cookie, secure, expires, is_session_cookie,
None, # Comment None, # Comment
None, # Comment URL None, # Comment URL
{} # rest {} # rest
) )
yield c yield c
# }}} # }}}
class LoadWatcher(QObject): # {{{ class LoadWatcher(QObject): # {{{
def __init__(self, page, parent=None): def __init__(self, page, parent=None):
QObject.__init__(self, parent) QObject.__init__(self, parent)
@ -257,7 +299,7 @@ class LoadWatcher(QObject): # {{{
self.page = None self.page = None
# }}} # }}}
class BrowserView(QDialog): # {{{ class BrowserView(QDialog): # {{{
def __init__(self, page, parent=None): def __init__(self, page, parent=None):
QDialog.__init__(self, parent) QDialog.__init__(self, parent)
@ -283,7 +325,7 @@ class Browser(QObject, FormsMixin):
def __init__(self, def __init__(self,
# Logging. If None, uses a default log, which does not output # Logging. If None, uses a default log, which does not output
# debugging info # debugging info
log = None, log=None,
# Receives a string and returns True/False. By default, returns # Receives a string and returns True/False. By default, returns
# True for all strings # True for all strings
confirm_callback=None, confirm_callback=None,
@ -303,7 +345,10 @@ class Browser(QObject, FormsMixin):
enable_developer_tools=False, enable_developer_tools=False,
# Verbosity # Verbosity
verbosity = 0 verbosity=0,
# The default timeout (in seconds)
default_timeout=30
): ):
must_use_qt() must_use_qt()
QObject.__init__(self) QObject.__init__(self)
@ -314,6 +359,7 @@ class Browser(QObject, FormsMixin):
if verbosity: if verbosity:
log.filter_level = log.DEBUG log.filter_level = log.DEBUG
self.log = log self.log = log
self.default_timeout = default_timeout
self.page = WebPage(log, confirm_callback=confirm_callback, self.page = WebPage(log, confirm_callback=confirm_callback,
prompt_callback=prompt_callback, user_agent=user_agent, prompt_callback=prompt_callback, user_agent=user_agent,
@ -327,6 +373,7 @@ class Browser(QObject, FormsMixin):
return self.page.user_agent return self.page.user_agent
def _wait_for_load(self, timeout, url=None): def _wait_for_load(self, timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
loop = QEventLoop(self) loop = QEventLoop(self)
start_time = time.time() start_time = time.time()
end_time = start_time + timeout end_time = start_time + timeout
@ -358,7 +405,16 @@ class Browser(QObject, FormsMixin):
if not loop.processEvents(): if not loop.processEvents():
time.sleep(0.1) time.sleep(0.1)
def visit(self, url, timeout=30.0): def wait_for_element(self, selector, timeout=default_timeout):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
while self.css_select(selector) is None:
self.run_for_a_time(0.1)
if time.time() - start_time > timeout:
raise Timeout('DOM failed to load in %.1g seconds' % timeout)
return self.css_select(selector)
def visit(self, url, timeout=default_timeout):
''' '''
Open the page specified in URL and wait for it to complete loading. Open the page specified in URL and wait for it to complete loading.
Note that when this method returns, there may still be javascript Note that when this method returns, there may still be javascript
@ -369,14 +425,26 @@ class Browser(QObject, FormsMixin):
Returns True if loading was successful, False otherwise. Returns True if loading was successful, False otherwise.
''' '''
self.current_form = None self.current_form = None
self.page.mainFrame().load(QUrl(url)) self.page.load_url(url)
return self._wait_for_load(timeout, url) return self._wait_for_load(timeout, url)
def back(self, wait_for_load=True, timeout=default_timeout):
'''
Like clicking the back button in the browser. Waits for loading to complete.
This method will raise a Timeout exception if loading takes more than timeout seconds.
Returns True if loading was successful, False otherwise.
'''
self.page.triggerAction(self.page.Back)
if wait_for_load:
return self._wait_for_load(timeout)
@property @property
def dom_ready(self): def dom_ready(self):
return self.page.ready_state in {'complete', 'interactive'} return self.page.dom_loaded
def wait_till_dom_ready(self, timeout=30.0, url=None): def wait_till_dom_ready(self, timeout=default_timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time() start_time = time.time()
while not self.dom_ready: while not self.dom_ready:
if time.time() - start_time > timeout: if time.time() - start_time > timeout:
@ -384,18 +452,30 @@ class Browser(QObject, FormsMixin):
url, timeout)) url, timeout))
self.run_for_a_time(0.1) self.run_for_a_time(0.1)
def start_load(self, url, timeout=30.0): def wait_till_element_exists(self, selector, timeout=default_timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
while self.css_select(selector) is None:
if time.time() - start_time > timeout:
raise Timeout('Loading of %r took longer than %d seconds'%(
url, timeout))
self.run_for_a_time(0.1)
def start_load(self, url, timeout=default_timeout, selector=None):
''' '''
Start the loading of the page at url and return once the DOM is ready, Start the loading of the page at url and return once the DOM is ready,
sub-resources such as scripts/stylesheets/images/etc. may not have all sub-resources such as scripts/stylesheets/images/etc. may not have all
loaded. loaded.
''' '''
self.current_form = None self.current_form = None
self.page.mainFrame().load(QUrl(url)) self.page.load_url(url)
self.run_for_a_time(0.01) self.run_for_a_time(0.01)
self.wait_till_dom_ready(timeout=timeout, url=url) if selector is not None:
self.wait_till_element_exists(selector, timeout=timeout, url=url)
else:
self.wait_till_dom_ready(timeout=timeout, url=url)
def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=30.0): def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=default_timeout):
''' '''
Click the :class:`QWebElement` pointed to by qwe_or_selector. Click the :class:`QWebElement` pointed to by qwe_or_selector.
@ -408,8 +488,8 @@ class Browser(QObject, FormsMixin):
initial_count = self.nam.reply_count initial_count = self.nam.reply_count
qwe = qwe_or_selector qwe = qwe_or_selector
if not isinstance(qwe, QWebElement): if not isinstance(qwe, QWebElement):
qwe = self.page.mainFrame().findFirstElement(qwe) qwe = self.css_select(qwe)
if qwe.isNull(): if qwe is None:
raise ValueError('Failed to find element with selector: %r' raise ValueError('Failed to find element with selector: %r'
% qwe_or_selector) % qwe_or_selector)
js = ''' js = '''
@ -425,7 +505,7 @@ class Browser(QObject, FormsMixin):
raise LoadError('Clicking resulted in a failed load') raise LoadError('Clicking resulted in a failed load')
def click_text_link(self, text_or_regex, selector='a[href]', def click_text_link(self, text_or_regex, selector='a[href]',
wait_for_load=True, ajax_replies=0, timeout=30.0): wait_for_load=True, ajax_replies=0, timeout=default_timeout):
target = None target = None
for qwe in self.page.mainFrame().findAllElements(selector): for qwe in self.page.mainFrame().findAllElements(selector):
src = unicode(qwe.toPlainText()) src = unicode(qwe.toPlainText())
@ -441,6 +521,59 @@ class Browser(QObject, FormsMixin):
return self.click(target, wait_for_load=wait_for_load, return self.click(target, wait_for_load=wait_for_load,
ajax_replies=ajax_replies, timeout=timeout) ajax_replies=ajax_replies, timeout=timeout)
def css_select(self, selector, all=False):
if all:
return tuple(self.page.mainFrame().findAllElements(selector).toList())
ans = self.page.mainFrame().findFirstElement(selector)
if ans.isNull():
ans = None
return ans
def get_image(self, qwe_or_selector):
'''
Return the image identified by qwe_or_selector as a QPixmap. If no such
image exists, the returned pixmap will be null.
'''
return self.page.get_image(qwe_or_selector)
def get_cached(self, url):
iod = self.nam.cache.data(QUrl(url))
if iod is not None:
return bytes(bytearray(iod.readAll()))
def get_resource(self, url, rtype='img', use_cache=True, timeout=default_timeout):
'''
Download a resource (image/stylesheet/script). The resource is
downloaded by visiting an simple HTML page that contains only that
resource. The resource is then returned from the cache (therefore, to
use this method you must not disable the cache). If use_cache is True
then the cache is queried before loading the resource. This can result
in a stale object if the resource has changed on the server, however,
it is a big performance boost in the common case, by avoiding a
roundtrip to the server. The resource is returned as a bytestring or None
if it could not be loaded.
'''
if not hasattr(self.nam, 'cache'):
raise RuntimeError('Cannot get resources when the cache is disabled')
if use_cache:
ans = self.get_cached(url)
if ans is not None:
return ans
try:
tag = {
'img': '<img src="%s">',
'link': '<link href="%s"></link>',
'script': '<script src="%s"></script>',
}[rtype] % prepare_string_for_xml(url, attribute=True)
except KeyError:
raise ValueError('Unknown resource type: %s' % rtype)
self.page.mainFrame().setHtml(
'''<!DOCTYPE html><html><body><div>{0}</div></body></html>'''.format(tag))
self._wait_for_load(timeout)
ans = self.get_cached(url)
if ans is not None:
return ans
def show_browser(self): def show_browser(self):
''' '''
@ -474,3 +607,5 @@ class Browser(QObject, FormsMixin):
def __exit__(self, *args): def __exit__(self, *args):
self.close() self.close()

View File

@ -10,6 +10,8 @@ __docformat__ = 'restructuredtext en'
from calibre import as_unicode from calibre import as_unicode
default_timeout = object()
# Forms {{{ # Forms {{{
class Control(object): class Control(object):
@ -43,7 +45,7 @@ class Control(object):
self.qwe.setAttribute('value', as_unicode(val)) self.qwe.setAttribute('value', as_unicode(val))
elif self.type in ('number', 'range'): elif self.type in ('number', 'range'):
self.qwe.setAttribute('value', '%d'%int(val)) self.qwe.setAttribute('value', '%d'%int(val))
else: # Unknown type treat as text else: # Unknown type treat as text
self.qwe.setAttribute('value', as_unicode(val)) self.qwe.setAttribute('value', as_unicode(val))
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
@ -221,7 +223,7 @@ class FormsMixin(object):
return self.current_form return self.current_form
def submit(self, submit_control_selector=None, wait_for_load=True, def submit(self, submit_control_selector=None, wait_for_load=True,
ajax_replies=0, timeout=30.0): ajax_replies=0, timeout=default_timeout):
''' '''
Submit the currently selected form. Tries to autodetect the submit Submit the currently selected form. Tries to autodetect the submit
control. You can override auto-detection by specifying a CSS2 selector control. You can override auto-detection by specifying a CSS2 selector
@ -238,7 +240,7 @@ class FormsMixin(object):
ajax_replies=ajax_replies, timeout=timeout) ajax_replies=ajax_replies, timeout=timeout)
def ajax_submit(self, submit_control_selector=None, def ajax_submit(self, submit_control_selector=None,
num_of_replies=1, timeout=30.0): num_of_replies=1, timeout=default_timeout):
''' '''
Submit the current form. This method is meant for those forms that Submit the current form. This method is meant for those forms that
use AJAX rather than a plain submit. It will block until the specified use AJAX rather than a plain submit. It will block until the specified
@ -249,3 +251,4 @@ class FormsMixin(object):
wait_for_load=False, ajax_replies=num_of_replies, wait_for_load=False, ajax_replies=num_of_replies,
timeout=timeout) timeout=timeout)