Various improvements for jsbrowser

Add APIs to get DOM elements by selector, get resources
(images/stylesheets/etc) and get image data for displayed images. Fix
waiting till the DOM is ready. Make the default timeout a per
browser-object setting.
This commit is contained in:
Kovid Goyal 2013-06-08 22:42:03 +05:30
parent d9635111d2
commit eaed92987f
2 changed files with 172 additions and 34 deletions

View File

@ -7,26 +7,29 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, pprint, time
import os, pprint, time, uuid
from cookielib import Cookie
from threading import current_thread
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl, pyqtSignal,
QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot)
QDialog, QVBoxLayout, QSize, QNetworkCookieJar, Qt, pyqtSlot, QPixmap)
from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView, QWebElement
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info, prepare_string_for_xml
from calibre.constants import ispy3, cache_dir
from calibre.utils.logging import ThreadSafeLog
from calibre.gui2 import must_use_qt
from calibre.web.jsbrowser.forms import FormsMixin
from calibre.web.jsbrowser.forms import FormsMixin, default_timeout
class Timeout(Exception): pass
class Timeout(Exception):
pass
class LoadError(Exception): pass
class LoadError(Exception):
pass
class WebPage(QWebPage): # {{{
class WebPage(QWebPage): # {{{
def __init__(self, log,
confirm_callback=None,
@ -48,6 +51,24 @@ class WebPage(QWebPage): # {{{
QWebSettings.enablePersistentStorage(os.path.join(cache_dir(),
'webkit-persistence'))
QWebSettings.setMaximumPagesInCache(0)
self.bridge_name = 'b' + uuid.uuid4().get_hex()
self.mainFrame().javaScriptWindowObjectCleared.connect(
self.add_window_objects)
self.dom_loaded = False
def add_window_objects(self):
self.dom_loaded = False
mf = self.mainFrame()
mf.addToJavaScriptWindowObject(self.bridge_name, self)
mf.evaluateJavaScript('document.addEventListener( "DOMContentLoaded", %s.content_loaded, false )' % self.bridge_name)
def load_url(self, url):
self.dom_loaded = False
self.mainFrame().load(QUrl(url))
@pyqtSlot()
def content_loaded(self):
self.dom_loaded = True
def userAgentForUrl(self, url):
return self.user_agent
@ -96,9 +117,28 @@ class WebPage(QWebPage): # {{{
def ready_state(self):
return unicode(self.mainFrame().evaluateJavaScript('document.readyState').toString())
@pyqtSlot(QPixmap)
def transfer_image(self, img):
self.saved_img = img
def get_image(self, qwe_or_selector):
qwe = qwe_or_selector
if not isinstance(qwe, QWebElement):
qwe = self.mainFrame().findFirstElement(qwe)
if qwe.isNull():
raise ValueError('Failed to find element with selector: %r'
% qwe_or_selector)
self.saved_img = QPixmap()
qwe.evaluateJavaScript('%s.transfer_image(this)' % self.bridge_name)
try:
return self.saved_img
finally:
del self.saved_img
# }}}
class ProxyFactory(QNetworkProxyFactory): # {{{
class ProxyFactory(QNetworkProxyFactory): # {{{
def __init__(self, log):
QNetworkProxyFactory.__init__(self)
@ -107,9 +147,11 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
for scheme, proxy_string in proxies.iteritems():
scheme = scheme.lower()
info = get_proxy_info(scheme, proxy_string)
if info is None: continue
if info is None:
continue
hn, port = info['hostname'], info['port']
if not hn or not port: continue
if not hn or not port:
continue
log.debug('JSBrowser using proxy:', pprint.pformat(info))
pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme,
QNetworkProxy.HttpProxy)
@ -128,9 +170,9 @@ class ProxyFactory(QNetworkProxyFactory): # {{{
return [self.proxies.get(scheme, self.default_proxy)]
# }}}
class NetworkAccessManager(QNetworkAccessManager): # {{{
class NetworkAccessManager(QNetworkAccessManager): # {{{
OPERATION_NAMES = { getattr(QNetworkAccessManager, '%sOperation'%x) :
OPERATION_NAMES = {getattr(QNetworkAccessManager, '%sOperation'%x) :
x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete',
'Custom')
}
@ -230,18 +272,18 @@ class NetworkAccessManager(QNetworkAccessManager): # {{{
c = Cookie(0, # version
name, value,
None, # port
False, # port specified
False, # port specified
domain, domain_specified, initial_dot, path,
path_specified,
secure, expires, is_session_cookie,
None, # Comment
None, # Comment URL
{} # rest
None, # Comment
None, # Comment URL
{} # rest
)
yield c
# }}}
class LoadWatcher(QObject): # {{{
class LoadWatcher(QObject): # {{{
def __init__(self, page, parent=None):
QObject.__init__(self, parent)
@ -257,7 +299,7 @@ class LoadWatcher(QObject): # {{{
self.page = None
# }}}
class BrowserView(QDialog): # {{{
class BrowserView(QDialog): # {{{
def __init__(self, page, parent=None):
QDialog.__init__(self, parent)
@ -283,7 +325,7 @@ class Browser(QObject, FormsMixin):
def __init__(self,
# Logging. If None, uses a default log, which does not output
# debugging info
log = None,
log=None,
# Receives a string and returns True/False. By default, returns
# True for all strings
confirm_callback=None,
@ -303,7 +345,10 @@ class Browser(QObject, FormsMixin):
enable_developer_tools=False,
# Verbosity
verbosity = 0
verbosity=0,
# The default timeout (in seconds)
default_timeout=30
):
must_use_qt()
QObject.__init__(self)
@ -314,6 +359,7 @@ class Browser(QObject, FormsMixin):
if verbosity:
log.filter_level = log.DEBUG
self.log = log
self.default_timeout = default_timeout
self.page = WebPage(log, confirm_callback=confirm_callback,
prompt_callback=prompt_callback, user_agent=user_agent,
@ -327,6 +373,7 @@ class Browser(QObject, FormsMixin):
return self.page.user_agent
def _wait_for_load(self, timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
loop = QEventLoop(self)
start_time = time.time()
end_time = start_time + timeout
@ -358,7 +405,16 @@ class Browser(QObject, FormsMixin):
if not loop.processEvents():
time.sleep(0.1)
def visit(self, url, timeout=30.0):
def wait_for_element(self, selector, timeout=default_timeout):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
while self.css_select(selector) is None:
self.run_for_a_time(0.1)
if time.time() - start_time > timeout:
raise Timeout('DOM failed to load in %.1g seconds' % timeout)
return self.css_select(selector)
def visit(self, url, timeout=default_timeout):
'''
Open the page specified in URL and wait for it to complete loading.
Note that when this method returns, there may still be javascript
@ -369,14 +425,26 @@ class Browser(QObject, FormsMixin):
Returns True if loading was successful, False otherwise.
'''
self.current_form = None
self.page.mainFrame().load(QUrl(url))
self.page.load_url(url)
return self._wait_for_load(timeout, url)
def back(self, wait_for_load=True, timeout=default_timeout):
'''
Like clicking the back button in the browser. Waits for loading to complete.
This method will raise a Timeout exception if loading takes more than timeout seconds.
Returns True if loading was successful, False otherwise.
'''
self.page.triggerAction(self.page.Back)
if wait_for_load:
return self._wait_for_load(timeout)
@property
def dom_ready(self):
return self.page.ready_state in {'complete', 'interactive'}
return self.page.dom_loaded
def wait_till_dom_ready(self, timeout=30.0, url=None):
def wait_till_dom_ready(self, timeout=default_timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
while not self.dom_ready:
if time.time() - start_time > timeout:
@ -384,18 +452,30 @@ class Browser(QObject, FormsMixin):
url, timeout))
self.run_for_a_time(0.1)
def start_load(self, url, timeout=30.0):
def wait_till_element_exists(self, selector, timeout=default_timeout, url=None):
timeout = self.default_timeout if timeout is default_timeout else timeout
start_time = time.time()
while self.css_select(selector) is None:
if time.time() - start_time > timeout:
raise Timeout('Loading of %r took longer than %d seconds'%(
url, timeout))
self.run_for_a_time(0.1)
def start_load(self, url, timeout=default_timeout, selector=None):
'''
Start the loading of the page at url and return once the DOM is ready,
sub-resources such as scripts/stylesheets/images/etc. may not have all
loaded.
'''
self.current_form = None
self.page.mainFrame().load(QUrl(url))
self.page.load_url(url)
self.run_for_a_time(0.01)
self.wait_till_dom_ready(timeout=timeout, url=url)
if selector is not None:
self.wait_till_element_exists(selector, timeout=timeout, url=url)
else:
self.wait_till_dom_ready(timeout=timeout, url=url)
def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=30.0):
def click(self, qwe_or_selector, wait_for_load=True, ajax_replies=0, timeout=default_timeout):
'''
Click the :class:`QWebElement` pointed to by qwe_or_selector.
@ -408,8 +488,8 @@ class Browser(QObject, FormsMixin):
initial_count = self.nam.reply_count
qwe = qwe_or_selector
if not isinstance(qwe, QWebElement):
qwe = self.page.mainFrame().findFirstElement(qwe)
if qwe.isNull():
qwe = self.css_select(qwe)
if qwe is None:
raise ValueError('Failed to find element with selector: %r'
% qwe_or_selector)
js = '''
@ -425,7 +505,7 @@ class Browser(QObject, FormsMixin):
raise LoadError('Clicking resulted in a failed load')
def click_text_link(self, text_or_regex, selector='a[href]',
wait_for_load=True, ajax_replies=0, timeout=30.0):
wait_for_load=True, ajax_replies=0, timeout=default_timeout):
target = None
for qwe in self.page.mainFrame().findAllElements(selector):
src = unicode(qwe.toPlainText())
@ -441,6 +521,59 @@ class Browser(QObject, FormsMixin):
return self.click(target, wait_for_load=wait_for_load,
ajax_replies=ajax_replies, timeout=timeout)
def css_select(self, selector, all=False):
if all:
return tuple(self.page.mainFrame().findAllElements(selector).toList())
ans = self.page.mainFrame().findFirstElement(selector)
if ans.isNull():
ans = None
return ans
def get_image(self, qwe_or_selector):
'''
Return the image identified by qwe_or_selector as a QPixmap. If no such
image exists, the returned pixmap will be null.
'''
return self.page.get_image(qwe_or_selector)
def get_cached(self, url):
iod = self.nam.cache.data(QUrl(url))
if iod is not None:
return bytes(bytearray(iod.readAll()))
def get_resource(self, url, rtype='img', use_cache=True, timeout=default_timeout):
'''
Download a resource (image/stylesheet/script). The resource is
downloaded by visiting an simple HTML page that contains only that
resource. The resource is then returned from the cache (therefore, to
use this method you must not disable the cache). If use_cache is True
then the cache is queried before loading the resource. This can result
in a stale object if the resource has changed on the server, however,
it is a big performance boost in the common case, by avoiding a
roundtrip to the server. The resource is returned as a bytestring or None
if it could not be loaded.
'''
if not hasattr(self.nam, 'cache'):
raise RuntimeError('Cannot get resources when the cache is disabled')
if use_cache:
ans = self.get_cached(url)
if ans is not None:
return ans
try:
tag = {
'img': '<img src="%s">',
'link': '<link href="%s"></link>',
'script': '<script src="%s"></script>',
}[rtype] % prepare_string_for_xml(url, attribute=True)
except KeyError:
raise ValueError('Unknown resource type: %s' % rtype)
self.page.mainFrame().setHtml(
'''<!DOCTYPE html><html><body><div>{0}</div></body></html>'''.format(tag))
self._wait_for_load(timeout)
ans = self.get_cached(url)
if ans is not None:
return ans
def show_browser(self):
'''
@ -474,3 +607,5 @@ class Browser(QObject, FormsMixin):
def __exit__(self, *args):
self.close()

View File

@ -10,6 +10,8 @@ __docformat__ = 'restructuredtext en'
from calibre import as_unicode
default_timeout = object()
# Forms {{{
class Control(object):
@ -43,7 +45,7 @@ class Control(object):
self.qwe.setAttribute('value', as_unicode(val))
elif self.type in ('number', 'range'):
self.qwe.setAttribute('value', '%d'%int(val))
else: # Unknown type treat as text
else: # Unknown type treat as text
self.qwe.setAttribute('value', as_unicode(val))
return property(fget=fget, fset=fset)
@ -221,7 +223,7 @@ class FormsMixin(object):
return self.current_form
def submit(self, submit_control_selector=None, wait_for_load=True,
ajax_replies=0, timeout=30.0):
ajax_replies=0, timeout=default_timeout):
'''
Submit the currently selected form. Tries to autodetect the submit
control. You can override auto-detection by specifying a CSS2 selector
@ -238,7 +240,7 @@ class FormsMixin(object):
ajax_replies=ajax_replies, timeout=timeout)
def ajax_submit(self, submit_control_selector=None,
num_of_replies=1, timeout=30.0):
num_of_replies=1, timeout=default_timeout):
'''
Submit the current form. This method is meant for those forms that
use AJAX rather than a plain submit. It will block until the specified
@ -249,3 +251,4 @@ class FormsMixin(object):
wait_for_load=False, ajax_replies=num_of_replies,
timeout=timeout)