Start work on Javascript enabled web scraper

This commit is contained in:
Kovid Goyal 2011-09-16 15:32:02 -06:00
parent a6f0210dd5
commit c894d504a3
5 changed files with 266 additions and 3 deletions

View File

@ -320,6 +320,28 @@ def get_parsed_proxy(typ='http', debug=True):
prints('Using http proxy', str(ans)) prints('Using http proxy', str(ans))
return ans return ans
def get_proxy_info(proxy_scheme, proxy_string):
'''
Parse all proxy information from a proxy string (as returned by
get_proxies). The returned dict will have members set to None when the info
is not available in the string. If an exception occurs parsing the string
this method returns None.
'''
import urlparse
try:
proxy_url = u'%s://%s'%(proxy_scheme, proxy_string)
urlinfo = urlparse.urlparse(proxy_url)
ans = {
u'scheme': urlinfo.scheme,
u'hostname': urlinfo.hostname,
u'port': urlinfo.port,
u'username': urlinfo.username,
u'password': urlinfo.password,
}
except:
return None
return ans
USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'

View File

@ -33,6 +33,7 @@ islinux = not(iswindows or isosx or isbsd)
isfrozen = hasattr(sys, 'frozen') isfrozen = hasattr(sys, 'frozen')
isunix = isosx or islinux isunix = isosx or islinux
isportable = os.environ.get('CALIBRE_PORTABLE_BUILD', None) is not None isportable = os.environ.get('CALIBRE_PORTABLE_BUILD', None) is not None
ispy3 = sys.version_info[0] > 2
try: try:
preferred_encoding = locale.getpreferredencoding() preferred_encoding = locale.getpreferredencoding()

View File

@ -753,15 +753,24 @@ def open_local_file(path):
url = QUrl.fromLocalFile(path) url = QUrl.fromLocalFile(path)
open_url(url) open_url(url)
def is_ok_to_use_qt(): def must_use_qt():
global gui_thread, _store_app global gui_thread, _store_app
if (islinux or isbsd) and ':' not in os.environ.get('DISPLAY', ''): if (islinux or isbsd) and ':' not in os.environ.get('DISPLAY', ''):
return False raise RuntimeError('X server required. If you are running on a'
' headless machine, use xvfb')
if _store_app is None and QApplication.instance() is None: if _store_app is None and QApplication.instance() is None:
_store_app = QApplication([]) _store_app = QApplication([])
if gui_thread is None: if gui_thread is None:
gui_thread = QThread.currentThread() gui_thread = QThread.currentThread()
return gui_thread is QThread.currentThread() if gui_thread is not QThread.currentThread():
raise RuntimeError('Cannot use Qt in non GUI thread')
def is_ok_to_use_qt():
try:
must_use_qt()
except RuntimeError:
return False
return True
def is_gui_thread(): def is_gui_thread():
global gui_thread global gui_thread

View File

@ -0,0 +1,10 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -0,0 +1,221 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, pprint
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
QNetworkProxy, QNetworkProxyFactory)
from PyQt4.QtWebKit import QWebPage
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info
from calibre.constants import ispy3, config_dir
from calibre.utils.logging import ThreadSafeLog
from calibre.gui2 import must_use_qt
class WebPage(QWebPage): # {{{
def __init__(self, log,
confirm_callback=None,
prompt_callback=None,
user_agent=USER_AGENT,
parent=None):
QWebPage.__init__(self, parent)
self.log = log
self.user_agent = user_agent if user_agent else USER_AGENT
self.confirm_callback = confirm_callback
self.prompt_callback = prompt_callback
self.setForwardUnsupportedContent(True)
self.unsupportedContent.connect(self.on_unsupported_content)
def userAgentForUrl(self, url):
return self.user_agent
def javaScriptAlert(self, frame, msg):
if self.view() is not None:
return QWebPage.javaScriptAlert(self, frame, msg)
prints('JSBrowser alert():', unicode(msg))
def javaScriptConfirm(self, frame, msg):
if self.view() is not None:
return QWebPage.javaScriptConfirm(self, frame, msg)
if self.confirm_callback is not None:
return self.confirm_callback(unicode(msg))
return True
def javaScriptConsoleMessage(self, msg, lineno, source_id):
prints('JSBrowser msg():%s:%s:'%(unicode(source_id), lineno), unicode(msg))
def javaScriptPrompt(self, frame, msg, default_value, *args):
if self.view() is not None:
return QWebPage.javaScriptPrompt(self, frame, msg, default_value,
*args)
if self.prompt_callback is None:
return (False, default_value) if ispy3 else False
value = self.prompt_callback(unicode(msg), unicode(default_value))
ok = value is not None
if ispy3:
return ok, value
if ok:
result = args[0]
result.clear()
result.append(value)
return ok
def shouldInterruptJavaScript(self):
if self.view() is not None:
return QWebPage.shouldInterruptJavaScript(self)
return True
def on_unsupported_content(self, reply):
self.log.warn('Unsupported content, ignoring: %s'%reply.url())
# }}}
class ProxyFactory(QNetworkProxyFactory): # {{{
def __init__(self, log):
QNetworkProxyFactory.__init__(self)
proxies = get_proxies()
self.proxies = {}
for scheme, proxy_string in proxies.iteritems():
scheme = scheme.lower()
info = get_proxy_info(scheme, proxy_string)
if info is None: continue
hn, port = info['hostname'], info['port']
if not hn or not port: continue
log.debug('JSBrowser using proxy:', pprint.pformat(info))
pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme,
QNetworkProxy.HttpProxy)
proxy = QNetworkProxy(pt, hn, port)
un, pw = info['username'], info['password']
if un:
proxy.setUser(un)
if pw:
proxy.setPassword(pw)
self.proxies[scheme] = proxy
self.default_proxy = QNetworkProxy(QNetworkProxy.DefaultProxy)
def queryProxy(self, query):
scheme = unicode(query.protocolTag()).lower()
return [self.proxies.get(scheme, self.default_proxy)]
# }}}
class NetworkAccessManager(QNetworkAccessManager): # {{{
OPERATION_NAMES = { getattr(QNetworkAccessManager, '%sOperation'%x) :
x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete',
'Custom')
}
def __init__(self, log, use_disk_cache=True, parent=None):
QNetworkAccessManager.__init__(self, parent)
self.log = log
if use_disk_cache:
self.cache = QNetworkDiskCache(self)
self.cache.setCacheDirectory(os.path.join(config_dir, 'caches',
'jsbrowser'))
self.setCache(self.cache)
self.sslErrors.connect(self.on_ssl_errors)
self.pf = ProxyFactory(log)
self.setProxyFactory(self.pf)
self.finished.connect(self.on_finished)
def on_ssl_errors(self, reply, errors):
reply.ignoreSslErrors()
def createRequest(self, operation, request, data):
url = unicode(request.url().toString())
operation_name = self.OPERATION_NAMES[operation]
debug = []
debug.append(('Request: %s %s' % (operation_name, url)))
for h in request.rawHeaderList():
try:
d = ' %s: %s' % (h, request.rawHeader(h))
except:
d = ' %r: %r' % (h, request.rawHeader(h))
debug.append(d)
if data is not None:
raw = data.peek(1024)
try:
raw = raw.decode('utf-8')
except:
raw = repr(raw)
debug.append(' Request data: %s'%raw)
self.log.debug('\n'.join(debug))
return QNetworkAccessManager.createRequest(self, operation, request,
data)
def on_finished(self, reply):
reply_url = unicode(reply.url().toString())
if reply.error():
self.log.warn("Reply error: %s - %d (%s)" %
(reply_url, reply.error(), reply.errorString()))
else:
debug = []
debug.append("Reply successful: %s" % reply_url)
for h in reply.rawHeaderList():
try:
d = ' %s: %s' % (h, reply.rawHeader(h))
except:
d = ' %r: %r' % (h, reply.rawHeader(h))
debug.append(d)
self.log.debug('\n'.join(debug))
# }}}
class Browser(QObject):
def __init__(self,
# Logging. If None, uses a default log, which does not output
# debugging info
log = None,
# Receives a string and returns True/False. By default, returns
# True for all strings
confirm_callback=None,
# Prompt callback. Receives a msg string and a default value
# string. Should return the user input value or None if the user
# canceled the prompt. By default returns None.
prompt_callback=None,
# User agent to be used
user_agent=USER_AGENT,
# If True a disk cache is used
use_disk_cache=True,
# Verbosity
verbosity = 0
):
must_use_qt()
QObject.__init__(self)
if log is None:
log = ThreadSafeLog()
if verbosity:
log.filter_level = log.DEBUG
self.jquery_lib = P('content_server/jquery.js', data=True,
allow_user_override=False).decode('utf-8')
self.simulate_lib = P('jquery.simulate.js', data=True,
allow_user_override=False).decode('utf-8')
self.page = WebPage(log, confirm_callback=confirm_callback,
prompt_callback=prompt_callback, user_agent=user_agent,
parent=self)
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
self.page.setNetworkAccessManager(self.nam)
def visit(self, url):
pass