mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Start work on Javascript enabled web scraper
This commit is contained in:
parent
a6f0210dd5
commit
c894d504a3
@ -320,6 +320,28 @@ def get_parsed_proxy(typ='http', debug=True):
|
||||
prints('Using http proxy', str(ans))
|
||||
return ans
|
||||
|
||||
def get_proxy_info(proxy_scheme, proxy_string):
|
||||
'''
|
||||
Parse all proxy information from a proxy string (as returned by
|
||||
get_proxies). The returned dict will have members set to None when the info
|
||||
is not available in the string. If an exception occurs parsing the string
|
||||
this method returns None.
|
||||
'''
|
||||
import urlparse
|
||||
try:
|
||||
proxy_url = u'%s://%s'%(proxy_scheme, proxy_string)
|
||||
urlinfo = urlparse.urlparse(proxy_url)
|
||||
ans = {
|
||||
u'scheme': urlinfo.scheme,
|
||||
u'hostname': urlinfo.hostname,
|
||||
u'port': urlinfo.port,
|
||||
u'username': urlinfo.username,
|
||||
u'password': urlinfo.password,
|
||||
}
|
||||
except:
|
||||
return None
|
||||
return ans
|
||||
|
||||
USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
|
||||
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
|
||||
|
||||
|
@ -33,6 +33,7 @@ islinux = not(iswindows or isosx or isbsd)
|
||||
isfrozen = hasattr(sys, 'frozen')
|
||||
isunix = isosx or islinux
|
||||
isportable = os.environ.get('CALIBRE_PORTABLE_BUILD', None) is not None
|
||||
ispy3 = sys.version_info[0] > 2
|
||||
|
||||
try:
|
||||
preferred_encoding = locale.getpreferredencoding()
|
||||
|
@ -753,15 +753,24 @@ def open_local_file(path):
|
||||
url = QUrl.fromLocalFile(path)
|
||||
open_url(url)
|
||||
|
||||
def is_ok_to_use_qt():
|
||||
def must_use_qt():
|
||||
global gui_thread, _store_app
|
||||
if (islinux or isbsd) and ':' not in os.environ.get('DISPLAY', ''):
|
||||
return False
|
||||
raise RuntimeError('X server required. If you are running on a'
|
||||
' headless machine, use xvfb')
|
||||
if _store_app is None and QApplication.instance() is None:
|
||||
_store_app = QApplication([])
|
||||
if gui_thread is None:
|
||||
gui_thread = QThread.currentThread()
|
||||
return gui_thread is QThread.currentThread()
|
||||
if gui_thread is not QThread.currentThread():
|
||||
raise RuntimeError('Cannot use Qt in non GUI thread')
|
||||
|
||||
def is_ok_to_use_qt():
|
||||
try:
|
||||
must_use_qt()
|
||||
except RuntimeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_gui_thread():
|
||||
global gui_thread
|
||||
|
10
src/calibre/web/jsbrowser/__init__.py
Normal file
10
src/calibre/web/jsbrowser/__init__.py
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
221
src/calibre/web/jsbrowser/browser.py
Normal file
221
src/calibre/web/jsbrowser/browser.py
Normal file
@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, pprint
|
||||
|
||||
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
|
||||
QNetworkProxy, QNetworkProxyFactory)
|
||||
from PyQt4.QtWebKit import QWebPage
|
||||
|
||||
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info
|
||||
from calibre.constants import ispy3, config_dir
|
||||
from calibre.utils.logging import ThreadSafeLog
|
||||
from calibre.gui2 import must_use_qt
|
||||
|
||||
class WebPage(QWebPage): # {{{
|
||||
|
||||
def __init__(self, log,
|
||||
confirm_callback=None,
|
||||
prompt_callback=None,
|
||||
user_agent=USER_AGENT,
|
||||
parent=None):
|
||||
QWebPage.__init__(self, parent)
|
||||
|
||||
self.log = log
|
||||
self.user_agent = user_agent if user_agent else USER_AGENT
|
||||
self.confirm_callback = confirm_callback
|
||||
self.prompt_callback = prompt_callback
|
||||
self.setForwardUnsupportedContent(True)
|
||||
self.unsupportedContent.connect(self.on_unsupported_content)
|
||||
|
||||
def userAgentForUrl(self, url):
|
||||
return self.user_agent
|
||||
|
||||
def javaScriptAlert(self, frame, msg):
|
||||
if self.view() is not None:
|
||||
return QWebPage.javaScriptAlert(self, frame, msg)
|
||||
prints('JSBrowser alert():', unicode(msg))
|
||||
|
||||
def javaScriptConfirm(self, frame, msg):
|
||||
if self.view() is not None:
|
||||
return QWebPage.javaScriptConfirm(self, frame, msg)
|
||||
if self.confirm_callback is not None:
|
||||
return self.confirm_callback(unicode(msg))
|
||||
return True
|
||||
|
||||
def javaScriptConsoleMessage(self, msg, lineno, source_id):
|
||||
prints('JSBrowser msg():%s:%s:'%(unicode(source_id), lineno), unicode(msg))
|
||||
|
||||
def javaScriptPrompt(self, frame, msg, default_value, *args):
|
||||
if self.view() is not None:
|
||||
return QWebPage.javaScriptPrompt(self, frame, msg, default_value,
|
||||
*args)
|
||||
if self.prompt_callback is None:
|
||||
return (False, default_value) if ispy3 else False
|
||||
value = self.prompt_callback(unicode(msg), unicode(default_value))
|
||||
ok = value is not None
|
||||
if ispy3:
|
||||
return ok, value
|
||||
if ok:
|
||||
result = args[0]
|
||||
result.clear()
|
||||
result.append(value)
|
||||
return ok
|
||||
|
||||
def shouldInterruptJavaScript(self):
|
||||
if self.view() is not None:
|
||||
return QWebPage.shouldInterruptJavaScript(self)
|
||||
return True
|
||||
|
||||
def on_unsupported_content(self, reply):
|
||||
self.log.warn('Unsupported content, ignoring: %s'%reply.url())
|
||||
|
||||
# }}}
|
||||
|
||||
class ProxyFactory(QNetworkProxyFactory): # {{{
|
||||
|
||||
def __init__(self, log):
|
||||
QNetworkProxyFactory.__init__(self)
|
||||
proxies = get_proxies()
|
||||
self.proxies = {}
|
||||
for scheme, proxy_string in proxies.iteritems():
|
||||
scheme = scheme.lower()
|
||||
info = get_proxy_info(scheme, proxy_string)
|
||||
if info is None: continue
|
||||
hn, port = info['hostname'], info['port']
|
||||
if not hn or not port: continue
|
||||
log.debug('JSBrowser using proxy:', pprint.pformat(info))
|
||||
pt = {'socks5':QNetworkProxy.Socks5Proxy}.get(scheme,
|
||||
QNetworkProxy.HttpProxy)
|
||||
proxy = QNetworkProxy(pt, hn, port)
|
||||
un, pw = info['username'], info['password']
|
||||
if un:
|
||||
proxy.setUser(un)
|
||||
if pw:
|
||||
proxy.setPassword(pw)
|
||||
self.proxies[scheme] = proxy
|
||||
|
||||
self.default_proxy = QNetworkProxy(QNetworkProxy.DefaultProxy)
|
||||
|
||||
def queryProxy(self, query):
|
||||
scheme = unicode(query.protocolTag()).lower()
|
||||
return [self.proxies.get(scheme, self.default_proxy)]
|
||||
# }}}
|
||||
|
||||
class NetworkAccessManager(QNetworkAccessManager): # {{{
|
||||
|
||||
OPERATION_NAMES = { getattr(QNetworkAccessManager, '%sOperation'%x) :
|
||||
x.upper() for x in ('Head', 'Get', 'Put', 'Post', 'Delete',
|
||||
'Custom')
|
||||
}
|
||||
|
||||
def __init__(self, log, use_disk_cache=True, parent=None):
|
||||
QNetworkAccessManager.__init__(self, parent)
|
||||
self.log = log
|
||||
if use_disk_cache:
|
||||
self.cache = QNetworkDiskCache(self)
|
||||
self.cache.setCacheDirectory(os.path.join(config_dir, 'caches',
|
||||
'jsbrowser'))
|
||||
self.setCache(self.cache)
|
||||
self.sslErrors.connect(self.on_ssl_errors)
|
||||
self.pf = ProxyFactory(log)
|
||||
self.setProxyFactory(self.pf)
|
||||
self.finished.connect(self.on_finished)
|
||||
|
||||
def on_ssl_errors(self, reply, errors):
|
||||
reply.ignoreSslErrors()
|
||||
|
||||
def createRequest(self, operation, request, data):
|
||||
url = unicode(request.url().toString())
|
||||
operation_name = self.OPERATION_NAMES[operation]
|
||||
debug = []
|
||||
debug.append(('Request: %s %s' % (operation_name, url)))
|
||||
for h in request.rawHeaderList():
|
||||
try:
|
||||
d = ' %s: %s' % (h, request.rawHeader(h))
|
||||
except:
|
||||
d = ' %r: %r' % (h, request.rawHeader(h))
|
||||
debug.append(d)
|
||||
|
||||
if data is not None:
|
||||
raw = data.peek(1024)
|
||||
try:
|
||||
raw = raw.decode('utf-8')
|
||||
except:
|
||||
raw = repr(raw)
|
||||
debug.append(' Request data: %s'%raw)
|
||||
|
||||
self.log.debug('\n'.join(debug))
|
||||
return QNetworkAccessManager.createRequest(self, operation, request,
|
||||
data)
|
||||
|
||||
def on_finished(self, reply):
|
||||
reply_url = unicode(reply.url().toString())
|
||||
|
||||
if reply.error():
|
||||
self.log.warn("Reply error: %s - %d (%s)" %
|
||||
(reply_url, reply.error(), reply.errorString()))
|
||||
else:
|
||||
debug = []
|
||||
debug.append("Reply successful: %s" % reply_url)
|
||||
for h in reply.rawHeaderList():
|
||||
try:
|
||||
d = ' %s: %s' % (h, reply.rawHeader(h))
|
||||
except:
|
||||
d = ' %r: %r' % (h, reply.rawHeader(h))
|
||||
debug.append(d)
|
||||
self.log.debug('\n'.join(debug))
|
||||
# }}}
|
||||
|
||||
class Browser(QObject):
|
||||
|
||||
def __init__(self,
|
||||
# Logging. If None, uses a default log, which does not output
|
||||
# debugging info
|
||||
log = None,
|
||||
# Receives a string and returns True/False. By default, returns
|
||||
# True for all strings
|
||||
confirm_callback=None,
|
||||
|
||||
# Prompt callback. Receives a msg string and a default value
|
||||
# string. Should return the user input value or None if the user
|
||||
# canceled the prompt. By default returns None.
|
||||
prompt_callback=None,
|
||||
|
||||
# User agent to be used
|
||||
user_agent=USER_AGENT,
|
||||
|
||||
# If True a disk cache is used
|
||||
use_disk_cache=True,
|
||||
|
||||
# Verbosity
|
||||
verbosity = 0
|
||||
):
|
||||
must_use_qt()
|
||||
QObject.__init__(self)
|
||||
|
||||
if log is None:
|
||||
log = ThreadSafeLog()
|
||||
if verbosity:
|
||||
log.filter_level = log.DEBUG
|
||||
|
||||
self.jquery_lib = P('content_server/jquery.js', data=True,
|
||||
allow_user_override=False).decode('utf-8')
|
||||
self.simulate_lib = P('jquery.simulate.js', data=True,
|
||||
allow_user_override=False).decode('utf-8')
|
||||
|
||||
self.page = WebPage(log, confirm_callback=confirm_callback,
|
||||
prompt_callback=prompt_callback, user_agent=user_agent,
|
||||
parent=self)
|
||||
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
|
||||
self.page.setNetworkAccessManager(self.nam)
|
||||
|
||||
def visit(self, url):
|
||||
pass
|
||||
|
Loading…
x
Reference in New Issue
Block a user