Refactor Woblink store plugin to not use Qt WebKit

This commit is contained in:
Kovid Goyal 2016-04-27 08:09:45 +05:30
parent 60a531283c
commit c82c1235e1

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 10 # Needed for dynamic plugin loading store_version = 11 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011-2014, Tomasz Długosz <tomek3d@gmail.com>' __copyright__ = '2011-2014, Tomasz Długosz <tomek3d@gmail.com>'
@ -11,29 +11,66 @@ import urllib
from base64 import b64encode from base64 import b64encode
from lxml import html from lxml import html
from mechanize import Request
from PyQt5.Qt import QUrl from PyQt5.Qt import QUrl
from calibre import url_slash_cleaner from calibre import url_slash_cleaner, browser
from calibre.gui2 import open_url from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin from calibre.gui2.store import StorePlugin
from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog from calibre.gui2.store.web_store_dialog import WebStoreDialog
from calibre.ebooks.chardet import strip_encoding_declarations def search(query, max_results=10, timeout=60):
from calibre.utils.ipc.simple_worker import fork_job, WorkerError url = 'http://woblink.com/publication/ajax?mode=none&query=' + urllib.quote_plus(query.encode('utf-8'))
if max_results > 10:
if max_results > 20:
url += '&limit=30'
else:
url += '&limit=20'
br = browser(user_agent='CalibreCrawler/1.0')
br.set_handle_gzip(True)
rq = Request(url, headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Referrer':'http://woblink.com/ebooki-kategorie',
'Cache-Control':'max-age=0',
}, data=urllib.urlencode({
'nw_filtry_filtr_zakrescen_formularz[min]':'0',
'nw_filtry_filtr_zakrescen_formularz[max]':'350',
}))
r = br.open(rq)
raw = r.read()
doc = html.fromstring('<html><body>' + raw.decode('utf-8') + '</body></html>')
counter = max_results
js_browser = ''' for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka "]'):
from calibre.web.jsbrowser.browser import Browser, Timeout if counter <= 0:
import urllib break
id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href'))
if not id:
continue
cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src'))
title = ''.join(data.xpath('.//h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()'))
author = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()'))
price = ''.join(data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()'))
formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()'))
s = SearchResult()
s.cover_url = 'http://woblink.com' + cover_url
s.title = title.strip()
s.author = author.strip()
s.price = price + ''
s.detail_item = id.strip()
s.formats = formats
counter -= 1
s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED
yield s
def get_results(url, timeout):
browser = Browser(default_timeout=timeout, user_agent='CalibreCrawler/1.0')
browser.visit(url)
browser.wait_for_element('#nw_kontent_main')
return browser.html
'''
class WoblinkStore(BasicStoreConfig, StorePlugin): class WoblinkStore(BasicStoreConfig, StorePlugin):
@ -56,49 +93,9 @@ class WoblinkStore(BasicStoreConfig, StorePlugin):
d.exec_() d.exec_()
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = 'http://woblink.com/ebooki-kategorie?query=' + urllib.quote_plus(query.encode('utf-8')) for s in search(query, max_results, timeout):
if max_results > 10: yield s
if max_results > 20:
url += '&limit=30'
else:
url += '&limit=20'
counter = max_results if __name__ == '__main__':
from pprint import pprint
try: pprint(list(search('Franciszek')))
results = fork_job(js_browser,'get_results', (url, timeout,), module_is_source_code=True)
except WorkerError as e:
raise Exception('Could not get results: %s'%e.orig_tb)
doc = html.fromstring(strip_encoding_declarations(results['result']))
for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka "]'):
if counter <= 0:
break
id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href'))
if not id:
continue
cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src'))
title = ''.join(data.xpath('.//h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()'))
author = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()'))
price = ''.join(data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()'))
formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()'))
s = SearchResult()
s.cover_url = 'http://woblink.com' + cover_url
s.title = title.strip()
s.author = author.strip()
s.price = price + ''
s.detail_item = id.strip()
s.formats = formats
if 'DRM' in formats:
s.drm = SearchResult.DRM_LOCKED
counter -= 1
yield s
else:
s.drm = SearchResult.DRM_UNLOCKED
counter -= 1
yield s