GetBooks; Update Google Books plugin for website changes

This commit is contained in:
Kovid Goyal 2017-03-07 09:34:58 +05:30
parent c4a949bfd7
commit f086a48a4a

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function) from __future__ import (unicode_literals, division, absolute_import, print_function)
store_version = 3 # Needed for dynamic plugin loading store_version = 4 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -11,9 +11,9 @@ import urllib
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
from PyQt5.Qt import QUrl from PyQt5.Qt import QUrl
import html5lib
from calibre import browser, url_slash_cleaner from calibre import browser, url_slash_cleaner
from calibre.gui2 import open_url from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin from calibre.gui2.store import StorePlugin
@ -22,6 +22,49 @@ from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog from calibre.gui2.store.web_store_dialog import WebStoreDialog
def parse_html(raw):
return html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
def search_google(query, max_results=10, timeout=60, write_html_to=None):
url = 'https://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query)
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
raw = f.read()
doc = parse_html(raw)
if write_html_to is not None:
praw = html.tostring(doc, encoding='utf-8')
open(write_html_to, 'wb').write(praw)
for data in doc.xpath('//div[@id="rso"]//div[@class="g"]'):
if counter <= 0:
break
id = ''.join(data.xpath('.//h3/a/@href'))
if not id:
continue
title = ''.join(data.xpath('.//h3/a//text()'))
authors = data.xpath('descendant::div[@class="s"]//a[@class="fl" and @href]//text()')
while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'):
authors = authors[:-1]
if not authors:
continue
author = ' & '.join(authors)
counter -= 1
s = SearchResult()
s.title = title.strip()
s.author = author.strip()
s.detail_item = id.strip()
s.drm = SearchResult.DRM_UNKNOWN
yield s
class GoogleBooksStore(BasicStoreConfig, StorePlugin): class GoogleBooksStore(BasicStoreConfig, StorePlugin):
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
@ -35,43 +78,13 @@ class GoogleBooksStore(BasicStoreConfig, StorePlugin):
d.exec_() d.exec_()
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = 'https://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) for result in search_google(query, max_results=max_results, timeout=timeout):
yield result
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
for data in doc.xpath('//ol/li'):
if counter <= 0:
break
id = ''.join(data.xpath('.//h3/a/@href'))
if not id:
continue
title = ''.join(data.xpath('.//h3/a//text()'))
authors = data.xpath('.//span[contains(@class, "f")]//a//text()')
while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'):
authors = authors[:-1]
if not authors:
continue
author = ', '.join(authors)
counter -= 1
s = SearchResult()
s.title = title.strip()
s.author = author.strip()
s.detail_item = id.strip()
s.drm = SearchResult.DRM_UNKNOWN
yield s
def get_details(self, search_result, timeout): def get_details(self, search_result, timeout):
br = browser() br = browser()
with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: with closing(br.open(search_result.detail_item, timeout=timeout)) as nf:
doc = html.fromstring(nf.read()) doc = parse_html(nf.read())
search_result.cover_url = ''.join(doc.xpath('//div[@class="sidebarcover"]//img/@src')) search_result.cover_url = ''.join(doc.xpath('//div[@class="sidebarcover"]//img/@src'))
@ -90,3 +103,9 @@ class GoogleBooksStore(BasicStoreConfig, StorePlugin):
search_result.formats = _('Unknown') search_result.formats = _('Unknown')
return True return True
if __name__ == '__main__':
import sys
for result in search_google(' '.join(sys.argv[1:]), write_html_to='/t/google.html'):
print (result)