From f086a48a4a105a2e46b9a4c3786a0d7b0f53448e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 7 Mar 2017 09:34:58 +0530 Subject: [PATCH] GetBooks; Update Google Books plugin for website changes --- .../gui2/store/stores/google_books_plugin.py | 89 +++++++++++-------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/src/calibre/gui2/store/stores/google_books_plugin.py b/src/calibre/gui2/store/stores/google_books_plugin.py index c16fca163f..d225a84e8e 100644 --- a/src/calibre/gui2/store/stores/google_books_plugin.py +++ b/src/calibre/gui2/store/stores/google_books_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 3 # Needed for dynamic plugin loading +store_version = 4 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' @@ -11,9 +11,9 @@ import urllib from contextlib import closing from lxml import html - from PyQt5.Qt import QUrl +import html5lib from calibre import browser, url_slash_cleaner from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin @@ -22,6 +22,49 @@ from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog +def parse_html(raw): + return html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml') + + +def search_google(query, max_results=10, timeout=60, write_html_to=None): + url = 'https://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) + + br = browser() + + counter = max_results + with closing(br.open(url, timeout=timeout)) as f: + raw = f.read() + doc = parse_html(raw) + if write_html_to is not None: + praw = html.tostring(doc, encoding='utf-8') + open(write_html_to, 'wb').write(praw) + for data in doc.xpath('//div[@id="rso"]//div[@class="g"]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//h3/a/@href')) + if not id: + continue + + title = ''.join(data.xpath('.//h3/a//text()')) + authors = data.xpath('descendant::div[@class="s"]//a[@class="fl" and @href]//text()') + while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): + authors = authors[:-1] + if not authors: + continue + author = ' & '.join(authors) + + counter -= 1 + + s = SearchResult() + s.title = title.strip() + s.author = author.strip() + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNKNOWN + + yield s + + class GoogleBooksStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): @@ -35,43 +78,13 @@ class GoogleBooksStore(BasicStoreConfig, StorePlugin): d.exec_() def search(self, query, max_results=10, timeout=60): - url = 'https://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) - - br = browser() - - counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//ol/li'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//h3/a/@href')) - if not id: - continue - - title = ''.join(data.xpath('.//h3/a//text()')) - authors = data.xpath('.//span[contains(@class, "f")]//a//text()') - while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): - authors = authors[:-1] - if not authors: - continue - author = ', '.join(authors) - - counter -= 1 - - s = SearchResult() - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNKNOWN - - yield s + for result in search_google(query, max_results=max_results, timeout=timeout): + yield result def get_details(self, search_result, timeout): br = browser() with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: - doc = html.fromstring(nf.read()) + doc = parse_html(nf.read()) search_result.cover_url = ''.join(doc.xpath('//div[@class="sidebarcover"]//img/@src')) @@ -90,3 +103,9 @@ class GoogleBooksStore(BasicStoreConfig, StorePlugin): search_result.formats = _('Unknown') return True + + +if __name__ == '__main__': + import sys + for result in search_google(' '.join(sys.argv[1:]), write_html_to='/t/google.html'): + print (result)