From 7d9ebf412d76370833a6cdb6cb7160cd12f7d9d6 Mon Sep 17 00:00:00 2001 From: josdion Date: Sun, 21 Feb 2021 01:00:56 +0200 Subject: [PATCH] Fix chitanka store Update html parser to be consistent with the current version of the site. --- .../gui2/store/stores/chitanka_plugin.py | 117 ++++++++---------- 1 file changed, 54 insertions(+), 63 deletions(-) diff --git a/src/calibre/gui2/store/stores/chitanka_plugin.py b/src/calibre/gui2/store/stores/chitanka_plugin.py index e2f3dc800d..7caf858200 100644 --- a/src/calibre/gui2/store/stores/chitanka_plugin.py +++ b/src/calibre/gui2/store/stores/chitanka_plugin.py @@ -27,6 +27,44 @@ from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog +def parse_book_page(doc, base_url, counter): + + for data in doc.xpath('//div[@class="booklist"]/div/div'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="media-body"]/a[@class="booklink"]/@href')).strip() + if not id: + continue + + counter -= 1 + + s = SearchResult() + s.cover_url = 'http:' + ''.join( + data.xpath('.//div[@class="media-left"]/a[@class="booklink"]/div/img/@src')).strip() + + s.title = ''.join(data.xpath('.//div[@class="media-body"]/a[@class="booklink"]/i/text()')).strip() + alternative_headline = data.xpath('.//div[@class="media-body"]/div[@itemprop="alternativeHeadline"]/text()') + if len(alternative_headline) > 0: + s.title = "{} ({})".format(s.title, ''.join(alternative_headline).strip()) + + s.author = ', '.join(data.xpath('.//div[@class="media-body"]/div[@class="bookauthor"]/span/a/text()')).strip(', ') + s.detail_item = id + s.drm = SearchResult.DRM_UNLOCKED + s.downloads['FB2'] = base_url + ''.join(data.xpath( + './/div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-fb2")]/@href')).strip().replace( + '.zip', '') + s.downloads['EPUB'] = base_url + ''.join(data.xpath( + './/div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-epub")]/@href')).strip().replace( + '.zip', '') + s.downloads['TXT'] = base_url + ''.join(data.xpath( + './/div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-txt")]/@href')).strip().replace( + '.zip', '') + s.formats = 'FB2, EPUB, TXT' + yield s + + return counter + class ChitankaStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): @@ -46,12 +84,10 @@ class ChitankaStore(BasicStoreConfig, StorePlugin): d.exec_() def search(self, query, max_results=10, timeout=60): - # check for cyrillic symbols before performing search if isinstance(query, bytes): query = query.decode('utf-8') - uquery = query.strip() - reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery) - if not reObj: + + if len(query) < 3: return base_url = 'http://chitanka.info' @@ -64,73 +100,28 @@ class ChitankaStore(BasicStoreConfig, StorePlugin): with closing(br.open(url, timeout=timeout)) as f: f = f.read().decode('utf-8') doc = html.fromstring(f) + counter = yield from parse_book_page(doc, base_url, counter) + if counter <= 0: + return - for data in doc.xpath('//ul[@class="superlist booklist"]/li'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() - if not id: + # search for author names + for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'): + author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href')) + if author_url == '': continue - counter -= 1 + br2 = browser() + with closing(br2.open(base_url + author_url, timeout=timeout)) as f: + f = f.read().decode('utf-8') + doc = html.fromstring(f) + counter = yield from parse_book_page(doc, base_url, counter) + if counter <= 0: + break - s = SearchResult() - s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() - s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() - s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() - s.detail_item = id - s.drm = SearchResult.DRM_UNLOCKED - s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') - s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') - s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') - s.formats = 'FB2, EPUB, TXT, SFB' - yield s except HTTPError as e: if e.code == 404: return else: raise - # search for author names - for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'): - author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href')) - if author_url == '': - continue - if counter <= 0: - break - br2 = browser() - with closing(br2.open(base_url + author_url, timeout=timeout)) as f: - if counter <= 0: - break - f = f.read().decode('utf-8') - doc2 = html.fromstring(f) - - # search for book title - for data in doc2.xpath('//ul[@class="superlist booklist"]/li'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() - if not id: - continue - - title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() - author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() - if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: - continue - - counter -= 1 - - s = SearchResult() - s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() - s.title = title - s.author = author - s.detail_item = id - s.drm = SearchResult.DRM_UNLOCKED - s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') - s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') - s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') - s.formats = 'FB2, EPUB, TXT, SFB' - yield s