mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix chitanka store
Update html parser to be consistent with the current version of the site.
This commit is contained in:
parent
8e98cb061f
commit
7d9ebf412d
@ -27,6 +27,44 @@ from calibre.gui2.store.search_result import SearchResult
|
|||||||
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||||
|
|
||||||
|
|
||||||
|
def parse_book_page(doc, base_url, counter):
|
||||||
|
|
||||||
|
for data in doc.xpath('//div[@class="booklist"]/div/div'):
|
||||||
|
if counter <= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
id = ''.join(data.xpath('.//div[@class="media-body"]/a[@class="booklink"]/@href')).strip()
|
||||||
|
if not id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
counter -= 1
|
||||||
|
|
||||||
|
s = SearchResult()
|
||||||
|
s.cover_url = 'http:' + ''.join(
|
||||||
|
data.xpath('.//div[@class="media-left"]/a[@class="booklink"]/div/img/@src')).strip()
|
||||||
|
|
||||||
|
s.title = ''.join(data.xpath('.//div[@class="media-body"]/a[@class="booklink"]/i/text()')).strip()
|
||||||
|
alternative_headline = data.xpath('.//div[@class="media-body"]/div[@itemprop="alternativeHeadline"]/text()')
|
||||||
|
if len(alternative_headline) > 0:
|
||||||
|
s.title = "{} ({})".format(s.title, ''.join(alternative_headline).strip())
|
||||||
|
|
||||||
|
s.author = ', '.join(data.xpath('.//div[@class="media-body"]/div[@class="bookauthor"]/span/a/text()')).strip(', ')
|
||||||
|
s.detail_item = id
|
||||||
|
s.drm = SearchResult.DRM_UNLOCKED
|
||||||
|
s.downloads['FB2'] = base_url + ''.join(data.xpath(
|
||||||
|
'.//div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-fb2")]/@href')).strip().replace(
|
||||||
|
'.zip', '')
|
||||||
|
s.downloads['EPUB'] = base_url + ''.join(data.xpath(
|
||||||
|
'.//div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-epub")]/@href')).strip().replace(
|
||||||
|
'.zip', '')
|
||||||
|
s.downloads['TXT'] = base_url + ''.join(data.xpath(
|
||||||
|
'.//div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-txt")]/@href')).strip().replace(
|
||||||
|
'.zip', '')
|
||||||
|
s.formats = 'FB2, EPUB, TXT'
|
||||||
|
yield s
|
||||||
|
|
||||||
|
return counter
|
||||||
|
|
||||||
class ChitankaStore(BasicStoreConfig, StorePlugin):
|
class ChitankaStore(BasicStoreConfig, StorePlugin):
|
||||||
|
|
||||||
def open(self, parent=None, detail_item=None, external=False):
|
def open(self, parent=None, detail_item=None, external=False):
|
||||||
@ -46,12 +84,10 @@ class ChitankaStore(BasicStoreConfig, StorePlugin):
|
|||||||
d.exec_()
|
d.exec_()
|
||||||
|
|
||||||
def search(self, query, max_results=10, timeout=60):
|
def search(self, query, max_results=10, timeout=60):
|
||||||
# check for cyrillic symbols before performing search
|
|
||||||
if isinstance(query, bytes):
|
if isinstance(query, bytes):
|
||||||
query = query.decode('utf-8')
|
query = query.decode('utf-8')
|
||||||
uquery = query.strip()
|
|
||||||
reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery)
|
if len(query) < 3:
|
||||||
if not reObj:
|
|
||||||
return
|
return
|
||||||
|
|
||||||
base_url = 'http://chitanka.info'
|
base_url = 'http://chitanka.info'
|
||||||
@ -64,73 +100,28 @@ class ChitankaStore(BasicStoreConfig, StorePlugin):
|
|||||||
with closing(br.open(url, timeout=timeout)) as f:
|
with closing(br.open(url, timeout=timeout)) as f:
|
||||||
f = f.read().decode('utf-8')
|
f = f.read().decode('utf-8')
|
||||||
doc = html.fromstring(f)
|
doc = html.fromstring(f)
|
||||||
|
counter = yield from parse_book_page(doc, base_url, counter)
|
||||||
|
if counter <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
for data in doc.xpath('//ul[@class="superlist booklist"]/li'):
|
# search for author names
|
||||||
if counter <= 0:
|
for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'):
|
||||||
break
|
author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href'))
|
||||||
|
if author_url == '':
|
||||||
id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
|
|
||||||
if not id:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
counter -= 1
|
br2 = browser()
|
||||||
|
with closing(br2.open(base_url + author_url, timeout=timeout)) as f:
|
||||||
|
f = f.read().decode('utf-8')
|
||||||
|
doc = html.fromstring(f)
|
||||||
|
counter = yield from parse_book_page(doc, base_url, counter)
|
||||||
|
if counter <= 0:
|
||||||
|
break
|
||||||
|
|
||||||
s = SearchResult()
|
|
||||||
s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
|
|
||||||
s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
|
|
||||||
s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
|
|
||||||
s.detail_item = id
|
|
||||||
s.drm = SearchResult.DRM_UNLOCKED
|
|
||||||
s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
|
|
||||||
s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
|
|
||||||
s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
|
|
||||||
s.formats = 'FB2, EPUB, TXT, SFB'
|
|
||||||
yield s
|
|
||||||
except HTTPError as e:
|
except HTTPError as e:
|
||||||
if e.code == 404:
|
if e.code == 404:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# search for author names
|
|
||||||
for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'):
|
|
||||||
author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href'))
|
|
||||||
if author_url == '':
|
|
||||||
continue
|
|
||||||
if counter <= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
br2 = browser()
|
|
||||||
with closing(br2.open(base_url + author_url, timeout=timeout)) as f:
|
|
||||||
if counter <= 0:
|
|
||||||
break
|
|
||||||
f = f.read().decode('utf-8')
|
|
||||||
doc2 = html.fromstring(f)
|
|
||||||
|
|
||||||
# search for book title
|
|
||||||
for data in doc2.xpath('//ul[@class="superlist booklist"]/li'):
|
|
||||||
if counter <= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
|
|
||||||
if not id:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
|
|
||||||
author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
|
|
||||||
if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1:
|
|
||||||
continue
|
|
||||||
|
|
||||||
counter -= 1
|
|
||||||
|
|
||||||
s = SearchResult()
|
|
||||||
s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
|
|
||||||
s.title = title
|
|
||||||
s.author = author
|
|
||||||
s.detail_item = id
|
|
||||||
s.drm = SearchResult.DRM_UNLOCKED
|
|
||||||
s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
|
|
||||||
s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
|
|
||||||
s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
|
|
||||||
s.formats = 'FB2, EPUB, TXT, SFB'
|
|
||||||
yield s
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user