Update Edelweiss metadata plugin

Getting book data using edelweiss identifiers works. However, I cannot
get the search to work, for some reason the server returns random
results, ignoring the passed in query.
This commit is contained in:
Kovid Goyal 2018-04-05 23:30:07 +05:30
parent d72b06b29b
commit 19ca524e47
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -25,14 +25,8 @@ def clean_html(raw):
def parse_html(raw): def parse_html(raw):
raw = clean_html(raw) raw = clean_html(raw)
try: from html5_parser import parse
from html5_parser import parse return parse(raw)
except ImportError:
# Old versions of calibre
import html5lib
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
else:
return parse(raw)
def astext(node): def astext(node):
@ -43,18 +37,21 @@ def astext(node):
class Worker(Thread): # {{{ class Worker(Thread): # {{{
def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin): def __init__(self, basic_data, relevance, result_queue, br, timeout, log, plugin):
Thread.__init__(self) Thread.__init__(self)
self.daemon = True self.daemon = True
self.url, self.br, self.log, self.timeout = url, br, log, timeout self.basic_data = basic_data
self.result_queue, self.plugin, self.sku = result_queue, plugin, sku self.br, self.log, self.timeout = br, log, timeout
self.result_queue, self.plugin, self.sku = result_queue, plugin, self.basic_data['sku']
self.relevance = relevance self.relevance = relevance
def run(self): def run(self):
url = ('https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/product/two_Enhanced.ascx&'
'sku={0}&idPrefix=content_1_{0}&mode=0'.format(self.sku))
try: try:
raw = self.br.open_novisit(self.url, timeout=self.timeout).read() raw = self.br.open_novisit(url, timeout=self.timeout).read()
except: except:
self.log.exception('Failed to load details page: %r'%self.url) self.log.exception('Failed to load comments page: %r'%url)
return return
try: try:
@ -63,91 +60,46 @@ class Worker(Thread): # {{{
self.plugin.clean_downloaded_metadata(mi) self.plugin.clean_downloaded_metadata(mi)
self.result_queue.put(mi) self.result_queue.put(mi)
except: except:
self.log.exception('Failed to parse details page: %r'%self.url) self.log.exception('Failed to parse details for sku: %s'%self.sku)
def parse(self, raw): def parse(self, raw):
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import parse_only_date, UNDEFINED_DATE from calibre.utils.date import UNDEFINED_DATE
from css_selectors import Select
root = parse_html(raw) root = parse_html(raw)
selector = Select(root) mi = Metadata(self.basic_data['title'], self.basic_data['authors'])
sku = next(selector('div.sku.attGroup'))
info = sku.getparent()
top = info.getparent().getparent()
banner = top.find('div')
spans = banner.findall('span')
title = ''
for i, span in enumerate(spans):
if i == 0 or '12pt' in span.get('style', ''):
title += astext(span)
else:
break
authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')]
mi = Metadata(title.strip(), authors)
# Identifiers # Identifiers
isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] if self.basic_data['isbns']:
for isbn in isbns: mi.isbn = self.basic_data['isbns'][0]
if isbn:
self.plugin.cache_isbn_to_identifier(isbn, self.sku)
isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True)
if isbns and isbns[0]:
mi.isbn = isbns[0]
mi.set_identifier('edelweiss', self.sku) mi.set_identifier('edelweiss', self.sku)
# Tags # Tags
bisac = tuple(selector('div.bisac.attGroup')) if self.basic_data['tags']:
if bisac: mi.tags = self.basic_data['tags']
bisac = astext(bisac[0])
mi.tags = [x.strip() for x in bisac.split(',')]
mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]
# Publisher # Publisher
pub = tuple(selector('div.supplier.attGroup')) mi.publisher = self.basic_data['publisher']
if pub:
pub = astext(pub[0])
mi.publisher = pub
# Pubdate # Pubdate
pub = tuple(selector('div.shipDate.attGroupItem')) if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:
if pub: mi.pubdate = self.basic_data['pubdate']
pub = astext(pub[0])
parts = pub.partition(':')[0::2] # Rating
pub = parts[1] or parts[0] if self.basic_data['rating']:
try: mi.rating = self.basic_data['rating']
if ', Ship Date:' in pub:
pub = pub.partition(', Ship Date:')[0]
q = parse_only_date(pub, assume_utc=True)
if q.year != UNDEFINED_DATE:
mi.pubdate = q
except:
self.log.exception('Error parsing published date: %r'%pub)
# Comments # Comments
comm = '' comments = ''
general = tuple(selector('div#pd-general-overview-content')) for cid in ('summary', 'contributorbio', 'quotes_reviews'):
if general: cid = 'desc_{}{}-content'.format(cid, self.sku)
q = self.render_comments(general[0]) div = root.xpath('//*[@id="{}"]'.format(cid))
if q != '<p>No title summary available. </p>': if div:
comm += q comments += self.render_comments(div[0])
general = tuple(selector('div#pd-general-contributor-content')) if comments:
if general: mi.comments = comments
comm += self.render_comments(general[0])
general = tuple(selector('div#pd-general-quotes-content'))
if general:
comm += self.render_comments(general[0])
if comm:
mi.comments = comm
# Cover
img = tuple(selector('img.title-image[src]'))
if img:
href = img[0].get('src').replace('jacket_covers/medium/',
'jacket_covers/flyout/')
self.plugin.cache_identifier_to_cover_url(self.sku, href)
mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None
return mi return mi
def render_comments(self, desc): def render_comments(self, desc):
@ -171,17 +123,84 @@ class Worker(Thread): # {{{
# }}} # }}}
def get_basic_data(browser, log, *skus):
from calibre.utils.date import parse_only_date
from mechanize import Request
zeroes = ','.join('0' for sku in skus)
data = {
'skus': ','.join(skus),
'drc': zeroes,
'startPosition': '0',
'sequence': '1',
'selected': zeroes,
'itemID': '0',
'orderID': '0',
'mailingID': '',
'tContentWidth': '926',
'originalOrder': ','.join(str(i) for i in range(len(skus))),
'selectedOrderID': '0',
'selectedSortColumn': '0',
'listType': '1',
'resultType': '32',
'blockView': '1',
}
items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx'
req = Request(items_data_url, data)
response = browser.open_novisit(req)
raw = response.read()
root = parse_html(raw)
for item in root.xpath('//div[@data-priority]'):
row = item.getparent().getparent()
sku = item.get('id').split('-')[-1]
isbns = [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_sku")]/text()')[0].split(',') if check_isbn(x.strip())]
isbns.sort(key=len, reverse=True)
try:
tags = [x.strip() for x in astext(row.xpath('descendant::*[contains(@class, "pev_categories")]')[0]).split('/')]
except IndexError:
tags = []
rating = 0
for bar in row.xpath('descendant::*[contains(@class, "bgdColorCommunity")]/@style'):
m = re.search('width: (\d+)px;.*max-width: (\d+)px', bar)
if m is not None:
rating = float(m.group(1)) / float(m.group(2))
break
try:
pubdate = parse_only_date(astext(row.xpath('descendant::*[contains(@class, "pev_shipDate")]')[0]
).split(':')[-1].split(u'\xa0')[-1].strip(), assume_utc=True)
except Exception:
log.exception('Error parsing published date')
pubdate = None
authors = []
for x in [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_contributor")]/@title')]:
authors.extend(a.strip() for a in x.split(','))
entry = {
'sku': sku,
'cover': row.xpath('descendant::img/@src')[0].split('?')[0],
'publisher': astext(row.xpath('descendant::*[contains(@class, "headerPublisher")]')[0]),
'title': astext(row.xpath('descendant::*[@id="title_{}"]'.format(sku))[0]),
'authors': authors,
'isbns': isbns,
'tags': tags,
'pubdate': pubdate,
'format': ' '.join(row.xpath('descendant::*[contains(@class, "pev_format")]/text()')).strip(),
'rating': rating,
}
if entry['cover'].startswith('/'):
entry['cover'] = None
yield entry
class Edelweiss(Source): class Edelweiss(Source):
name = 'Edelweiss' name = 'Edelweiss'
version = (1, 0, 0) version = (2, 0, 0)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (3, 6, 0)
description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers') description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers')
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset([ touched_fields = frozenset([
'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
'identifier:isbn', 'identifier:edelweiss']) 'identifier:isbn', 'identifier:edelweiss', 'rating'])
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
has_html_comments = True has_html_comments = True
@ -189,11 +208,11 @@ class Edelweiss(Source):
def user_agent(self): def user_agent(self):
# Pass in an index to random_user_agent() to test with a particular # Pass in an index to random_user_agent() to test with a particular
# user agent # user agent
return random_user_agent() return random_user_agent(allow_ie=False)
def _get_book_url(self, sku): def _get_book_url(self, sku):
if sku: if sku:
return 'https://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku return 'https://www.edelweiss.plus/#sku={}&page=1'.format(sku)
def get_book_url(self, identifiers): # {{{ def get_book_url(self, identifiers): # {{{
sku = identifiers.get('edelweiss', None) sku = identifiers.get('edelweiss', None)
@ -213,10 +232,9 @@ class Edelweiss(Source):
def create_query(self, log, title=None, authors=None, identifiers={}): def create_query(self, log, title=None, authors=None, identifiers={}):
from urllib import urlencode from urllib import urlencode
BASE_URL = 'https://edelweiss.abovethetreeline.com/Browse.aspx?source=catalog&rg=4187&group=browse&pg=0&' import time
params = { BASE_URL = ('https://www.edelweiss.plus/GetTreelineControl.aspx?'
'browseType':'title', 'startIndex':0, 'savecook':1, 'sord':20, 'secSord':20, 'tertSord':20, 'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&')
}
keywords = [] keywords = []
isbn = check_isbn(identifiers.get('isbn', None)) isbn = check_isbn(identifiers.get('isbn', None))
if isbn is not None: if isbn is not None:
@ -225,28 +243,33 @@ class Edelweiss(Source):
title_tokens = list(self.get_title_tokens(title)) title_tokens = list(self.get_title_tokens(title))
if title_tokens: if title_tokens:
keywords.extend(title_tokens) keywords.extend(title_tokens)
# Searching with author names does not work on edelweiss author_tokens = self.get_author_tokens(authors, only_first_author=True)
# author_tokens = self.get_author_tokens(authors, if author_tokens:
# only_first_author=True) keywords.extend(author_tokens)
# if author_tokens:
# keywords.extend(author_tokens)
if not keywords: if not keywords:
return None return None
params['bsk'] = (' '.join(keywords)).encode('utf-8') params = {
'q': (' '.join(keywords)).encode('utf-8'),
'_': str(int(time.time()))
}
return BASE_URL+urlencode(params) return BASE_URL+urlencode(params)
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30): identifiers={}, timeout=30):
from urlparse import parse_qs import json
book_url = self._get_book_url(identifiers.get('edelweiss', None))
br = self.browser br = self.browser
if book_url: br.addheaders = [
entries = [(book_url, identifiers['edelweiss'])] ('Referer', 'https://www.edelweiss.plus/'),
('X-Requested-With', 'XMLHttpRequest'),
('Cache-Control', 'no-cache'),
('Pragma', 'no-cache'),
]
if 'edelweiss' in identifiers:
items = [identifiers['edelweiss']]
else: else:
entries = []
query = self.create_query(log, title=title, authors=authors, query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
if not query: if not query:
@ -254,66 +277,41 @@ class Edelweiss(Source):
return return
log('Using query URL:', query) log('Using query URL:', query)
try: try:
raw = br.open_novisit(query, timeout=timeout).read() raw = br.open(query, timeout=timeout).read().decode('utf-8')
except Exception as e: except Exception as e:
log.exception('Failed to make identify query: %r'%query) log.exception('Failed to make identify query: %r'%query)
return as_unicode(e) return as_unicode(e)
items = re.search('window[.]items\s*=\s*(.+?);', raw)
if items is None:
log.error('Failed to get list of matching items')
log.debug('Response text:')
log.debug(raw)
return
items = json.loads(items.group(1))
try: if (not items and identifiers and title and authors and
root = parse_html(raw)
except Exception as e:
log.exception('Failed to parse identify results')
return as_unicode(e)
from css_selectors import Select
select = Select(root)
has_isbn = check_isbn(identifiers.get('isbn', None)) is not None
if not has_isbn:
author_tokens = set(x.lower() for x in self.get_author_tokens(authors, only_first_author=True))
for entry in select('div.listRow div.listRowMain'):
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]')
if not a:
continue
href = a[0].get('href')
prefix, qs = href.partition('?')[0::2]
sku = parse_qs(qs).get('sku', None)
if sku and sku[0]:
sku = sku[0]
div = tuple(select('div.sku.attGroup'))
if div:
text = astext(div[0])
isbns = [check_isbn(x.strip()) for x in text.split(',')]
for isbn in isbns:
if isbn:
self.cache_isbn_to_identifier(isbn, sku)
for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'):
self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/'))
div = tuple(select('div.format.attGroup'))
text = astext(div[0]).lower()
if 'audio' in text or 'mp3' in text: # Audio-book, ignore
continue
if not has_isbn:
# edelweiss returns matches based only on title, so we
# filter by author manually
div = tuple(select('div.contributor.attGroup'))
try:
entry_authors = set(self.get_author_tokens([x.strip() for x in astext(div[0]).lower().split(',')]))
except IndexError:
entry_authors = set()
if not entry_authors.issuperset(author_tokens):
continue
entries.append((self._get_book_url(sku), sku))
if (not entries and identifiers and title and authors and
not abort.is_set()): not abort.is_set()):
return self.identify(log, result_queue, abort, title=title, return self.identify(log, result_queue, abort, title=title,
authors=authors, timeout=timeout) authors=authors, timeout=timeout)
if not entries: if not items:
return return
workers = [Worker(skul, url, i, result_queue, br.clone_browser(), timeout, log, self) workers = []
for i, (url, skul) in enumerate(entries[:5])] items = items[:5]
for i, item in enumerate(get_basic_data(self.browser, log, *items)):
sku = item['sku']
for isbn in item['isbns']:
self.cache_isbn_to_identifier(isbn, sku)
if item['cover']:
self.cache_identifier_to_cover_url(sku, item['cover'])
fmt = item['format'].lower()
if 'audio' in fmt or 'mp3' in fmt:
continue # Audio-book, ignore
workers.append(Worker(item, i, result_queue, br.clone_browser(), timeout, log, self))
if not workers:
return
for w in workers: for w in workers:
w.start() w.start()
@ -390,14 +388,13 @@ if __name__ == '__main__':
# Multiple authors and two part title and no general description # Multiple authors and two part title and no general description
({'identifiers':{'edelweiss':'0321180607'}}, ({'identifiers':{'edelweiss':'0321180607'}},
[title_test( [title_test(
"XQuery from the Experts: A Guide to the W3C XML Query Language" "XQuery From the Experts: A Guide to the W3C XML Query Language"
, exact=True), authors_test([ , exact=True), authors_test([
'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez', 'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez',
'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon', 'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon',
'Jim Tivy', 'Philip Wadler']), pubdate_test(2003, 8, 22), 'Jim Tivy', 'Philip Wadler']), pubdate_test(2003, 8, 22),
comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments) comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments)
]), ]),
] ]
start, stop = 0, len(tests) start, stop = 0, len(tests)