diff --git a/src/calibre/ebooks/metadata/sources/edelweiss.py b/src/calibre/ebooks/metadata/sources/edelweiss.py index 1f47d90a82..2b39688743 100644 --- a/src/calibre/ebooks/metadata/sources/edelweiss.py +++ b/src/calibre/ebooks/metadata/sources/edelweiss.py @@ -25,14 +25,8 @@ def clean_html(raw): def parse_html(raw): raw = clean_html(raw) - try: - from html5_parser import parse - except ImportError: - # Old versions of calibre - import html5lib - return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) - else: - return parse(raw) + from html5_parser import parse + return parse(raw) def astext(node): @@ -43,18 +37,21 @@ def astext(node): class Worker(Thread): # {{{ - def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin): + def __init__(self, basic_data, relevance, result_queue, br, timeout, log, plugin): Thread.__init__(self) self.daemon = True - self.url, self.br, self.log, self.timeout = url, br, log, timeout - self.result_queue, self.plugin, self.sku = result_queue, plugin, sku + self.basic_data = basic_data + self.br, self.log, self.timeout = br, log, timeout + self.result_queue, self.plugin, self.sku = result_queue, plugin, self.basic_data['sku'] self.relevance = relevance def run(self): + url = ('https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/product/two_Enhanced.ascx&' + 'sku={0}&idPrefix=content_1_{0}&mode=0'.format(self.sku)) try: - raw = self.br.open_novisit(self.url, timeout=self.timeout).read() + raw = self.br.open_novisit(url, timeout=self.timeout).read() except: - self.log.exception('Failed to load details page: %r'%self.url) + self.log.exception('Failed to load comments page: %r'%url) return try: @@ -63,91 +60,46 @@ class Worker(Thread): # {{{ self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi) except: - self.log.exception('Failed to parse details page: %r'%self.url) + self.log.exception('Failed to parse details for sku: %s'%self.sku) def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata - from calibre.utils.date import parse_only_date, UNDEFINED_DATE - from css_selectors import Select + from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) - selector = Select(root) - sku = next(selector('div.sku.attGroup')) - info = sku.getparent() - top = info.getparent().getparent() - banner = top.find('div') - spans = banner.findall('span') - title = '' - for i, span in enumerate(spans): - if i == 0 or '12pt' in span.get('style', ''): - title += astext(span) - else: - break - authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')] - mi = Metadata(title.strip(), authors) + mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers - isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] - for isbn in isbns: - if isbn: - self.plugin.cache_isbn_to_identifier(isbn, self.sku) - isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True) - if isbns and isbns[0]: - mi.isbn = isbns[0] + if self.basic_data['isbns']: + mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags - bisac = tuple(selector('div.bisac.attGroup')) - if bisac: - bisac = astext(bisac[0]) - mi.tags = [x.strip() for x in bisac.split(',')] + if self.basic_data['tags']: + mi.tags = self.basic_data['tags'] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher - pub = tuple(selector('div.supplier.attGroup')) - if pub: - pub = astext(pub[0]) - mi.publisher = pub + mi.publisher = self.basic_data['publisher'] # Pubdate - pub = tuple(selector('div.shipDate.attGroupItem')) - if pub: - pub = astext(pub[0]) - parts = pub.partition(':')[0::2] - pub = parts[1] or parts[0] - try: - if ', Ship Date:' in pub: - pub = pub.partition(', Ship Date:')[0] - q = parse_only_date(pub, assume_utc=True) - if q.year != UNDEFINED_DATE: - mi.pubdate = q - except: - self.log.exception('Error parsing published date: %r'%pub) + if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE: + mi.pubdate = self.basic_data['pubdate'] + + # Rating + if self.basic_data['rating']: + mi.rating = self.basic_data['rating'] # Comments - comm = '' - general = tuple(selector('div#pd-general-overview-content')) - if general: - q = self.render_comments(general[0]) - if q != '

No title summary available.

': - comm += q - general = tuple(selector('div#pd-general-contributor-content')) - if general: - comm += self.render_comments(general[0]) - general = tuple(selector('div#pd-general-quotes-content')) - if general: - comm += self.render_comments(general[0]) - if comm: - mi.comments = comm - - # Cover - img = tuple(selector('img.title-image[src]')) - if img: - href = img[0].get('src').replace('jacket_covers/medium/', - 'jacket_covers/flyout/') - self.plugin.cache_identifier_to_cover_url(self.sku, href) + comments = '' + for cid in ('summary', 'contributorbio', 'quotes_reviews'): + cid = 'desc_{}{}-content'.format(cid, self.sku) + div = root.xpath('//*[@id="{}"]'.format(cid)) + if div: + comments += self.render_comments(div[0]) + if comments: + mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None - return mi def render_comments(self, desc): @@ -171,17 +123,84 @@ class Worker(Thread): # {{{ # }}} +def get_basic_data(browser, log, *skus): + from calibre.utils.date import parse_only_date + from mechanize import Request + zeroes = ','.join('0' for sku in skus) + data = { + 'skus': ','.join(skus), + 'drc': zeroes, + 'startPosition': '0', + 'sequence': '1', + 'selected': zeroes, + 'itemID': '0', + 'orderID': '0', + 'mailingID': '', + 'tContentWidth': '926', + 'originalOrder': ','.join(str(i) for i in range(len(skus))), + 'selectedOrderID': '0', + 'selectedSortColumn': '0', + 'listType': '1', + 'resultType': '32', + 'blockView': '1', + } + items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx' + req = Request(items_data_url, data) + response = browser.open_novisit(req) + raw = response.read() + root = parse_html(raw) + for item in root.xpath('//div[@data-priority]'): + row = item.getparent().getparent() + sku = item.get('id').split('-')[-1] + isbns = [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_sku")]/text()')[0].split(',') if check_isbn(x.strip())] + isbns.sort(key=len, reverse=True) + try: + tags = [x.strip() for x in astext(row.xpath('descendant::*[contains(@class, "pev_categories")]')[0]).split('/')] + except IndexError: + tags = [] + rating = 0 + for bar in row.xpath('descendant::*[contains(@class, "bgdColorCommunity")]/@style'): + m = re.search('width: (\d+)px;.*max-width: (\d+)px', bar) + if m is not None: + rating = float(m.group(1)) / float(m.group(2)) + break + try: + pubdate = parse_only_date(astext(row.xpath('descendant::*[contains(@class, "pev_shipDate")]')[0] + ).split(':')[-1].split(u'\xa0')[-1].strip(), assume_utc=True) + except Exception: + log.exception('Error parsing published date') + pubdate = None + authors = [] + for x in [x.strip() for x in row.xpath('descendant::*[contains(@class, "pev_contributor")]/@title')]: + authors.extend(a.strip() for a in x.split(',')) + entry = { + 'sku': sku, + 'cover': row.xpath('descendant::img/@src')[0].split('?')[0], + 'publisher': astext(row.xpath('descendant::*[contains(@class, "headerPublisher")]')[0]), + 'title': astext(row.xpath('descendant::*[@id="title_{}"]'.format(sku))[0]), + 'authors': authors, + 'isbns': isbns, + 'tags': tags, + 'pubdate': pubdate, + 'format': ' '.join(row.xpath('descendant::*[contains(@class, "pev_format")]/text()')).strip(), + 'rating': rating, + } + if entry['cover'].startswith('/'): + entry['cover'] = None + yield entry + + class Edelweiss(Source): name = 'Edelweiss' - version = (1, 0, 0) - minimum_calibre_version = (2, 80, 0) + version = (2, 0, 0) + minimum_calibre_version = (3, 6, 0) description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', - 'identifier:isbn', 'identifier:edelweiss']) + 'identifier:isbn', 'identifier:edelweiss', 'rating']) supports_gzip_transfer_encoding = True has_html_comments = True @@ -189,11 +208,11 @@ class Edelweiss(Source): def user_agent(self): # Pass in an index to random_user_agent() to test with a particular # user agent - return random_user_agent() + return random_user_agent(allow_ie=False) def _get_book_url(self, sku): if sku: - return 'https://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku + return 'https://www.edelweiss.plus/#sku={}&page=1'.format(sku) def get_book_url(self, identifiers): # {{{ sku = identifiers.get('edelweiss', None) @@ -213,10 +232,9 @@ class Edelweiss(Source): def create_query(self, log, title=None, authors=None, identifiers={}): from urllib import urlencode - BASE_URL = 'https://edelweiss.abovethetreeline.com/Browse.aspx?source=catalog&rg=4187&group=browse&pg=0&' - params = { - 'browseType':'title', 'startIndex':0, 'savecook':1, 'sord':20, 'secSord':20, 'tertSord':20, - } + import time + BASE_URL = ('https://www.edelweiss.plus/GetTreelineControl.aspx?' + 'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&') keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: @@ -225,28 +243,33 @@ class Edelweiss(Source): title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) - # Searching with author names does not work on edelweiss - # author_tokens = self.get_author_tokens(authors, - # only_first_author=True) - # if author_tokens: - # keywords.extend(author_tokens) + author_tokens = self.get_author_tokens(authors, only_first_author=True) + if author_tokens: + keywords.extend(author_tokens) if not keywords: return None - params['bsk'] = (' '.join(keywords)).encode('utf-8') + params = { + 'q': (' '.join(keywords)).encode('utf-8'), + '_': str(int(time.time())) + } return BASE_URL+urlencode(params) # }}} def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): - from urlparse import parse_qs + import json - book_url = self._get_book_url(identifiers.get('edelweiss', None)) br = self.browser - if book_url: - entries = [(book_url, identifiers['edelweiss'])] + br.addheaders = [ + ('Referer', 'https://www.edelweiss.plus/'), + ('X-Requested-With', 'XMLHttpRequest'), + ('Cache-Control', 'no-cache'), + ('Pragma', 'no-cache'), + ] + if 'edelweiss' in identifiers: + items = [identifiers['edelweiss']] else: - entries = [] query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: @@ -254,66 +277,41 @@ class Edelweiss(Source): return log('Using query URL:', query) try: - raw = br.open_novisit(query, timeout=timeout).read() + raw = br.open(query, timeout=timeout).read().decode('utf-8') except Exception as e: log.exception('Failed to make identify query: %r'%query) return as_unicode(e) + items = re.search('window[.]items\s*=\s*(.+?);', raw) + if items is None: + log.error('Failed to get list of matching items') + log.debug('Response text:') + log.debug(raw) + return + items = json.loads(items.group(1)) - try: - root = parse_html(raw) - except Exception as e: - log.exception('Failed to parse identify results') - return as_unicode(e) - from css_selectors import Select - select = Select(root) - has_isbn = check_isbn(identifiers.get('isbn', None)) is not None - if not has_isbn: - author_tokens = set(x.lower() for x in self.get_author_tokens(authors, only_first_author=True)) - for entry in select('div.listRow div.listRowMain'): - a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]') - if not a: - continue - href = a[0].get('href') - prefix, qs = href.partition('?')[0::2] - sku = parse_qs(qs).get('sku', None) - if sku and sku[0]: - sku = sku[0] - div = tuple(select('div.sku.attGroup')) - if div: - text = astext(div[0]) - isbns = [check_isbn(x.strip()) for x in text.split(',')] - for isbn in isbns: - if isbn: - self.cache_isbn_to_identifier(isbn, sku) - for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'): - self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/')) - - div = tuple(select('div.format.attGroup')) - text = astext(div[0]).lower() - if 'audio' in text or 'mp3' in text: # Audio-book, ignore - continue - if not has_isbn: - # edelweiss returns matches based only on title, so we - # filter by author manually - div = tuple(select('div.contributor.attGroup')) - try: - entry_authors = set(self.get_author_tokens([x.strip() for x in astext(div[0]).lower().split(',')])) - except IndexError: - entry_authors = set() - if not entry_authors.issuperset(author_tokens): - continue - entries.append((self._get_book_url(sku), sku)) - - if (not entries and identifiers and title and authors and + if (not items and identifiers and title and authors and not abort.is_set()): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) - if not entries: + if not items: return - workers = [Worker(skul, url, i, result_queue, br.clone_browser(), timeout, log, self) - for i, (url, skul) in enumerate(entries[:5])] + workers = [] + items = items[:5] + for i, item in enumerate(get_basic_data(self.browser, log, *items)): + sku = item['sku'] + for isbn in item['isbns']: + self.cache_isbn_to_identifier(isbn, sku) + if item['cover']: + self.cache_identifier_to_cover_url(sku, item['cover']) + fmt = item['format'].lower() + if 'audio' in fmt or 'mp3' in fmt: + continue # Audio-book, ignore + workers.append(Worker(item, i, result_queue, br.clone_browser(), timeout, log, self)) + + if not workers: + return for w in workers: w.start() @@ -390,14 +388,13 @@ if __name__ == '__main__': # Multiple authors and two part title and no general description ({'identifiers':{'edelweiss':'0321180607'}}, [title_test( - "XQuery from the Experts: A Guide to the W3C XML Query Language" + "XQuery From the Experts: A Guide to the W3C XML Query Language" , exact=True), authors_test([ 'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez', 'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon', 'Jim Tivy', 'Philip Wadler']), pubdate_test(2003, 8, 22), comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments) ]), - ] start, stop = 0, len(tests)