From 620102102e92a0bb5673a2fd91a419250c5c8fb2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 Oct 2010 16:58:56 -0600 Subject: [PATCH] New social metadata plugin for Amazon that does not rely on AWS --- src/calibre/ebooks/metadata/amazon.py | 147 +++++++++++++------------- src/calibre/ebooks/metadata/xisbn.py | 80 ++++++++++++++ 2 files changed, 156 insertions(+), 71 deletions(-) create mode 100644 src/calibre/ebooks/metadata/xisbn.py diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 1713d044f5..0a3ab95fc2 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -8,88 +8,93 @@ Fetch metadata using Amazon AWS ''' import sys, re -from lxml import etree +from lxml import html from calibre import browser -from calibre.utils.date import parse_date, utcnow -from calibre.ebooks.metadata import MetaInformation, string_to_authors +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode -AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05' +def find_asin(br, isbn): + q = 'http://www.amazon.com/s?field-keywords='+isbn + raw = br.open_novisit(q).read() + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + root = html.fromstring(raw) + revs = root.xpath('//*[@class="asinReviewsSummary" and @name]') + revs = [x.get('name') for x in revs] + if revs: + return revs[0] -def AWS(tag): - return '{%s}%s'%(AWS_NS, tag) - -class ISBNNotFound(ValueError): - pass - -def check_for_errors(root, isbn): - err = root.find('.//'+AWS('Error')) - if err is not None: - text = etree.tostring(err, method='text', pretty_print=True, - encoding=unicode) - if 'AWS.InvalidParameterValue'+isbn in text: - raise ISBNNotFound(isbn) - raise Exception('Failed to get metadata with error: '\ - + text) def get_social_metadata(title, authors, publisher, isbn): - mi = MetaInformation(title, authors) - if isbn: - br = browser() - response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read() - root = etree.fromstring(response_xml) - try: - check_for_errors(root, isbn) - except ISBNNotFound: - return mi - mi.title = root.findtext('.//'+AWS('Title')) - authors = [x.text for x in root.findall('.//'+AWS('Author'))] - if authors: - mi.authors = [] - for x in authors: - mi.authors.extend(string_to_authors(x)) - mi.publisher = root.findtext('.//'+AWS('Publisher')) - try: - d = root.findtext('.//'+AWS('PublicationDate')) - if d: - default = utcnow().replace(day=15) - d = parse_date(d[0].text, assume_utc=True, default=default) - mi.pubdate = d - except: - pass - try: - rating = float(root.findtext('.//'+AWS('AverageRating'))) - num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews'))) - if num_of_reviews > 4 and rating > 0 and rating < 5: - mi.rating = rating - except: - pass - tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'), - AWS('Subject')))] - if tags: - mi.tags = [] - for x in tags: - mi.tags.extend([y.strip() for y in x.split('/')]) - mi.tags = [x.replace(',', ';') for x in mi.tags] - comments = root.find('.//%s/%s'%(AWS('EditorialReview'), - AWS('Content'))) - if comments is not None: - mi.comments = etree.tostring(comments, - method='text', encoding=unicode) - mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments) - mi.comments = re.sub('', '*', mi.comments) - mi.comments = re.sub('', '**', mi.comments) - mi.comments = re.sub('
', '\n\n', mi.comments) - mi.comments = re.sub('<[^>]+>', '', mi.comments) - mi.comments = mi.comments.strip() - mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments - + mi = Metadata(title, authors) + if not isbn: return mi + isbn = check_isbn(isbn) + if not isbn: + return mi + br = browser() + if len(isbn) == 13: + try: + asin = find_asin(br, isbn) + except: + import traceback + traceback.print_exc() + asin = None + else: + asin = isbn + if asin: + if get_metadata(br, asin, mi): + return mi + # TODO: Use xisbn to search over all isbns + return mi +def get_metadata(br, asin, mi): + q = 'http://amzn.com/'+asin + raw = br.open_novisit(q).read() + if '404 - ' in raw: + return False + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + root = html.fromstring(raw) + ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') + if ratings: + pat = re.compile(r'([0-9.]+) out of (\d+) stars') + r = ratings[0] + for elem in r.xpath('descendant::*[@title]'): + t = elem.get('title') + m = pat.match(t) + if m is not None: + try: + mi.rating = float(m.group(1))/float(m.group(2)) * 5 + break + except: + pass + + desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') + if desc: + desc = desc[0] + for c in desc.xpath('descendant::*[@class="seeAll" or' + ' @class="emptyClear" or @href]'): + c.getparent().remove(c) + desc = html.tostring(desc, method='html', encoding=unicode).strip() + desc = re.sub(r' class=[^>]+>', '>', desc) + desc = re.sub('\n+', '\n', desc) + desc = re.sub(' +', ' ', desc) + desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) + desc = re.sub(r'(?s)<!--.*?-->', '', desc) + mi.comments = desc def main(args=sys.argv): - print get_social_metadata(None, None, None, '9781416551720') + print get_social_metadata('Swan Thieves', None, None, '9780316065795') + print + return 0 + print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') + print + print get_social_metadata('The Great Gatsby', None, None, '0743273567') + return 0 if __name__ == '__main__': diff --git a/src/calibre/ebooks/metadata/xisbn.py b/src/calibre/ebooks/metadata/xisbn.py new file mode 100644 index 0000000000..21ea0ee79f --- /dev/null +++ b/src/calibre/ebooks/metadata/xisbn.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +import threading, re, json + +from calibre import browser + +class xISBN(object): + + QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed' + + def __init__(self): + self.lock = threading.RLock() + self._data = [] + self._map = {} + + self.br = browser() + self.isbn_pat = re.compile(r'[^0-9X]', re.IGNORECASE) + + def purify(self, isbn): + return self.isbn_pat.sub('', isbn.upper()) + + def fetch_data(self, isbn): + url = self.QUERY%isbn + data = self.br.open_novisit(url).read() + data = json.loads(data) + if data.get('stat', None) != 'ok': + return [] + data = data.get('list', []) + ans = [] + for rec in data: + forms = rec.get('form', []) + # Only get books, not audio/video + forms = [x for x in forms if x in ('BA', 'BC', 'BB', 'DA')] + if forms: + ans.append(rec) + return ans + + def get_data(self, isbn): + isbn = self.purify(isbn) + with self.lock: + if isbn not in self._map: + try: + data = self.fetch_data(isbn) + except: + import traceback + traceback.print_exc() + data = [] + id_ = len(self._data) + self._data.append(data) + for rec in data: + for i in rec.get('isbn', []): + self._map[i] = id_ + self._map[isbn] = id_ + return self._data[self._map[isbn]] + + def get_associated_isbns(self, isbn): + data = self.get_data(isbn) + ans = set([]) + for rec in data: + for i in rec.get('isbn', []): + ans.add(i) + return ans + + + +xisbn = xISBN() + +if __name__ == '__main__': + import sys + isbn = sys.argv[-1] + print xisbn.get_data(isbn) + print + print xisbn.get_associated_isbns(isbn) + +