New social metadata plugin for Amazon that does not rely on AWS

This commit is contained in:
Kovid Goyal 2010-10-27 16:58:56 -06:00
parent 238f9391ea
commit 620102102e
2 changed files with 156 additions and 71 deletions

View File

@ -8,88 +8,93 @@ Fetch metadata using Amazon AWS
''' '''
import sys, re import sys, re
from lxml import etree from lxml import html
from calibre import browser from calibre import browser
from calibre.utils.date import parse_date, utcnow from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata import MetaInformation, string_to_authors from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05' def find_asin(br, isbn):
q = 'http://www.amazon.com/s?field-keywords='+isbn
raw = br.open_novisit(q).read()
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
root = html.fromstring(raw)
revs = root.xpath('//*[@class="asinReviewsSummary" and @name]')
revs = [x.get('name') for x in revs]
if revs:
return revs[0]
def AWS(tag):
return '{%s}%s'%(AWS_NS, tag)
class ISBNNotFound(ValueError):
pass
def check_for_errors(root, isbn):
err = root.find('.//'+AWS('Error'))
if err is not None:
text = etree.tostring(err, method='text', pretty_print=True,
encoding=unicode)
if 'AWS.InvalidParameterValue'+isbn in text:
raise ISBNNotFound(isbn)
raise Exception('Failed to get metadata with error: '\
+ text)
def get_social_metadata(title, authors, publisher, isbn): def get_social_metadata(title, authors, publisher, isbn):
mi = MetaInformation(title, authors) mi = Metadata(title, authors)
if isbn: if not isbn:
return mi
isbn = check_isbn(isbn)
if not isbn:
return mi
br = browser() br = browser()
response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read() if len(isbn) == 13:
root = etree.fromstring(response_xml)
try: try:
check_for_errors(root, isbn) asin = find_asin(br, isbn)
except ISBNNotFound: except:
import traceback
traceback.print_exc()
asin = None
else:
asin = isbn
if asin:
if get_metadata(br, asin, mi):
return mi return mi
mi.title = root.findtext('.//'+AWS('Title')) # TODO: Use xisbn to search over all isbns
authors = [x.text for x in root.findall('.//'+AWS('Author'))]
if authors:
mi.authors = []
for x in authors:
mi.authors.extend(string_to_authors(x))
mi.publisher = root.findtext('.//'+AWS('Publisher'))
try:
d = root.findtext('.//'+AWS('PublicationDate'))
if d:
default = utcnow().replace(day=15)
d = parse_date(d[0].text, assume_utc=True, default=default)
mi.pubdate = d
except:
pass
try:
rating = float(root.findtext('.//'+AWS('AverageRating')))
num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews')))
if num_of_reviews > 4 and rating > 0 and rating < 5:
mi.rating = rating
except:
pass
tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'),
AWS('Subject')))]
if tags:
mi.tags = []
for x in tags:
mi.tags.extend([y.strip() for y in x.split('/')])
mi.tags = [x.replace(',', ';') for x in mi.tags]
comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
AWS('Content')))
if comments is not None:
mi.comments = etree.tostring(comments,
method='text', encoding=unicode)
mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments)
mi.comments = re.sub('</?[iI]>', '*', mi.comments)
mi.comments = re.sub('</?[bB]>', '**', mi.comments)
mi.comments = re.sub('<BR>', '\n\n', mi.comments)
mi.comments = re.sub('<[^>]+>', '', mi.comments)
mi.comments = mi.comments.strip()
mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments
return mi return mi
def get_metadata(br, asin, mi):
q = 'http://amzn.com/'+asin
raw = br.open_novisit(q).read()
if '<title>404 - ' in raw:
return False
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
root = html.fromstring(raw)
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
if ratings:
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
r = ratings[0]
for elem in r.xpath('descendant::*[@title]'):
t = elem.get('title')
m = pat.match(t)
if m is not None:
try:
mi.rating = float(m.group(1))/float(m.group(2)) * 5
break
except:
pass
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
if desc:
desc = desc[0]
for c in desc.xpath('descendant::*[@class="seeAll" or'
' @class="emptyClear" or @href]'):
c.getparent().remove(c)
desc = html.tostring(desc, method='html', encoding=unicode).strip()
desc = re.sub(r' class=[^>]+>', '>', desc)
desc = re.sub('\n+', '\n', desc)
desc = re.sub(' +', ' ', desc)
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
mi.comments = desc
def main(args=sys.argv): def main(args=sys.argv):
print get_social_metadata(None, None, None, '9781416551720') print get_social_metadata('Swan Thieves', None, None, '9780316065795')
print
return 0
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')
print
print get_social_metadata('The Great Gatsby', None, None, '0743273567')
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import threading, re, json
from calibre import browser
class xISBN(object):
QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed'
def __init__(self):
self.lock = threading.RLock()
self._data = []
self._map = {}
self.br = browser()
self.isbn_pat = re.compile(r'[^0-9X]', re.IGNORECASE)
def purify(self, isbn):
return self.isbn_pat.sub('', isbn.upper())
def fetch_data(self, isbn):
url = self.QUERY%isbn
data = self.br.open_novisit(url).read()
data = json.loads(data)
if data.get('stat', None) != 'ok':
return []
data = data.get('list', [])
ans = []
for rec in data:
forms = rec.get('form', [])
# Only get books, not audio/video
forms = [x for x in forms if x in ('BA', 'BC', 'BB', 'DA')]
if forms:
ans.append(rec)
return ans
def get_data(self, isbn):
isbn = self.purify(isbn)
with self.lock:
if isbn not in self._map:
try:
data = self.fetch_data(isbn)
except:
import traceback
traceback.print_exc()
data = []
id_ = len(self._data)
self._data.append(data)
for rec in data:
for i in rec.get('isbn', []):
self._map[i] = id_
self._map[isbn] = id_
return self._data[self._map[isbn]]
def get_associated_isbns(self, isbn):
data = self.get_data(isbn)
ans = set([])
for rec in data:
for i in rec.get('isbn', []):
ans.add(i)
return ans
xisbn = xISBN()
if __name__ == '__main__':
import sys
isbn = sys.argv[-1]
print xisbn.get_data(isbn)
print
print xisbn.get_associated_isbns(isbn)