mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New social metadata plugin for Amazon that does not rely on AWS
This commit is contained in:
parent
238f9391ea
commit
620102102e
@ -8,88 +8,93 @@ Fetch metadata using Amazon AWS
|
|||||||
'''
|
'''
|
||||||
import sys, re
|
import sys, re
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import html
|
||||||
|
|
||||||
from calibre import browser
|
from calibre import browser
|
||||||
from calibre.utils.date import parse_date, utcnow
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05'
|
def find_asin(br, isbn):
|
||||||
|
q = 'http://www.amazon.com/s?field-keywords='+isbn
|
||||||
|
raw = br.open_novisit(q).read()
|
||||||
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
|
resolve_entities=True)[0]
|
||||||
|
root = html.fromstring(raw)
|
||||||
|
revs = root.xpath('//*[@class="asinReviewsSummary" and @name]')
|
||||||
|
revs = [x.get('name') for x in revs]
|
||||||
|
if revs:
|
||||||
|
return revs[0]
|
||||||
|
|
||||||
def AWS(tag):
|
|
||||||
return '{%s}%s'%(AWS_NS, tag)
|
|
||||||
|
|
||||||
class ISBNNotFound(ValueError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def check_for_errors(root, isbn):
|
|
||||||
err = root.find('.//'+AWS('Error'))
|
|
||||||
if err is not None:
|
|
||||||
text = etree.tostring(err, method='text', pretty_print=True,
|
|
||||||
encoding=unicode)
|
|
||||||
if 'AWS.InvalidParameterValue'+isbn in text:
|
|
||||||
raise ISBNNotFound(isbn)
|
|
||||||
raise Exception('Failed to get metadata with error: '\
|
|
||||||
+ text)
|
|
||||||
|
|
||||||
def get_social_metadata(title, authors, publisher, isbn):
|
def get_social_metadata(title, authors, publisher, isbn):
|
||||||
mi = MetaInformation(title, authors)
|
mi = Metadata(title, authors)
|
||||||
if isbn:
|
if not isbn:
|
||||||
|
return mi
|
||||||
|
isbn = check_isbn(isbn)
|
||||||
|
if not isbn:
|
||||||
|
return mi
|
||||||
br = browser()
|
br = browser()
|
||||||
response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read()
|
if len(isbn) == 13:
|
||||||
root = etree.fromstring(response_xml)
|
|
||||||
try:
|
try:
|
||||||
check_for_errors(root, isbn)
|
asin = find_asin(br, isbn)
|
||||||
except ISBNNotFound:
|
except:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
asin = None
|
||||||
|
else:
|
||||||
|
asin = isbn
|
||||||
|
if asin:
|
||||||
|
if get_metadata(br, asin, mi):
|
||||||
return mi
|
return mi
|
||||||
mi.title = root.findtext('.//'+AWS('Title'))
|
# TODO: Use xisbn to search over all isbns
|
||||||
authors = [x.text for x in root.findall('.//'+AWS('Author'))]
|
|
||||||
if authors:
|
|
||||||
mi.authors = []
|
|
||||||
for x in authors:
|
|
||||||
mi.authors.extend(string_to_authors(x))
|
|
||||||
mi.publisher = root.findtext('.//'+AWS('Publisher'))
|
|
||||||
try:
|
|
||||||
d = root.findtext('.//'+AWS('PublicationDate'))
|
|
||||||
if d:
|
|
||||||
default = utcnow().replace(day=15)
|
|
||||||
d = parse_date(d[0].text, assume_utc=True, default=default)
|
|
||||||
mi.pubdate = d
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
rating = float(root.findtext('.//'+AWS('AverageRating')))
|
|
||||||
num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews')))
|
|
||||||
if num_of_reviews > 4 and rating > 0 and rating < 5:
|
|
||||||
mi.rating = rating
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'),
|
|
||||||
AWS('Subject')))]
|
|
||||||
if tags:
|
|
||||||
mi.tags = []
|
|
||||||
for x in tags:
|
|
||||||
mi.tags.extend([y.strip() for y in x.split('/')])
|
|
||||||
mi.tags = [x.replace(',', ';') for x in mi.tags]
|
|
||||||
comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
|
|
||||||
AWS('Content')))
|
|
||||||
if comments is not None:
|
|
||||||
mi.comments = etree.tostring(comments,
|
|
||||||
method='text', encoding=unicode)
|
|
||||||
mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments)
|
|
||||||
mi.comments = re.sub('</?[iI]>', '*', mi.comments)
|
|
||||||
mi.comments = re.sub('</?[bB]>', '**', mi.comments)
|
|
||||||
mi.comments = re.sub('<BR>', '\n\n', mi.comments)
|
|
||||||
mi.comments = re.sub('<[^>]+>', '', mi.comments)
|
|
||||||
mi.comments = mi.comments.strip()
|
|
||||||
mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments
|
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
def get_metadata(br, asin, mi):
|
||||||
|
q = 'http://amzn.com/'+asin
|
||||||
|
raw = br.open_novisit(q).read()
|
||||||
|
if '<title>404 - ' in raw:
|
||||||
|
return False
|
||||||
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
|
resolve_entities=True)[0]
|
||||||
|
root = html.fromstring(raw)
|
||||||
|
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
|
||||||
|
if ratings:
|
||||||
|
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
||||||
|
r = ratings[0]
|
||||||
|
for elem in r.xpath('descendant::*[@title]'):
|
||||||
|
t = elem.get('title')
|
||||||
|
m = pat.match(t)
|
||||||
|
if m is not None:
|
||||||
|
try:
|
||||||
|
mi.rating = float(m.group(1))/float(m.group(2)) * 5
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
||||||
|
if desc:
|
||||||
|
desc = desc[0]
|
||||||
|
for c in desc.xpath('descendant::*[@class="seeAll" or'
|
||||||
|
' @class="emptyClear" or @href]'):
|
||||||
|
c.getparent().remove(c)
|
||||||
|
desc = html.tostring(desc, method='html', encoding=unicode).strip()
|
||||||
|
desc = re.sub(r' class=[^>]+>', '>', desc)
|
||||||
|
desc = re.sub('\n+', '\n', desc)
|
||||||
|
desc = re.sub(' +', ' ', desc)
|
||||||
|
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
|
||||||
|
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
||||||
|
mi.comments = desc
|
||||||
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
print get_social_metadata(None, None, None, '9781416551720')
|
print get_social_metadata('Swan Thieves', None, None, '9780316065795')
|
||||||
|
print
|
||||||
|
return 0
|
||||||
|
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')
|
||||||
|
print
|
||||||
|
print get_social_metadata('The Great Gatsby', None, None, '0743273567')
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
80
src/calibre/ebooks/metadata/xisbn.py
Normal file
80
src/calibre/ebooks/metadata/xisbn.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import threading, re, json
|
||||||
|
|
||||||
|
from calibre import browser
|
||||||
|
|
||||||
|
class xISBN(object):
|
||||||
|
|
||||||
|
QUERY = 'http://xisbn.worldcat.org/webservices/xid/isbn/%s?method=getEditions&format=json&fl=form,year,lang,ed'
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.lock = threading.RLock()
|
||||||
|
self._data = []
|
||||||
|
self._map = {}
|
||||||
|
|
||||||
|
self.br = browser()
|
||||||
|
self.isbn_pat = re.compile(r'[^0-9X]', re.IGNORECASE)
|
||||||
|
|
||||||
|
def purify(self, isbn):
|
||||||
|
return self.isbn_pat.sub('', isbn.upper())
|
||||||
|
|
||||||
|
def fetch_data(self, isbn):
|
||||||
|
url = self.QUERY%isbn
|
||||||
|
data = self.br.open_novisit(url).read()
|
||||||
|
data = json.loads(data)
|
||||||
|
if data.get('stat', None) != 'ok':
|
||||||
|
return []
|
||||||
|
data = data.get('list', [])
|
||||||
|
ans = []
|
||||||
|
for rec in data:
|
||||||
|
forms = rec.get('form', [])
|
||||||
|
# Only get books, not audio/video
|
||||||
|
forms = [x for x in forms if x in ('BA', 'BC', 'BB', 'DA')]
|
||||||
|
if forms:
|
||||||
|
ans.append(rec)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def get_data(self, isbn):
|
||||||
|
isbn = self.purify(isbn)
|
||||||
|
with self.lock:
|
||||||
|
if isbn not in self._map:
|
||||||
|
try:
|
||||||
|
data = self.fetch_data(isbn)
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
data = []
|
||||||
|
id_ = len(self._data)
|
||||||
|
self._data.append(data)
|
||||||
|
for rec in data:
|
||||||
|
for i in rec.get('isbn', []):
|
||||||
|
self._map[i] = id_
|
||||||
|
self._map[isbn] = id_
|
||||||
|
return self._data[self._map[isbn]]
|
||||||
|
|
||||||
|
def get_associated_isbns(self, isbn):
|
||||||
|
data = self.get_data(isbn)
|
||||||
|
ans = set([])
|
||||||
|
for rec in data:
|
||||||
|
for i in rec.get('isbn', []):
|
||||||
|
ans.add(i)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
xisbn = xISBN()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
isbn = sys.argv[-1]
|
||||||
|
print xisbn.get_data(isbn)
|
||||||
|
print
|
||||||
|
print xisbn.get_associated_isbns(isbn)
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user