This commit is contained in:
Kovid Goyal 2011-03-16 17:42:42 -06:00
parent 68f63e807a
commit cc14a6a657
4 changed files with 134 additions and 43 deletions

View File

@ -633,10 +633,6 @@ class Metadata(object):
fmt('Publisher', self.publisher) fmt('Publisher', self.publisher)
if getattr(self, 'book_producer', False): if getattr(self, 'book_producer', False):
fmt('Book Producer', self.book_producer) fmt('Book Producer', self.book_producer)
if self.comments:
fmt('Comments', self.comments)
if self.isbn:
fmt('ISBN', self.isbn)
if self.tags: if self.tags:
fmt('Tags', u', '.join([unicode(t) for t in self.tags])) fmt('Tags', u', '.join([unicode(t) for t in self.tags]))
if self.series: if self.series:
@ -654,6 +650,9 @@ class Metadata(object):
if self.identifiers: if self.identifiers:
fmt('Identifiers', u', '.join(['%s:%s'%(k, v) for k, v in fmt('Identifiers', u', '.join(['%s:%s'%(k, v) for k, v in
self.identifiers.iteritems()])) self.identifiers.iteritems()]))
if self.comments:
fmt('Comments', self.comments)
for key in self.custom_field_keys(): for key in self.custom_field_keys():
val = self.get(key, None) val = self.get(key, None)
if val: if val:

View File

@ -20,6 +20,7 @@ from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
from calibre.utils.date import parse_date
class Worker(Thread): class Worker(Thread):
@ -28,10 +29,12 @@ class Worker(Thread):
''' '''
def __init__(self, url, result_queue, browser, log, timeout=20): def __init__(self, url, result_queue, browser, log, timeout=20):
Thread.__init__(self)
self.daemon = True
self.url, self.result_queue = url, result_queue self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout self.log, self.timeout = log, timeout
self.browser = browser.clone_browser() self.browser = browser.clone_browser()
self.cover_url = self.amazon_id = None self.cover_url = self.amazon_id = self.isbn = None
def run(self): def run(self):
try: try:
@ -111,7 +114,7 @@ class Worker(Thread):
self.amazon_id = asin self.amazon_id = asin
try: try:
mi.rating = self.parse_ratings(root) mi.rating = self.parse_rating(root)
except: except:
self.log.exception('Error parsing ratings for url: %r'%self.url) self.log.exception('Error parsing ratings for url: %r'%self.url)
@ -125,6 +128,37 @@ class Worker(Thread):
except: except:
self.log.exception('Error parsing cover for url: %r'%self.url) self.log.exception('Error parsing cover for url: %r'%self.url)
pd = root.xpath('//h2[text()="Product Details"]/../div[@class="content"]')
if pd:
pd = pd[0]
try:
isbn = self.parse_isbn(pd)
if isbn:
self.isbn = mi.isbn = isbn
except:
self.log.exception('Error parsing ISBN for url: %r'%self.url)
try:
mi.publisher = self.parse_publisher(pd)
except:
self.log.exception('Error parsing publisher for url: %r'%self.url)
try:
mi.pubdate = self.parse_pubdate(pd)
except:
self.log.exception('Error parsing publish date for url: %r'%self.url)
try:
lang = self.parse_language(pd)
if lang:
mi.language = lang
except:
self.log.exception('Error parsing language for url: %r'%self.url)
else:
self.log.warning('Failed to find product description for url: %r'%self.url)
self.result_queue.put(mi) self.result_queue.put(mi)
def parse_asin(self, root): def parse_asin(self, root):
@ -140,27 +174,23 @@ class Worker(Thread):
method='text').strip() method='text').strip()
else: else:
title = tostring(tdiv, encoding=unicode, method='text').strip() title = tostring(tdiv, encoding=unicode, method='text').strip()
return re.sub(r'[([].*[)]]', '', title).strip() return re.sub(r'[(\[].*[)\]]', '', title).strip()
def parse_authors(self, root): def parse_authors(self, root):
bdiv = root.xpath('//div[@class="buying"]')[0] aname = root.xpath('//span[@class="contributorNameTrigger"]')
aname = bdiv.xpath('descendant::span[@class="contributorNameTrigger"]')
authors = [tostring(x, encoding=unicode, method='text').strip() for x authors = [tostring(x, encoding=unicode, method='text').strip() for x
in aname] in aname]
return authors return authors
def parse_ratings(self, root): def parse_rating(self, root):
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
pat = re.compile(r'([0-9.]+) out of (\d+) stars') pat = re.compile(r'([0-9.]+) out of (\d+) stars')
if ratings: if ratings:
for elem in ratings[0].xpath('descendant::*[@title]'): for elem in ratings[0].xpath('descendant::*[@title]'):
t = elem.get('title') t = elem.get('title').strip()
m = pat.match(t) m = pat.match(t)
if m is not None: if m is not None:
try: return float(m.group(1))/float(m.group(2)) * 5
return float(m.group(1))/float(m.group(2)) * 5
except:
pass
def parse_comments(self, root): def parse_comments(self, root):
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
@ -193,6 +223,37 @@ class Worker(Thread):
bn = sparts[0] + sparts[-1] bn = sparts[0] + sparts[-1]
return ('/'.join(parts[:-1]))+'/'+bn return ('/'.join(parts[:-1]))+'/'+bn
def parse_isbn(self, pd):
for x in reversed(pd.xpath(
'descendant::*[starts-with(text(), "ISBN")]')):
if x.tail:
ans = check_isbn(x.tail.strip())
if ans:
return ans
def parse_publisher(self, pd):
for x in reversed(pd.xpath(
'descendant::*[starts-with(text(), "Publisher:")]')):
if x.tail:
ans = x.tail.partition(';')[0]
return ans.partition('(')[0].strip()
def parse_pubdate(self, pd):
for x in reversed(pd.xpath(
'descendant::*[starts-with(text(), "Publisher:")]')):
if x.tail:
ans = x.tail
date = ans.partition('(')[-1].replace(')', '').strip()
return parse_date(date, assume_utc=True)
def parse_language(self, pd):
for x in reversed(pd.xpath(
'descendant::*[starts-with(text(), "Language:")]')):
if x.tail:
ans = x.tail.strip()
if ans == 'English':
return 'en'
class Amazon(Source): class Amazon(Source):
@ -200,7 +261,8 @@ class Amazon(Source):
description = _('Downloads metadata from Amazon') description = _('Downloads metadata from Amazon')
capabilities = frozenset(['identify']) capabilities = frozenset(['identify'])
touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments']) touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
AMAZON_DOMAINS = { AMAZON_DOMAINS = {
'com': _('US'), 'com': _('US'),
@ -254,6 +316,10 @@ class Amazon(Source):
def identify(self, log, result_queue, abort, title=None, authors=None, def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=20): identifiers={}, timeout=20):
'''
Note this method will retry without identifiers automatically if no
match is found with identifiers.
'''
query = self.create_query(log, title=title, authors=authors, query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
if query is None: if query is None:
@ -281,37 +347,45 @@ class Amazon(Source):
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
if '<title>404 - ' in raw:
log.error('No matches found for query: %r'%query)
return
try:
root = soupparser.fromstring(clean_ascii_chars(raw))
except:
msg = 'Failed to parse amazon page for query: %r'%query
log.exception(msg)
return msg
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = tostring(errmsg, method='text', encoding=unicode).strip()
log.error(msg)
# The error is almost always a not found error
return
matches = [] matches = []
for div in root.xpath(r'//div[starts-with(@id, "result_")]'): found = '<title>404 - ' not in raw
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
title = tostring(a, method='text', encoding=unicode).lower() if found:
if 'bulk pack' not in title: try:
matches.append(a.get('href')) root = soupparser.fromstring(clean_ascii_chars(raw))
break except:
msg = 'Failed to parse amazon page for query: %r'%query
log.exception(msg)
return msg
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = tostring(errmsg, method='text', encoding=unicode).strip()
log.error(msg)
# The error is almost always a not found error
found = False
if found:
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
title = tostring(a, method='text', encoding=unicode).lower()
if 'bulk pack' not in title:
matches.append(a.get('href'))
break
# Keep only the top 5 matches as the matches are sorted by relevance by # Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant # Amazon so lower matches are not likely to be very relevant
matches = matches[:5] matches = matches[:5]
if abort.is_set():
return
if not matches: if not matches:
if identifiers and title and authors:
self.log('No matches found with identifiers, retrying using only'
' title and authors')
return self.identify(log, result_queue, abort, title=title,
authors=authors, timeout=timeout)
log.error('No matches found with query: %r'%query) log.error('No matches found with query: %r'%query)
return return
@ -333,6 +407,14 @@ class Amazon(Source):
if not a_worker_is_alive: if not a_worker_is_alive:
break break
for w in workers:
if w.amazon_id:
if w.isbn:
self.cache_isbn_to_identifier(w.isbn, w.amazon_id)
if w.cover_url:
self.cache_identifier_to_cover_url(w.amazon_id,
w.cover_url)
return None return None

View File

@ -35,6 +35,7 @@ class Source(Plugin):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs) Plugin.__init__(self, *args, **kwargs)
self._isbn_to_identifier_cache = {} self._isbn_to_identifier_cache = {}
self._identifier_to_cover_url_cache = {}
self.cache_lock = threading.RLock() self.cache_lock = threading.RLock()
self._config_obj = None self._config_obj = None
self._browser = None self._browser = None
@ -68,6 +69,14 @@ class Source(Plugin):
with self.cache_lock: with self.cache_lock:
return self._isbn_to_identifier_cache.get(isbn, None) return self._isbn_to_identifier_cache.get(isbn, None)
def cache_identifier_to_cover_url(self, id_, url):
with self.cache_lock:
self._identifier_to_cover_url_cache[id_] = url
def cached_identifier_to_cover_url(self, id_):
with self.cache_lock:
return self._identifier_to_cover_url_cache.get(id_, None)
def get_author_tokens(self, authors, only_first_author=True): def get_author_tokens(self, authors, only_first_author=True):
''' '''
Take a list of authors and return a list of tokens useful for an Take a list of authors and return a list of tokens useful for an

View File

@ -145,8 +145,9 @@ class GoogleBooks(Source):
description = _('Downloads metadata from Google Books') description = _('Downloads metadata from Google Books')
capabilities = frozenset(['identify']) capabilities = frozenset(['identify'])
touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate', touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
'comments', 'publisher', 'author_sort']) # language currently disabled 'comments', 'publisher', 'author_sort', 'identifier:isbn',
'identifier:google']) # language currently disabled
def create_query(self, log, title=None, authors=None, identifiers={}): def create_query(self, log, title=None, authors=None, identifiers={}):
BASE_URL = 'http://books.google.com/books/feeds/volumes?' BASE_URL = 'http://books.google.com/books/feeds/volumes?'