diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py index 6818126699..db0c278340 100644 --- a/src/calibre/ebooks/metadata/book/base.py +++ b/src/calibre/ebooks/metadata/book/base.py @@ -633,10 +633,6 @@ class Metadata(object): fmt('Publisher', self.publisher) if getattr(self, 'book_producer', False): fmt('Book Producer', self.book_producer) - if self.comments: - fmt('Comments', self.comments) - if self.isbn: - fmt('ISBN', self.isbn) if self.tags: fmt('Tags', u', '.join([unicode(t) for t in self.tags])) if self.series: @@ -654,6 +650,9 @@ class Metadata(object): if self.identifiers: fmt('Identifiers', u', '.join(['%s:%s'%(k, v) for k, v in self.identifiers.iteritems()])) + if self.comments: + fmt('Comments', self.comments) + for key in self.custom_field_keys(): val = self.get(key, None) if val: diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 33ea24c421..30b95950ea 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -20,6 +20,7 @@ from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.book.base import Metadata from calibre.library.comments import sanitize_comments_html +from calibre.utils.date import parse_date class Worker(Thread): @@ -28,10 +29,12 @@ class Worker(Thread): ''' def __init__(self, url, result_queue, browser, log, timeout=20): + Thread.__init__(self) + self.daemon = True self.url, self.result_queue = url, result_queue self.log, self.timeout = log, timeout self.browser = browser.clone_browser() - self.cover_url = self.amazon_id = None + self.cover_url = self.amazon_id = self.isbn = None def run(self): try: @@ -111,7 +114,7 @@ class Worker(Thread): self.amazon_id = asin try: - mi.rating = self.parse_ratings(root) + mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) @@ -125,6 +128,37 @@ class Worker(Thread): except: self.log.exception('Error parsing cover for url: %r'%self.url) + pd = root.xpath('//h2[text()="Product Details"]/../div[@class="content"]') + if pd: + pd = pd[0] + + try: + isbn = self.parse_isbn(pd) + if isbn: + self.isbn = mi.isbn = isbn + except: + self.log.exception('Error parsing ISBN for url: %r'%self.url) + + try: + mi.publisher = self.parse_publisher(pd) + except: + self.log.exception('Error parsing publisher for url: %r'%self.url) + + try: + mi.pubdate = self.parse_pubdate(pd) + except: + self.log.exception('Error parsing publish date for url: %r'%self.url) + + try: + lang = self.parse_language(pd) + if lang: + mi.language = lang + except: + self.log.exception('Error parsing language for url: %r'%self.url) + + else: + self.log.warning('Failed to find product description for url: %r'%self.url) + self.result_queue.put(mi) def parse_asin(self, root): @@ -140,27 +174,23 @@ class Worker(Thread): method='text').strip() else: title = tostring(tdiv, encoding=unicode, method='text').strip() - return re.sub(r'[([].*[)]]', '', title).strip() + return re.sub(r'[(\[].*[)\]]', '', title).strip() def parse_authors(self, root): - bdiv = root.xpath('//div[@class="buying"]')[0] - aname = bdiv.xpath('descendant::span[@class="contributorNameTrigger"]') + aname = root.xpath('//span[@class="contributorNameTrigger"]') authors = [tostring(x, encoding=unicode, method='text').strip() for x in aname] return authors - def parse_ratings(self, root): - ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') + def parse_rating(self, root): + ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]') pat = re.compile(r'([0-9.]+) out of (\d+) stars') if ratings: for elem in ratings[0].xpath('descendant::*[@title]'): - t = elem.get('title') + t = elem.get('title').strip() m = pat.match(t) if m is not None: - try: - return float(m.group(1))/float(m.group(2)) * 5 - except: - pass + return float(m.group(1))/float(m.group(2)) * 5 def parse_comments(self, root): desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') @@ -193,6 +223,37 @@ class Worker(Thread): bn = sparts[0] + sparts[-1] return ('/'.join(parts[:-1]))+'/'+bn + def parse_isbn(self, pd): + for x in reversed(pd.xpath( + 'descendant::*[starts-with(text(), "ISBN")]')): + if x.tail: + ans = check_isbn(x.tail.strip()) + if ans: + return ans + + def parse_publisher(self, pd): + for x in reversed(pd.xpath( + 'descendant::*[starts-with(text(), "Publisher:")]')): + if x.tail: + ans = x.tail.partition(';')[0] + return ans.partition('(')[0].strip() + + def parse_pubdate(self, pd): + for x in reversed(pd.xpath( + 'descendant::*[starts-with(text(), "Publisher:")]')): + if x.tail: + ans = x.tail + date = ans.partition('(')[-1].replace(')', '').strip() + return parse_date(date, assume_utc=True) + + def parse_language(self, pd): + for x in reversed(pd.xpath( + 'descendant::*[starts-with(text(), "Language:")]')): + if x.tail: + ans = x.tail.strip() + if ans == 'English': + return 'en' + class Amazon(Source): @@ -200,7 +261,8 @@ class Amazon(Source): description = _('Downloads metadata from Amazon') capabilities = frozenset(['identify']) - touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments']) + touched_fields = frozenset(['title', 'authors', 'identifier:amazon', + 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate']) AMAZON_DOMAINS = { 'com': _('US'), @@ -254,6 +316,10 @@ class Amazon(Source): def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=20): + ''' + Note this method will retry without identifiers automatically if no + match is found with identifiers. + ''' query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: @@ -281,37 +347,45 @@ class Amazon(Source): raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] - if '404 - ' in raw: - log.error('No matches found for query: %r'%query) - return - - try: - root = soupparser.fromstring(clean_ascii_chars(raw)) - except: - msg = 'Failed to parse amazon page for query: %r'%query - log.exception(msg) - return msg - - errmsg = root.xpath('//*[@id="errorMessage"]') - if errmsg: - msg = tostring(errmsg, method='text', encoding=unicode).strip() - log.error(msg) - # The error is almost always a not found error - return - matches = [] - for div in root.xpath(r'//div[starts-with(@id, "result_")]'): - for a in div.xpath(r'descendant::a[@class="title" and @href]'): - title = tostring(a, method='text', encoding=unicode).lower() - if 'bulk pack' not in title: - matches.append(a.get('href')) - break + found = '<title>404 - ' not in raw + + if found: + try: + root = soupparser.fromstring(clean_ascii_chars(raw)) + except: + msg = 'Failed to parse amazon page for query: %r'%query + log.exception(msg) + return msg + + errmsg = root.xpath('//*[@id="errorMessage"]') + if errmsg: + msg = tostring(errmsg, method='text', encoding=unicode).strip() + log.error(msg) + # The error is almost always a not found error + found = False + + if found: + for div in root.xpath(r'//div[starts-with(@id, "result_")]'): + for a in div.xpath(r'descendant::a[@class="title" and @href]'): + title = tostring(a, method='text', encoding=unicode).lower() + if 'bulk pack' not in title: + matches.append(a.get('href')) + break # Keep only the top 5 matches as the matches are sorted by relevance by # Amazon so lower matches are not likely to be very relevant matches = matches[:5] + if abort.is_set(): + return + if not matches: + if identifiers and title and authors: + self.log('No matches found with identifiers, retrying using only' + ' title and authors') + return self.identify(log, result_queue, abort, title=title, + authors=authors, timeout=timeout) log.error('No matches found with query: %r'%query) return @@ -333,6 +407,14 @@ class Amazon(Source): if not a_worker_is_alive: break + for w in workers: + if w.amazon_id: + if w.isbn: + self.cache_isbn_to_identifier(w.isbn, w.amazon_id) + if w.cover_url: + self.cache_identifier_to_cover_url(w.amazon_id, + w.cover_url) + return None diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 523d012cd5..3c320d14b6 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -35,6 +35,7 @@ class Source(Plugin): def __init__(self, *args, **kwargs): Plugin.__init__(self, *args, **kwargs) self._isbn_to_identifier_cache = {} + self._identifier_to_cover_url_cache = {} self.cache_lock = threading.RLock() self._config_obj = None self._browser = None @@ -68,6 +69,14 @@ class Source(Plugin): with self.cache_lock: return self._isbn_to_identifier_cache.get(isbn, None) + def cache_identifier_to_cover_url(self, id_, url): + with self.cache_lock: + self._identifier_to_cover_url_cache[id_] = url + + def cached_identifier_to_cover_url(self, id_): + with self.cache_lock: + return self._identifier_to_cover_url_cache.get(id_, None) + def get_author_tokens(self, authors, only_first_author=True): ''' Take a list of authors and return a list of tokens useful for an diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 923062379e..8a7fc8e540 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -145,8 +145,9 @@ class GoogleBooks(Source): description = _('Downloads metadata from Google Books') capabilities = frozenset(['identify']) - touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate', - 'comments', 'publisher', 'author_sort']) # language currently disabled + touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', + 'comments', 'publisher', 'author_sort', 'identifier:isbn', + 'identifier:google']) # language currently disabled def create_query(self, log, title=None, authors=None, identifiers={}): BASE_URL = 'http://books.google.com/books/feeds/volumes?'