mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
68f63e807a
commit
cc14a6a657
@ -633,10 +633,6 @@ class Metadata(object):
|
||||
fmt('Publisher', self.publisher)
|
||||
if getattr(self, 'book_producer', False):
|
||||
fmt('Book Producer', self.book_producer)
|
||||
if self.comments:
|
||||
fmt('Comments', self.comments)
|
||||
if self.isbn:
|
||||
fmt('ISBN', self.isbn)
|
||||
if self.tags:
|
||||
fmt('Tags', u', '.join([unicode(t) for t in self.tags]))
|
||||
if self.series:
|
||||
@ -654,6 +650,9 @@ class Metadata(object):
|
||||
if self.identifiers:
|
||||
fmt('Identifiers', u', '.join(['%s:%s'%(k, v) for k, v in
|
||||
self.identifiers.iteritems()]))
|
||||
if self.comments:
|
||||
fmt('Comments', self.comments)
|
||||
|
||||
for key in self.custom_field_keys():
|
||||
val = self.get(key, None)
|
||||
if val:
|
||||
|
@ -20,6 +20,7 @@ from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.library.comments import sanitize_comments_html
|
||||
from calibre.utils.date import parse_date
|
||||
|
||||
class Worker(Thread):
|
||||
|
||||
@ -28,10 +29,12 @@ class Worker(Thread):
|
||||
'''
|
||||
|
||||
def __init__(self, url, result_queue, browser, log, timeout=20):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
self.url, self.result_queue = url, result_queue
|
||||
self.log, self.timeout = log, timeout
|
||||
self.browser = browser.clone_browser()
|
||||
self.cover_url = self.amazon_id = None
|
||||
self.cover_url = self.amazon_id = self.isbn = None
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
@ -111,7 +114,7 @@ class Worker(Thread):
|
||||
self.amazon_id = asin
|
||||
|
||||
try:
|
||||
mi.rating = self.parse_ratings(root)
|
||||
mi.rating = self.parse_rating(root)
|
||||
except:
|
||||
self.log.exception('Error parsing ratings for url: %r'%self.url)
|
||||
|
||||
@ -125,6 +128,37 @@ class Worker(Thread):
|
||||
except:
|
||||
self.log.exception('Error parsing cover for url: %r'%self.url)
|
||||
|
||||
pd = root.xpath('//h2[text()="Product Details"]/../div[@class="content"]')
|
||||
if pd:
|
||||
pd = pd[0]
|
||||
|
||||
try:
|
||||
isbn = self.parse_isbn(pd)
|
||||
if isbn:
|
||||
self.isbn = mi.isbn = isbn
|
||||
except:
|
||||
self.log.exception('Error parsing ISBN for url: %r'%self.url)
|
||||
|
||||
try:
|
||||
mi.publisher = self.parse_publisher(pd)
|
||||
except:
|
||||
self.log.exception('Error parsing publisher for url: %r'%self.url)
|
||||
|
||||
try:
|
||||
mi.pubdate = self.parse_pubdate(pd)
|
||||
except:
|
||||
self.log.exception('Error parsing publish date for url: %r'%self.url)
|
||||
|
||||
try:
|
||||
lang = self.parse_language(pd)
|
||||
if lang:
|
||||
mi.language = lang
|
||||
except:
|
||||
self.log.exception('Error parsing language for url: %r'%self.url)
|
||||
|
||||
else:
|
||||
self.log.warning('Failed to find product description for url: %r'%self.url)
|
||||
|
||||
self.result_queue.put(mi)
|
||||
|
||||
def parse_asin(self, root):
|
||||
@ -140,27 +174,23 @@ class Worker(Thread):
|
||||
method='text').strip()
|
||||
else:
|
||||
title = tostring(tdiv, encoding=unicode, method='text').strip()
|
||||
return re.sub(r'[([].*[)]]', '', title).strip()
|
||||
return re.sub(r'[(\[].*[)\]]', '', title).strip()
|
||||
|
||||
def parse_authors(self, root):
|
||||
bdiv = root.xpath('//div[@class="buying"]')[0]
|
||||
aname = bdiv.xpath('descendant::span[@class="contributorNameTrigger"]')
|
||||
aname = root.xpath('//span[@class="contributorNameTrigger"]')
|
||||
authors = [tostring(x, encoding=unicode, method='text').strip() for x
|
||||
in aname]
|
||||
return authors
|
||||
|
||||
def parse_ratings(self, root):
|
||||
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
|
||||
def parse_rating(self, root):
|
||||
ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
|
||||
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
||||
if ratings:
|
||||
for elem in ratings[0].xpath('descendant::*[@title]'):
|
||||
t = elem.get('title')
|
||||
t = elem.get('title').strip()
|
||||
m = pat.match(t)
|
||||
if m is not None:
|
||||
try:
|
||||
return float(m.group(1))/float(m.group(2)) * 5
|
||||
except:
|
||||
pass
|
||||
return float(m.group(1))/float(m.group(2)) * 5
|
||||
|
||||
def parse_comments(self, root):
|
||||
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
||||
@ -193,6 +223,37 @@ class Worker(Thread):
|
||||
bn = sparts[0] + sparts[-1]
|
||||
return ('/'.join(parts[:-1]))+'/'+bn
|
||||
|
||||
def parse_isbn(self, pd):
|
||||
for x in reversed(pd.xpath(
|
||||
'descendant::*[starts-with(text(), "ISBN")]')):
|
||||
if x.tail:
|
||||
ans = check_isbn(x.tail.strip())
|
||||
if ans:
|
||||
return ans
|
||||
|
||||
def parse_publisher(self, pd):
|
||||
for x in reversed(pd.xpath(
|
||||
'descendant::*[starts-with(text(), "Publisher:")]')):
|
||||
if x.tail:
|
||||
ans = x.tail.partition(';')[0]
|
||||
return ans.partition('(')[0].strip()
|
||||
|
||||
def parse_pubdate(self, pd):
|
||||
for x in reversed(pd.xpath(
|
||||
'descendant::*[starts-with(text(), "Publisher:")]')):
|
||||
if x.tail:
|
||||
ans = x.tail
|
||||
date = ans.partition('(')[-1].replace(')', '').strip()
|
||||
return parse_date(date, assume_utc=True)
|
||||
|
||||
def parse_language(self, pd):
|
||||
for x in reversed(pd.xpath(
|
||||
'descendant::*[starts-with(text(), "Language:")]')):
|
||||
if x.tail:
|
||||
ans = x.tail.strip()
|
||||
if ans == 'English':
|
||||
return 'en'
|
||||
|
||||
|
||||
class Amazon(Source):
|
||||
|
||||
@ -200,7 +261,8 @@ class Amazon(Source):
|
||||
description = _('Downloads metadata from Amazon')
|
||||
|
||||
capabilities = frozenset(['identify'])
|
||||
touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments'])
|
||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
|
||||
|
||||
AMAZON_DOMAINS = {
|
||||
'com': _('US'),
|
||||
@ -254,6 +316,10 @@ class Amazon(Source):
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||
identifiers={}, timeout=20):
|
||||
'''
|
||||
Note this method will retry without identifiers automatically if no
|
||||
match is found with identifiers.
|
||||
'''
|
||||
query = self.create_query(log, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
if query is None:
|
||||
@ -281,37 +347,45 @@ class Amazon(Source):
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
|
||||
if '<title>404 - ' in raw:
|
||||
log.error('No matches found for query: %r'%query)
|
||||
return
|
||||
|
||||
try:
|
||||
root = soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
msg = 'Failed to parse amazon page for query: %r'%query
|
||||
log.exception(msg)
|
||||
return msg
|
||||
|
||||
errmsg = root.xpath('//*[@id="errorMessage"]')
|
||||
if errmsg:
|
||||
msg = tostring(errmsg, method='text', encoding=unicode).strip()
|
||||
log.error(msg)
|
||||
# The error is almost always a not found error
|
||||
return
|
||||
|
||||
matches = []
|
||||
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
|
||||
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
|
||||
title = tostring(a, method='text', encoding=unicode).lower()
|
||||
if 'bulk pack' not in title:
|
||||
matches.append(a.get('href'))
|
||||
break
|
||||
found = '<title>404 - ' not in raw
|
||||
|
||||
if found:
|
||||
try:
|
||||
root = soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
msg = 'Failed to parse amazon page for query: %r'%query
|
||||
log.exception(msg)
|
||||
return msg
|
||||
|
||||
errmsg = root.xpath('//*[@id="errorMessage"]')
|
||||
if errmsg:
|
||||
msg = tostring(errmsg, method='text', encoding=unicode).strip()
|
||||
log.error(msg)
|
||||
# The error is almost always a not found error
|
||||
found = False
|
||||
|
||||
if found:
|
||||
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
|
||||
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
|
||||
title = tostring(a, method='text', encoding=unicode).lower()
|
||||
if 'bulk pack' not in title:
|
||||
matches.append(a.get('href'))
|
||||
break
|
||||
|
||||
# Keep only the top 5 matches as the matches are sorted by relevance by
|
||||
# Amazon so lower matches are not likely to be very relevant
|
||||
matches = matches[:5]
|
||||
|
||||
if abort.is_set():
|
||||
return
|
||||
|
||||
if not matches:
|
||||
if identifiers and title and authors:
|
||||
self.log('No matches found with identifiers, retrying using only'
|
||||
' title and authors')
|
||||
return self.identify(log, result_queue, abort, title=title,
|
||||
authors=authors, timeout=timeout)
|
||||
log.error('No matches found with query: %r'%query)
|
||||
return
|
||||
|
||||
@ -333,6 +407,14 @@ class Amazon(Source):
|
||||
if not a_worker_is_alive:
|
||||
break
|
||||
|
||||
for w in workers:
|
||||
if w.amazon_id:
|
||||
if w.isbn:
|
||||
self.cache_isbn_to_identifier(w.isbn, w.amazon_id)
|
||||
if w.cover_url:
|
||||
self.cache_identifier_to_cover_url(w.amazon_id,
|
||||
w.cover_url)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
@ -35,6 +35,7 @@ class Source(Plugin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
Plugin.__init__(self, *args, **kwargs)
|
||||
self._isbn_to_identifier_cache = {}
|
||||
self._identifier_to_cover_url_cache = {}
|
||||
self.cache_lock = threading.RLock()
|
||||
self._config_obj = None
|
||||
self._browser = None
|
||||
@ -68,6 +69,14 @@ class Source(Plugin):
|
||||
with self.cache_lock:
|
||||
return self._isbn_to_identifier_cache.get(isbn, None)
|
||||
|
||||
def cache_identifier_to_cover_url(self, id_, url):
|
||||
with self.cache_lock:
|
||||
self._identifier_to_cover_url_cache[id_] = url
|
||||
|
||||
def cached_identifier_to_cover_url(self, id_):
|
||||
with self.cache_lock:
|
||||
return self._identifier_to_cover_url_cache.get(id_, None)
|
||||
|
||||
def get_author_tokens(self, authors, only_first_author=True):
|
||||
'''
|
||||
Take a list of authors and return a list of tokens useful for an
|
||||
|
@ -145,8 +145,9 @@ class GoogleBooks(Source):
|
||||
description = _('Downloads metadata from Google Books')
|
||||
|
||||
capabilities = frozenset(['identify'])
|
||||
touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate',
|
||||
'comments', 'publisher', 'author_sort']) # language currently disabled
|
||||
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
||||
'comments', 'publisher', 'author_sort', 'identifier:isbn',
|
||||
'identifier:google']) # language currently disabled
|
||||
|
||||
def create_query(self, log, title=None, authors=None, identifiers={}):
|
||||
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
|
||||
|
Loading…
x
Reference in New Issue
Block a user