mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
68f63e807a
commit
cc14a6a657
@ -633,10 +633,6 @@ class Metadata(object):
|
|||||||
fmt('Publisher', self.publisher)
|
fmt('Publisher', self.publisher)
|
||||||
if getattr(self, 'book_producer', False):
|
if getattr(self, 'book_producer', False):
|
||||||
fmt('Book Producer', self.book_producer)
|
fmt('Book Producer', self.book_producer)
|
||||||
if self.comments:
|
|
||||||
fmt('Comments', self.comments)
|
|
||||||
if self.isbn:
|
|
||||||
fmt('ISBN', self.isbn)
|
|
||||||
if self.tags:
|
if self.tags:
|
||||||
fmt('Tags', u', '.join([unicode(t) for t in self.tags]))
|
fmt('Tags', u', '.join([unicode(t) for t in self.tags]))
|
||||||
if self.series:
|
if self.series:
|
||||||
@ -654,6 +650,9 @@ class Metadata(object):
|
|||||||
if self.identifiers:
|
if self.identifiers:
|
||||||
fmt('Identifiers', u', '.join(['%s:%s'%(k, v) for k, v in
|
fmt('Identifiers', u', '.join(['%s:%s'%(k, v) for k, v in
|
||||||
self.identifiers.iteritems()]))
|
self.identifiers.iteritems()]))
|
||||||
|
if self.comments:
|
||||||
|
fmt('Comments', self.comments)
|
||||||
|
|
||||||
for key in self.custom_field_keys():
|
for key in self.custom_field_keys():
|
||||||
val = self.get(key, None)
|
val = self.get(key, None)
|
||||||
if val:
|
if val:
|
||||||
|
@ -20,6 +20,7 @@ from calibre.utils.cleantext import clean_ascii_chars
|
|||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.library.comments import sanitize_comments_html
|
from calibre.library.comments import sanitize_comments_html
|
||||||
|
from calibre.utils.date import parse_date
|
||||||
|
|
||||||
class Worker(Thread):
|
class Worker(Thread):
|
||||||
|
|
||||||
@ -28,10 +29,12 @@ class Worker(Thread):
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, url, result_queue, browser, log, timeout=20):
|
def __init__(self, url, result_queue, browser, log, timeout=20):
|
||||||
|
Thread.__init__(self)
|
||||||
|
self.daemon = True
|
||||||
self.url, self.result_queue = url, result_queue
|
self.url, self.result_queue = url, result_queue
|
||||||
self.log, self.timeout = log, timeout
|
self.log, self.timeout = log, timeout
|
||||||
self.browser = browser.clone_browser()
|
self.browser = browser.clone_browser()
|
||||||
self.cover_url = self.amazon_id = None
|
self.cover_url = self.amazon_id = self.isbn = None
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
@ -111,7 +114,7 @@ class Worker(Thread):
|
|||||||
self.amazon_id = asin
|
self.amazon_id = asin
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mi.rating = self.parse_ratings(root)
|
mi.rating = self.parse_rating(root)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing ratings for url: %r'%self.url)
|
self.log.exception('Error parsing ratings for url: %r'%self.url)
|
||||||
|
|
||||||
@ -125,6 +128,37 @@ class Worker(Thread):
|
|||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing cover for url: %r'%self.url)
|
self.log.exception('Error parsing cover for url: %r'%self.url)
|
||||||
|
|
||||||
|
pd = root.xpath('//h2[text()="Product Details"]/../div[@class="content"]')
|
||||||
|
if pd:
|
||||||
|
pd = pd[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
isbn = self.parse_isbn(pd)
|
||||||
|
if isbn:
|
||||||
|
self.isbn = mi.isbn = isbn
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing ISBN for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mi.publisher = self.parse_publisher(pd)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing publisher for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mi.pubdate = self.parse_pubdate(pd)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing publish date for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
lang = self.parse_language(pd)
|
||||||
|
if lang:
|
||||||
|
mi.language = lang
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing language for url: %r'%self.url)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.log.warning('Failed to find product description for url: %r'%self.url)
|
||||||
|
|
||||||
self.result_queue.put(mi)
|
self.result_queue.put(mi)
|
||||||
|
|
||||||
def parse_asin(self, root):
|
def parse_asin(self, root):
|
||||||
@ -140,27 +174,23 @@ class Worker(Thread):
|
|||||||
method='text').strip()
|
method='text').strip()
|
||||||
else:
|
else:
|
||||||
title = tostring(tdiv, encoding=unicode, method='text').strip()
|
title = tostring(tdiv, encoding=unicode, method='text').strip()
|
||||||
return re.sub(r'[([].*[)]]', '', title).strip()
|
return re.sub(r'[(\[].*[)\]]', '', title).strip()
|
||||||
|
|
||||||
def parse_authors(self, root):
|
def parse_authors(self, root):
|
||||||
bdiv = root.xpath('//div[@class="buying"]')[0]
|
aname = root.xpath('//span[@class="contributorNameTrigger"]')
|
||||||
aname = bdiv.xpath('descendant::span[@class="contributorNameTrigger"]')
|
|
||||||
authors = [tostring(x, encoding=unicode, method='text').strip() for x
|
authors = [tostring(x, encoding=unicode, method='text').strip() for x
|
||||||
in aname]
|
in aname]
|
||||||
return authors
|
return authors
|
||||||
|
|
||||||
def parse_ratings(self, root):
|
def parse_rating(self, root):
|
||||||
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
|
ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
|
||||||
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
||||||
if ratings:
|
if ratings:
|
||||||
for elem in ratings[0].xpath('descendant::*[@title]'):
|
for elem in ratings[0].xpath('descendant::*[@title]'):
|
||||||
t = elem.get('title')
|
t = elem.get('title').strip()
|
||||||
m = pat.match(t)
|
m = pat.match(t)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
try:
|
return float(m.group(1))/float(m.group(2)) * 5
|
||||||
return float(m.group(1))/float(m.group(2)) * 5
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def parse_comments(self, root):
|
def parse_comments(self, root):
|
||||||
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
||||||
@ -193,6 +223,37 @@ class Worker(Thread):
|
|||||||
bn = sparts[0] + sparts[-1]
|
bn = sparts[0] + sparts[-1]
|
||||||
return ('/'.join(parts[:-1]))+'/'+bn
|
return ('/'.join(parts[:-1]))+'/'+bn
|
||||||
|
|
||||||
|
def parse_isbn(self, pd):
|
||||||
|
for x in reversed(pd.xpath(
|
||||||
|
'descendant::*[starts-with(text(), "ISBN")]')):
|
||||||
|
if x.tail:
|
||||||
|
ans = check_isbn(x.tail.strip())
|
||||||
|
if ans:
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def parse_publisher(self, pd):
|
||||||
|
for x in reversed(pd.xpath(
|
||||||
|
'descendant::*[starts-with(text(), "Publisher:")]')):
|
||||||
|
if x.tail:
|
||||||
|
ans = x.tail.partition(';')[0]
|
||||||
|
return ans.partition('(')[0].strip()
|
||||||
|
|
||||||
|
def parse_pubdate(self, pd):
|
||||||
|
for x in reversed(pd.xpath(
|
||||||
|
'descendant::*[starts-with(text(), "Publisher:")]')):
|
||||||
|
if x.tail:
|
||||||
|
ans = x.tail
|
||||||
|
date = ans.partition('(')[-1].replace(')', '').strip()
|
||||||
|
return parse_date(date, assume_utc=True)
|
||||||
|
|
||||||
|
def parse_language(self, pd):
|
||||||
|
for x in reversed(pd.xpath(
|
||||||
|
'descendant::*[starts-with(text(), "Language:")]')):
|
||||||
|
if x.tail:
|
||||||
|
ans = x.tail.strip()
|
||||||
|
if ans == 'English':
|
||||||
|
return 'en'
|
||||||
|
|
||||||
|
|
||||||
class Amazon(Source):
|
class Amazon(Source):
|
||||||
|
|
||||||
@ -200,7 +261,8 @@ class Amazon(Source):
|
|||||||
description = _('Downloads metadata from Amazon')
|
description = _('Downloads metadata from Amazon')
|
||||||
|
|
||||||
capabilities = frozenset(['identify'])
|
capabilities = frozenset(['identify'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments'])
|
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||||
|
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
|
||||||
|
|
||||||
AMAZON_DOMAINS = {
|
AMAZON_DOMAINS = {
|
||||||
'com': _('US'),
|
'com': _('US'),
|
||||||
@ -254,6 +316,10 @@ class Amazon(Source):
|
|||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
identifiers={}, timeout=20):
|
identifiers={}, timeout=20):
|
||||||
|
'''
|
||||||
|
Note this method will retry without identifiers automatically if no
|
||||||
|
match is found with identifiers.
|
||||||
|
'''
|
||||||
query = self.create_query(log, title=title, authors=authors,
|
query = self.create_query(log, title=title, authors=authors,
|
||||||
identifiers=identifiers)
|
identifiers=identifiers)
|
||||||
if query is None:
|
if query is None:
|
||||||
@ -281,37 +347,45 @@ class Amazon(Source):
|
|||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
|
|
||||||
if '<title>404 - ' in raw:
|
|
||||||
log.error('No matches found for query: %r'%query)
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
root = soupparser.fromstring(clean_ascii_chars(raw))
|
|
||||||
except:
|
|
||||||
msg = 'Failed to parse amazon page for query: %r'%query
|
|
||||||
log.exception(msg)
|
|
||||||
return msg
|
|
||||||
|
|
||||||
errmsg = root.xpath('//*[@id="errorMessage"]')
|
|
||||||
if errmsg:
|
|
||||||
msg = tostring(errmsg, method='text', encoding=unicode).strip()
|
|
||||||
log.error(msg)
|
|
||||||
# The error is almost always a not found error
|
|
||||||
return
|
|
||||||
|
|
||||||
matches = []
|
matches = []
|
||||||
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
|
found = '<title>404 - ' not in raw
|
||||||
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
|
|
||||||
title = tostring(a, method='text', encoding=unicode).lower()
|
if found:
|
||||||
if 'bulk pack' not in title:
|
try:
|
||||||
matches.append(a.get('href'))
|
root = soupparser.fromstring(clean_ascii_chars(raw))
|
||||||
break
|
except:
|
||||||
|
msg = 'Failed to parse amazon page for query: %r'%query
|
||||||
|
log.exception(msg)
|
||||||
|
return msg
|
||||||
|
|
||||||
|
errmsg = root.xpath('//*[@id="errorMessage"]')
|
||||||
|
if errmsg:
|
||||||
|
msg = tostring(errmsg, method='text', encoding=unicode).strip()
|
||||||
|
log.error(msg)
|
||||||
|
# The error is almost always a not found error
|
||||||
|
found = False
|
||||||
|
|
||||||
|
if found:
|
||||||
|
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
|
||||||
|
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
|
||||||
|
title = tostring(a, method='text', encoding=unicode).lower()
|
||||||
|
if 'bulk pack' not in title:
|
||||||
|
matches.append(a.get('href'))
|
||||||
|
break
|
||||||
|
|
||||||
# Keep only the top 5 matches as the matches are sorted by relevance by
|
# Keep only the top 5 matches as the matches are sorted by relevance by
|
||||||
# Amazon so lower matches are not likely to be very relevant
|
# Amazon so lower matches are not likely to be very relevant
|
||||||
matches = matches[:5]
|
matches = matches[:5]
|
||||||
|
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
|
||||||
if not matches:
|
if not matches:
|
||||||
|
if identifiers and title and authors:
|
||||||
|
self.log('No matches found with identifiers, retrying using only'
|
||||||
|
' title and authors')
|
||||||
|
return self.identify(log, result_queue, abort, title=title,
|
||||||
|
authors=authors, timeout=timeout)
|
||||||
log.error('No matches found with query: %r'%query)
|
log.error('No matches found with query: %r'%query)
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -333,6 +407,14 @@ class Amazon(Source):
|
|||||||
if not a_worker_is_alive:
|
if not a_worker_is_alive:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
for w in workers:
|
||||||
|
if w.amazon_id:
|
||||||
|
if w.isbn:
|
||||||
|
self.cache_isbn_to_identifier(w.isbn, w.amazon_id)
|
||||||
|
if w.cover_url:
|
||||||
|
self.cache_identifier_to_cover_url(w.amazon_id,
|
||||||
|
w.cover_url)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,6 +35,7 @@ class Source(Plugin):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
Plugin.__init__(self, *args, **kwargs)
|
Plugin.__init__(self, *args, **kwargs)
|
||||||
self._isbn_to_identifier_cache = {}
|
self._isbn_to_identifier_cache = {}
|
||||||
|
self._identifier_to_cover_url_cache = {}
|
||||||
self.cache_lock = threading.RLock()
|
self.cache_lock = threading.RLock()
|
||||||
self._config_obj = None
|
self._config_obj = None
|
||||||
self._browser = None
|
self._browser = None
|
||||||
@ -68,6 +69,14 @@ class Source(Plugin):
|
|||||||
with self.cache_lock:
|
with self.cache_lock:
|
||||||
return self._isbn_to_identifier_cache.get(isbn, None)
|
return self._isbn_to_identifier_cache.get(isbn, None)
|
||||||
|
|
||||||
|
def cache_identifier_to_cover_url(self, id_, url):
|
||||||
|
with self.cache_lock:
|
||||||
|
self._identifier_to_cover_url_cache[id_] = url
|
||||||
|
|
||||||
|
def cached_identifier_to_cover_url(self, id_):
|
||||||
|
with self.cache_lock:
|
||||||
|
return self._identifier_to_cover_url_cache.get(id_, None)
|
||||||
|
|
||||||
def get_author_tokens(self, authors, only_first_author=True):
|
def get_author_tokens(self, authors, only_first_author=True):
|
||||||
'''
|
'''
|
||||||
Take a list of authors and return a list of tokens useful for an
|
Take a list of authors and return a list of tokens useful for an
|
||||||
|
@ -145,8 +145,9 @@ class GoogleBooks(Source):
|
|||||||
description = _('Downloads metadata from Google Books')
|
description = _('Downloads metadata from Google Books')
|
||||||
|
|
||||||
capabilities = frozenset(['identify'])
|
capabilities = frozenset(['identify'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate',
|
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
||||||
'comments', 'publisher', 'author_sort']) # language currently disabled
|
'comments', 'publisher', 'author_sort', 'identifier:isbn',
|
||||||
|
'identifier:google']) # language currently disabled
|
||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}):
|
def create_query(self, log, title=None, authors=None, identifiers={}):
|
||||||
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
|
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user