mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon metadata download: Ignore Spanish edition entries when searching for a book on amazon.com
This commit is contained in:
parent
9fb122cd4b
commit
949b3c04f9
@ -21,7 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata
|
|||||||
from calibre.utils.date import parse_only_date
|
from calibre.utils.date import parse_only_date
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
class Worker(Thread): # Get details {{{
|
class Worker(Thread): # Get details {{{
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Get book details from amazons book page in a separate thread
|
Get book details from amazons book page in a separate thread
|
||||||
@ -43,12 +43,12 @@ class Worker(Thread): # Get details {{{
|
|||||||
|
|
||||||
months = {
|
months = {
|
||||||
'de': {
|
'de': {
|
||||||
1 : ['jän'],
|
1: ['jän'],
|
||||||
2 : ['februar'],
|
2: ['februar'],
|
||||||
3 : ['märz'],
|
3: ['märz'],
|
||||||
5 : ['mai'],
|
5: ['mai'],
|
||||||
6 : ['juni'],
|
6: ['juni'],
|
||||||
7 : ['juli'],
|
7: ['juli'],
|
||||||
10: ['okt'],
|
10: ['okt'],
|
||||||
12: ['dez']
|
12: ['dez']
|
||||||
},
|
},
|
||||||
@ -276,7 +276,6 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.exception('Error parsing authors for url: %r'%self.url)
|
self.log.exception('Error parsing authors for url: %r'%self.url)
|
||||||
authors = []
|
authors = []
|
||||||
|
|
||||||
|
|
||||||
if not title or not authors or not asin:
|
if not title or not authors or not asin:
|
||||||
self.log.error('Could not find title/authors/asin for %r'%self.url)
|
self.log.error('Could not find title/authors/asin for %r'%self.url)
|
||||||
self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
|
self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
|
||||||
@ -431,7 +430,6 @@ class Worker(Thread): # Get details {{{
|
|||||||
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
||||||
return sanitize_comments_html(desc)
|
return sanitize_comments_html(desc)
|
||||||
|
|
||||||
|
|
||||||
def parse_comments(self, root):
|
def parse_comments(self, root):
|
||||||
ans = ''
|
ans = ''
|
||||||
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
|
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
|
||||||
@ -528,13 +526,13 @@ class Amazon(Source):
|
|||||||
|
|
||||||
AMAZON_DOMAINS = {
|
AMAZON_DOMAINS = {
|
||||||
'com': _('US'),
|
'com': _('US'),
|
||||||
'fr' : _('France'),
|
'fr': _('France'),
|
||||||
'de' : _('Germany'),
|
'de': _('Germany'),
|
||||||
'uk' : _('UK'),
|
'uk': _('UK'),
|
||||||
'it' : _('Italy'),
|
'it': _('Italy'),
|
||||||
'jp' : _('Japan'),
|
'jp': _('Japan'),
|
||||||
'es' : _('Spain'),
|
'es': _('Spain'),
|
||||||
'br' : _('Brazil'),
|
'br': _('Brazil'),
|
||||||
}
|
}
|
||||||
|
|
||||||
options = (
|
options = (
|
||||||
@ -592,7 +590,7 @@ class Amazon(Source):
|
|||||||
return domain, val
|
return domain, val
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
def get_book_url(self, identifiers): # {{{
|
def get_book_url(self, identifiers): # {{{
|
||||||
domain, asin = self.get_domain_and_asin(identifiers)
|
domain, asin = self.get_domain_and_asin(identifiers)
|
||||||
if domain and asin:
|
if domain and asin:
|
||||||
url = None
|
url = None
|
||||||
@ -637,8 +635,7 @@ class Amazon(Source):
|
|||||||
mi.tags = list(map(fixcase, mi.tags))
|
mi.tags = list(map(fixcase, mi.tags))
|
||||||
mi.isbn = check_isbn(mi.isbn)
|
mi.isbn = check_isbn(mi.isbn)
|
||||||
|
|
||||||
|
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
|
|
||||||
domain=None):
|
domain=None):
|
||||||
if domain is None:
|
if domain is None:
|
||||||
domain = self.domain
|
domain = self.domain
|
||||||
@ -648,8 +645,8 @@ class Amazon(Source):
|
|||||||
domain = idomain
|
domain = idomain
|
||||||
|
|
||||||
# See the amazon detailed search page to get all options
|
# See the amazon detailed search page to get all options
|
||||||
q = { 'search-alias' : 'aps',
|
q = {'search-alias': 'aps',
|
||||||
'unfiltered' : '1',
|
'unfiltered': '1',
|
||||||
}
|
}
|
||||||
|
|
||||||
if domain == 'com':
|
if domain == 'com':
|
||||||
@ -704,7 +701,7 @@ class Amazon(Source):
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_cached_cover_url(self, identifiers): # {{{
|
def get_cached_cover_url(self, identifiers): # {{{
|
||||||
url = None
|
url = None
|
||||||
domain, asin = self.get_domain_and_asin(identifiers)
|
domain, asin = self.get_domain_and_asin(identifiers)
|
||||||
if asin is None:
|
if asin is None:
|
||||||
@ -717,14 +714,17 @@ class Amazon(Source):
|
|||||||
return url
|
return url
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def parse_results_page(self, root): # {{{
|
def parse_results_page(self, root): # {{{
|
||||||
from lxml.html import tostring
|
from lxml.html import tostring
|
||||||
|
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
def title_ok(title):
|
def title_ok(title):
|
||||||
title = title.lower()
|
title = title.lower()
|
||||||
for x in ('bulk pack', '[audiobook]', '[audio cd]'):
|
bad = ['bulk pack', '[audiobook]', '[audio cd]']
|
||||||
|
if self.domain == 'com':
|
||||||
|
bad.append('(spanish edition)')
|
||||||
|
for x in bad:
|
||||||
if x in title:
|
if x in title:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
@ -751,13 +751,12 @@ class Amazon(Source):
|
|||||||
matches.append(a.get('href'))
|
matches.append(a.get('href'))
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
# Keep only the top 5 matches as the matches are sorted by relevance by
|
# Keep only the top 5 matches as the matches are sorted by relevance by
|
||||||
# Amazon so lower matches are not likely to be very relevant
|
# Amazon so lower matches are not likely to be very relevant
|
||||||
return matches[:5]
|
return matches[:5]
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
identifiers={}, timeout=30):
|
identifiers={}, timeout=30):
|
||||||
'''
|
'''
|
||||||
Note this method will retry without identifiers automatically if no
|
Note this method will retry without identifiers automatically if no
|
||||||
@ -795,7 +794,6 @@ class Amazon(Source):
|
|||||||
log.exception(msg)
|
log.exception(msg)
|
||||||
return as_unicode(msg)
|
return as_unicode(msg)
|
||||||
|
|
||||||
|
|
||||||
raw = clean_ascii_chars(xml_to_unicode(raw,
|
raw = clean_ascii_chars(xml_to_unicode(raw,
|
||||||
strip_encoding_pats=True, resolve_entities=True)[0])
|
strip_encoding_pats=True, resolve_entities=True)[0])
|
||||||
|
|
||||||
@ -825,7 +823,6 @@ class Amazon(Source):
|
|||||||
# The error is almost always a not found error
|
# The error is almost always a not found error
|
||||||
found = False
|
found = False
|
||||||
|
|
||||||
|
|
||||||
if found:
|
if found:
|
||||||
matches = self.parse_results_page(root)
|
matches = self.parse_results_page(root)
|
||||||
|
|
||||||
@ -863,7 +860,7 @@ class Amazon(Source):
|
|||||||
return None
|
return None
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def download_cover(self, log, result_queue, abort, # {{{
|
def download_cover(self, log, result_queue, abort, # {{{
|
||||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||||
cached_url = self.get_cached_cover_url(identifiers)
|
cached_url = self.get_cached_cover_url(identifiers)
|
||||||
if cached_url is None:
|
if cached_url is None:
|
||||||
@ -900,39 +897,44 @@ class Amazon(Source):
|
|||||||
log.exception('Failed to download cover from:', cached_url)
|
log.exception('Failed to download cover from:', cached_url)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug -e
|
# To run these test use: calibre-debug -e
|
||||||
# src/calibre/ebooks/metadata/sources/amazon.py
|
# src/calibre/ebooks/metadata/sources/amazon.py
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
isbn_test, title_test, authors_test, comments_test, series_test)
|
isbn_test, title_test, authors_test, comments_test, series_test)
|
||||||
com_tests = [ # {{{
|
com_tests = [ # {{{
|
||||||
|
|
||||||
( # + in title and uses id="main-image" for cover
|
( # Has a spanish edition
|
||||||
|
{'title':'11/22/63'},
|
||||||
|
[title_test('11/22/63: A Novel', exact=True), authors_test(['Stephen King']),]
|
||||||
|
),
|
||||||
|
|
||||||
|
( # + in title and uses id="main-image" for cover
|
||||||
{'title':'C++ Concurrency in Action'},
|
{'title':'C++ Concurrency in Action'},
|
||||||
[title_test('C++ Concurrency in Action: Practical Multithreading',
|
[title_test('C++ Concurrency in Action: Practical Multithreading',
|
||||||
exact=True),
|
exact=True),
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # Series
|
( # Series
|
||||||
{'identifiers':{'amazon':'0756407117'}},
|
{'identifiers':{'amazon':'0756407117'}},
|
||||||
[title_test(
|
[title_test(
|
||||||
"Throne of the Crescent Moon"
|
"Throne of the Crescent Moon",
|
||||||
, exact=True), series_test('Crescent Moon Kingdoms', 1),
|
exact=True), series_test('Crescent Moon Kingdoms', 1),
|
||||||
comments_test('Makhslood'),
|
comments_test('Makhslood'),
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # Different comments markup, using Book Description section
|
( # Different comments markup, using Book Description section
|
||||||
{'identifiers':{'amazon':'0982514506'}},
|
{'identifiers':{'amazon':'0982514506'}},
|
||||||
[title_test(
|
[title_test(
|
||||||
"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy"
|
"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy",
|
||||||
, exact=True),
|
exact=True),
|
||||||
comments_test('Jelena'), comments_test('Leslie'),
|
comments_test('Jelena'), comments_test('Leslie'),
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # # in title
|
( # # in title
|
||||||
{'title':'Expert C# 2008 Business Objects',
|
{'title':'Expert C# 2008 Business Objects',
|
||||||
'authors':['Lhotka']},
|
'authors':['Lhotka']},
|
||||||
[title_test('Expert C# 2008 Business Objects', exact=True),
|
[title_test('Expert C# 2008 Business Objects', exact=True),
|
||||||
@ -948,13 +950,13 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
( # Sophisticated comment formatting
|
( # Sophisticated comment formatting
|
||||||
{'identifiers':{'isbn': '9781416580829'}},
|
{'identifiers':{'isbn': '9781416580829'}},
|
||||||
[title_test('Angels & Demons - Movie Tie-In: A Novel',
|
[title_test('Angels & Demons - Movie Tie-In: A Novel',
|
||||||
exact=True), authors_test(['Dan Brown'])]
|
exact=True), authors_test(['Dan Brown'])]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # No specific problems
|
( # No specific problems
|
||||||
{'identifiers':{'isbn': '0743273567'}},
|
{'identifiers':{'isbn': '0743273567'}},
|
||||||
[title_test('The great gatsby', exact=True),
|
[title_test('The great gatsby', exact=True),
|
||||||
authors_test(['F. Scott Fitzgerald'])]
|
authors_test(['F. Scott Fitzgerald'])]
|
||||||
@ -967,9 +969,9 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
de_tests = [ # {{{
|
de_tests = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '3548283519'}},
|
{'identifiers':{'isbn': '3548283519'}},
|
||||||
[title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',
|
[title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',
|
||||||
@ -977,9 +979,9 @@ if __name__ == '__main__': # tests {{{
|
|||||||
]
|
]
|
||||||
|
|
||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
it_tests = [ # {{{
|
it_tests = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '8838922195'}},
|
{'identifiers':{'isbn': '8838922195'}},
|
||||||
[title_test('La briscola in cinque',
|
[title_test('La briscola in cinque',
|
||||||
@ -987,9 +989,9 @@ if __name__ == '__main__': # tests {{{
|
|||||||
]
|
]
|
||||||
|
|
||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
fr_tests = [ # {{{
|
fr_tests = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '2221116798'}},
|
{'identifiers':{'isbn': '2221116798'}},
|
||||||
[title_test('L\'étrange voyage de Monsieur Daldry',
|
[title_test('L\'étrange voyage de Monsieur Daldry',
|
||||||
@ -997,9 +999,9 @@ if __name__ == '__main__': # tests {{{
|
|||||||
]
|
]
|
||||||
|
|
||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
es_tests = [ # {{{
|
es_tests = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '8483460831'}},
|
{'identifiers':{'isbn': '8483460831'}},
|
||||||
[title_test('Tiempos Interesantes',
|
[title_test('Tiempos Interesantes',
|
||||||
@ -1007,28 +1009,28 @@ if __name__ == '__main__': # tests {{{
|
|||||||
]
|
]
|
||||||
|
|
||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
jp_tests = [ # {{{
|
jp_tests = [ # {{{
|
||||||
( # Adult filtering test
|
( # Adult filtering test
|
||||||
{'identifiers':{'isbn':'4799500066'}},
|
{'identifiers':{'isbn':'4799500066'}},
|
||||||
[title_test(u'Bitch Trap'),]
|
[title_test(u'Bitch Trap'),]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # isbn -> title, authors
|
( # isbn -> title, authors
|
||||||
{'identifiers':{'isbn': '9784101302720' }},
|
{'identifiers':{'isbn': '9784101302720'}},
|
||||||
[title_test(u'精霊の守り人',
|
[title_test(u'精霊の守り人',
|
||||||
exact=True), authors_test([u'上橋 菜穂子'])
|
exact=True), authors_test([u'上橋 菜穂子'])
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
( # title, authors -> isbn (will use Shift_JIS encoding in query.)
|
( # title, authors -> isbn (will use Shift_JIS encoding in query.)
|
||||||
{'title': u'考えない練習',
|
{'title': u'考えない練習',
|
||||||
'authors': [u'小池 龍之介']},
|
'authors': [u'小池 龍之介']},
|
||||||
[isbn_test('9784093881067'), ]
|
[isbn_test('9784093881067'), ]
|
||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
br_tests = [ # {{{
|
br_tests = [ # {{{
|
||||||
(
|
(
|
||||||
{'title':'Guerra dos Tronos'},
|
{'title':'Guerra dos Tronos'},
|
||||||
[title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
|
[title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
|
||||||
@ -1036,7 +1038,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
]
|
]
|
||||||
|
|
||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
def do_test(domain, start=0, stop=None):
|
def do_test(domain, start=0, stop=None):
|
||||||
tests = globals().get(domain+'_tests')
|
tests = globals().get(domain+'_tests')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user