Amazon metadata download: Ignore Spanish edition entries when searching for a book on amazon.com

This commit is contained in:
Kovid Goyal 2013-04-09 17:07:24 +05:30
parent 9fb122cd4b
commit 949b3c04f9

View File

@ -43,12 +43,12 @@ class Worker(Thread): # Get details {{{
months = { months = {
'de': { 'de': {
1 : ['jän'], 1: ['jän'],
2 : ['februar'], 2: ['februar'],
3 : ['märz'], 3: ['märz'],
5 : ['mai'], 5: ['mai'],
6 : ['juni'], 6: ['juni'],
7 : ['juli'], 7: ['juli'],
10: ['okt'], 10: ['okt'],
12: ['dez'] 12: ['dez']
}, },
@ -276,7 +276,6 @@ class Worker(Thread): # Get details {{{
self.log.exception('Error parsing authors for url: %r'%self.url) self.log.exception('Error parsing authors for url: %r'%self.url)
authors = [] authors = []
if not title or not authors or not asin: if not title or not authors or not asin:
self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('Could not find title/authors/asin for %r'%self.url)
self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
@ -431,7 +430,6 @@ class Worker(Thread): # Get details {{{
desc = re.sub(r'(?s)<!--.*?-->', '', desc) desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc) return sanitize_comments_html(desc)
def parse_comments(self, root): def parse_comments(self, root):
ans = '' ans = ''
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
@ -528,13 +526,13 @@ class Amazon(Source):
AMAZON_DOMAINS = { AMAZON_DOMAINS = {
'com': _('US'), 'com': _('US'),
'fr' : _('France'), 'fr': _('France'),
'de' : _('Germany'), 'de': _('Germany'),
'uk' : _('UK'), 'uk': _('UK'),
'it' : _('Italy'), 'it': _('Italy'),
'jp' : _('Japan'), 'jp': _('Japan'),
'es' : _('Spain'), 'es': _('Spain'),
'br' : _('Brazil'), 'br': _('Brazil'),
} }
options = ( options = (
@ -637,7 +635,6 @@ class Amazon(Source):
mi.tags = list(map(fixcase, mi.tags)) mi.tags = list(map(fixcase, mi.tags))
mi.isbn = check_isbn(mi.isbn) mi.isbn = check_isbn(mi.isbn)
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
domain=None): domain=None):
if domain is None: if domain is None:
@ -648,8 +645,8 @@ class Amazon(Source):
domain = idomain domain = idomain
# See the amazon detailed search page to get all options # See the amazon detailed search page to get all options
q = { 'search-alias' : 'aps', q = {'search-alias': 'aps',
'unfiltered' : '1', 'unfiltered': '1',
} }
if domain == 'com': if domain == 'com':
@ -724,7 +721,10 @@ class Amazon(Source):
def title_ok(title): def title_ok(title):
title = title.lower() title = title.lower()
for x in ('bulk pack', '[audiobook]', '[audio cd]'): bad = ['bulk pack', '[audiobook]', '[audio cd]']
if self.domain == 'com':
bad.append('(spanish edition)')
for x in bad:
if x in title: if x in title:
return False return False
return True return True
@ -751,7 +751,6 @@ class Amazon(Source):
matches.append(a.get('href')) matches.append(a.get('href'))
break break
# Keep only the top 5 matches as the matches are sorted by relevance by # Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant # Amazon so lower matches are not likely to be very relevant
return matches[:5] return matches[:5]
@ -795,7 +794,6 @@ class Amazon(Source):
log.exception(msg) log.exception(msg)
return as_unicode(msg) return as_unicode(msg)
raw = clean_ascii_chars(xml_to_unicode(raw, raw = clean_ascii_chars(xml_to_unicode(raw,
strip_encoding_pats=True, resolve_entities=True)[0]) strip_encoding_pats=True, resolve_entities=True)[0])
@ -825,7 +823,6 @@ class Amazon(Source):
# The error is almost always a not found error # The error is almost always a not found error
found = False found = False
if found: if found:
matches = self.parse_results_page(root) matches = self.parse_results_page(root)
@ -907,6 +904,11 @@ if __name__ == '__main__': # tests {{{
isbn_test, title_test, authors_test, comments_test, series_test) isbn_test, title_test, authors_test, comments_test, series_test)
com_tests = [ # {{{ com_tests = [ # {{{
( # Has a spanish edition
{'title':'11/22/63'},
[title_test('11/22/63: A Novel', exact=True), authors_test(['Stephen King']),]
),
( # + in title and uses id="main-image" for cover ( # + in title and uses id="main-image" for cover
{'title':'C++ Concurrency in Action'}, {'title':'C++ Concurrency in Action'},
[title_test('C++ Concurrency in Action: Practical Multithreading', [title_test('C++ Concurrency in Action: Practical Multithreading',
@ -917,8 +919,8 @@ if __name__ == '__main__': # tests {{{
( # Series ( # Series
{'identifiers':{'amazon':'0756407117'}}, {'identifiers':{'amazon':'0756407117'}},
[title_test( [title_test(
"Throne of the Crescent Moon" "Throne of the Crescent Moon",
, exact=True), series_test('Crescent Moon Kingdoms', 1), exact=True), series_test('Crescent Moon Kingdoms', 1),
comments_test('Makhslood'), comments_test('Makhslood'),
] ]
), ),
@ -926,8 +928,8 @@ if __name__ == '__main__': # tests {{{
( # Different comments markup, using Book Description section ( # Different comments markup, using Book Description section
{'identifiers':{'amazon':'0982514506'}}, {'identifiers':{'amazon':'0982514506'}},
[title_test( [title_test(
"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy" "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy",
, exact=True), exact=True),
comments_test('Jelena'), comments_test('Leslie'), comments_test('Jelena'), comments_test('Leslie'),
] ]
), ),
@ -1016,7 +1018,7 @@ if __name__ == '__main__': # tests {{{
), ),
( # isbn -> title, authors ( # isbn -> title, authors
{'identifiers':{'isbn': '9784101302720' }}, {'identifiers':{'isbn': '9784101302720'}},
[title_test(u'精霊の守り人', [title_test(u'精霊の守り人',
exact=True), authors_test([u'上橋 菜穂子']) exact=True), authors_test([u'上橋 菜穂子'])
] ]