Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it uses.

This commit is contained in:
Kovid Goyal 2012-02-18 14:04:41 +05:30
parent 60c97aec90
commit 45e7e3f507

View File

@ -156,6 +156,16 @@ class Worker(Thread): # Get details {{{
for name in names: for name in names:
self.lang_map[name] = code self.lang_map[name] = code
self.series_pat = re.compile(
r'''
\|\s* # Prefix
(Series)\s*:\s* # Series declaration
(?P<series>.+?)\s+ # The series name
\((Book)\s* # Book declaration
(?P<index>[0-9.]+) # Series index
\s*\)
''', re.X)
def delocalize_datestr(self, raw): def delocalize_datestr(self, raw):
if not self.months: if not self.months:
return raw return raw
@ -265,6 +275,15 @@ class Worker(Thread): # Get details {{{
except: except:
self.log.exception('Error parsing comments for url: %r'%self.url) self.log.exception('Error parsing comments for url: %r'%self.url)
try:
series, series_index = self.parse_series(root)
if series:
mi.series, mi.series_index = series, series_index
elif self.testing:
mi.series, mi.series_index = 'Dummy series for testing', 1
except:
self.log.exception('Error parsing series for url: %r'%self.url)
try: try:
self.cover_url = self.parse_cover(root) self.cover_url = self.parse_cover(root)
except: except:
@ -398,6 +417,20 @@ class Worker(Thread): # Get details {{{
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
return ans return ans
def parse_series(self, root):
ans = (None, None)
desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
if desc:
raw = self.tostring(desc[0], method='text', encoding=unicode)
raw = re.sub(r'\s+', ' ', raw)
match = self.series_pat.search(raw)
if match is not None:
s, i = match.group('series'), float(match.group('index'))
if s:
ans = (s, i)
return ans
def parse_cover(self, root): def parse_cover(self, root):
imgs = root.xpath('//img[@id="prodImage" and @src]') imgs = root.xpath('//img[@id="prodImage" and @src]')
if imgs: if imgs:
@ -457,7 +490,7 @@ class Amazon(Source):
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:amazon', touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate',
'languages']) 'languages', 'series'])
has_html_comments = True has_html_comments = True
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
@ -685,13 +718,15 @@ class Amazon(Source):
from lxml.html import tostring from lxml.html import tostring
import html5lib import html5lib
testing = getattr(self, 'running_a_test', False)
query, domain = self.create_query(log, title=title, authors=authors, query, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
if query is None: if query is None:
log.error('Insufficient metadata to construct query') log.error('Insufficient metadata to construct query')
return return
br = self.browser br = self.browser
if getattr(self, 'running_a_test', False): if testing:
print ('Using user agent for amazon: %s'%self.user_agent) print ('Using user agent for amazon: %s'%self.user_agent)
try: try:
raw = br.open_novisit(query, timeout=timeout).read().strip() raw = br.open_novisit(query, timeout=timeout).read().strip()
@ -714,7 +749,7 @@ class Amazon(Source):
raw = clean_ascii_chars(xml_to_unicode(raw, raw = clean_ascii_chars(xml_to_unicode(raw,
strip_encoding_pats=True, resolve_entities=True)[0]) strip_encoding_pats=True, resolve_entities=True)[0])
if getattr(self, 'running_a_test', False): if testing:
import tempfile import tempfile
with tempfile.NamedTemporaryFile(prefix='amazon_results_', with tempfile.NamedTemporaryFile(prefix='amazon_results_',
suffix='.html', delete=False) as f: suffix='.html', delete=False) as f:
@ -757,8 +792,7 @@ class Amazon(Source):
return return
workers = [Worker(url, result_queue, br, log, i, domain, self, workers = [Worker(url, result_queue, br, log, i, domain, self,
testing=getattr(self, 'running_a_test', False)) for i, url in testing=testing) for i, url in enumerate(matches)]
enumerate(matches)]
for w in workers: for w in workers:
w.start() w.start()
@ -820,9 +854,18 @@ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e # To run these test use: calibre-debug -e
# src/calibre/ebooks/metadata/sources/amazon.py # src/calibre/ebooks/metadata/sources/amazon.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test, title_test, authors_test, comments_test) isbn_test, title_test, authors_test, comments_test, series_test)
com_tests = [ # {{{ com_tests = [ # {{{
( # Series
{'identifiers':{'amazon':'0756407117'}},
[title_test(
"Throne of the Crescent Moon"
, exact=True), series_test('Crescent Moon Kingdoms', 1),
comments_test('Makhslood'),
]
),
( # Different comments markup, using Book Description section ( # Different comments markup, using Book Description section
{'identifiers':{'amazon':'0982514506'}}, {'identifiers':{'amazon':'0982514506'}},
[title_test( [title_test(