mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Amazon metadata download: Get series information from the amazon.com page, when available
This commit is contained in:
parent
84c97d3cfa
commit
cbfb38ffae
@ -549,6 +549,35 @@ class Worker(Thread): # Get details {{{
|
||||
|
||||
def parse_series(self, root):
|
||||
ans = (None, None)
|
||||
|
||||
# This is found on the paperback/hardback pages for books on amazon.com
|
||||
series = root.xpath('//div[@data-feature-name="seriesTitle"]')
|
||||
if series:
|
||||
series = series[0]
|
||||
spans = series.xpath('./span')
|
||||
if spans:
|
||||
raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||
m = re.search('\s+([0-9.]+)$', raw.strip())
|
||||
if m is not None:
|
||||
series_index = float(m.group(1))
|
||||
s = series.xpath('./a[@id="series-page-link"]')
|
||||
if s:
|
||||
series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||
if series:
|
||||
ans = (series, series_index)
|
||||
# This is found on Kindle edition pages on amazon.com
|
||||
if ans == (None, None):
|
||||
for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'):
|
||||
text = (span.text or '').strip()
|
||||
m = re.match('Book\s+([0-9.]+)', text)
|
||||
if m is not None:
|
||||
series_index = float(m.group(1))
|
||||
a = span.xpath('./a[@href]')
|
||||
if a:
|
||||
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||
if series:
|
||||
ans = (series, series_index)
|
||||
if ans == (None, None):
|
||||
desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
|
||||
if desc:
|
||||
raw = self.tostring(desc[0], method='text', encoding=unicode)
|
||||
@ -558,6 +587,9 @@ class Worker(Thread): # Get details {{{
|
||||
s, i = match.group('series'), float(match.group('index'))
|
||||
if s:
|
||||
ans = (s, i)
|
||||
if ans[0]:
|
||||
ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1])
|
||||
ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1])
|
||||
return ans
|
||||
|
||||
def parse_tags(self, root):
|
||||
@ -838,6 +870,10 @@ class Amazon(Source):
|
||||
(mi.is_null('language') and self.domain in {'com', 'uk'})
|
||||
)
|
||||
if mi.title and docase:
|
||||
# Remove series information from title
|
||||
m = re.search(r'\S+\s+(\(.+?\s+Book\s+\d+\))$', mi.title)
|
||||
if m is not None:
|
||||
mi.title = mi.title.replace(m.group(1), '').strip()
|
||||
mi.title = fixcase(mi.title)
|
||||
mi.authors = fixauthors(mi.authors)
|
||||
if mi.tags and docase:
|
||||
@ -1155,9 +1191,19 @@ class Amazon(Source):
|
||||
if __name__ == '__main__': # tests {{{
|
||||
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
|
||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||
isbn_test, title_test, authors_test, comments_test)
|
||||
isbn_test, title_test, authors_test, comments_test, series_test)
|
||||
com_tests = [ # {{{
|
||||
|
||||
( # Paperback with series
|
||||
{'identifiers':{'amazon':'1423146786'}},
|
||||
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)]
|
||||
),
|
||||
|
||||
( # Kindle edition with series
|
||||
{'identifiers':{'amazon':'B0085UEQDO'}},
|
||||
[title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)]
|
||||
),
|
||||
|
||||
( # A kindle edition that does not appear in the search results when searching by ASIN
|
||||
{'identifiers':{'amazon':'B004JHY6OG'}},
|
||||
[title_test('The Heroes: A First Law Novel', exact=True)]
|
||||
|
Loading…
x
Reference in New Issue
Block a user