mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Amazon metadata download: Get series information from the amazon.com page, when available
This commit is contained in:
parent
84c97d3cfa
commit
cbfb38ffae
@ -549,15 +549,47 @@ class Worker(Thread): # Get details {{{
|
|||||||
|
|
||||||
def parse_series(self, root):
|
def parse_series(self, root):
|
||||||
ans = (None, None)
|
ans = (None, None)
|
||||||
desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
|
|
||||||
if desc:
|
# This is found on the paperback/hardback pages for books on amazon.com
|
||||||
raw = self.tostring(desc[0], method='text', encoding=unicode)
|
series = root.xpath('//div[@data-feature-name="seriesTitle"]')
|
||||||
raw = re.sub(r'\s+', ' ', raw)
|
if series:
|
||||||
match = self.series_pat.search(raw)
|
series = series[0]
|
||||||
if match is not None:
|
spans = series.xpath('./span')
|
||||||
s, i = match.group('series'), float(match.group('index'))
|
if spans:
|
||||||
if s:
|
raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||||
ans = (s, i)
|
m = re.search('\s+([0-9.]+)$', raw.strip())
|
||||||
|
if m is not None:
|
||||||
|
series_index = float(m.group(1))
|
||||||
|
s = series.xpath('./a[@id="series-page-link"]')
|
||||||
|
if s:
|
||||||
|
series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||||
|
if series:
|
||||||
|
ans = (series, series_index)
|
||||||
|
# This is found on Kindle edition pages on amazon.com
|
||||||
|
if ans == (None, None):
|
||||||
|
for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'):
|
||||||
|
text = (span.text or '').strip()
|
||||||
|
m = re.match('Book\s+([0-9.]+)', text)
|
||||||
|
if m is not None:
|
||||||
|
series_index = float(m.group(1))
|
||||||
|
a = span.xpath('./a[@href]')
|
||||||
|
if a:
|
||||||
|
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||||
|
if series:
|
||||||
|
ans = (series, series_index)
|
||||||
|
if ans == (None, None):
|
||||||
|
desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
|
||||||
|
if desc:
|
||||||
|
raw = self.tostring(desc[0], method='text', encoding=unicode)
|
||||||
|
raw = re.sub(r'\s+', ' ', raw)
|
||||||
|
match = self.series_pat.search(raw)
|
||||||
|
if match is not None:
|
||||||
|
s, i = match.group('series'), float(match.group('index'))
|
||||||
|
if s:
|
||||||
|
ans = (s, i)
|
||||||
|
if ans[0]:
|
||||||
|
ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1])
|
||||||
|
ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1])
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def parse_tags(self, root):
|
def parse_tags(self, root):
|
||||||
@ -838,6 +870,10 @@ class Amazon(Source):
|
|||||||
(mi.is_null('language') and self.domain in {'com', 'uk'})
|
(mi.is_null('language') and self.domain in {'com', 'uk'})
|
||||||
)
|
)
|
||||||
if mi.title and docase:
|
if mi.title and docase:
|
||||||
|
# Remove series information from title
|
||||||
|
m = re.search(r'\S+\s+(\(.+?\s+Book\s+\d+\))$', mi.title)
|
||||||
|
if m is not None:
|
||||||
|
mi.title = mi.title.replace(m.group(1), '').strip()
|
||||||
mi.title = fixcase(mi.title)
|
mi.title = fixcase(mi.title)
|
||||||
mi.authors = fixauthors(mi.authors)
|
mi.authors = fixauthors(mi.authors)
|
||||||
if mi.tags and docase:
|
if mi.tags and docase:
|
||||||
@ -1155,9 +1191,19 @@ class Amazon(Source):
|
|||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
|
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
isbn_test, title_test, authors_test, comments_test)
|
isbn_test, title_test, authors_test, comments_test, series_test)
|
||||||
com_tests = [ # {{{
|
com_tests = [ # {{{
|
||||||
|
|
||||||
|
( # Paperback with series
|
||||||
|
{'identifiers':{'amazon':'1423146786'}},
|
||||||
|
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)]
|
||||||
|
),
|
||||||
|
|
||||||
|
( # Kindle edition with series
|
||||||
|
{'identifiers':{'amazon':'B0085UEQDO'}},
|
||||||
|
[title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)]
|
||||||
|
),
|
||||||
|
|
||||||
( # A kindle edition that does not appear in the search results when searching by ASIN
|
( # A kindle edition that does not appear in the search results when searching by ASIN
|
||||||
{'identifiers':{'amazon':'B004JHY6OG'}},
|
{'identifiers':{'amazon':'B004JHY6OG'}},
|
||||||
[title_test('The Heroes: A First Law Novel', exact=True)]
|
[title_test('The Heroes: A First Law Novel', exact=True)]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user