mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon metadata download plugin: Fix links being stripped from comments. Also fix ratings/isbn not being parsed from kindle edition pages. Fixes #782012 (amazon metadata errors)
This commit is contained in:
parent
a070127d1d
commit
1dd0706eae
@ -202,6 +202,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
|
|
||||||
def parse_rating(self, root):
|
def parse_rating(self, root):
|
||||||
ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
|
ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
|
||||||
|
if not ratings:
|
||||||
|
ratings = root.xpath('//div[@class="buying"]/descendant::span[@class="asinReviewsSummary"]')
|
||||||
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
||||||
if ratings:
|
if ratings:
|
||||||
for elem in ratings[0].xpath('descendant::*[@title]'):
|
for elem in ratings[0].xpath('descendant::*[@title]'):
|
||||||
@ -215,9 +217,13 @@ class Worker(Thread): # Get details {{{
|
|||||||
if desc:
|
if desc:
|
||||||
desc = desc[0]
|
desc = desc[0]
|
||||||
for c in desc.xpath('descendant::*[@class="seeAll" or'
|
for c in desc.xpath('descendant::*[@class="seeAll" or'
|
||||||
' @class="emptyClear" or @href]'):
|
' @class="emptyClear"]'):
|
||||||
c.getparent().remove(c)
|
c.getparent().remove(c)
|
||||||
|
for a in desc.xpath('descendant::a[@href]'):
|
||||||
|
del a.attrib['href']
|
||||||
|
a.tag = 'span'
|
||||||
desc = tostring(desc, method='html', encoding=unicode).strip()
|
desc = tostring(desc, method='html', encoding=unicode).strip()
|
||||||
|
|
||||||
# Encoding bug in Amazon data U+fffd (replacement char)
|
# Encoding bug in Amazon data U+fffd (replacement char)
|
||||||
# in some examples it is present in place of '
|
# in some examples it is present in place of '
|
||||||
desc = desc.replace('\ufffd', "'")
|
desc = desc.replace('\ufffd', "'")
|
||||||
@ -246,8 +252,12 @@ class Worker(Thread): # Get details {{{
|
|||||||
return ('/'.join(parts[:-1]))+'/'+bn
|
return ('/'.join(parts[:-1]))+'/'+bn
|
||||||
|
|
||||||
def parse_isbn(self, pd):
|
def parse_isbn(self, pd):
|
||||||
for x in reversed(pd.xpath(
|
items = pd.xpath(
|
||||||
'descendant::*[starts-with(text(), "ISBN")]')):
|
'descendant::*[starts-with(text(), "ISBN")]')
|
||||||
|
if not items:
|
||||||
|
items = pd.xpath(
|
||||||
|
'descendant::b[contains(text(), "ISBN:")]')
|
||||||
|
for x in reversed(items):
|
||||||
if x.tail:
|
if x.tail:
|
||||||
ans = check_isbn(x.tail.strip())
|
ans = check_isbn(x.tail.strip())
|
||||||
if ans:
|
if ans:
|
||||||
@ -519,8 +529,17 @@ if __name__ == '__main__': # tests {{{
|
|||||||
test_identify_plugin(Amazon.name,
|
test_identify_plugin(Amazon.name,
|
||||||
[
|
[
|
||||||
|
|
||||||
( # An e-book ISBN not on Amazon, one of the authors is
|
( # Description has links
|
||||||
# unknown to Amazon, so no popup wrapper
|
{'identifiers':{'isbn': '9780671578275'}},
|
||||||
|
[title_test('A Civil Campaign: A Comedy of Biology and Manners',
|
||||||
|
exact=True), authors_test(['Lois McMaster Bujold'])
|
||||||
|
]
|
||||||
|
|
||||||
|
),
|
||||||
|
|
||||||
|
( # An e-book ISBN not on Amazon, the title/author search matches
|
||||||
|
# the Kindle edition, which has different markup for ratings and
|
||||||
|
# isbn
|
||||||
{'identifiers':{'isbn': '9780307459671'},
|
{'identifiers':{'isbn': '9780307459671'},
|
||||||
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
|
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
|
||||||
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
|
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user