mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Aamzon plugin: Workaround broken encoding. Detect and use mobile user agent search results page
This commit is contained in:
parent
0d924d81ef
commit
6e98d78dd7
@ -218,6 +218,9 @@ class Worker(Thread): # Get details {{{
|
|||||||
' @class="emptyClear" or @href]'):
|
' @class="emptyClear" or @href]'):
|
||||||
c.getparent().remove(c)
|
c.getparent().remove(c)
|
||||||
desc = tostring(desc, method='html', encoding=unicode).strip()
|
desc = tostring(desc, method='html', encoding=unicode).strip()
|
||||||
|
# Encoding bug in Amazon data U+fffd (replacement char)
|
||||||
|
# in some examples it is present in place of '
|
||||||
|
desc = desc.replace('\ufffd', "'")
|
||||||
# remove all attributes from tags
|
# remove all attributes from tags
|
||||||
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
||||||
# Collapse whitespace
|
# Collapse whitespace
|
||||||
@ -410,6 +413,18 @@ class Amazon(Source):
|
|||||||
if 'bulk pack' not in title:
|
if 'bulk pack' not in title:
|
||||||
matches.append(a.get('href'))
|
matches.append(a.get('href'))
|
||||||
break
|
break
|
||||||
|
if not matches:
|
||||||
|
# This can happen for some user agents that Amazon thinks are
|
||||||
|
# mobile/less capable
|
||||||
|
log('Trying alternate results page markup')
|
||||||
|
for td in root.xpath(
|
||||||
|
r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
|
||||||
|
for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
|
||||||
|
title = tostring(a, method='text', encoding=unicode).lower()
|
||||||
|
if 'bulk pack' not in title:
|
||||||
|
matches.append(a.get('href'))
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
# Keep only the top 5 matches as the matches are sorted by relevance by
|
# Keep only the top 5 matches as the matches are sorted by relevance by
|
||||||
# Amazon so lower matches are not likely to be very relevant
|
# Amazon so lower matches are not likely to be very relevant
|
||||||
|
Loading…
x
Reference in New Issue
Block a user