mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Amazon metadata download: Handle a change in the amazon website that prevented review metadata from being downloaded
This commit is contained in:
parent
592fbc1490
commit
e8266bea14
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import socket, time, re
|
import socket, time, re
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
|
from urllib import unquote
|
||||||
|
|
||||||
|
|
||||||
from calibre import as_unicode, random_user_agent
|
from calibre import as_unicode, random_user_agent
|
||||||
@ -332,7 +333,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.exception('Error parsing ratings for url: %r'%self.url)
|
self.log.exception('Error parsing ratings for url: %r'%self.url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mi.comments = self.parse_comments(root)
|
mi.comments = self.parse_comments(root, raw)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing comments for url: %r'%self.url)
|
self.log.exception('Error parsing comments for url: %r'%self.url)
|
||||||
|
|
||||||
@ -502,7 +503,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
||||||
return sanitize_comments_html(desc)
|
return sanitize_comments_html(desc)
|
||||||
|
|
||||||
def parse_comments(self, root):
|
def parse_comments(self, root, raw):
|
||||||
ans = ''
|
ans = ''
|
||||||
ns = tuple(self.selector('#bookDescription_feature_div noscript'))
|
ns = tuple(self.selector('#bookDescription_feature_div noscript'))
|
||||||
if ns:
|
if ns:
|
||||||
@ -522,6 +523,21 @@ class Worker(Thread): # Get details {{{
|
|||||||
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
||||||
if desc:
|
if desc:
|
||||||
ans += self._render_comments(desc[0])
|
ans += self._render_comments(desc[0])
|
||||||
|
else:
|
||||||
|
# Idiot chickens from amazon strike again. This data is now stored
|
||||||
|
# in a JS variable inside a script tag URL encoded.
|
||||||
|
m = re.search(b'var\s+iframeContent\s*=\s*"([^"]+)"', raw)
|
||||||
|
if m is not None:
|
||||||
|
try:
|
||||||
|
text = unquote(m.group(1)).decode('utf-8')
|
||||||
|
nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
||||||
|
if desc:
|
||||||
|
ans += self._render_comments(desc[0])
|
||||||
|
except Exception:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def parse_series(self, root):
|
def parse_series(self, root):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user