Amazon metadata download: Handle a change in the amazon website that prevented review metadata from being downloaded

This commit is contained in:
Kovid Goyal 2016-01-09 18:01:24 +05:30
parent 592fbc1490
commit e8266bea14

View File

@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
import socket, time, re import socket, time, re
from threading import Thread from threading import Thread
from Queue import Queue, Empty from Queue import Queue, Empty
from urllib import unquote
from calibre import as_unicode, random_user_agent from calibre import as_unicode, random_user_agent
@ -332,7 +333,7 @@ class Worker(Thread): # Get details {{{
self.log.exception('Error parsing ratings for url: %r'%self.url) self.log.exception('Error parsing ratings for url: %r'%self.url)
try: try:
mi.comments = self.parse_comments(root) mi.comments = self.parse_comments(root, raw)
except: except:
self.log.exception('Error parsing comments for url: %r'%self.url) self.log.exception('Error parsing comments for url: %r'%self.url)
@ -502,7 +503,7 @@ class Worker(Thread): # Get details {{{
desc = re.sub(r'(?s)<!--.*?-->', '', desc) desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc) return sanitize_comments_html(desc)
def parse_comments(self, root): def parse_comments(self, root, raw):
ans = '' ans = ''
ns = tuple(self.selector('#bookDescription_feature_div noscript')) ns = tuple(self.selector('#bookDescription_feature_div noscript'))
if ns: if ns:
@ -522,6 +523,21 @@ class Worker(Thread): # Get details {{{
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
else:
# Idiot chickens from amazon strike again. This data is now stored
# in a JS variable inside a script tag URL encoded.
m = re.search(b'var\s+iframeContent\s*=\s*"([^"]+)"', raw)
if m is not None:
try:
text = unquote(m.group(1)).decode('utf-8')
nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
if desc:
ans += self._render_comments(desc[0])
except Exception:
import traceback
traceback.print_exc()
return ans return ans
def parse_series(self, root): def parse_series(self, root):