This commit is contained in:
Kovid Goyal 2024-01-03 19:31:09 +05:30
commit bd3115c1dd
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -426,6 +426,7 @@ class Worker(Thread): # Get details {{{
def parse_details(self, raw, root): def parse_details(self, raw, root):
asin = parse_asin(root, self.log, self.url) asin = parse_asin(root, self.log, self.url)
self.log('ORIO asin:', asin)
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'): if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
raise CaptchaError( raise CaptchaError(
'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.') 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
@ -442,12 +443,14 @@ class Worker(Thread): # Get details {{{
except: except:
self.log.exception('Error parsing title for url: %r' % self.url) self.log.exception('Error parsing title for url: %r' % self.url)
title = None title = None
self.log('ORIO title:', title)
try: try:
authors = self.parse_authors(root) authors = self.parse_authors(root)
except: except:
self.log.exception('Error parsing authors for url: %r' % self.url) self.log.exception('Error parsing authors for url: %r' % self.url)
authors = [] authors = []
self.log('ORIO authors:', authors)
if not title or not authors or not asin: if not title or not authors or not asin:
self.log.error( self.log.error(
@ -465,11 +468,13 @@ class Worker(Thread): # Get details {{{
mi.rating = self.parse_rating(root) mi.rating = self.parse_rating(root)
except: except:
self.log.exception('Error parsing ratings for url: %r' % self.url) self.log.exception('Error parsing ratings for url: %r' % self.url)
self.log('ORIO rating:', mi.rating)
try: try:
mi.comments = self.parse_comments(root, raw) mi.comments = self.parse_comments(root, raw)
except: except:
self.log.exception('Error parsing comments for url: %r' % self.url) self.log.exception('Error parsing comments for url: %r' % self.url)
self.log('ORIO comments:', mi.comments)
try: try:
series, series_index = self.parse_series(root) series, series_index = self.parse_series(root)
@ -479,11 +484,13 @@ class Worker(Thread): # Get details {{{
mi.series, mi.series_index = 'Dummy series for testing', 1 mi.series, mi.series_index = 'Dummy series for testing', 1
except: except:
self.log.exception('Error parsing series for url: %r' % self.url) self.log.exception('Error parsing series for url: %r' % self.url)
self.log('ORIO series, series_index:', series, series_index)
try: try:
mi.tags = self.parse_tags(root) mi.tags = self.parse_tags(root)
except: except:
self.log.exception('Error parsing tags for url: %r' % self.url) self.log.exception('Error parsing tags for url: %r' % self.url)
self.log('ORIO tags:', mi.tags)
try: try:
self.cover_url = self.parse_cover(root, raw) self.cover_url = self.parse_cover(root, raw)
@ -492,14 +499,17 @@ class Worker(Thread): # Get details {{{
if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'): if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'):
self.cover_url = self.cover_url_processor(self.cover_url) self.cover_url = self.cover_url_processor(self.cover_url)
mi.has_cover = bool(self.cover_url) mi.has_cover = bool(self.cover_url)
self.log('ORIO cover_url:', self.cover_url)
detail_bullets = root.xpath('//*[@data-feature-name="detailBullets"]') detail_bullets = root.xpath('//*[@data-feature-name="detailBullets"]')
non_hero = tuple(self.selector( non_hero = tuple(self.selector(
'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector( 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector(
'#productDetails_techSpec_sections')) '#productDetails_techSpec_sections'))
if detail_bullets: if detail_bullets:
self.log('ORIO parse_detail_bullets')
self.parse_detail_bullets(root, mi, detail_bullets[0]) self.parse_detail_bullets(root, mi, detail_bullets[0])
elif non_hero: elif non_hero:
self.log('ORIO parse_new_details')
try: try:
self.parse_new_details(root, mi, non_hero[0]) self.parse_new_details(root, mi, non_hero[0])
except: except:
@ -509,6 +519,7 @@ class Worker(Thread): # Get details {{{
else: else:
pd = root.xpath(self.pd_xpath) pd = root.xpath(self.pd_xpath)
if pd: if pd:
self.log('ORIO Dettagli prodotto')
pd = pd[0] pd = pd[0]
try: try:
@ -518,18 +529,21 @@ class Worker(Thread): # Get details {{{
except: except:
self.log.exception( self.log.exception(
'Error parsing ISBN for url: %r' % self.url) 'Error parsing ISBN for url: %r' % self.url)
self.log('ORIO isbn:', mi.isbn)
try: try:
mi.publisher = self.parse_publisher(pd) mi.publisher = self.parse_publisher(pd)
except: except:
self.log.exception( self.log.exception(
'Error parsing publisher for url: %r' % self.url) 'Error parsing publisher for url: %r' % self.url)
self.log('ORIO publisher:', mi.publisher)
try: try:
mi.pubdate = self.parse_pubdate(pd) mi.pubdate = self.parse_pubdate(pd)
except: except:
self.log.exception( self.log.exception(
'Error parsing publish date for url: %r' % self.url) 'Error parsing publish date for url: %r' % self.url)
self.log('ORIO pubdate:', mi.pubdate)
try: try:
lang = self.parse_language(pd) lang = self.parse_language(pd)
@ -538,6 +552,7 @@ class Worker(Thread): # Get details {{{
except: except:
self.log.exception( self.log.exception(
'Error parsing language for url: %r' % self.url) 'Error parsing language for url: %r' % self.url)
self.log('ORIO language:', mi.language)
else: else:
self.log.warning( self.log.warning(
@ -721,16 +736,21 @@ class Worker(Thread): # Get details {{{
except ImportError: except ImportError:
from urllib import unquote from urllib import unquote
ans = '' ans = ''
ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div')) # ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div'))
# ovr = tuple(self.selector('#drengr_DesktopTabbedDescriptionOverviewContent_feature_div'))
ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div')) or tuple(self.selector('#drengr_DesktopTabbedDescriptionOverviewContent_feature_div'))
if ovr: if ovr:
ovr = ovr[0] ovr = ovr[0]
ovr.tag = 'div' ovr.tag = 'div'
ans = self._render_comments(ovr) ans = self._render_comments(ovr)
ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div')) # ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div'))
# ovr = tuple(self.selector('#drengr_DesktopTabbedDescriptionEditorialsContent_feature_div'))
ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div')) or tuple(self.selector('#drengr_DesktopTabbedDescriptionEditorialsContent_feature_div'))
if ovr: if ovr:
ovr = ovr[0] ovr = ovr[0]
ovr.tag = 'div' ovr.tag = 'div'
ans += self._render_comments(ovr) ans += self._render_comments(ovr)
self.log('ORIO comments ovr:', ans)
else: else:
ns = tuple(self.selector('#bookDescription_feature_div noscript')) ns = tuple(self.selector('#bookDescription_feature_div noscript'))
if ns: if ns:
@ -744,19 +764,23 @@ class Worker(Thread): # Get details {{{
else: else:
ns.tag = 'div' ns.tag = 'div'
ans = self._render_comments(ns) ans = self._render_comments(ns)
self.log('ORIO comments not ovr - ns:', ans)
else: else:
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
if desc: if desc:
ans = self._render_comments(desc[0]) ans = self._render_comments(desc[0])
self.log('ORIO comments desc:', ans)
else: else:
ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content')) ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content'))
if ns: if ns:
ans = self._render_comments(ns[0]) ans = self._render_comments(ns[0])
self.log('ORIO comments not ovr - else:', ans)
desc = root.xpath( desc = root.xpath(
'//div[@id="productDescription"]/*[@class="content"]') '//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
self.log('ORIO comments 2 desc:', ans)
else: else:
# Idiot chickens from amazon strike again. This data is now stored # Idiot chickens from amazon strike again. This data is now stored
# in a JS variable inside a script tag URL encoded. # in a JS variable inside a script tag URL encoded.
@ -769,6 +793,7 @@ class Worker(Thread): # Get details {{{
'//div[@id="productDescription"]/*[@class="content"]') '//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
self.log('ORIO comments 2 m_desc:', ans)
except Exception as e: except Exception as e:
self.log.warn( self.log.warn(
'Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) 'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
@ -776,6 +801,7 @@ class Worker(Thread): # Get details {{{
desc = root.xpath('//div[@id="productDescription_fullView"]') desc = root.xpath('//div[@id="productDescription_fullView"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
self.log('ORIO comments 2 else - desc:', ans)
return ans return ans
@ -877,14 +903,20 @@ class Worker(Thread): # Get details {{{
# Look for the image URL in javascript, using the first image in the # Look for the image URL in javascript, using the first image in the
# image gallery as the cover # image gallery as the cover
import json import json
imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""") # imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""")
imgpat = re.compile(r'"hiRes":"(.+?)","thumb"')
for script in root.xpath('//script'): for script in root.xpath('//script'):
# self.log('ORIO parse_cover - script_text:', script.text)
m = imgpat.search(script.text or '') m = imgpat.search(script.text or '')
if m is not None: if m is not None:
# self.log('ORIO parse_cover - script_text:', m.group(1))
return m.group(1)
'''
try: try:
return json.loads(m.group(1))[0]['mainUrl'] return json.loads(m.group(1))[0]['mainUrl']
except Exception: except Exception:
continue continue
'''
def clean_img_src(src): def clean_img_src(src):
parts = src.split('/') parts = src.split('/')
@ -902,6 +934,7 @@ class Worker(Thread): # Get details {{{
src = m.group(1) src = m.group(1)
url = clean_img_src(src) url = clean_img_src(src)
if url: if url:
# self.log('ORIO parse_cover - script url:', url)
return url return url
imgs = root.xpath( imgs = root.xpath(
@ -926,6 +959,8 @@ class Worker(Thread): # Get details {{{
if width > mwidth: if width > mwidth:
mwidth = width mwidth = width
url = iurl url = iurl
# self.log('ORIO parse_cover - not img_url:', url)
return url return url
except Exception: except Exception:
pass pass
@ -942,6 +977,7 @@ class Worker(Thread): # Get details {{{
self.log('Found image: %s' % src) self.log('Found image: %s' % src)
url = clean_img_src(src) url = clean_img_src(src)
if url: if url:
# self.log('ORIO parse_cover - img_url:', url)
return url return url
def parse_detail_bullets(self, root, mi, container): def parse_detail_bullets(self, root, mi, container):
@ -962,6 +998,7 @@ class Worker(Thread): # Get details {{{
name = self.totext(c1, only_printable=True).strip().strip(':').strip() name = self.totext(c1, only_printable=True).strip().strip(':').strip()
val = self.totext(c2).strip() val = self.totext(c2).strip()
val = val.replace('\u200e', '').replace('\u200f', '') val = val.replace('\u200e', '').replace('\u200f', '')
self.log('ORIO parse_detail_cells:', name, val)
if not val: if not val:
return return
if name in self.language_names: if name in self.language_names:
@ -1459,6 +1496,7 @@ class Amazon(Source):
matches = [] matches = []
query, domain = self.create_query(log, title=title, authors=authors, query, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
time.sleep(1)
try: try:
raw = br.open_novisit(query, timeout=timeout).read().strip() raw = br.open_novisit(query, timeout=timeout).read().strip()
except Exception as e: except Exception as e: