From 6e9bb955d007718e3b010398f15fd48591147549 Mon Sep 17 00:00:00 2001 From: os-strus <56672814+os-strus@users.noreply.github.com> Date: Wed, 3 Jan 2024 12:45:30 +0100 Subject: [PATCH] Update amazon.py 1.- Amazon changed the way manage book images. I fixed this on lines 880-888 (original file) - 906-920 (new file) 2.- Amazon also changed the
name for the book comments I fixed this on lines 724 and 729 (original file) - 741 and 748 (new file) All others changes in the new file are log fir debug info, that can be deleted --- src/calibre/ebooks/metadata/sources/amazon.py | 46 +++++++++++++++++-- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 8d34ee3d88..88d7c7e7f0 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -426,6 +426,7 @@ class Worker(Thread): # Get details {{{ def parse_details(self, raw, root): asin = parse_asin(root, self.log, self.url) + self.log('ORIO asin:', asin) if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'): raise CaptchaError( 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.') @@ -442,12 +443,14 @@ class Worker(Thread): # Get details {{{ except: self.log.exception('Error parsing title for url: %r' % self.url) title = None + self.log('ORIO title:', title) try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] + self.log('ORIO authors:', authors) if not title or not authors or not asin: self.log.error( @@ -465,11 +468,13 @@ class Worker(Thread): # Get details {{{ mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) + self.log('ORIO rating:', mi.rating) try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r' % self.url) + self.log('ORIO comments:', mi.comments) try: series, series_index = self.parse_series(root) @@ -479,11 +484,13 @@ class Worker(Thread): # Get details {{{ mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) + self.log('ORIO series, series_index:', series, series_index) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) + self.log('ORIO tags:', mi.tags) try: self.cover_url = self.parse_cover(root, raw) @@ -492,14 +499,17 @@ class Worker(Thread): # Get details {{{ if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'): self.cover_url = self.cover_url_processor(self.cover_url) mi.has_cover = bool(self.cover_url) + self.log('ORIO cover_url:', self.cover_url) detail_bullets = root.xpath('//*[@data-feature-name="detailBullets"]') non_hero = tuple(self.selector( 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector( '#productDetails_techSpec_sections')) if detail_bullets: + self.log('ORIO parse_detail_bullets') self.parse_detail_bullets(root, mi, detail_bullets[0]) elif non_hero: + self.log('ORIO parse_new_details') try: self.parse_new_details(root, mi, non_hero[0]) except: @@ -509,6 +519,7 @@ class Worker(Thread): # Get details {{{ else: pd = root.xpath(self.pd_xpath) if pd: + self.log('ORIO Dettagli prodotto') pd = pd[0] try: @@ -518,18 +529,21 @@ class Worker(Thread): # Get details {{{ except: self.log.exception( 'Error parsing ISBN for url: %r' % self.url) + self.log('ORIO isbn:', mi.isbn) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception( 'Error parsing publisher for url: %r' % self.url) + self.log('ORIO publisher:', mi.publisher) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception( 'Error parsing publish date for url: %r' % self.url) + self.log('ORIO pubdate:', mi.pubdate) try: lang = self.parse_language(pd) @@ -538,6 +552,7 @@ class Worker(Thread): # Get details {{{ except: self.log.exception( 'Error parsing language for url: %r' % self.url) + self.log('ORIO language:', mi.language) else: self.log.warning( @@ -721,16 +736,21 @@ class Worker(Thread): # Get details {{{ except ImportError: from urllib import unquote ans = '' - ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div')) + # ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div')) + # ovr = tuple(self.selector('#drengr_DesktopTabbedDescriptionOverviewContent_feature_div')) + ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div')) or tuple(self.selector('#drengr_DesktopTabbedDescriptionOverviewContent_feature_div')) if ovr: ovr = ovr[0] ovr.tag = 'div' ans = self._render_comments(ovr) - ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div')) + # ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div')) + # ovr = tuple(self.selector('#drengr_DesktopTabbedDescriptionEditorialsContent_feature_div')) + ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div')) or tuple(self.selector('#drengr_DesktopTabbedDescriptionEditorialsContent_feature_div')) if ovr: ovr = ovr[0] ovr.tag = 'div' ans += self._render_comments(ovr) + self.log('ORIO comments ovr:', ans) else: ns = tuple(self.selector('#bookDescription_feature_div noscript')) if ns: @@ -744,19 +764,23 @@ class Worker(Thread): # Get details {{{ else: ns.tag = 'div' ans = self._render_comments(ns) + self.log('ORIO comments not ovr - ns:', ans) else: desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: ans = self._render_comments(desc[0]) + self.log('ORIO comments desc:', ans) else: ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content')) if ns: ans = self._render_comments(ns[0]) + self.log('ORIO comments not ovr - else:', ans) desc = root.xpath( '//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) + self.log('ORIO comments 2 desc:', ans) else: # Idiot chickens from amazon strike again. This data is now stored # in a JS variable inside a script tag URL encoded. @@ -769,6 +793,7 @@ class Worker(Thread): # Get details {{{ '//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) + self.log('ORIO comments 2 m_desc:', ans) except Exception as e: self.log.warn( 'Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) @@ -776,6 +801,7 @@ class Worker(Thread): # Get details {{{ desc = root.xpath('//div[@id="productDescription_fullView"]') if desc: ans += self._render_comments(desc[0]) + self.log('ORIO comments 2 else - desc:', ans) return ans @@ -877,14 +903,20 @@ class Worker(Thread): # Get details {{{ # Look for the image URL in javascript, using the first image in the # image gallery as the cover import json - imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""") + # imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""") + imgpat = re.compile(r'"hiRes":"(.+?)","thumb"') for script in root.xpath('//script'): + # self.log('ORIO parse_cover - script_text:', script.text) m = imgpat.search(script.text or '') if m is not None: + # self.log('ORIO parse_cover - script_text:', m.group(1)) + return m.group(1) + ''' try: return json.loads(m.group(1))[0]['mainUrl'] except Exception: - continue + continue + ''' def clean_img_src(src): parts = src.split('/') @@ -902,6 +934,7 @@ class Worker(Thread): # Get details {{{ src = m.group(1) url = clean_img_src(src) if url: + # self.log('ORIO parse_cover - script url:', url) return url imgs = root.xpath( @@ -926,6 +959,8 @@ class Worker(Thread): # Get details {{{ if width > mwidth: mwidth = width url = iurl + + # self.log('ORIO parse_cover - not img_url:', url) return url except Exception: pass @@ -942,6 +977,7 @@ class Worker(Thread): # Get details {{{ self.log('Found image: %s' % src) url = clean_img_src(src) if url: + # self.log('ORIO parse_cover - img_url:', url) return url def parse_detail_bullets(self, root, mi, container): @@ -962,6 +998,7 @@ class Worker(Thread): # Get details {{{ name = self.totext(c1, only_printable=True).strip().strip(':').strip() val = self.totext(c2).strip() val = val.replace('\u200e', '').replace('\u200f', '') + self.log('ORIO parse_detail_cells:', name, val) if not val: return if name in self.language_names: @@ -1459,6 +1496,7 @@ class Amazon(Source): matches = [] query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) + time.sleep(1) try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: