Update Granta

2025-07-09 03:04:10 -04:00 · 2021-04-18 13:09:21 +05:30 · 2021-04-18 13:09:21 +05:30 · a4b6b79829
commit a4b6b79829
parent a57ea59adb
1 changed files with 42 additions and 85 deletions
--- a/recipes/granta.recipe
+++ b/recipes/granta.recipe
@ -17,6 +17,12 @@ force_issue_download = None
 # Adapted from https://gist.github.com/FlyingTopHat/7cfdd5434ec704916174
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 def plus_with_unknown_component(first_comp, second_comp, result):
    if result is None:
        return first_comp + second_comp
@ -152,26 +158,6 @@ def text2num(s):
 ##################################################################
 ##################################################################
 # Utilities
 def absurl(url):
    if url.startswith('/'):
        url = 'https://www.granta.com' + url
    return url
 def stripstyle(tag):
    if tag is not None:
        del tag['style']
 def get_innermost_string(tag):
    while hasattr(tag, 'contents') and len(tag.contents) > 0 and tag.contents[0] is not None:
        tag = tag.contents[0]
    return str(tag).strip()
 ##################################################################
 class Granta(BasicNewsRecipe):
    title = u'Granta'
@ -180,17 +166,17 @@ class Granta(BasicNewsRecipe):
    __author__ = 'Gary Arnold'
-    needs_subscription = True
+    needs_subscription = 'optional'
    keep_only_tags = [
-        dict(name='div', attrs={'class': 'article-feature-image-container'}),
+        classes(
-        dict(name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-12 article-header'}),
+            'article-header article-content article-feature-image-standard-container article-feature-image-full-width-container'
-        dict(name='div', attrs={'class': 'carousel-inner'}),
+        ),
        dict(name='div', attrs={'class': 'article-content'}),
    ]
-
+    remove_tags = [
-    preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
+        classes('social-share-container'),
-                           m:'<head></head>')]
+    ]
    remove_attributes = ['style']
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -219,26 +205,10 @@ class Granta(BasicNewsRecipe):
        return br
    def preprocess_html(self, soup):
-        articleHeader = soup.find(
+        for div in soup.findAll(attrs={'data-background': True}):
-            'div', attrs={'class': 'article-feature-image-container'})
+            img = soup.new_tag('img')
-        if articleHeader is None:
+            img['src'] = div['data-background']
-            articleHeader = soup.find(
+            div.append(img)
                    'div', attrs={'class': lambda x: x and 'article-header' in x.split()})
        if articleHeader is not None:
            image = articleHeader.find(
                'div', attrs={'class': 'article-feature-image'})
            if image is not None and image.attrs is not None:
                style = dict(image.attrs)['style']
                if style is not None:
                    m = re.search(r'url\(([^\)]*)\)', style)
                    if m.group(1) is not None:
                        stripstyle(image)
                        image.name = 'img'
                        image['src'] = m.group(1)
            stripstyle(articleHeader.find('h1'))
            stripstyle(articleHeader.find('h2'))
        return soup
    def parse_index(self):
@ -246,51 +216,38 @@ class Granta(BasicNewsRecipe):
            soup = self.index_to_soup('https://granta.com/')
            # Get latest issue
-            issueInfo = soup.find(
+            issueInfo = soup.find(**classes('featured_product__image'))
-                    'div', attrs={'class': lambda x: x and 'dnd_container__heading' in x.split()})
+            issueAnchor = issueInfo.findParent('a', href=True)
            issueAnchor = issueInfo.find('a')
            issueTitle = issueAnchor.contents[0]
            issueLink = issueAnchor.get('href')
        else:
            issueLink = force_issue_download
            issueTitle = ''
        self.log('Fetching issue:', issueLink)
        soup = self.index_to_soup(issueLink)
        # open('/t/raw.html', 'w').write(str(soup))
        # Find cover
-        cover = soup.find('div', attrs={'class': 'product-img-container'})
+        cover = soup.find(**classes('single-issue__cover-image'))
        if cover is not None:
-            img = cover.find('img', src=True)
+            self.cover_url = cover['data-background']
            self.cover_url = absurl(img['src'])
            self.log.info('Found cover at:', self.cover_url)
-        # Find TOC
+        sections = {}
-        tocs = soup.findAll('div', attrs={'class': 'product-article'})
+        for item in soup.findAll(**classes('single-contributor_related-row_container')):
-        articles = []
+            h6 = item.find('h6')
-        for toc in tocs:
+            section = self.tag_to_string(h6.find('a')).strip()
-            if (self.username and self.password) or (toc.find('img') is None):
+            sections.setdefault(section, [])
-                # Either user is logged in or the article is unlocked
+            h1 = item.find('h1')
-                h1 = toc.find('h1')
+            title = self.tag_to_string(h1).strip()
-                h2 = toc.find('h2')
+            url = h1.findParent('a')['href']
-                if h1.find('a') is not None and len(h1.find('a').contents) > 0 and h1.find('a').contents[0] is not None:
+            author = self.tag_to_string(item.findAll('h3')[-1]).strip()
-                    title = get_innermost_string(h1.find('a').contents[0])
+            desc = ''
-                elif len(h1.contents) > 0 and h1.contents[0] is not None:
+            for p in item.findAll('p'):
-                    title = get_innermost_string(h1.contents[0])
+                desc += self.tag_to_string(p)
-                else:
+            sections[section].append({
-                    title = ''
+                'title': title, 'url': url, 'description': 'by ' + author + '. ' + desc})
                if h2.find('a') is not None and len(h2.find('a').contents) > 0 and h2.find('a').contents[0] is not None:
                    author = get_innermost_string(h2.find('a').contents[0])
                    title = title + u' (%s)' % author
                elif len(h2.contents) > 0 and h2.contents[0] is not None:
                    author = get_innermost_string(h2.contents[0])
                    title = title + u' (%s)' % author
                else:
                    author = ''
                url = absurl(h1.find('a', href=True)['href'])
                self.log.info('Found article:', title)
                self.log.info('\t', url)
                articles.append({'title': title, 'url': url,
                                 'date': '', 'description': ''})
-        return [(issueTitle, articles)]
+            self.log.info('Found article:', title)
            self.log.info('\t', url)
        return [(sec, sections[sec]) for sec in sections]