From df0e052c08275d8c5cd6229613223d4ae8f3de2b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Mar 2019 20:33:52 +0530 Subject: [PATCH] Update 1843 --- recipes/1843.recipe | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/recipes/1843.recipe b/recipes/1843.recipe index bbff61247c..88dedbc825 100644 --- a/recipes/1843.recipe +++ b/recipes/1843.recipe @@ -21,11 +21,47 @@ class E1843(BasicNewsRecipe): remove_javascript = True oldest_article = 365 encoding = 'utf-8' - feeds = [ - 'https://www.1843magazine.com/rss/content', - ] + + # feeds = [ + # 'https://www.1843magazine.com/rss/content', + # ] keep_only_tags = [ dict(name='h1', attrs={'class': lambda x: x and 'title' in x.split()}), classes('field-name-field-rubric-summary article-header__overlay-main-image meta-info__author article__body'), ] + + def parse_index(self): + soup = self.index_to_soup('https://www.1843magazine.com') + a = soup.find(text='Print edition').parent + soup = self.index_to_soup(a['href']) + h1 = soup.find(**classes('cover-image__main')) + self.timefmt = ' [%s]' % self.tag_to_string(h1) + img = soup.find(**classes('cover-image__image')).find('img') + self.cover_url = img['src'] + + ans = [] + current_section = articles = None + + for div in soup.findAll(**classes('field-name-field-header node-article')): + if 'field-header' in div['class']: + if current_section and articles: + ans.append((current_section, articles)) + current_section = self.tag_to_string(div) + self.log(current_section) + articles = [] + else: + a = div.find('a', href=True) + title = self.tag_to_string(a) + url = a['href'] + self.log('\t', title, ' at ', url) + desc = '' + r = div.find(**classes('article-rubric')) + if r is not None: + desc = self.tag_to_string(r) + articles.append( + {'title': title, 'url': url, 'description': desc}) + + if current_section and articles: + ans.append((current_section, articles)) + return ans