Update Reason Magazine

This commit is contained in:
Kovid Goyal 2022-04-08 07:53:29 +05:30
parent 2e2fcaaf28
commit 71f6d8b162
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import unicode_literals
import json import json
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
@ -64,7 +65,7 @@ def extract_html(soup):
class Reason(BasicNewsRecipe): class Reason(BasicNewsRecipe):
title = 'Reason Magazine' title = 'Reason'
description = 'Free minds and free markets' description = 'Free minds and free markets'
INDEX = 'https://reason.com/magazine/' INDEX = 'https://reason.com/magazine/'
__author__ = 'Howard Cornett' __author__ = 'Howard Cornett'
@ -74,8 +75,8 @@ class Reason(BasicNewsRecipe):
remove_tags = [ remove_tags = [
classes( classes(
'next-post-link the-tags tag rcom-social tools comments-header-show logo-header' 'next-post-link the-tags tag rcom-social-tools most-read-container comments-header-show'
' navbar navbar-expanded-lg primary content-info sidebar magicSidebar advertisement logo entry-subtitle' ' logo-header navbar navbar-expanded-lg primary content-info sidebar magicSidebar advertisement logo entry-subtitle'
), ),
] ]
@ -128,42 +129,57 @@ class Reason(BasicNewsRecipe):
self.cover_url = cover['src'] self.cover_url = cover['src']
current_section, current_articles = 'Cover Story', [] current_section, current_articles = 'Cover Story', []
feeds = [] feeds = []
for div in soup.findAll('div', **classes('issue-header-right toc-category-list')): for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'issue-header-right', 'toc-category-list'})}):
for h3 in div.findAll('h3', **classes('toc-department')): for h3 in div.findAll('h3', attrs={'class': True}):
if current_articles: cls = h3['class']
feeds.append((current_section, current_articles)) if hasattr(cls, 'split'):
current_articles = [] cls = cls.split()
current_section = self.tag_to_string(h3) if 'toc-department' in cls:
self.log('\nFound section:', current_section) if current_articles:
title = h3.find_next_sibling().a.text feeds.append((current_section, current_articles))
url = h3.find_next_sibling().a['href'] current_articles = []
desc = h3.find_next_sibling().p.text current_section = self.tag_to_string(h3)
current_articles.append({ self.log('\nFound section:', current_section)
'title': title, title = h3.find_next_sibling().a.text
'url': url, url = h3.find_next_sibling().a['href']
'description': desc desc = h3.find_next_sibling().p.text
}) current_articles.append({
for h2 in div.findAll('h2', **classes('toc-department')): 'title': title,
if current_articles: 'url': url,
feeds.append((current_section, current_articles)) 'description': desc
current_articles = [] })
current_section = self.tag_to_string(h2) for h2 in div.findAll('h2', attrs={'class': True}):
self.log('\nFound section:', current_section) cls = h2['class']
for article in div.findAll('article', attrs={'class': True}): if hasattr(cls, 'split'):
h4 = article.find('h4') cls = cls.split()
if h4.a is not None: if 'toc-department' in cls:
title = h4.a.text if current_articles:
url = h4.a['href'] feeds.append((current_section, current_articles))
else: current_articles = []
title = '' current_section = self.tag_to_string(h2)
url = '' self.log('\nFound section:', current_section)
desc = h4.find_next_sibling().text for article in div.findAll('article', attrs={'class': True}):
current_articles.append({ h4 = article.find('h4')
'title': title, if h4.a is not None:
'url': url, title = h4.a.text
'description': desc url = h4.a['href']
}) else:
title = ''
url = ''
desc = h4.find_next_sibling().text
current_articles.append({
'title': title,
'url': url,
'description': desc
})
if current_articles: if current_articles:
feeds.append((current_section, current_articles)) feeds.append((current_section, current_articles))
return feeds return feeds
if __name__ == '__main__':
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup
print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))