Update Psychology Today, The SMithsonian and The New Republic

This commit is contained in:
Kovid Goyal 2012-07-27 01:26:49 +05:30
parent d13e49b401
commit 9d90cfd756
3 changed files with 189 additions and 129 deletions

View File

@ -1,44 +1,79 @@
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1275708473(BasicNewsRecipe):
title = u'Psychology Today'
_author__ = 'rty'
publisher = u'www.psychologytoday.com'
category = u'Psychology'
max_articles_per_feed = 100
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
class PsychologyToday(BasicNewsRecipe):
title = 'Psychology Today'
__author__ = 'Rick Shang'
description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.'
language = 'en'
temp_files = []
articles_are_obfuscated = True
remove_tags = [
dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}),
dict(name='span', attrs={'class':'print-footnote'}),
]
remove_tags_before = dict(name='h1', attrs={'class':'print-title'})
remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']})
category = 'news'
encoding = 'UTF-8'
keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})]
no_javascript = True
no_stylesheets = True
feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')]
def get_article_url(self, article):
return article.get('link', None)
def parse_index(self):
articles = []
soup = self.index_to_soup('http://www.psychologytoday.com/magazine')
#Go to the main body
div = soup.find('div',attrs={'id':'content-content'})
#Find cover & date
cover_item = div.find('div', attrs={'class':'collections-header-image'})
cover = cover_item.find('img',src=True)
self.cover_url = cover['src']
date = self.tag_to_string(cover['title'])
self.timefmt = u' [%s]'%date
articles = []
for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
title = title + u' (%s)'%author
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
description = post.find('div', attrs={'class':'collection-node-description'})
author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip())
desc = self.tag_to_string(description).strip()
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
title = title + u' (%s)'%author
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
title = title + u' (%s)'%author
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
return [('Current Issue', articles)]
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
def get_cover_url(self):
index = 'http://www.psychologytoday.com/magazine/'
soup = self.index_to_soup(index)
for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }):
return image['src'] + '.jpg'
return None

View File

@ -1,61 +1,67 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict
class SmithsonianMagazine(BasicNewsRecipe):
title = u'Smithsonian Magazine'
language = 'en'
__author__ = 'Krittika Goyal and TerminalVeracity'
oldest_article = 31#days
max_articles_per_feed = 50
use_embedded_content = False
recursions = 1
cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg'
match_regexps = ['&page=[2-9]$']
preprocess_regexps = [
(re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '')
]
extra_css = """
h1{font-size: large; margin: .2em 0}
h2{font-size: medium; margin: .2em 0}
h3{font-size: medium; margin: .2em 0}
#byLine{margin: .2em 0}
.articleImageCaptionwide{font-style: italic}
.wp-caption-text{font-style: italic}
img{display: block}
"""
class Smithsonian(BasicNewsRecipe):
title = 'Smithsonian Magazine'
__author__ = 'Rick Shang'
remove_stylesheets = True
remove_tags_after = dict(name='div', attrs={'class':['post','articlePaginationWrapper']})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}),
dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}),
dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
dict(name='h4', attrs={'id':'related-topics'}),
dict(name='table'),
dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}),
dict(name='a', attrs={'name':'comments_shaded'}),
]
description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.'
language = 'en'
category = 'news'
encoding = 'UTF-8'
keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})]
remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})]
no_javascript = True
no_stylesheets = True
def parse_index(self):
#Go to the issue
soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/')
div = soup0.find('div',attrs={'id':'archives'})
issue = div.find('ul',attrs={'class':'clear-both'})
current_issue_url = issue.find('a', href=True)['href']
soup = self.index_to_soup(current_issue_url)
feeds = [
('History and Archeology',
'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
('People and Places',
'http://feeds.feedburner.com/smithsonianmag/people-places'),
('Science and Nature',
'http://feeds.feedburner.com/smithsonianmag/science-nature'),
('Arts and Culture',
'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
('Travel',
'http://feeds.feedburner.com/smithsonianmag/travel'),
]
#Go to the main body
div = soup.find ('div', attrs={'id':'content-inset'})
#Find date
date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip())
self.timefmt = u' [%s]'%date
#Find cover
self.cover_url = div.find('img',src=True)['src']
feeds = OrderedDict()
section_title = ''
subsection_title = ''
for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}):
articles = []
prefix = ''
h3=post.find('h3')
if h3 is not None:
section_title = self.tag_to_string(h3)
else:
subsection=post.find('p',attrs={'class':'article-cat'})
link=post.find('a',href=True)
url=link['href']+'?c=y&story=fullstory'
if subsection is not None:
subsection_title = self.tag_to_string(subsection)
prefix = (subsection_title+': ')
description=self.tag_to_string(post('p', limit=2)[1]).strip()
else:
description=self.tag_to_string(post.find('p')).strip()
desc=re.sub('\sBy\s.*', '', description, re.DOTALL)
author=re.sub('.*By\s', '', description, re.DOTALL)
title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'id':'article-body'})
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body')
body.insert(0, story)
return soup

View File

@ -1,45 +1,64 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict
class The_New_Republic(BasicNewsRecipe):
title = 'The New Republic'
__author__ = 'cix3'
class TNR(BasicNewsRecipe):
title = 'The New Republic'
__author__ = 'Rick Shang'
description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
language = 'en'
description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture'
timefmt = ' [%b %d, %Y]'
oldest_article = 7
max_articles_per_feed = 100
category = 'news'
encoding = 'UTF-8'
remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
no_javascript = True
no_stylesheets = True
remove_tags = [
dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}),
dict(name='hr', attrs={'class':'print-hr'}), dict(name='img')
]
feeds = [
('Politics', 'http://www.tnr.com/rss/articles/Politics'),
('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'),
('Economy', 'http://www.tnr.com/rss/articles/Economy'),
('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'),
('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'),
('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'),
('World', 'http://www.tnr.com/rss/articles/World'),
('Film', 'http://www.tnr.com/rss/articles/Film'),
('Books', 'http://www.tnr.com/rss/articles/books'),
('The Book', 'http://www.tnr.com/rss/book'),
('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'),
('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'),
('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'),
('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'),
('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'),
('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'),
('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'),
('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'),
('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'),
('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'),
('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter')
]
def parse_index(self):
def print_version(self, url):
return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/')
#Go to the issue
soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
issue = soup0.find('div',attrs={'id':'current_issue'})
#Find date
date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
self.timefmt = u' [%s]'%date
#Go to the main body
current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
soup = self.index_to_soup(current_issue_url)
div = soup.find ('div', attrs={'class':'article_detail_body'})
#Find cover
self.cover_url = div.find('img',src=True)['src']
feeds = OrderedDict()
section_title = ''
subsection_title = ''
for post in div.findAll('p'):
articles = []
em=post.find('em')
b=post.find('b')
a=post.find('a',href=True)
if em is not None:
section_title = self.tag_to_string(em).strip()
subsection_title = ''
elif b is not None:
subsection_title=self.tag_to_string(b).strip()
elif a is not None:
prefix = (subsection_title+': ') if subsection_title else ''
url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans