Update Psychology Today, The SMithsonian and The New Republic

This commit is contained in:
Kovid Goyal 2012-07-27 01:26:49 +05:30
parent d13e49b401
commit 9d90cfd756
3 changed files with 189 additions and 129 deletions

View File

@ -1,44 +1,79 @@
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1275708473(BasicNewsRecipe): class PsychologyToday(BasicNewsRecipe):
title = u'Psychology Today'
_author__ = 'rty' title = 'Psychology Today'
publisher = u'www.psychologytoday.com' __author__ = 'Rick Shang'
category = u'Psychology'
max_articles_per_feed = 100 description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en' language = 'en'
temp_files = [] category = 'news'
articles_are_obfuscated = True encoding = 'UTF-8'
remove_tags = [ keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})]
dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}), no_javascript = True
dict(name='span', attrs={'class':'print-footnote'}), no_stylesheets = True
]
remove_tags_before = dict(name='h1', attrs={'class':'print-title'})
remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']})
feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')]
def get_article_url(self, article): def parse_index(self):
return article.get('link', None) articles = []
soup = self.index_to_soup('http://www.psychologytoday.com/magazine')
#Go to the main body
div = soup.find('div',attrs={'id':'content-content'})
#Find cover & date
cover_item = div.find('div', attrs={'class':'collections-header-image'})
cover = cover_item.find('img',src=True)
self.cover_url = cover['src']
date = self.tag_to_string(cover['title'])
self.timefmt = u' [%s]'%date
articles = []
for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
title = title + u' (%s)'%author
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
description = post.find('div', attrs={'class':'collection-node-description'})
author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip())
desc = self.tag_to_string(description).strip()
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
title = title + u' (%s)'%author
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
title = title + u' (%s)'%author
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
return [('Current Issue', articles)]
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
def get_cover_url(self):
index = 'http://www.psychologytoday.com/magazine/'
soup = self.index_to_soup(index)
for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }):
return image['src'] + '.jpg'
return None

View File

@ -1,61 +1,67 @@
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup from collections import OrderedDict
class SmithsonianMagazine(BasicNewsRecipe): class Smithsonian(BasicNewsRecipe):
title = u'Smithsonian Magazine'
title = 'Smithsonian Magazine'
__author__ = 'Rick Shang'
description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.'
language = 'en' language = 'en'
__author__ = 'Krittika Goyal and TerminalVeracity' category = 'news'
oldest_article = 31#days encoding = 'UTF-8'
max_articles_per_feed = 50 keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})]
use_embedded_content = False remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})]
recursions = 1 no_javascript = True
cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg' no_stylesheets = True
match_regexps = ['&page=[2-9]$']
preprocess_regexps = [
(re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '')
]
extra_css = """
h1{font-size: large; margin: .2em 0}
h2{font-size: medium; margin: .2em 0}
h3{font-size: medium; margin: .2em 0}
#byLine{margin: .2em 0}
.articleImageCaptionwide{font-style: italic}
.wp-caption-text{font-style: italic}
img{display: block}
"""
def parse_index(self):
#Go to the issue
soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/')
div = soup0.find('div',attrs={'id':'archives'})
issue = div.find('ul',attrs={'class':'clear-both'})
current_issue_url = issue.find('a', href=True)['href']
soup = self.index_to_soup(current_issue_url)
remove_stylesheets = True #Go to the main body
remove_tags_after = dict(name='div', attrs={'class':['post','articlePaginationWrapper']}) div = soup.find ('div', attrs={'id':'content-inset'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}),
dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}),
dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
dict(name='h4', attrs={'id':'related-topics'}),
dict(name='table'),
dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}),
dict(name='a', attrs={'name':'comments_shaded'}),
]
#Find date
date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip())
self.timefmt = u' [%s]'%date
feeds = [ #Find cover
('History and Archeology', self.cover_url = div.find('img',src=True)['src']
'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
('People and Places', feeds = OrderedDict()
'http://feeds.feedburner.com/smithsonianmag/people-places'), section_title = ''
('Science and Nature', subsection_title = ''
'http://feeds.feedburner.com/smithsonianmag/science-nature'), for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}):
('Arts and Culture', articles = []
'http://feeds.feedburner.com/smithsonianmag/arts-culture'), prefix = ''
('Travel', h3=post.find('h3')
'http://feeds.feedburner.com/smithsonianmag/travel'), if h3 is not None:
] section_title = self.tag_to_string(h3)
else:
subsection=post.find('p',attrs={'class':'article-cat'})
link=post.find('a',href=True)
url=link['href']+'?c=y&story=fullstory'
if subsection is not None:
subsection_title = self.tag_to_string(subsection)
prefix = (subsection_title+': ')
description=self.tag_to_string(post('p', limit=2)[1]).strip()
else:
description=self.tag_to_string(post.find('p')).strip()
desc=re.sub('\sBy\s.*', '', description, re.DOTALL)
author=re.sub('.*By\s', '', description, re.DOTALL)
title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'id':'article-body'})
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body')
body.insert(0, story)
return soup

View File

@ -1,45 +1,64 @@
from calibre.web.feeds.news import BasicNewsRecipe import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict
class TNR(BasicNewsRecipe):
class The_New_Republic(BasicNewsRecipe):
title = 'The New Republic' title = 'The New Republic'
__author__ = 'cix3' __author__ = 'Rick Shang'
language = 'en'
description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture'
timefmt = ' [%b %d, %Y]'
oldest_article = 7 description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
max_articles_per_feed = 100 language = 'en'
category = 'news'
encoding = 'UTF-8'
remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
no_javascript = True
no_stylesheets = True no_stylesheets = True
remove_tags = [
dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}),
dict(name='hr', attrs={'class':'print-hr'}), dict(name='img')
]
feeds = [ def parse_index(self):
('Politics', 'http://www.tnr.com/rss/articles/Politics'),
('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'),
('Economy', 'http://www.tnr.com/rss/articles/Economy'),
('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'),
('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'),
('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'),
('World', 'http://www.tnr.com/rss/articles/World'),
('Film', 'http://www.tnr.com/rss/articles/Film'),
('Books', 'http://www.tnr.com/rss/articles/books'),
('The Book', 'http://www.tnr.com/rss/book'),
('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'),
('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'),
('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'),
('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'),
('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'),
('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'),
('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'),
('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'),
('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'),
('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'),
('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter')
]
def print_version(self, url): #Go to the issue
return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
issue = soup0.find('div',attrs={'id':'current_issue'})
#Find date
date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
self.timefmt = u' [%s]'%date
#Go to the main body
current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
soup = self.index_to_soup(current_issue_url)
div = soup.find ('div', attrs={'class':'article_detail_body'})
#Find cover
self.cover_url = div.find('img',src=True)['src']
feeds = OrderedDict()
section_title = ''
subsection_title = ''
for post in div.findAll('p'):
articles = []
em=post.find('em')
b=post.find('b')
a=post.find('a',href=True)
if em is not None:
section_title = self.tag_to_string(em).strip()
subsection_title = ''
elif b is not None:
subsection_title=self.tag_to_string(b).strip()
elif a is not None:
prefix = (subsection_title+': ') if subsection_title else ''
url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans