Update The New Republic

This commit is contained in:
Kovid Goyal 2013-05-01 22:56:54 +05:30
parent f013d5e371
commit c504bbab3d

View File

@ -1,68 +1,63 @@
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict
class TNR(BasicNewsRecipe): class TNR(BasicNewsRecipe):
title = 'The New Republic' title = 'The New Republic'
__author__ = 'Rick Shang' __author__ = 'Krittika Goyal'
description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.' description = '''The New Republic is a journal of opinion with an emphasis
language = 'en' on politics and domestic and international affairs. It carries feature
category = 'news' articles by staff and contributing editors. The second half of each issue
encoding = 'UTF-8' is devoted to book and the arts, theater, motion pictures, music and art.'''
remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
no_javascript = True language = 'en'
no_stylesheets = True encoding = 'UTF-8'
needs_subscription = True
def parse_index(self): preprocess_regexps = [
(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
#Go to the issue (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues') ]
issue = soup0.find('div',attrs={'id':'current_issue'})
def get_browser(self):
#Find date br = BasicNewsRecipe.get_browser(self)
date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip() br.open('http://www.newrepublic.com/user')
self.timefmt = u' [%s]'%date br.select_form(nr=1)
try:
#Go to the main body br['user'] = self.username
current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href'] except:
soup = self.index_to_soup(current_issue_url) br['name'] = self.username
div = soup.find ('div', attrs={'class':'article_detail_body'}) br['pass'] = self.password
self.log('Logging in...')
raw = br.submit().read()
if 'SIGN OUT' not in raw:
#Find cover raise ValueError('Failed to log in to tnr.com, check your username and password')
self.cover_url = div.find('img',src=True)['src'] self.log('Logged in successfully')
return br
feeds = OrderedDict()
section_title = '' def parse_index(self):
subsection_title = '' raw = self.index_to_soup('http://www.newrepublic.com/current-issue', raw=True)
for post in div.findAll('p'): # raw = self.index_to_soup(open('/t/raw.html').read().decode('utf-8'), raw=True)
articles = [] for pat, sub in self.preprocess_regexps:
em=post.find('em') raw = pat.sub(sub, raw)
b=post.find('b') soup = self.index_to_soup(raw)
a=post.find('a',href=True) feed_title = 'The New Republic Magazine Articles'
p=post.find('img', src=True)
#Find cover articles = []
if p is not None: for div in soup.findAll('div', attrs={'class':lambda x: x and 'field-item' in x.split()}):
self.cover_url = p['src'].strip() a = div.find('a', href=True, attrs={'class':lambda x: x != 'author'})
if em is not None: if a is not None:
section_title = self.tag_to_string(em).strip() art_title = self.tag_to_string(a)
subsection_title = '' url = a.get('href')
elif b is not None: num = re.search(r'/(\d+)/', url)
subsection_title=self.tag_to_string(b).strip() if num is not None:
elif a is not None: art = num.group(1)
prefix = (subsection_title+': ') if subsection_title else '' url = 'http://www.newrepublic.com/node/%s/print'%art
url=re.sub('www.tnr.com','www.tnr.com/print', a['href']) self.log.info('\tFound article:', art_title, 'at', url)
author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL) article = {'title':art_title, 'url':url, 'description':'', 'date':''}
title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author articles.append(article)
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
return [(feed_title, articles)]
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans