mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Update The New Republic
This commit is contained in:
parent
f013d5e371
commit
c504bbab3d
@ -1,68 +1,63 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
class TNR(BasicNewsRecipe):
|
class TNR(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The New Republic'
|
title = 'The New Republic'
|
||||||
__author__ = 'Rick Shang'
|
__author__ = 'Krittika Goyal'
|
||||||
|
|
||||||
description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
|
description = '''The New Republic is a journal of opinion with an emphasis
|
||||||
language = 'en'
|
on politics and domestic and international affairs. It carries feature
|
||||||
category = 'news'
|
articles by staff and contributing editors. The second half of each issue
|
||||||
encoding = 'UTF-8'
|
is devoted to book and the arts, theater, motion pictures, music and art.'''
|
||||||
remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
|
|
||||||
no_javascript = True
|
language = 'en'
|
||||||
no_stylesheets = True
|
encoding = 'UTF-8'
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
def parse_index(self):
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
|
||||||
#Go to the issue
|
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
|
||||||
soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
|
]
|
||||||
issue = soup0.find('div',attrs={'id':'current_issue'})
|
|
||||||
|
def get_browser(self):
|
||||||
#Find date
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
|
br.open('http://www.newrepublic.com/user')
|
||||||
self.timefmt = u' [%s]'%date
|
br.select_form(nr=1)
|
||||||
|
try:
|
||||||
#Go to the main body
|
br['user'] = self.username
|
||||||
current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
|
except:
|
||||||
soup = self.index_to_soup(current_issue_url)
|
br['name'] = self.username
|
||||||
div = soup.find ('div', attrs={'class':'article_detail_body'})
|
br['pass'] = self.password
|
||||||
|
self.log('Logging in...')
|
||||||
|
raw = br.submit().read()
|
||||||
|
if 'SIGN OUT' not in raw:
|
||||||
#Find cover
|
raise ValueError('Failed to log in to tnr.com, check your username and password')
|
||||||
self.cover_url = div.find('img',src=True)['src']
|
self.log('Logged in successfully')
|
||||||
|
return br
|
||||||
feeds = OrderedDict()
|
|
||||||
section_title = ''
|
def parse_index(self):
|
||||||
subsection_title = ''
|
raw = self.index_to_soup('http://www.newrepublic.com/current-issue', raw=True)
|
||||||
for post in div.findAll('p'):
|
# raw = self.index_to_soup(open('/t/raw.html').read().decode('utf-8'), raw=True)
|
||||||
articles = []
|
for pat, sub in self.preprocess_regexps:
|
||||||
em=post.find('em')
|
raw = pat.sub(sub, raw)
|
||||||
b=post.find('b')
|
soup = self.index_to_soup(raw)
|
||||||
a=post.find('a',href=True)
|
feed_title = 'The New Republic Magazine Articles'
|
||||||
p=post.find('img', src=True)
|
|
||||||
#Find cover
|
articles = []
|
||||||
if p is not None:
|
for div in soup.findAll('div', attrs={'class':lambda x: x and 'field-item' in x.split()}):
|
||||||
self.cover_url = p['src'].strip()
|
a = div.find('a', href=True, attrs={'class':lambda x: x != 'author'})
|
||||||
if em is not None:
|
if a is not None:
|
||||||
section_title = self.tag_to_string(em).strip()
|
art_title = self.tag_to_string(a)
|
||||||
subsection_title = ''
|
url = a.get('href')
|
||||||
elif b is not None:
|
num = re.search(r'/(\d+)/', url)
|
||||||
subsection_title=self.tag_to_string(b).strip()
|
if num is not None:
|
||||||
elif a is not None:
|
art = num.group(1)
|
||||||
prefix = (subsection_title+': ') if subsection_title else ''
|
url = 'http://www.newrepublic.com/node/%s/print'%art
|
||||||
url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
|
self.log.info('\tFound article:', art_title, 'at', url)
|
||||||
author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
|
article = {'title':art_title, 'url':url, 'description':'', 'date':''}
|
||||||
title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
|
articles.append(article)
|
||||||
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
|
|
||||||
|
return [(feed_title, articles)]
|
||||||
if articles:
|
|
||||||
if section_title not in feeds:
|
|
||||||
feeds[section_title] = []
|
|
||||||
feeds[section_title] += articles
|
|
||||||
ans = [(key, val) for key, val in feeds.iteritems()]
|
|
||||||
return ans
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user