mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
69 lines
2.6 KiB
Plaintext
69 lines
2.6 KiB
Plaintext
import re
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
from collections import OrderedDict
|
|
|
|
class TNR(BasicNewsRecipe):
|
|
|
|
title = 'The New Republic'
|
|
__author__ = 'Rick Shang'
|
|
|
|
description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
|
|
language = 'en'
|
|
category = 'news'
|
|
encoding = 'UTF-8'
|
|
remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
|
|
no_javascript = True
|
|
no_stylesheets = True
|
|
|
|
|
|
def parse_index(self):
|
|
|
|
#Go to the issue
|
|
soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
|
|
issue = soup0.find('div',attrs={'id':'current_issue'})
|
|
|
|
#Find date
|
|
date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
|
|
self.timefmt = u' [%s]'%date
|
|
|
|
#Go to the main body
|
|
current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
|
|
soup = self.index_to_soup(current_issue_url)
|
|
div = soup.find ('div', attrs={'class':'article_detail_body'})
|
|
|
|
|
|
|
|
#Find cover
|
|
self.cover_url = div.find('img',src=True)['src']
|
|
|
|
feeds = OrderedDict()
|
|
section_title = ''
|
|
subsection_title = ''
|
|
for post in div.findAll('p'):
|
|
articles = []
|
|
em=post.find('em')
|
|
b=post.find('b')
|
|
a=post.find('a',href=True)
|
|
p=post.find('img', src=True)
|
|
#Find cover
|
|
if p is not None:
|
|
self.cover_url = p['src'].strip()
|
|
if em is not None:
|
|
section_title = self.tag_to_string(em).strip()
|
|
subsection_title = ''
|
|
elif b is not None:
|
|
subsection_title=self.tag_to_string(b).strip()
|
|
elif a is not None:
|
|
prefix = (subsection_title+': ') if subsection_title else ''
|
|
url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
|
|
author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
|
|
title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
|
|
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
|
|
|
|
if articles:
|
|
if section_title not in feeds:
|
|
feeds[section_title] = []
|
|
feeds[section_title] += articles
|
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
|
return ans
|