Update Slate

This commit is contained in:
Kovid Goyal 2013-09-24 22:13:46 +05:30
parent dc18dbd5b0
commit b2dc29019a

View File

@ -7,31 +7,28 @@ __license__ = 'GPL v3'
calibre recipe for slate.com calibre recipe for slate.com
''' '''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class Slate(BasicNewsRecipe): class Slate(BasicNewsRecipe):
title = 'Slate'
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.' description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
timefmt = '' timefmt = ''
no_stylesheets = True no_stylesheets = True
language = 'en' language = 'en'
title = 'Slate'
INDEX = 'http://slate.com'
encoding = 'utf-8' encoding = 'utf-8'
preprocess_regexps = [ masthead_url = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
(re.compile(r'<!--.*?-->', re.DOTALL), lambda x: ''),
(re.compile(r'^.*?<html', re.DOTALL), lambda x:'<html'),
(re.compile(r'<meta[^>]+?/>', re.DOTALL), lambda x:''),
]
remove_tags = [
{'name':['link', 'script']},
{'class':['share-box-flank', 'sl-crumbs', 'sl-tbar',
'sl-chunky-tbar']},
]
remove_tags_after = [{'class':'sl-art-creds-cntr'}]
keep_only_tags = {'class':'sl-body-wrapper'}
remove_attributes = ['style'] remove_attributes = ['style']
INDEX = 'http://slate.com'
keep_only_tags = [
dict(name='header', attrs={'class':'article-header'}),
dict(name='section', attrs={'class':'content'}),
]
remove_tags = [
dict(id='header_social'),
dict(attrs={'class':['prop-name', 'prop-desc', 'authorbox']}),
]
def print_version(self, url): def print_version(self, url):
return url.replace('.html', '.single.html') return url.replace('.html', '.single.html')
@ -49,48 +46,32 @@ class Slate(BasicNewsRecipe):
('Double X', '/articles/double_x.html'), ('Double X', '/articles/double_x.html'),
): ):
url = self.INDEX + url url = self.INDEX + url
self.log('Found section:', sectitle) self.log('\nFound section:', sectitle)
articles = self.slate_section_articles(self.index_to_soup(url)) articles = self.slate_section_articles(self.index_to_soup(url))
if articles: if articles:
ans.append((sectitle, articles)) ans.append((sectitle, articles))
if self.test and len(ans) > 1:
break
return ans return ans
def slate_section_articles(self, soup): def slate_section_articles(self, soup):
cont = soup.find('div', id='most_read')
seen = set()
ans = [] ans = []
for h4 in cont.findAll('h4'): main = soup.find('article', attrs={'class':'main'})
a = h4.find('a', href=True) for a in main.findAll('a', attrs={'class':'primary'}):
if a is None: continue
url = a['href'] url = a['href']
if url.startswith('/'): if url.endswith('/'):
url = self.INDEX + url continue
if url in seen: continue p = a.parent
seen.add(url) title = p.find(attrs={'class':'hed'})
title = self.tag_to_string(a) if title is None:
parent = h4.parent continue
h3 = parent.find('h3') title = self.tag_to_string(title)
span = p.find(attrs={'class':'byline'})
desc = '' desc = ''
if h3 is not None: if span is not None:
desc = self.tag_to_string(h3) desc = self.tag_to_string(span)
a = parent.find('a', rel='author') self.log('\t' + title)
if a is not None: self.log('\t\t' + url)
a = self.tag_to_string(a) ans.append({'title':title, 'description':desc, 'date':'', 'url':url})
art = {'title':title, 'description':desc, 'date':'', 'url':url}
if a:
art['author'] = a
self.log('\tFound article:', title, ' by ', a)
ans.append(art)
return ans return ans
def get_masthead_url(self):
masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(masthead)
except:
self.log("\nMasthead unavailable")
masthead = None
return masthead