mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Slate
This commit is contained in:
parent
dc18dbd5b0
commit
b2dc29019a
@ -7,31 +7,28 @@ __license__ = 'GPL v3'
|
|||||||
calibre recipe for slate.com
|
calibre recipe for slate.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class Slate(BasicNewsRecipe):
|
class Slate(BasicNewsRecipe):
|
||||||
|
title = 'Slate'
|
||||||
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
|
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
title = 'Slate'
|
|
||||||
INDEX = 'http://slate.com'
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
preprocess_regexps = [
|
masthead_url = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
|
||||||
(re.compile(r'<!--.*?-->', re.DOTALL), lambda x: ''),
|
|
||||||
(re.compile(r'^.*?<html', re.DOTALL), lambda x:'<html'),
|
|
||||||
(re.compile(r'<meta[^>]+?/>', re.DOTALL), lambda x:''),
|
|
||||||
]
|
|
||||||
remove_tags = [
|
|
||||||
{'name':['link', 'script']},
|
|
||||||
{'class':['share-box-flank', 'sl-crumbs', 'sl-tbar',
|
|
||||||
'sl-chunky-tbar']},
|
|
||||||
]
|
|
||||||
remove_tags_after = [{'class':'sl-art-creds-cntr'}]
|
|
||||||
keep_only_tags = {'class':'sl-body-wrapper'}
|
|
||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
|
INDEX = 'http://slate.com'
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='header', attrs={'class':'article-header'}),
|
||||||
|
dict(name='section', attrs={'class':'content'}),
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(id='header_social'),
|
||||||
|
dict(attrs={'class':['prop-name', 'prop-desc', 'authorbox']}),
|
||||||
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('.html', '.single.html')
|
return url.replace('.html', '.single.html')
|
||||||
@ -49,48 +46,32 @@ class Slate(BasicNewsRecipe):
|
|||||||
('Double X', '/articles/double_x.html'),
|
('Double X', '/articles/double_x.html'),
|
||||||
):
|
):
|
||||||
url = self.INDEX + url
|
url = self.INDEX + url
|
||||||
self.log('Found section:', sectitle)
|
self.log('\nFound section:', sectitle)
|
||||||
articles = self.slate_section_articles(self.index_to_soup(url))
|
articles = self.slate_section_articles(self.index_to_soup(url))
|
||||||
if articles:
|
if articles:
|
||||||
ans.append((sectitle, articles))
|
ans.append((sectitle, articles))
|
||||||
|
if self.test and len(ans) > 1:
|
||||||
|
break
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def slate_section_articles(self, soup):
|
def slate_section_articles(self, soup):
|
||||||
cont = soup.find('div', id='most_read')
|
|
||||||
seen = set()
|
|
||||||
ans = []
|
ans = []
|
||||||
for h4 in cont.findAll('h4'):
|
main = soup.find('article', attrs={'class':'main'})
|
||||||
a = h4.find('a', href=True)
|
for a in main.findAll('a', attrs={'class':'primary'}):
|
||||||
if a is None: continue
|
|
||||||
url = a['href']
|
url = a['href']
|
||||||
if url.startswith('/'):
|
if url.endswith('/'):
|
||||||
url = self.INDEX + url
|
continue
|
||||||
if url in seen: continue
|
p = a.parent
|
||||||
seen.add(url)
|
title = p.find(attrs={'class':'hed'})
|
||||||
title = self.tag_to_string(a)
|
if title is None:
|
||||||
parent = h4.parent
|
continue
|
||||||
h3 = parent.find('h3')
|
title = self.tag_to_string(title)
|
||||||
|
span = p.find(attrs={'class':'byline'})
|
||||||
desc = ''
|
desc = ''
|
||||||
if h3 is not None:
|
if span is not None:
|
||||||
desc = self.tag_to_string(h3)
|
desc = self.tag_to_string(span)
|
||||||
a = parent.find('a', rel='author')
|
self.log('\t' + title)
|
||||||
if a is not None:
|
self.log('\t\t' + url)
|
||||||
a = self.tag_to_string(a)
|
ans.append({'title':title, 'description':desc, 'date':'', 'url':url})
|
||||||
art = {'title':title, 'description':desc, 'date':'', 'url':url}
|
|
||||||
if a:
|
|
||||||
art['author'] = a
|
|
||||||
self.log('\tFound article:', title, ' by ', a)
|
|
||||||
ans.append(art)
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def get_masthead_url(self):
|
|
||||||
masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
|
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
|
||||||
try:
|
|
||||||
br.open(masthead)
|
|
||||||
except:
|
|
||||||
self.log("\nMasthead unavailable")
|
|
||||||
masthead = None
|
|
||||||
return masthead
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user