calibre/recipes/nytimesbook.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

66 lines
2.4 KiB
Plaintext

from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
class NewYorkTimesBookReview(BasicNewsRecipe):
title = u'New York Times Book Review'
language = 'en'
description = 'The New York Times Sunday Book Review'
__author__ = 'Kovid Goyal'
no_stylesheets = True
no_javascript = True
ignore_duplicate_articles = {'title', 'url'}
encoding = 'utf-8'
keep_only_tags = [
dict(id=['headline', 'story-meta-footer']),
dict(itemprop=['associatedMedia', 'articleBody', 'reviewBody']),
classes('story-body'),
]
remove_tags = [
dict(id=['d-promo-realestate', 'books-update-email-promo']),
classes('skip-to-text-link story-meta-footer-sharetools story-footer-links'),
]
def parse_index(self):
soup = self.index_to_soup(
'http://www.nytimes.com/pages/books/review/index.html')
# Find TOC
toc = soup.find('div', attrs={'class': 'rank'})
main_articles, articles = [], []
feeds = [('Features', main_articles), ('Latest', articles)]
for h2 in toc.findAll('h2', attrs={'class': 'headline'}):
a = h2.find('a', href=True)
if a is not None:
title = self.tag_to_string(a)
url = a['href']
desc = ''
p = h2.findNextSibling('p', attrs={'class': 'summary'})
if p:
desc = self.tag_to_string(p)
main_articles.append(
{'title': title, 'url': url, 'description': desc})
self.log('Found:', title, 'at', url)
if desc:
self.log('\t', desc)
for li in soup.find(id='latest-panel').find('ol').findAll('li'):
a = li.find('a', attrs={'class': 'story-link'}, href=True)
if a is None:
continue
url = a['href']
m = a.find(attrs={'class': 'story-meta'})
title = self.tag_to_string(m.find('h2'))
desc = self.tag_to_string(m.find(attrs={'class': 'summary'}))
articles.append({'title': title, 'url': url, 'description': desc})
self.log('Found:', title, 'at', url)
if desc:
self.log('\t', desc)
return feeds