From d84fd43c5a38f82bc1beef0b1828e85f74c06945 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Sep 2008 18:25:27 -0700 Subject: [PATCH] Improved recipe for The NEw York Review of Books --- src/calibre/web/feeds/recipes/__init__.py | 2 +- .../feeds/recipes/new_york_review_of_books.py | 50 +++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 src/calibre/web/feeds/recipes/new_york_review_of_books.py diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index b9be819568..45f119813b 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -9,7 +9,7 @@ recipes = [ 'nytimes', 'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj', 'wired', 'globe_and_mail', 'smh', 'espn', 'business_week', 'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht', - 'discover_magazine', 'scientific_american', + 'discover_magazine', 'scientific_american', 'new_york_review_of_books', ] import re, imp, inspect, time diff --git a/src/calibre/web/feeds/recipes/new_york_review_of_books.py b/src/calibre/web/feeds/recipes/new_york_review_of_books.py new file mode 100644 index 0000000000..e2bb4e0960 --- /dev/null +++ b/src/calibre/web/feeds/recipes/new_york_review_of_books.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +nybooks.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from lxml import html +from calibre.constants import preferred_encoding + +class NewYorkReviewOfBooks(BasicNewsRecipe): + + title = u'New York Review of Books' + description = u'Book reviews' + __author__ = 'Kovid Goyal' + + remove_tags_before = {'id':'container'} + remove_tags = [{'class':['noprint', 'ad', 'footer']}, {'id':'right-content'}] + + def parse_index(self): + root = html.fromstring(self.browser.open('http://www.nybooks.com/current-issue').read()) + date = root.xpath('//h4[@class = "date"]')[0] + self.timefmt = ' ['+date.text.encode(preferred_encoding)+']' + articles = [] + for tag in date.itersiblings(): + if tag.tag == 'h4': break + if tag.tag == 'p': + if tag.get('class') == 'indented': + articles[-1]['description'] += html.tostring(tag) + else: + href = tag.xpath('descendant::a[@href]')[0].get('href') + article = { + 'title': u''.join(tag.xpath('descendant::text()')), + 'date' : '', + 'url' : 'http://www.nybooks.com'+href, + 'description': '', + } + articles.append(article) + + return [('Current Issue', articles)] + + + + + + + \ No newline at end of file