From c70adff6bfedbce5043ec0852fc5dca9c68a9ea3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 16 Dec 2009 12:23:39 -0700 Subject: [PATCH] Fix #4226 (London Review of Books fails to load completely. An index and table of contents is the only pages in the feed.) --- resources/recipes/lrb.recipe | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/resources/recipes/lrb.recipe b/resources/recipes/lrb.recipe index 8c248b00f1..0076b3e697 100644 --- a/resources/recipes/lrb.recipe +++ b/resources/recipes/lrb.recipe @@ -1,4 +1,3 @@ -#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Darko Miletic ' @@ -12,30 +11,29 @@ class LondonReviewOfBooks(BasicNewsRecipe): title = u'London Review of Books' __author__ = u'Darko Miletic' description = u'Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers' + category = 'news, literature, England' + publisher = 'London Review of Books' oldest_article = 7 max_articles_per_feed = 100 - language = 'en_GB' - + language = 'en_GB' no_stylesheets = True use_embedded_content = False - encoding = 'cp1252' + encoding = 'utf-8' + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [dict(name='div' , attrs={'id' :'main'})] remove_tags = [ - dict(name='div' , attrs={'id' :'otherarticles'}) - ,dict(name='div' , attrs={'class':'pagetools' }) - ,dict(name='div' , attrs={'id' :'mainmenu' }) - ,dict(name='div' , attrs={'id' :'precontent' }) - ,dict(name='div' , attrs={'class':'nocss' }) - ,dict(name='span', attrs={'class':'inlineright' }) + dict(name='div' , attrs={'class':['pagetools','issue-nav-controls','nocss']}) + ,dict(name='div' , attrs={'id' :['mainmenu','precontent','otherarticles'] }) + ,dict(name='span', attrs={'class':['inlineright','article-icons']}) + ,dict(name='ul' , attrs={'class':'article-controls'}) + ,dict(name='p' , attrs={'class':'meta-info' }) ] feeds = [(u'London Review of Books', u'http://www.lrb.co.uk/lrbrss.xml')] - - def print_version(self, url): - main, split, rest = url.rpartition('/') - return main + '/print/' + rest - - def postprocess_html(self, soup, first_fetch): - for t in soup.findAll(['table', 'tr', 'td']): - t.name = 'div' - return soup