From d41cabce250a0dde970709e1dce09401bc341627 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Dec 2009 20:03:58 -0700 Subject: [PATCH] Fix #4269 (The Straits Times feed - error in parser?) --- resources/recipes/straitstimes.recipe | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/resources/recipes/straitstimes.recipe b/resources/recipes/straitstimes.recipe index 64e50e2f60..5faf616774 100644 --- a/resources/recipes/straitstimes.recipe +++ b/resources/recipes/straitstimes.recipe @@ -1,4 +1,3 @@ -#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' @@ -6,6 +5,7 @@ __copyright__ = '2009, Darko Miletic ' www.straitstimes.com ''' +import re from calibre.web.feeds.recipes import BasicNewsRecipe class StraitsTimes(BasicNewsRecipe): @@ -29,9 +29,21 @@ class StraitsTimes(BasicNewsRecipe): ,'publisher' : publisher } - remove_tags = [dict(name=['object','link','map'])] + preprocess_regexps = [ + (re.compile( + r'', + re.IGNORECASE|re.DOTALL), + lambda m:''), + (re.compile(r'', re.IGNORECASE|re.DOTALL), + lambda m: ''), + ] + remove_tags = [ + dict(name=['object','link','map']) + ,dict(name='div',attrs={'align':'left'}) + ] - keep_only_tags = [dict(name='div', attrs={'class':['top_headline','story_text']})] + keep_only_tags = [dict(name='div', attrs={'class':'stleft'})] + remove_tags_after=dict(name='div',attrs={'class':'hr_thin'}) feeds = [ (u'Singapore' , u'http://www.straitstimes.com/STI/STIFILES/rss/break_singapore.xml' ) @@ -47,4 +59,3 @@ class StraitsTimes(BasicNewsRecipe): for item in soup.findAll(style=True): del item['style'] return soup -