Fix #4269 (The Straits Times feed - error in parser?)

This commit is contained in:
Kovid Goyal 2009-12-22 20:03:58 -07:00
parent e4164d71d2
commit d41cabce25

View File

@ -1,4 +1,3 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
@ -6,6 +5,7 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
www.straitstimes.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class StraitsTimes(BasicNewsRecipe):
@ -29,9 +29,21 @@ class StraitsTimes(BasicNewsRecipe):
,'publisher' : publisher
}
remove_tags = [dict(name=['object','link','map'])]
preprocess_regexps = [
(re.compile(
r'<meta name="description" content="[^"]+"\s*/?>',
re.IGNORECASE|re.DOTALL),
lambda m:''),
(re.compile(r'<!--.+?-->', re.IGNORECASE|re.DOTALL),
lambda m: ''),
]
remove_tags = [
dict(name=['object','link','map'])
,dict(name='div',attrs={'align':'left'})
]
keep_only_tags = [dict(name='div', attrs={'class':['top_headline','story_text']})]
keep_only_tags = [dict(name='div', attrs={'class':'stleft'})]
remove_tags_after=dict(name='div',attrs={'class':'hr_thin'})
feeds = [
(u'Singapore' , u'http://www.straitstimes.com/STI/STIFILES/rss/break_singapore.xml' )
@ -47,4 +59,3 @@ class StraitsTimes(BasicNewsRecipe):
for item in soup.findAll(style=True):
del item['style']
return soup