mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #4269 (The Straits Times feed - error in parser?)
This commit is contained in:
parent
e4164d71d2
commit
d41cabce25
@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
@ -6,6 +5,7 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
www.straitstimes.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class StraitsTimes(BasicNewsRecipe):
|
||||
@ -29,9 +29,21 @@ class StraitsTimes(BasicNewsRecipe):
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
remove_tags = [dict(name=['object','link','map'])]
|
||||
preprocess_regexps = [
|
||||
(re.compile(
|
||||
r'<meta name="description" content="[^"]+"\s*/?>',
|
||||
re.IGNORECASE|re.DOTALL),
|
||||
lambda m:''),
|
||||
(re.compile(r'<!--.+?-->', re.IGNORECASE|re.DOTALL),
|
||||
lambda m: ''),
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name=['object','link','map'])
|
||||
,dict(name='div',attrs={'align':'left'})
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['top_headline','story_text']})]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'stleft'})]
|
||||
remove_tags_after=dict(name='div',attrs={'class':'hr_thin'})
|
||||
|
||||
feeds = [
|
||||
(u'Singapore' , u'http://www.straitstimes.com/STI/STIFILES/rss/break_singapore.xml' )
|
||||
@ -47,4 +59,3 @@ class StraitsTimes(BasicNewsRecipe):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user