mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #4269 (The Straits Times feed - error in parser?)
This commit is contained in:
parent
e4164d71d2
commit
d41cabce25
@ -1,4 +1,3 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
@ -6,6 +5,7 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
|||||||
www.straitstimes.com
|
www.straitstimes.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class StraitsTimes(BasicNewsRecipe):
|
class StraitsTimes(BasicNewsRecipe):
|
||||||
@ -29,9 +29,21 @@ class StraitsTimes(BasicNewsRecipe):
|
|||||||
,'publisher' : publisher
|
,'publisher' : publisher
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags = [dict(name=['object','link','map'])]
|
preprocess_regexps = [
|
||||||
|
(re.compile(
|
||||||
|
r'<meta name="description" content="[^"]+"\s*/?>',
|
||||||
|
re.IGNORECASE|re.DOTALL),
|
||||||
|
lambda m:''),
|
||||||
|
(re.compile(r'<!--.+?-->', re.IGNORECASE|re.DOTALL),
|
||||||
|
lambda m: ''),
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link','map'])
|
||||||
|
,dict(name='div',attrs={'align':'left'})
|
||||||
|
]
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['top_headline','story_text']})]
|
keep_only_tags = [dict(name='div', attrs={'class':'stleft'})]
|
||||||
|
remove_tags_after=dict(name='div',attrs={'class':'hr_thin'})
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Singapore' , u'http://www.straitstimes.com/STI/STIFILES/rss/break_singapore.xml' )
|
(u'Singapore' , u'http://www.straitstimes.com/STI/STIFILES/rss/break_singapore.xml' )
|
||||||
@ -47,4 +59,3 @@ class StraitsTimes(BasicNewsRecipe):
|
|||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user