Fix Business Week Magazine

This commit is contained in:
Kovid Goyal 2012-05-23 22:56:16 +05:30
parent 4833df56c8
commit cd8236c1d1

View File

@ -15,6 +15,7 @@ class BusinessWeek(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 200 max_articles_per_feed = 200
no_stylesheets = True no_stylesheets = True
auto_cleanup = True
encoding = 'utf8' encoding = 'utf8'
use_embedded_content = False use_embedded_content = False
language = 'en' language = 'en'
@ -36,12 +37,12 @@ class BusinessWeek(BasicNewsRecipe):
, 'language' : language , 'language' : language
} }
remove_tags = [ #remove_tags = [
dict(attrs={'class':'inStory'}) #dict(attrs={'class':'inStory'})
,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td']) #,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
,dict(attrs={'id':['inset','videoDisplay']}) #,dict(attrs={'id':['inset','videoDisplay']})
] #]
keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody']})] #keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody']})]
remove_attributes = ['lang'] remove_attributes = ['lang']
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*'] match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*']
@ -100,3 +101,4 @@ class BusinessWeek(BasicNewsRecipe):
tstr = alink.string tstr = alink.string
alink.replaceWith(tstr) alink.replaceWith(tstr)
return soup return soup