Fix #7917 (New Scientist recipe update)

This commit is contained in:
Kovid Goyal 2010-12-16 12:13:40 -07:00
parent 5ce7afa6e2
commit 8289d68454

View File

@ -5,6 +5,7 @@ newscientist.com
''' '''
import re import re
import urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class NewScientist(BasicNewsRecipe): class NewScientist(BasicNewsRecipe):
@ -24,7 +25,7 @@ class NewScientist(BasicNewsRecipe):
needs_subscription = 'optional' needs_subscription = 'optional'
extra_css = """ extra_css = """
body{font-family: Arial,sans-serif} body{font-family: Arial,sans-serif}
img{margin-bottom: 0.8em} img{margin-bottom: 0.8em; display: block}
.quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em} .quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em}
""" """
@ -41,12 +42,14 @@ class NewScientist(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open('http://www.newscientist.com/') br.open('http://www.newscientist.com/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('https://www.newscientist.com/user/login?redirectURL=') br.open('https://www.newscientist.com/user/login')
br.select_form(nr=2) data = urllib.urlencode({ 'source':'form'
br['loginId' ] = self.username ,'redirectURL':''
br['password'] = self.password ,'loginId':self.username
br.submit() ,'password':self.password
})
br.open('https://www.newscientist.com/user/login',data)
return br return br
remove_tags = [ remove_tags = [
@ -55,21 +58,22 @@ class NewScientist(BasicNewsRecipe):
,dict(name='p' , attrs={'class':['marker','infotext' ]}) ,dict(name='p' , attrs={'class':['marker','infotext' ]})
,dict(name='meta' , attrs={'name' :'description' }) ,dict(name='meta' , attrs={'name' :'description' })
,dict(name='a' , attrs={'rel' :'tag' }) ,dict(name='a' , attrs={'rel' :'tag' })
,dict(name='ul' , attrs={'class':'markerlist' })
,dict(name=['link','base','meta','iframe','object','embed']) ,dict(name=['link','base','meta','iframe','object','embed'])
] ]
remove_tags_after = dict(attrs={'class':['nbpcopy','comments']}) remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
remove_attributes = ['height','width','lang'] remove_attributes = ['height','width','lang','onclick']
feeds = [ feeds = [
(u'Latest Headlines' , u'http://feeds.newscientist.com/science-news' ) (u'Latest Headlines' , u'http://feeds.newscientist.com/science-news' )
,(u'Magazine' , u'http://www.newscientist.com/feed/magazine' ) ,(u'Magazine' , u'http://feeds.newscientist.com/magazine' )
,(u'Health' , u'http://www.newscientist.com/feed/view?id=2&type=channel' ) ,(u'Health' , u'http://feeds.newscientist.com/health' )
,(u'Life' , u'http://www.newscientist.com/feed/view?id=3&type=channel' ) ,(u'Life' , u'http://feeds.newscientist.com/life' )
,(u'Space' , u'http://www.newscientist.com/feed/view?id=6&type=channel' ) ,(u'Space' , u'http://feeds.newscientist.com/space' )
,(u'Physics and Mathematics' , u'http://www.newscientist.com/feed/view?id=4&type=channel' ) ,(u'Physics and Mathematics' , u'http://feeds.newscientist.com/physics-math' )
,(u'Environment' , u'http://www.newscientist.com/feed/view?id=1&type=channel' ) ,(u'Environment' , u'http://feeds.newscientist.com/environment' )
,(u'Science in Society' , u'http://www.newscientist.com/feed/view?id=5&type=channel' ) ,(u'Science in Society' , u'http://feeds.newscientist.com/science-in-society' )
,(u'Tech' , u'http://www.newscientist.com/feed/view?id=7&type=channel' ) ,(u'Tech' , u'http://feeds.newscientist.com/tech' )
] ]
def get_article_url(self, article): def get_article_url(self, article):
@ -79,11 +83,21 @@ class NewScientist(BasicNewsRecipe):
return url + '?full=true&print=true' return url + '?full=true&print=true'
def preprocess_html(self, soup): def preprocess_html(self, soup):
if soup.html.has_key('id'):
del soup.html['id']
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(['quote','quotetext']): for item in soup.findAll(['quote','quotetext']):
item.name='p' item.name='p'
for item in soup.findAll(['xref','figref']):
tstr = item.string
item.replaceWith(tstr)
for tg in soup.findAll('a'): for tg in soup.findAll('a'):
if tg.string == 'Home': if tg.string == 'Home':
tg.parent.extract() tg.parent.extract()
return self.adeify_images(soup) else:
return self.adeify_images(soup) if tg.string is not None:
tstr = tg.string
tg.replaceWith(tstr)
return soup