From 8289d684543a54a5a0f1c6446f4ec56e777af5c8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 16 Dec 2010 12:13:40 -0700 Subject: [PATCH] Fix #7917 (New Scientist recipe update) --- resources/recipes/new_scientist.recipe | 52 ++++++++++++++++---------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/resources/recipes/new_scientist.recipe b/resources/recipes/new_scientist.recipe index 02bbbe4d42..434c41f525 100644 --- a/resources/recipes/new_scientist.recipe +++ b/resources/recipes/new_scientist.recipe @@ -5,6 +5,7 @@ newscientist.com ''' import re +import urllib from calibre.web.feeds.news import BasicNewsRecipe class NewScientist(BasicNewsRecipe): @@ -24,7 +25,7 @@ class NewScientist(BasicNewsRecipe): needs_subscription = 'optional' extra_css = """ body{font-family: Arial,sans-serif} - img{margin-bottom: 0.8em} + img{margin-bottom: 0.8em; display: block} .quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em} """ @@ -41,12 +42,14 @@ class NewScientist(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser() br.open('http://www.newscientist.com/') - if self.username is not None and self.password is not None: - br.open('https://www.newscientist.com/user/login?redirectURL=') - br.select_form(nr=2) - br['loginId' ] = self.username - br['password'] = self.password - br.submit() + if self.username is not None and self.password is not None: + br.open('https://www.newscientist.com/user/login') + data = urllib.urlencode({ 'source':'form' + ,'redirectURL':'' + ,'loginId':self.username + ,'password':self.password + }) + br.open('https://www.newscientist.com/user/login',data) return br remove_tags = [ @@ -55,21 +58,22 @@ class NewScientist(BasicNewsRecipe): ,dict(name='p' , attrs={'class':['marker','infotext' ]}) ,dict(name='meta' , attrs={'name' :'description' }) ,dict(name='a' , attrs={'rel' :'tag' }) + ,dict(name='ul' , attrs={'class':'markerlist' }) ,dict(name=['link','base','meta','iframe','object','embed']) ] remove_tags_after = dict(attrs={'class':['nbpcopy','comments']}) - remove_attributes = ['height','width','lang'] + remove_attributes = ['height','width','lang','onclick'] feeds = [ - (u'Latest Headlines' , u'http://feeds.newscientist.com/science-news' ) - ,(u'Magazine' , u'http://www.newscientist.com/feed/magazine' ) - ,(u'Health' , u'http://www.newscientist.com/feed/view?id=2&type=channel' ) - ,(u'Life' , u'http://www.newscientist.com/feed/view?id=3&type=channel' ) - ,(u'Space' , u'http://www.newscientist.com/feed/view?id=6&type=channel' ) - ,(u'Physics and Mathematics' , u'http://www.newscientist.com/feed/view?id=4&type=channel' ) - ,(u'Environment' , u'http://www.newscientist.com/feed/view?id=1&type=channel' ) - ,(u'Science in Society' , u'http://www.newscientist.com/feed/view?id=5&type=channel' ) - ,(u'Tech' , u'http://www.newscientist.com/feed/view?id=7&type=channel' ) + (u'Latest Headlines' , u'http://feeds.newscientist.com/science-news' ) + ,(u'Magazine' , u'http://feeds.newscientist.com/magazine' ) + ,(u'Health' , u'http://feeds.newscientist.com/health' ) + ,(u'Life' , u'http://feeds.newscientist.com/life' ) + ,(u'Space' , u'http://feeds.newscientist.com/space' ) + ,(u'Physics and Mathematics' , u'http://feeds.newscientist.com/physics-math' ) + ,(u'Environment' , u'http://feeds.newscientist.com/environment' ) + ,(u'Science in Society' , u'http://feeds.newscientist.com/science-in-society' ) + ,(u'Tech' , u'http://feeds.newscientist.com/tech' ) ] def get_article_url(self, article): @@ -79,11 +83,21 @@ class NewScientist(BasicNewsRecipe): return url + '?full=true&print=true' def preprocess_html(self, soup): + if soup.html.has_key('id'): + del soup.html['id'] + for item in soup.findAll(style=True): + del item['style'] for item in soup.findAll(['quote','quotetext']): item.name='p' + for item in soup.findAll(['xref','figref']): + tstr = item.string + item.replaceWith(tstr) for tg in soup.findAll('a'): if tg.string == 'Home': tg.parent.extract() - return self.adeify_images(soup) - return self.adeify_images(soup) + else: + if tg.string is not None: + tstr = tg.string + tg.replaceWith(tstr) + return soup