diff --git a/recipes/the_oz.recipe b/recipes/the_oz.recipe index 3909265b30..20cd082de4 100644 --- a/recipes/the_oz.recipe +++ b/recipes/the_oz.recipe @@ -7,11 +7,13 @@ __docformat__ = 'restructuredtext en' http://www.theaustralian.news.com.au/ ''' -from calibre.web.feeds.news import BasicNewsRecipe +from calibre import browser +from calibre.web.feeds.jsnews import JavascriptRecipe +from calibre.web.feeds import feed_from_xml -class DailyTelegraph(BasicNewsRecipe): +class DailyTelegraph(JavascriptRecipe): title = u'The Australian' - __author__ = u'Matthew Briggs and Sujata Raman' + __author__ = u'Kovid Goyal' description = (u'National broadsheet newspaper from down under - colloquially known as The Oz' '. You will need to have a subscription to ' 'http://www.theaustralian.com.au to get full articles.') @@ -23,27 +25,16 @@ class DailyTelegraph(BasicNewsRecipe): remove_javascript = True no_stylesheets = True encoding = 'utf8' + remove_empty_feeds = True + ignore_duplicate_articles = {'url'} - keep_only_tags = [dict(name='div', attrs={'id': 'story'})] - - # remove_tags = [dict(name=['object','link'])] - remove_tags = [dict(name='div', attrs={'class': 'story-info'}), - dict(name='div', attrs={'class': 'story-header-tools'}), - dict(name='div', attrs={'class': 'story-sidebar'}), - dict(name='div', attrs={'class': 'story-footer'}), - dict(name='div', attrs={'id': 'comments'}), - dict(name='div', attrs={'class': 'story-extras story-extras-2'}), - dict(name='div', attrs={'class': 'group item-count-1 story-related'}) - ] - - extra_css = ''' - h1{font-family :Georgia,"Times New Roman",Times,serif; font-size:large; } - #article{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;} - .module-subheader{font-family :Tahoma,Geneva,Arial,Helvetica,sans-serif; color:#666666; font-size: xx-small;} - .intro{ font-family:Trebuchet MS,Trebuchet,Helvetica,sans-serif;font-size: x-small; } - .article-source{font-family:Trebuchet MS,Trebuchet,Helvetica,sans-serif; color:#666666; font-size: xx-small;} - .caption{font-family:Trebuchet MS,Trebuchet,Helvetica,sans-serif; font-size: xx-small;} - ''' + keep_only_tags = ['div#story'] + remove_tags = [ + '.story-info', '.story-header-tools', '.module-controls', '.story-sidebar', + '.story-footer', '#comments', '.story-extras', '.story-related', '.vms-nav', + '.vms-endcard', '.vms-discover', '.share-tools', '.story-comments-link', + '.vms-controls', '.ooyala-player', '.vms-countdown', '.vms-header', '.comments', + ] feeds = [ (u'News', u'http://feeds.news.com.au/public/rss/2.0/aus_news_807.xml'), @@ -63,29 +54,34 @@ class DailyTelegraph(BasicNewsRecipe): (u'Business', u'http://feeds.news.com.au/public/rss/2.0/aus_business_811.xml'), (u'Aviation', u'http://feeds.news.com.au/public/rss/2.0/aus_business_aviation_706.xml'), (u'Commercial Property', u'http://feeds.news.com.au/public/rss/2.0/aus_business_commercial_property_708.xml'), - (u'Mining', u'http://feeds.news.com.au/public/rss/2.0/aus_business_mining_704.xml')] + (u'Mining', u'http://feeds.news.com.au/public/rss/2.0/aus_business_mining_704.xml') + ] - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username and self.password: - br.open('http://www.theaustralian.com.au') - br.select_form(nr=1) - br['username'] = self.username - br['password'] = self.password - raw = br.submit().read() - if '>log out' not in raw.lower(): - raise ValueError('Failed to log in to www.theaustralian.com.au' - ' are your username and password correct?') - return br + def get_publication_data(self, br): + br = browser() + ans = {} + feeds = ans['index'] = [] + for title, url in self.feeds: + raw = br.open_novisit(url).read() + self.log('Fetching feed: %s' % title) + feed = feed_from_xml(raw, title=title, log=self.log, + oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, get_article_url=self.get_article_url) + if len(feed) > 0: + feeds.append((title, [ + {'title':a.title, 'url':a.url, 'description':a.text_summary} for a in feed.articles])) + return ans + + def do_login(self, browser, username, password): + if username and password: + browser.visit('http://www.theaustralian.com.au/login') + form = browser.select_form('form[action="https://idp.news.com.au/idp/Authn/rest"]') + form['username'] = username + form['password'] = password + browser.submit(submit_control_selector='button[type="submit"]', timeout=60) + if '>Log Out' not in browser.html: + raise ValueError('Failed to log in, check your username and password') def get_article_url(self, article): return article.id - # br = self.get_browser() - # br.open(article.link).read() - # print br.geturl() - - # return br.geturl() - -