From a9e3e679e2da331e393d57caf3ab6735d11e12f4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Apr 2013 09:35:49 +0530 Subject: [PATCH] Fix #1169590 (Updated recipe for Financial Times, UK and US edition) --- recipes/financial_times_uk.recipe | 6 +-- recipes/financial_times_us.recipe | 63 +++++++++++++++---------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index 8105a9777f..6aa926a076 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -1,7 +1,7 @@ __license__ = 'GPL v3' -__copyright__ = '2010-2012, Darko Miletic ' +__copyright__ = '2010-2013, Darko Miletic ' ''' -www.ft.com/uk-edition +www.ft.com/intl/uk-edition ''' import datetime @@ -29,7 +29,7 @@ class FinancialTimes(BasicNewsRecipe): masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://registration.ft.com/registration/barrier/login' LOGIN2 = 'http://media.ft.com/h/subs3.html' - INDEX = 'http://www.ft.com/uk-edition' + INDEX = 'http://www.ft.com/intl/uk-edition' PREFIX = 'http://www.ft.com' conversion_options = { diff --git a/recipes/financial_times_us.recipe b/recipes/financial_times_us.recipe index 3821e5ea0e..7d8eed92f9 100644 --- a/recipes/financial_times_us.recipe +++ b/recipes/financial_times_us.recipe @@ -1,20 +1,21 @@ __license__ = 'GPL v3' -__copyright__ = '2013, Darko Miletic ' +__copyright__ = '2010-2013, Darko Miletic ' ''' -http://www.ft.com/intl/us-edition +www.ft.com/intl/international-edition ''' import datetime from calibre.ptempfile import PersistentTemporaryFile from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe +from collections import OrderedDict class FinancialTimes(BasicNewsRecipe): - title = 'Financial Times (US) printed edition' + title = 'Financial Times (International) printed edition' __author__ = 'Darko Miletic' description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." publisher = 'The Financial Times Ltd.' - category = 'news, finances, politics, UK, World' + category = 'news, finances, politics, World' oldest_article = 2 language = 'en' max_articles_per_feed = 250 @@ -28,7 +29,7 @@ class FinancialTimes(BasicNewsRecipe): masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://registration.ft.com/registration/barrier/login' LOGIN2 = 'http://media.ft.com/h/subs3.html' - INDEX = 'http://www.ft.com/intl/us-edition' + INDEX = 'http://www.ft.com/intl/international-edition' PREFIX = 'http://www.ft.com' conversion_options = { @@ -93,7 +94,7 @@ class FinancialTimes(BasicNewsRecipe): try: urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. except: - continue + continue title = self.tag_to_string(item) date = strftime(self.timefmt) articles.append({ @@ -105,29 +106,30 @@ class FinancialTimes(BasicNewsRecipe): return articles def parse_index(self): - feeds = [] + feeds = OrderedDict() soup = self.index_to_soup(self.INDEX) - dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) - self.timefmt = ' [%s]'%dates - wide = soup.find('div',attrs={'class':'wide'}) - if not wide: - return feeds - allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()}) - if not allsections: - return feeds - count = 0 - for item in allsections: - count = count + 1 - if self.test and count > 2: - return feeds - fitem = item.h3 - if not fitem: - fitem = item.h4 - ftitle = self.tag_to_string(fitem) - self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) - feedarts = self.get_artlinks(item.ul) - feeds.append((ftitle,feedarts)) - return feeds + #dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) + #self.timefmt = ' [%s]'%dates + section_title = 'Untitled' + + for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}): + for section in column. findAll('div', attrs = {'class':'feedBox'}): + sectiontitle=self.tag_to_string(section.find('h4')) + if '...' not in sectiontitle: section_title=sectiontitle + for article in section.ul.findAll('li'): + articles = [] + title=self.tag_to_string(article.a) + url=article.a['href'] + articles.append({'title':title, 'url':url, 'description':'', 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + + + ans = [(key, val) for key, val in feeds.iteritems()] + return ans def preprocess_html(self, soup): items = ['promo-box','promo-title', @@ -174,9 +176,6 @@ class FinancialTimes(BasicNewsRecipe): count += 1 tfile = PersistentTemporaryFile('_fa.html') tfile.write(html) - tfile.close() + tfile.close() self.temp_files.append(tfile) return tfile.name - - def cleanup(self): - self.browser.open('https://registration.ft.com/registration/login/logout?location=') \ No newline at end of file