diff --git a/recipes/financial_times.recipe b/recipes/financial_times.recipe index e750b6f113..91d989a778 100644 --- a/recipes/financial_times.recipe +++ b/recipes/financial_times.recipe @@ -1,32 +1,42 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2010-2011, Darko Miletic ' ''' -ft.com +www.ft.com ''' +import datetime +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -class FinancialTimes(BasicNewsRecipe): - title = u'Financial Times' - __author__ = 'Darko Miletic and Sujata Raman' - description = ('Financial world news. Available after 5AM ' - 'GMT, daily.') +class FinancialTimes_rss(BasicNewsRecipe): + title = 'Financial Times' + __author__ = 'Darko Miletic' + description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." + publisher = 'The Financial Times Ltd.' + category = 'news, finances, politics, World' oldest_article = 2 - language = 'en' - - max_articles_per_feed = 100 + language = 'en' + max_articles_per_feed = 250 no_stylesheets = True use_embedded_content = False needs_subscription = True - simultaneous_downloads= 1 - delay = 1 + encoding = 'utf8' + publication_type = 'newspaper' + masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' + LOGIN = 'https://registration.ft.com/registration/barrier/login' + INDEX = 'http://www.ft.com' - LOGIN = 'https://registration.ft.com/registration/barrier/login' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } def get_browser(self): br = BasicNewsRecipe.get_browser() + br.open(self.INDEX) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(name='loginForm') @@ -35,31 +45,63 @@ class FinancialTimes(BasicNewsRecipe): br.submit() return br - keep_only_tags = [ dict(name='div', attrs={'id':'cont'}) ] - remove_tags_after = dict(name='p', attrs={'class':'copyright'}) + keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})] remove_tags = [ - dict(name='div', attrs={'id':'floating-con'}) + dict(name='div', attrs={'id':'floating-con'}) + ,dict(name=['meta','iframe','base','object','embed','link']) + ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']}) ] + remove_attributes = ['width','height','lang'] - extra_css = ''' - body{font-family:Arial,Helvetica,sans-serif;} - h2(font-size:large;} - .ft-story-header(font-size:xx-small;} - .ft-story-body(font-size:small;} - a{color:#003399;} + extra_css = """ + body{font-family: Georgia,Times,"Times New Roman",serif} + h2{font-size:large} + .ft-story-header{font-size: x-small} .container{font-size:x-small;} h3{font-size:x-small;color:#003399;} - ''' + .copyright{font-size: x-small} + img{margin-top: 0.8em; display: block} + .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} + .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} + """ + feeds = [ (u'UK' , u'http://www.ft.com/rss/home/uk' ) ,(u'US' , u'http://www.ft.com/rss/home/us' ) - ,(u'Europe' , u'http://www.ft.com/rss/home/europe' ) ,(u'Asia' , u'http://www.ft.com/rss/home/asia' ) ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast') ] def preprocess_html(self, soup): - content_type = soup.find('meta', {'http-equiv':'Content-Type'}) - if content_type: - content_type['content'] = 'text/html; charset=utf-8' + items = ['promo-box','promo-title', + 'promo-headline','promo-image', + 'promo-intro','promo-link','subhead'] + for item in items: + for it in soup.findAll(item): + it.name = 'div' + it.attrs = [] + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' return soup + + def get_cover_url(self): + cdate = datetime.date.today() + if cdate.isoweekday() == 7: + cdate -= datetime.timedelta(days=1) + return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_USA.pdf') + \ No newline at end of file diff --git a/recipes/icons/financial_times.png b/recipes/icons/financial_times.png new file mode 100644 index 0000000000..2a769d9dbb Binary files /dev/null and b/recipes/icons/financial_times.png differ