From 53b0b57959c1e2680a1a414f0e7a98696d85aed5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 26 Jun 2011 13:31:47 -0600 Subject: [PATCH] Fix #802232 (Updated recipe for Financial times rss) --- recipes/financial_times.recipe | 100 +++++++++++++++++++++--------- recipes/icons/financial_times.png | Bin 0 -> 1470 bytes 2 files changed, 71 insertions(+), 29 deletions(-) create mode 100644 recipes/icons/financial_times.png diff --git a/recipes/financial_times.recipe b/recipes/financial_times.recipe index e750b6f113..91d989a778 100644 --- a/recipes/financial_times.recipe +++ b/recipes/financial_times.recipe @@ -1,32 +1,42 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2010-2011, Darko Miletic ' ''' -ft.com +www.ft.com ''' +import datetime +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -class FinancialTimes(BasicNewsRecipe): - title = u'Financial Times' - __author__ = 'Darko Miletic and Sujata Raman' - description = ('Financial world news. Available after 5AM ' - 'GMT, daily.') +class FinancialTimes_rss(BasicNewsRecipe): + title = 'Financial Times' + __author__ = 'Darko Miletic' + description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." + publisher = 'The Financial Times Ltd.' + category = 'news, finances, politics, World' oldest_article = 2 - language = 'en' - - max_articles_per_feed = 100 + language = 'en' + max_articles_per_feed = 250 no_stylesheets = True use_embedded_content = False needs_subscription = True - simultaneous_downloads= 1 - delay = 1 + encoding = 'utf8' + publication_type = 'newspaper' + masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' + LOGIN = 'https://registration.ft.com/registration/barrier/login' + INDEX = 'http://www.ft.com' - LOGIN = 'https://registration.ft.com/registration/barrier/login' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } def get_browser(self): br = BasicNewsRecipe.get_browser() + br.open(self.INDEX) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(name='loginForm') @@ -35,31 +45,63 @@ class FinancialTimes(BasicNewsRecipe): br.submit() return br - keep_only_tags = [ dict(name='div', attrs={'id':'cont'}) ] - remove_tags_after = dict(name='p', attrs={'class':'copyright'}) + keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})] remove_tags = [ - dict(name='div', attrs={'id':'floating-con'}) + dict(name='div', attrs={'id':'floating-con'}) + ,dict(name=['meta','iframe','base','object','embed','link']) + ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']}) ] + remove_attributes = ['width','height','lang'] - extra_css = ''' - body{font-family:Arial,Helvetica,sans-serif;} - h2(font-size:large;} - .ft-story-header(font-size:xx-small;} - .ft-story-body(font-size:small;} - a{color:#003399;} + extra_css = """ + body{font-family: Georgia,Times,"Times New Roman",serif} + h2{font-size:large} + .ft-story-header{font-size: x-small} .container{font-size:x-small;} h3{font-size:x-small;color:#003399;} - ''' + .copyright{font-size: x-small} + img{margin-top: 0.8em; display: block} + .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} + .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} + """ + feeds = [ (u'UK' , u'http://www.ft.com/rss/home/uk' ) ,(u'US' , u'http://www.ft.com/rss/home/us' ) - ,(u'Europe' , u'http://www.ft.com/rss/home/europe' ) ,(u'Asia' , u'http://www.ft.com/rss/home/asia' ) ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast') ] def preprocess_html(self, soup): - content_type = soup.find('meta', {'http-equiv':'Content-Type'}) - if content_type: - content_type['content'] = 'text/html; charset=utf-8' + items = ['promo-box','promo-title', + 'promo-headline','promo-image', + 'promo-intro','promo-link','subhead'] + for item in items: + for it in soup.findAll(item): + it.name = 'div' + it.attrs = [] + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' return soup + + def get_cover_url(self): + cdate = datetime.date.today() + if cdate.isoweekday() == 7: + cdate -= datetime.timedelta(days=1) + return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_USA.pdf') + \ No newline at end of file diff --git a/recipes/icons/financial_times.png b/recipes/icons/financial_times.png new file mode 100644 index 0000000000000000000000000000000000000000..2a769d9dbb359c6713d0e27e4660c1d06ae22c54 GIT binary patch literal 1470 zcmV;v1ws0WP)$c3DqG>cElz=c@nAnccUFgmPg2^R@rfp60lW)vScNoq3#uY5B{U zx|w*cmv$W^C^9lKK07%7->6ALKK$CGPDerc$Ch+sQ_7@%|Lw5<+M>~@fuxIVLq9us zXIA{uol;Cg;IxUaka1m8NcPB<(6x%Oly#|(aaK=8TUAQ(#FXv7kY-*`=emyIxr~Bt zT8DC7mwsikl5(ewZ}GyCzo>z=m30ytA}K2_EiW)QH#Yyyn*Ga|_{x{}#+B>6kacBL zd}>(Gvxv^7fXt_dt<#+ajs{iAv|Jb7X(VY6pm+!xj&!~YRCMq#8GBYzXN<=WJk?+Bg?!b}kzme>|km|jU z>b#C|Vp4WwRCi@mcV<|h_K9|I6PXT7>n1S_~67;6etpepYp`a!!)v;qh|mwh@~akLAuNK;p6}n1szS@Ah%+!8c00Ix7tggl+1bSh??zuq z&=a4%o2NDI*^+BK*4$1&1s(irUqng8R28{LHga5!W8zX?u~SNH1nv)vz7^cAQmCh@ef|hmSYYTcM z_I%~uf?a`&^NJmtZkW_~91U8GsbFKhnTvB?|APl_zjVEQC6xM30;ph85M05&S?v-o z=T@Z6-y*bdVcT+{d8(&*n!J*IAey@VIt^Un&p{d0(y>jtlh=Rk@y=TtOh?(7UeBolhmpYvS z2gGh>#AyCF5flVb&}kEP+C$*X#N1$z35(w*Px}Yr+8qCms?)00hzFIz7BS<#e7SI77->e$QF97QNjia)e8gWg0inxg4Q0aKue3ofPo+Y Y06ekPz%t>XKmY&$07*qoM6N<$f*Td~BLDyZ literal 0 HcmV?d00001