This commit is contained in:
Kovid Goyal 2014-03-07 09:05:58 +05:30
parent 6f30914a52
commit d410d11e55

View File

@ -4,7 +4,6 @@ __copyright__ = '2010-2014, Darko Miletic <darko.miletic at gmail.com>'
www.ft.com/intl/uk-edition www.ft.com/intl/uk-edition
''' '''
import datetime
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -13,7 +12,7 @@ from collections import OrderedDict
class FinancialTimes(BasicNewsRecipe): class FinancialTimes(BasicNewsRecipe):
title = 'Financial Times (UK)' title = 'Financial Times (UK)'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." # noqa
publisher = 'The Financial Times Ltd.' publisher = 'The Financial Times Ltd.'
category = 'news, finances, politics, UK, World' category = 'news, finances, politics, UK, World'
oldest_article = 2 oldest_article = 2
@ -58,7 +57,7 @@ class FinancialTimes(BasicNewsRecipe):
dict(name='div', attrs={'id':'floating-con'}) dict(name='div', attrs={'id':'floating-con'})
,dict(name=['meta','iframe','base','object','embed','link']) ,dict(name=['meta','iframe','base','object','embed','link'])
,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image','promobox']}) ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image','promobox']})
,dict(name='div', attrs={'class':lambda x: x and 'insideArticleRelatedTopics' in x.split()} ) ,dict(name='div', attrs={'class':lambda x: x and 'insideArticleRelatedTopics' in x.split()})
] ]
remove_attributes = ['width','height','lang'] remove_attributes = ['width','height','lang']
@ -80,13 +79,13 @@ class FinancialTimes(BasicNewsRecipe):
for item in elem.findAll('a',href=True): for item in elem.findAll('a',href=True):
count = count + 1 count = count + 1
if self.test and count > 2: if self.test and count > 2:
return articles return articles
rawlink = item['href'] rawlink = item['href']
url = rawlink url = rawlink
if not rawlink.startswith('http://'): if not rawlink.startswith('http://'):
url = self.PREFIX + rawlink url = self.PREFIX + rawlink
try: try:
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
except: except:
continue continue
title = self.tag_to_string(item) title = self.tag_to_string(item)
@ -106,10 +105,11 @@ class FinancialTimes(BasicNewsRecipe):
#self.timefmt = ' [%s]'%dates #self.timefmt = ' [%s]'%dates
section_title = 'Untitled' section_title = 'Untitled'
for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}): for column in soup.findAll('div', attrs={'class':'feedBoxes clearfix'}):
for section in column. findAll('div', attrs = {'class':'feedBox'}): for section in column. findAll('div', attrs={'class':'feedBox'}):
sectiontitle=self.tag_to_string(section.find('h4')) sectiontitle=self.tag_to_string(section.find('h4'))
if '...' not in sectiontitle: section_title=sectiontitle if '...' not in sectiontitle:
section_title=sectiontitle
for article in section.ul.findAll('li'): for article in section.ul.findAll('li'):
articles = [] articles = []
title=self.tag_to_string(article.a) title=self.tag_to_string(article.a)
@ -121,7 +121,6 @@ class FinancialTimes(BasicNewsRecipe):
feeds[section_title] = [] feeds[section_title] = []
feeds[section_title] += articles feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()] ans = [(key, val) for key, val in feeds.iteritems()]
return ans return ans
@ -138,18 +137,18 @@ class FinancialTimes(BasicNewsRecipe):
for item in soup.findAll('a'): for item in soup.findAll('a'):
limg = item.find('img') limg = item.find('img')
if item.string is not None: if item.string is not None:
str = item.string str = item.string
item.replaceWith(str) item.replaceWith(str)
else: else:
if limg: if limg:
item.name = 'div' item.name = 'div'
item.attrs = [] item.attrs = []
else: else:
str = self.tag_to_string(item) str = self.tag_to_string(item)
item.replaceWith(str) item.replaceWith(str)
for item in soup.findAll('img'): for item in soup.findAll('img'):
if not item.has_key('alt'): if not item.has_key('alt'):
item['alt'] = 'image' item['alt'] = 'image'
return soup return soup
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):