Fix #1072822 (Updated recipe for Financial Times UK edition)

This commit is contained in:
Kovid Goyal 2012-10-29 23:25:34 +05:30
parent 9c9f0e350c
commit b859653724

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.ft.com/uk-edition www.ft.com/uk-edition
''' '''
@ -42,18 +42,23 @@ class FinancialTimes(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open(self.INDEX) br.open(self.INDEX)
br.open(self.LOGIN) if self.username is not None and self.password is not None:
br.select_form(name='loginForm') br.open(self.LOGIN2)
br['username'] = self.username br.select_form(name='loginForm')
br['password'] = self.password br['username'] = self.username
br.submit() br['password'] = self.password
br.submit()
return br return br
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) dict(name='div' , attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})
,dict(name='div', attrs={'class':'standfirst'}) ,dict(name='div' , attrs={'class':'standfirst'})
,dict(name='div', attrs={'id' :'storyContent'}) ,dict(name='div' , attrs={'id' :'storyContent'})
,dict(name='div', attrs={'class':['ft-story-body','index-detail']}) ,dict(name='div' , attrs={'class':['ft-story-body','index-detail']})
,dict(name='h2' , attrs={'class':'entry-title'} )
,dict(name='span', attrs={'class':lambda x: x and 'posted-on' in x.split()} )
,dict(name='span', attrs={'class':'author_byline'} )
,dict(name='div' , attrs={'class':'entry-content'} )
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'floating-con'}) dict(name='div', attrs={'id':'floating-con'})
@ -82,21 +87,20 @@ class FinancialTimes(BasicNewsRecipe):
if self.test and count > 2: if self.test and count > 2:
return articles return articles
rawlink = item['href'] rawlink = item['href']
if rawlink.startswith('http://'): url = rawlink
url = rawlink if not rawlink.startswith('http://'):
else: url = self.PREFIX + rawlink
url = self.PREFIX + rawlink
try: try:
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
except: except:
continue continue
title = self.tag_to_string(item) title = self.tag_to_string(item)
date = strftime(self.timefmt) date = strftime(self.timefmt)
articles.append({ articles.append({
'title' :title 'title' :title
,'date' :date ,'date' :date
,'url' :urlverified ,'url' :urlverified
,'description':'' ,'description':''
}) })
return articles return articles
@ -108,21 +112,20 @@ class FinancialTimes(BasicNewsRecipe):
wide = soup.find('div',attrs={'class':'wide'}) wide = soup.find('div',attrs={'class':'wide'})
if not wide: if not wide:
return feeds return feeds
strest = wide.findAll('h3', attrs={'class':'section'}) allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()})
if not strest: if not allsections:
return feeds return feeds
st = wide.findAll('h4',attrs={'class':'section-no-arrow'})
if st:
st.extend(strest)
count = 0 count = 0
for item in st: for item in allsections:
count = count + 1 count = count + 1
if self.test and count > 2: if self.test and count > 2:
return feeds return feeds
ftitle = self.tag_to_string(item) fitem = item.h3
if not fitem:
fitem = item.h4
ftitle = self.tag_to_string(fitem)
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
if item.parent.ul is not None: feedarts = self.get_artlinks(item.ul)
feedarts = self.get_artlinks(item.parent.ul)
feeds.append((ftitle,feedarts)) feeds.append((ftitle,feedarts))
return feeds return feeds
@ -156,7 +159,7 @@ class FinancialTimes(BasicNewsRecipe):
def get_cover_url(self): def get_cover_url(self):
cdate = datetime.date.today() cdate = datetime.date.today()
if cdate.isoweekday() == 7: if cdate.isoweekday() == 7:
cdate -= datetime.timedelta(days=1) cdate -= datetime.timedelta(days=1)
return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf') return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf')
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
@ -169,10 +172,11 @@ class FinancialTimes(BasicNewsRecipe):
except: except:
print "Retrying download..." print "Retrying download..."
count += 1 count += 1
self.temp_files.append(PersistentTemporaryFile('_fa.html')) tfile = PersistentTemporaryFile('_fa.html')
self.temp_files[-1].write(html) tfile.write(html)
self.temp_files[-1].close() tfile.close()
return self.temp_files[-1].name self.temp_files.append(tfile)
return tfile.name
def cleanup(self): def cleanup(self):
self.browser.open('https://registration.ft.com/registration/login/logout?location=') self.browser.open('https://registration.ft.com/registration/login/logout?location=')