Fix #831695 (Updated recipe for Financial times UK edition)

This commit is contained in:
Kovid Goyal 2011-08-22 18:18:46 -06:00
parent 161644a752
commit ac30f8edd4

View File

@ -5,6 +5,7 @@ www.ft.com/uk-edition
'''
import datetime
from calibre.ptempfile import PersistentTemporaryFile
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
@ -22,6 +23,8 @@ class FinancialTimes(BasicNewsRecipe):
needs_subscription = True
encoding = 'utf8'
publication_type = 'newspaper'
articles_are_obfuscated = True
temp_files = []
masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
LOGIN = 'https://registration.ft.com/registration/barrier/login'
LOGIN2 = 'http://media.ft.com/h/subs3.html'
@ -47,7 +50,12 @@ class FinancialTimes(BasicNewsRecipe):
br.submit()
return br
keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})]
keep_only_tags = [
dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})
,dict(name='div', attrs={'class':'standfirst'})
,dict(name='div', attrs={'id' :'storyContent'})
,dict(name='div', attrs={'class':['ft-story-body','index-detail']})
]
remove_tags = [
dict(name='div', attrs={'id':'floating-con'})
,dict(name=['meta','iframe','base','object','embed','link'])
@ -69,18 +77,23 @@ class FinancialTimes(BasicNewsRecipe):
def get_artlinks(self, elem):
articles = []
count = 0
for item in elem.findAll('a',href=True):
count = count + 1
if self.test and count > 2:
return articles
rawlink = item['href']
if rawlink.startswith('http://'):
url = rawlink
else:
url = self.PREFIX + rawlink
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
title = self.tag_to_string(item)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'url' :urlverified
,'description':''
})
return articles
@ -97,7 +110,11 @@ class FinancialTimes(BasicNewsRecipe):
st = wide.find('h4',attrs={'class':'section-no-arrow'})
if st:
strest.insert(0,st)
count = 0
for item in strest:
count = count + 1
if self.test and count > 2:
return feeds
ftitle = self.tag_to_string(item)
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
feedarts = self.get_artlinks(item.parent.ul)
@ -136,4 +153,19 @@ class FinancialTimes(BasicNewsRecipe):
if cdate.isoweekday() == 7:
cdate -= datetime.timedelta(days=1)
return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf')
def get_obfuscated_article(self, url):
count = 0
while (count < 10):
try:
response = self.browser.open(url)
html = response.read()
count = 10
except:
print "Retrying download..."
count += 1
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name