Fix #831695 (Updated recipe for Financial times UK edition)

This commit is contained in:
Kovid Goyal 2011-08-22 18:18:46 -06:00
parent 161644a752
commit ac30f8edd4

View File

@ -5,6 +5,7 @@ www.ft.com/uk-edition
''' '''
import datetime import datetime
from calibre.ptempfile import PersistentTemporaryFile
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -22,6 +23,8 @@ class FinancialTimes(BasicNewsRecipe):
needs_subscription = True needs_subscription = True
encoding = 'utf8' encoding = 'utf8'
publication_type = 'newspaper' publication_type = 'newspaper'
articles_are_obfuscated = True
temp_files = []
masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
LOGIN = 'https://registration.ft.com/registration/barrier/login' LOGIN = 'https://registration.ft.com/registration/barrier/login'
LOGIN2 = 'http://media.ft.com/h/subs3.html' LOGIN2 = 'http://media.ft.com/h/subs3.html'
@ -47,7 +50,12 @@ class FinancialTimes(BasicNewsRecipe):
br.submit() br.submit()
return br return br
keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})] keep_only_tags = [
dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})
,dict(name='div', attrs={'class':'standfirst'})
,dict(name='div', attrs={'id' :'storyContent'})
,dict(name='div', attrs={'class':['ft-story-body','index-detail']})
]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'floating-con'}) dict(name='div', attrs={'id':'floating-con'})
,dict(name=['meta','iframe','base','object','embed','link']) ,dict(name=['meta','iframe','base','object','embed','link'])
@ -69,18 +77,23 @@ class FinancialTimes(BasicNewsRecipe):
def get_artlinks(self, elem): def get_artlinks(self, elem):
articles = [] articles = []
count = 0
for item in elem.findAll('a',href=True): for item in elem.findAll('a',href=True):
count = count + 1
if self.test and count > 2:
return articles
rawlink = item['href'] rawlink = item['href']
if rawlink.startswith('http://'): if rawlink.startswith('http://'):
url = rawlink url = rawlink
else: else:
url = self.PREFIX + rawlink url = self.PREFIX + rawlink
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
title = self.tag_to_string(item) title = self.tag_to_string(item)
date = strftime(self.timefmt) date = strftime(self.timefmt)
articles.append({ articles.append({
'title' :title 'title' :title
,'date' :date ,'date' :date
,'url' :url ,'url' :urlverified
,'description':'' ,'description':''
}) })
return articles return articles
@ -97,7 +110,11 @@ class FinancialTimes(BasicNewsRecipe):
st = wide.find('h4',attrs={'class':'section-no-arrow'}) st = wide.find('h4',attrs={'class':'section-no-arrow'})
if st: if st:
strest.insert(0,st) strest.insert(0,st)
count = 0
for item in strest: for item in strest:
count = count + 1
if self.test and count > 2:
return feeds
ftitle = self.tag_to_string(item) ftitle = self.tag_to_string(item)
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
feedarts = self.get_artlinks(item.parent.ul) feedarts = self.get_artlinks(item.parent.ul)
@ -137,3 +154,18 @@ class FinancialTimes(BasicNewsRecipe):
cdate -= datetime.timedelta(days=1) cdate -= datetime.timedelta(days=1)
return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf') return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf')
def get_obfuscated_article(self, url):
count = 0
while (count < 10):
try:
response = self.browser.open(url)
html = response.read()
count = 10
except:
print "Retrying download..."
count += 1
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name