From b859653724be604219b9904ea72e2725ee25573f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 29 Oct 2012 23:25:34 +0530 Subject: [PATCH] Fix #1072822 (Updated recipe for Financial Times UK edition) --- recipes/financial_times_uk.recipe | 72 ++++++++++++++++--------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index e2b69f4987..6af000d990 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010-2011, Darko Miletic ' +__copyright__ = '2010-2012, Darko Miletic ' ''' www.ft.com/uk-edition ''' @@ -42,18 +42,23 @@ class FinancialTimes(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser() br.open(self.INDEX) - br.open(self.LOGIN) - br.select_form(name='loginForm') - br['username'] = self.username - br['password'] = self.password - br.submit() + if self.username is not None and self.password is not None: + br.open(self.LOGIN2) + br.select_form(name='loginForm') + br['username'] = self.username + br['password'] = self.password + br.submit() return br keep_only_tags = [ - dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) - ,dict(name='div', attrs={'class':'standfirst'}) - ,dict(name='div', attrs={'id' :'storyContent'}) - ,dict(name='div', attrs={'class':['ft-story-body','index-detail']}) + dict(name='div' , attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) + ,dict(name='div' , attrs={'class':'standfirst'}) + ,dict(name='div' , attrs={'id' :'storyContent'}) + ,dict(name='div' , attrs={'class':['ft-story-body','index-detail']}) + ,dict(name='h2' , attrs={'class':'entry-title'} ) + ,dict(name='span', attrs={'class':lambda x: x and 'posted-on' in x.split()} ) + ,dict(name='span', attrs={'class':'author_byline'} ) + ,dict(name='div' , attrs={'class':'entry-content'} ) ] remove_tags = [ dict(name='div', attrs={'id':'floating-con'}) @@ -82,21 +87,20 @@ class FinancialTimes(BasicNewsRecipe): if self.test and count > 2: return articles rawlink = item['href'] - if rawlink.startswith('http://'): - url = rawlink - else: - url = self.PREFIX + rawlink + url = rawlink + if not rawlink.startswith('http://'): + url = self.PREFIX + rawlink try: urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. except: - continue + continue title = self.tag_to_string(item) date = strftime(self.timefmt) articles.append({ - 'title' :title - ,'date' :date - ,'url' :urlverified - ,'description':'' + 'title' :title + ,'date' :date + ,'url' :urlverified + ,'description':'' }) return articles @@ -108,21 +112,20 @@ class FinancialTimes(BasicNewsRecipe): wide = soup.find('div',attrs={'class':'wide'}) if not wide: return feeds - strest = wide.findAll('h3', attrs={'class':'section'}) - if not strest: + allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()}) + if not allsections: return feeds - st = wide.findAll('h4',attrs={'class':'section-no-arrow'}) - if st: - st.extend(strest) count = 0 - for item in st: + for item in allsections: count = count + 1 if self.test and count > 2: return feeds - ftitle = self.tag_to_string(item) + fitem = item.h3 + if not fitem: + fitem = item.h4 + ftitle = self.tag_to_string(fitem) self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) - if item.parent.ul is not None: - feedarts = self.get_artlinks(item.parent.ul) + feedarts = self.get_artlinks(item.ul) feeds.append((ftitle,feedarts)) return feeds @@ -156,7 +159,7 @@ class FinancialTimes(BasicNewsRecipe): def get_cover_url(self): cdate = datetime.date.today() if cdate.isoweekday() == 7: - cdate -= datetime.timedelta(days=1) + cdate -= datetime.timedelta(days=1) return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf') def get_obfuscated_article(self, url): @@ -169,10 +172,11 @@ class FinancialTimes(BasicNewsRecipe): except: print "Retrying download..." count += 1 - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name + tfile = PersistentTemporaryFile('_fa.html') + tfile.write(html) + tfile.close() + self.temp_files.append(tfile) + return tfile.name def cleanup(self): - self.browser.open('https://registration.ft.com/registration/login/logout?location=') + self.browser.open('https://registration.ft.com/registration/login/logout?location=') \ No newline at end of file