Fix #831695 (Updated recipe for Financial times UK edition)

2025-07-09 03:04:10 -04:00 · 2011-08-22 18:18:46 -06:00 · 2011-08-22 18:18:46 -06:00 · ac30f8edd4
commit ac30f8edd4
parent 161644a752
1 changed files with 34 additions and 2 deletions
--- a/recipes/financial_times_uk.recipe
+++ b/recipes/financial_times_uk.recipe
@ -5,6 +5,7 @@ www.ft.com/uk-edition
 '''

 import datetime
+from calibre.ptempfile import PersistentTemporaryFile
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe

@ -22,6 +23,8 @@ class FinancialTimes(BasicNewsRecipe):
    needs_subscription    = True
    encoding              = 'utf8'
    publication_type      = 'newspaper'
+    articles_are_obfuscated = True
+    temp_files              = []
    masthead_url          = 'http://im.media.ft.com/m/img/masthead_main.jpg'
    LOGIN                 = 'https://registration.ft.com/registration/barrier/login'
    LOGIN2                = 'http://media.ft.com/h/subs3.html'
@ -47,7 +50,12 @@ class FinancialTimes(BasicNewsRecipe):
            br.submit()
        return br

-    keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})]
+    keep_only_tags = [
+                        dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})
+                       ,dict(name='div', attrs={'class':'standfirst'})
+                       ,dict(name='div', attrs={'id'   :'storyContent'})
+                       ,dict(name='div', attrs={'class':['ft-story-body','index-detail']})
+                     ]
    remove_tags = [
                      dict(name='div', attrs={'id':'floating-con'})
                     ,dict(name=['meta','iframe','base','object','embed','link'])
@ -69,18 +77,23 @@ class FinancialTimes(BasicNewsRecipe):

    def get_artlinks(self, elem):
        articles = []
+        count = 0
        for item in elem.findAll('a',href=True):
+            count = count + 1
+            if self.test and count > 2:
+               return articles
            rawlink = item['href']
            if rawlink.startswith('http://'):
               url = rawlink
            else:
               url   = self.PREFIX + rawlink
+            urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
            title = self.tag_to_string(item)
            date = strftime(self.timefmt)
            articles.append({
                              'title'      :title
                             ,'date'       :date
-                             ,'url'        :url
+                             ,'url'        :urlverified
                             ,'description':''
                            })
        return articles
@ -97,7 +110,11 @@ class FinancialTimes(BasicNewsRecipe):
        st = wide.find('h4',attrs={'class':'section-no-arrow'})
        if st:
           strest.insert(0,st)
+        count = 0
        for item in strest:
+            count = count + 1
+            if self.test and count > 2:
+               return feeds
            ftitle   = self.tag_to_string(item)
            self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
            feedarts = self.get_artlinks(item.parent.ul)
@ -136,4 +153,19 @@ class FinancialTimes(BasicNewsRecipe):
        if cdate.isoweekday() == 7:
           cdate -= datetime.timedelta(days=1)
        return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf')
+
+    def get_obfuscated_article(self, url):
+        count = 0
+        while (count < 10):
+            try:
+                response = self.browser.open(url)
+                html = response.read()
+                count = 10
+            except:
+                print "Retrying download..."
+            count += 1        
+        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
+        self.temp_files[-1].write(html)
+        self.temp_files[-1].close()
+        return self.temp_files[-1].name