Update Financial Times

Fixes #1711934 [Financial times download uk/us is broken](https://bugs.launchpad.net/calibre/+bug/1711934)
2025-07-09 03:04:10 -04:00 · 2017-08-21 00:31:35 +05:30 · 2017-08-21 00:31:35 +05:30 · e25426ad5a
commit e25426ad5a
parent 331190c369
2 changed files with 36 additions and 62 deletions
--- a/recipes/financial_times_uk.recipe
+++ b/recipes/financial_times_uk.recipe
@ -5,11 +5,10 @@
 __license__ = 'GPL v3'
 __copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
 '''
-www.ft.com/uk-edition
+www.ft.com/todaysnewspaper/uk
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from collections import OrderedDict
 from urllib import unquote
@ -34,22 +33,26 @@ class FinancialTimes(BasicNewsRecipe):
    encoding = 'utf8'
    publication_type = 'newspaper'
    handle_gzip = True
    compress_news_images = True
    scale_news_images_to_device = True
    ignore_duplicate_articles = {'url'}
    LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
    LOGOUT = 'https://myaccount.ft.com/logout'
-    INDEX = 'http://www.ft.com/uk-edition'
+    INDEX = 'https://www.ft.com/todaysnewspaper/uk'
-    PREFIX = 'http://www.ft.com'
+    PREFIX = 'https://www.ft.com'
    keep_only_tags = [
-        classes(
+        classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body')
            'article__header--wrapper article__time-byline article__body'
            'n-content-image barrier-grid__heading article__time-byline topper__headline topper__standfirst')
    ]
    remove_tags = [
-        classes('n-content-related-box tour-tip')
+        classes('n-content-related-box tour-tip n-content-recommended n-content-video')
    ]
-    remove_attributes = ['width', 'height', 'lang', 'style']
+    extra_css = '''
                body {font-family: Georgia,serif;}
                img {display:block;}
                '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -81,33 +84,14 @@ class FinancialTimes(BasicNewsRecipe):
        return cover
    def parse_index(self):
-        feeds = OrderedDict()
+        articles = []
        soup = self.index_to_soup(self.INDEX)
        # dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
        # self.timefmt = ' [%s]'%dates
        section_title = 'Untitled'
-        for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}):
+        for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
-            for section in column.findAll('div', attrs={'class': 'feedBox'}):
+            url = self.PREFIX + article['href']
-                sectiontitle = self.tag_to_string(section.find('h4'))
+            title = self.tag_to_string(article)
-                if '...' not in sectiontitle:
+            articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
-                    section_title = sectiontitle
+        return [("Articles", articles)]
                    self.log('Found section:', sectiontitle)
                for article in section.ul.findAll('li'):
                    articles = []
                    title = self.tag_to_string(article.a)
                    url = article.a['href']
                    articles.append(
                        {'title': title, 'url': url, 'description': '', 'date': ''})
                    self.log('\tFound article:', title)
                    if articles:
                        if section_title not in feeds:
                            feeds[section_title] = []
                        feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans
    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):
--- a/recipes/financial_times_us.recipe
+++ b/recipes/financial_times_us.recipe
@ -5,11 +5,10 @@
 __license__ = 'GPL v3'
 __copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
 '''
-www.ft.com/international-edition
+www.ft.com/todaysnewspaper/international
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from collections import OrderedDict
 from urllib import unquote
@ -34,20 +33,26 @@ class FinancialTimes(BasicNewsRecipe):
    encoding = 'utf8'
    publication_type = 'newspaper'
    handle_gzip = True
    compress_news_images = True
    scale_news_images_to_device = True
    ignore_duplicate_articles = {'url'}
    LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
    LOGOUT = 'https://myaccount.ft.com/logout'
-    INDEX = 'http://www.ft.com/international-edition'
+    INDEX = 'https://www.ft.com/todaysnewspaper/international'
-    PREFIX = 'http://www.ft.com'
+    PREFIX = 'https://www.ft.com'
    keep_only_tags = [
-        classes('article__header--wrapper article__time-byline article__body n-content-image barrier-grid__heading')
+        classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body')
    ]
    remove_tags = [
-        classes('n-content-related-box tour-tip')
+        classes('n-content-related-box tour-tip n-content-recommended n-content-video')
    ]
-    remove_attributes = ['width', 'height', 'lang', 'style']
+    extra_css = '''
                body {font-family: Georgia,serif;}
                img {display:block;}
                '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -63,29 +68,14 @@ class FinancialTimes(BasicNewsRecipe):
        return br
    def parse_index(self):
-        feeds = OrderedDict()
+        articles = []
        soup = self.index_to_soup(self.INDEX)
        section_title = 'Untitled'
-        for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}):
+        for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
-            for section in column.findAll('div', attrs={'class': 'feedBox'}):
+            url = self.PREFIX + article['href']
-                sectiontitle = self.tag_to_string(section.find('h4'))
+            title = self.tag_to_string(article)
-                if '...' not in sectiontitle:
+            articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
-                    section_title = sectiontitle
+        return [("Articles", articles)]
                for article in section.ul.findAll('li'):
                    articles = []
                    title = self.tag_to_string(article.a)
                    url = article.a['href']
                    articles.append(
                        {'title': title, 'url': url, 'description': '', 'date': ''})
                    if articles:
                        if section_title not in feeds:
                            feeds[section_title] = []
                        feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans
    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):