Update Financial Times

Fixes #1711934 [Financial times download uk/us is broken](https://bugs.launchpad.net/calibre/+bug/1711934)
2025-07-09 03:04:10 -04:00 · 2017-08-21 00:31:35 +05:30 · 2017-08-21 00:31:35 +05:30 · e25426ad5a
commit e25426ad5a
parent 331190c369
2 changed files with 36 additions and 62 deletions
--- a/recipes/financial_times_uk.recipe
+++ b/recipes/financial_times_uk.recipe
@ -5,11 +5,10 @@
 __license__ = 'GPL v3'
 __copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
 '''
-www.ft.com/uk-edition
+www.ft.com/todaysnewspaper/uk
 '''

 from calibre.web.feeds.news import BasicNewsRecipe
-from collections import OrderedDict
 from urllib import unquote


@ -34,22 +33,26 @@ class FinancialTimes(BasicNewsRecipe):
    encoding = 'utf8'
    publication_type = 'newspaper'
    handle_gzip = True
+    compress_news_images = True
+    scale_news_images_to_device = True
+    ignore_duplicate_articles = {'url'}
    LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
    LOGOUT = 'https://myaccount.ft.com/logout'
-    INDEX = 'http://www.ft.com/uk-edition'
-    PREFIX = 'http://www.ft.com'
+    INDEX = 'https://www.ft.com/todaysnewspaper/uk'
+    PREFIX = 'https://www.ft.com'

    keep_only_tags = [
-        classes(
-            'article__header--wrapper article__time-byline article__body'
-            'n-content-image barrier-grid__heading article__time-byline topper__headline topper__standfirst')
+        classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body')
    ]

    remove_tags = [
-        classes('n-content-related-box tour-tip')
+        classes('n-content-related-box tour-tip n-content-recommended n-content-video')
    ]

-    remove_attributes = ['width', 'height', 'lang', 'style']
+    extra_css = '''
+                body {font-family: Georgia,serif;}
+                img {display:block;}
+                '''

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -81,33 +84,14 @@ class FinancialTimes(BasicNewsRecipe):
        return cover

    def parse_index(self):
-        feeds = OrderedDict()
+        articles = []
        soup = self.index_to_soup(self.INDEX)
-        # dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
-        # self.timefmt = ' [%s]'%dates
-        section_title = 'Untitled'

-        for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}):
-            for section in column.findAll('div', attrs={'class': 'feedBox'}):
-                sectiontitle = self.tag_to_string(section.find('h4'))
-                if '...' not in sectiontitle:
-                    section_title = sectiontitle
-                    self.log('Found section:', sectiontitle)
-                for article in section.ul.findAll('li'):
-                    articles = []
-                    title = self.tag_to_string(article.a)
-                    url = article.a['href']
-                    articles.append(
-                        {'title': title, 'url': url, 'description': '', 'date': ''})
-                    self.log('\tFound article:', title)
-
-                    if articles:
-                        if section_title not in feeds:
-                            feeds[section_title] = []
-                        feeds[section_title] += articles
-
-        ans = [(key, val) for key, val in feeds.iteritems()]
-        return ans
+        for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
+            url = self.PREFIX + article['href']
+            title = self.tag_to_string(article)
+            articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
+        return [("Articles", articles)]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):
--- a/recipes/financial_times_us.recipe
+++ b/recipes/financial_times_us.recipe
@ -5,11 +5,10 @@
 __license__ = 'GPL v3'
 __copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
 '''
-www.ft.com/international-edition
+www.ft.com/todaysnewspaper/international
 '''

 from calibre.web.feeds.news import BasicNewsRecipe
-from collections import OrderedDict
 from urllib import unquote


@ -34,20 +33,26 @@ class FinancialTimes(BasicNewsRecipe):
    encoding = 'utf8'
    publication_type = 'newspaper'
    handle_gzip = True
+    compress_news_images = True
+    scale_news_images_to_device = True
+    ignore_duplicate_articles = {'url'}
    LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
    LOGOUT = 'https://myaccount.ft.com/logout'
-    INDEX = 'http://www.ft.com/international-edition'
-    PREFIX = 'http://www.ft.com'
+    INDEX = 'https://www.ft.com/todaysnewspaper/international'
+    PREFIX = 'https://www.ft.com'

    keep_only_tags = [
-        classes('article__header--wrapper article__time-byline article__body n-content-image barrier-grid__heading')
+        classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body')
    ]

    remove_tags = [
-        classes('n-content-related-box tour-tip')
+        classes('n-content-related-box tour-tip n-content-recommended n-content-video')
    ]

-    remove_attributes = ['width', 'height', 'lang', 'style']
+    extra_css = '''
+                body {font-family: Georgia,serif;}
+                img {display:block;}
+                '''

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -63,29 +68,14 @@ class FinancialTimes(BasicNewsRecipe):
        return br

    def parse_index(self):
-        feeds = OrderedDict()
+        articles = []
        soup = self.index_to_soup(self.INDEX)
-        section_title = 'Untitled'

-        for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}):
-            for section in column.findAll('div', attrs={'class': 'feedBox'}):
-                sectiontitle = self.tag_to_string(section.find('h4'))
-                if '...' not in sectiontitle:
-                    section_title = sectiontitle
-                for article in section.ul.findAll('li'):
-                    articles = []
-                    title = self.tag_to_string(article.a)
-                    url = article.a['href']
-                    articles.append(
-                        {'title': title, 'url': url, 'description': '', 'date': ''})
-
-                    if articles:
-                        if section_title not in feeds:
-                            feeds[section_title] = []
-                        feeds[section_title] += articles
-
-        ans = [(key, val) for key, val in feeds.iteritems()]
-        return ans
+        for article in soup.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
+            url = self.PREFIX + article['href']
+            title = self.tag_to_string(article)
+            articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
+        return [("Articles", articles)]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):