Merge branch 'patch-1' of https://github.com/bobbysteel/calibre

2025-11-04 03:27:00 -05:00 · 2018-01-30 09:18:27 +05:30 · 2018-01-30 09:18:27 +05:30 · 50ca78b7a5
commit 50ca78b7a5
parent b4c8d223f4 425ba84334
1 changed files with 0 additions and 113 deletions
--- a/recipes/financial_times_uk.recipe
+++ b/recipes/financial_times_uk.recipe
@ -1,113 +0,0 @@
 #!/usr/bin/env  python2
 # -*- mode: python -*-
 # -*- coding: utf-8 -*-
 __license__ = 'GPL v3'
 __copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.ft.com/todaysnewspaper/uk
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from urllib import unquote
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 class FinancialTimes(BasicNewsRecipe):
    title = 'Financial Times (UK)'
    __author__ = 'Darko Miletic'
    description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."  # noqa
    publisher = 'The Financial Times Ltd.'
    category = 'news, finances, politics, UK, World'
    oldest_article = 2
    language = 'en_GB'
    max_articles_per_feed = 250
    no_stylesheets = True
    use_embedded_content = False
    needs_subscription = True
    encoding = 'utf8'
    publication_type = 'newspaper'
    handle_gzip = True
    compress_news_images = True
    scale_news_images_to_device = True
    ignore_duplicate_articles = {'url'}
    LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
    LOGOUT = 'https://myaccount.ft.com/logout'
    INDEX = 'https://www.ft.com/todaysnewspaper/uk'
    PREFIX = 'https://www.ft.com'
    keep_only_tags = [
        classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body')
    ]
    remove_tags = [
        classes('n-content-related-box tour-tip n-content-recommended n-content-video'),
        dict(name=['aside'])
    ]
    extra_css = '''
                body {font-family: Georgia,serif;}
                img {display:block;}
                '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
            br.open(self.LOGIN)
            br.select_form(name='enter-email-form')
            br['email'] = self.username
            br.submit()
            br.select_form(name='enter-password-form')
            br['password'] = self.password
            br.submit()
        return br
    def get_cover_url(self):
        from datetime import date
        cover = 'http://img.kiosko.net/' + str(date.today().year) + '/' + date.today().strftime('%m') + '/' + date.today().strftime('%d') + '/uk/ft_uk.750.jpg'
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
            index = 'http://en.kiosko.net/uk/np/ft_uk.html'
            soup = self.index_to_soup(index)
            for image in soup.findAll('img', src=True):
                if image['src'].endswith('750.jpg'):
                    return image['src']
            self.log("\nCover unavailable")
            cover = None
        return cover
    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        totalfeeds = []
        current_section = []
        div = []
        for div in soup.findAll('div', attrs={'data-trackable': 'list'}):
            articles = []
            current_section = self.tag_to_string(div.find('h2'))
            self.log('in section: ', current_section)
            for article in div.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
                url = self.PREFIX + article['href']
                title = self.tag_to_string(article)
                articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
                self.log('title: ', title, ' url: ', url)
            totalfeeds.append((current_section,articles))
        return totalfeeds
    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):
            src = img['srcset'].split(',')[0].strip()
            src = unquote(src.rpartition('/')[2].partition('?')[0])
            img['src'] = src
        return soup
    def cleanup(self):
        self.browser.open(self.LOGOUT)