calibre/recipes/taipei.recipe

import re

from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(
        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
    )


class TN(BasicNewsRecipe):
    title = u'Taipei Times'
    language = 'en_TW'
    __author__ = 'Krittika Goyal'
    oldest_article = 1  # days
    max_articles_per_feed = 25
    use_embedded_content = False

    no_stylesheets = True

    keep_only_tags = [
        dict(name='h1'),
        dict(name='h3', attrs={'class': 'a'}),
        classes('main_ipic reporter text page'),
    ]

    feeds = [
        ('Front Page', 'http://www.taipeitimes.com/xml/front.rss'),
        ('Editorials', 'http://www.taipeitimes.com/xml/editorials.rss'),
        ('Taiwan', 'http://www.taipeitimes.com/xml/taiwan.rss'),
        ('Features', 'http://www.taipeitimes.com/xml/feat.rss'),
        ('Business', 'http://www.taipeitimes.com/xml/biz.rss'),
        ('World', 'http://www.taipeitimes.com/xml/world.rss'),
        ('Sports', 'http://www.taipeitimes.com/xml/sport.rss'),
    ]

    def preprocess_html(self, soup, *a):
        for div in soup.findAll(**classes('page')):
            for a in div.findAll('a', href=True):
                a['data-calibre-follow-link'] = '1'
                if a['href'].startswith('/'):
                    a['href'] = 'http://www.taipeitimes.com' + a['href']
        return soup

    recursions = 1

    def is_link_wanted(self, url, tag):
        digit = re.search(r'/(\d+)$', url)
        if digit is not None and tag['data-calibre-follow-link'] == '1' and re.match(r'\d+', self.tag_to_string(tag)) is not None:
            if int(digit.group(1)) > 1:
                return True
        return False

    def postprocess_html(self, soup, *a):
        for div in soup.findAll(**classes('page')):
            div.extract()
        return soup

    # def parse_index(self):
    #     return [(
    #         'Articles', [{
    #             'title':
    #             'test',
    #             'url':
    #             'http://www.taipeitimes.com/News/editorials/archives/2019/02/26/2003710411'
    #         }]
    #     )]