calibre/recipes/india_today.recipe


from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class IndiaToday(BasicNewsRecipe):
    title = u'India Today Magazine'
    language = 'en_IN'
    __author__ = 'unkn0wn'
    no_stylesheets = True
    use_embedded_content = False
    remove_attributes = ['style', 'height', 'width']
    ignore_duplicate_articles = {'url'}
    description = (
        'India’s Most Reputed, Credible and Popular news magazine.'
        ' Read the most preferred magazine of 9.5 million Indians to access highly researched and unbiased content.'
    )
    masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'

    extra_css = '''
        #sub-d {font-style:italic; color:#202020;}
        .story__byline {font-size:small; text-align:left;}
        .body_caption, .mos__alt .caption, .caption-drupal-entity {font-size:small; text-align:center;}
        blockquote{color:#404040;}
    '''

    remove_tags = [
            classes('checkout__section sharing align-center-button amp-izooto-sub ads__container inline-story-add amp-ad readmore__box'),
            dict(name=(('amp-web-push-widget', 'amp-ad'))),
            dict(attrs={'id':'tab-link-wrapper-plugin'}),
            dict(name='div', attrs={'amp-access':'NOT granted'})
        ]

    def preprocess_raw_html(self, raw_html, url):
        return raw_html.replace('â€”', '--')

    recipe_specific_options = {
        'date': {
            'short': 'The date of the edition to download (DD-MM-YYYY format)',
            'long': 'For example, 22-07-2024'
        }
    }

    def get_cover_url(self):
        d = self.recipe_specific_options.get('date')
        if not (d and isinstance(d, str)):
            soup = self.index_to_soup(
                'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154'
            )
            for citem in soup.findAll(
                'meta', content=lambda s: s and s.endswith('/magazine/300/new')
            ):
                return citem['content'].replace('300', '600')

    def parse_index(self):
        issue = https://www.indiatoday.in/magazine'
        d = self.recipe_specific_options.get('date')
        if d and isinstance(d, str):
            issue = issue + '/' + d
        soup = self.index_to_soup(issue)

        section = None
        sections = {}

        for tag in soup.findAll('div', attrs={'class': lambda x: x and 'NoCard_story__grid__' in x}):
            sec = tag.find('div', attrs={'class': lambda x: x and 'NoCard_header__nav__' in x})
            section = self.tag_to_string(sec).strip()
            self.log(section)
            sections[section] = []

            for art in tag.findAll('article'):
                title = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_articletitle__' in x})).strip()
                url = art.find('a', href=True, title=True)['href']
                if url.startswith('/'):
                    url = 'https://www.indiatoday.in' + url
                desc = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_story__shortcont__' in x})).strip()
                self.log('\t', title, '\n\t', desc, '\n\t\t', url)
                sections[section].append({'title': title, 'url': url, 'description': desc})

        def sort_key(x):
            section = x[0]
            try:
                return (
                    'Editor\'s Note', 'Cover Story', 'The Big Story', 'Upfront',
                    'NATION', 'INTERVIEW'
                ).index(section)
            except Exception:
                return 99999999

        return sorted(sections.items(), key=sort_key)

    def preprocess_html(self, soup):
        if soup.find('div', attrs={'amp-access':'granted'}) is not None:
            keep_only_tags = [
                classes('strytitle strykicker story__byline srtymos'),
                dict(name='div', attrs={'amp-access':'granted'}),
            ]
        else:
            keep_only_tags = [
                classes('strytitle strykicker story__byline srtymos'),
                dict(name='div', attrs={'class':'description'}),
            ]
        body = new_tag(soup, 'body')
        for spec in keep_only_tags:
            for tag in soup.find('body').findAll(**spec):
                body.insert(len(body.contents), tag)
        soup.find('body').replaceWith(body)

        for img in soup.findAll('amp-img'):
            if not img.find('img'):
                img.name = 'img'
        h2 = soup.find('h2')
        if h2:
            h2.name = 'p'
            h2['id'] = 'sub-d'
        for quo in soup.findAll(attrs={'class':'quotes'}):
            quo.name = 'blockquote'
        return soup

    def print_version(self, url):
        return url.replace('.in/','.in/amp/')