calibre/recipes/bar_and_bench.recipe

#!/usr/bin/env python
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes


class bar(BasicNewsRecipe):
    title = 'Bar and Bench'
    __author__ = 'unkn0wn'
    description = (
        'Bar & Bench is the premier online portal for Indian legal news. News, interviews,'
        ' and columns related to the Supreme Court of India and the High Courts are published.'
        )
    language = 'en_IN'
    masthead_url = 'https://gumlet.assettype.com/barandbench/2019-12/7a743b15-5d5d-44d7-96c2-13616780ed95/brand_2x.png'

    no_stylesheets = True
    remove_javascript = True
    remove_attributes = ['height', 'width', 'style']

    keep_only_tags = [
        prefixed_classes(
            'text-story-m_header-details__ text-story-m_hero-image__ text-story-m_story-content-inner-wrapper__'
        )
    ]

    remove_tags = [
        prefixed_classes(
            'text-story-m_story-tags__ story-footer-module__metype__'
        ),
        dict(name='svg')
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-src':True}):
            img['src'] = img['data-src']
        return soup

    ignore_duplicate_articles = {'title'}
    resolve_internal_links  = True
    remove_empty_feeds = True

    def parse_index(self):
        index = 'https://www.barandbench.com/'
        sections = [
            'news', 'columns', 'interviews', 'law-firms', 'apprentice-lawyer', 'legal-jobs'
        ]
        feeds = []
        soup = self.index_to_soup(index)
        for sec in sections:
            section = sec.capitalize()
            self.log(section)
            articles = []
            for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}):
                url = a['href'].split('?')[0]
                if url in {index + sec + '/', index + sec}:
                    continue
                title = self.tag_to_string(a)
                self.log('\t', title, '\n\t\t', url)
                articles.append({'title': title, 'url': url})
            if articles:
                feeds.append((section, articles))
        return feeds