Bar and Bench by unkn0wn

2025-07-09 03:04:10 -04:00 · 2023-04-15 21:26:28 +05:30 · 2023-04-15 21:26:28 +05:30 · 5fe521bcb9
commit 5fe521bcb9
parent c0d533f7ec
1 changed files with 73 additions and 0 deletions
--- a/recipes/bar_and_bench.recipe
+++ b/recipes/bar_and_bench.recipe
@ -0,0 +1,73 @@
+from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
+from calibre.ptempfile import PersistentTemporaryFile
+
+class bar(BasicNewsRecipe):
+    title = 'Bar and Bench'
+    __author__ = 'unkn0wn'
+    description = (
+        'Bar & Bench is the premier online portal for Indian legal news. News, interviews,'
+        ' and columns related to the Supreme Court of India and the High Courts are published.'
+        )
+    language = 'en_IN'
+    masthead_url = 'https://gumlet.assettype.com/barandbench/2019-12/7a743b15-5d5d-44d7-96c2-13616780ed95/brand_2x.png'
+
+    no_stylesheets = True
+    remove_javascript = True
+    remove_attributes = ['height', 'width', 'style']
+
+    keep_only_tags = [
+        prefixed_classes(
+            'text-story-m_header-details__ text-story-m_hero-image__ text-story-m_story-content-inner-wrapper__'
+        )
+    ]
+
+    remove_tags = [
+        prefixed_classes(
+            'text-story-m_story-tags__ story-footer-module__metype__'
+        ),
+        dict(name = 'svg')
+    ]
+
+    def preprocess_html(self, soup):
+        for img in soup.findAll('img', attrs={'data-src':True}):
+            img['src'] = img['data-src']
+        return soup
+
+    ignore_duplicate_articles = {'title'}
+    resolve_internal_links  = True
+    remove_empty_feeds = True
+
+    articles_are_obfuscated = True
+
+    def get_obfuscated_article(self, url):
+        br = self.get_browser()
+        try:
+            br.open(url)
+        except Exception as e:
+            url = e.hdrs.get('location')
+        soup = self.index_to_soup(url)
+        link = soup.find('a', href=True)
+        skip_sections =[ # add sections you want to skip
+            '/video/', '/videos/', '/media/', 'podcast-'
+        ]
+        if any(x in link['href'] for x in skip_sections):
+            self.log('Aborting Article ', link['href'])
+            self.abort_article('skipping video links')
+
+        self.log('Downloading ', link['href'])
+        html = br.open(link['href']).read()
+        pt = PersistentTemporaryFile('.html')
+        pt.write(html)
+        pt.close()
+        return pt.name
+
+    feeds = []
+
+    sections = [
+        'news', 'columns', 'interviews', 'law-firms', 'apprentice-lawyer', 'legal-jobs'
+    ]
+
+    for sec in sections:
+        a = 'https://news.google.com/rss/search?q=when:27h+allinurl:barandbench.com{}&hl=en-IN&gl=IN&ceid=IN:en'
+        feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
+    feeds.append(('Others', a.format('')))