From 1893c41f1a3575500733ff0858b81cc7540d7aeb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 14 Feb 2023 16:21:13 +0530 Subject: [PATCH] Deccan Herald by unkn0wn --- recipes/deccan_herald.recipe | 55 ++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 recipes/deccan_herald.recipe diff --git a/recipes/deccan_herald.recipe b/recipes/deccan_herald.recipe new file mode 100644 index 0000000000..9d3ec3b47b --- /dev/null +++ b/recipes/deccan_herald.recipe @@ -0,0 +1,55 @@ +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe, classes + +class herald(BasicNewsRecipe): + title = 'Deccan Herald' + __author__ = 'unkn0wn' + description = 'Deccan Herald is an Indian English language daily newspaper published from the Indian state of Karnataka.' + language = 'en_IN' + no_stylesheets = True + remove_attributes = ['height', 'width', 'style'] + ignore_duplicate_articles = {'url'} + encoding = 'utf-8' + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + try: + br.open(url) + except Exception as e: + url = e.hdrs.get('location') + soup = self.index_to_soup(url) + link = soup.find('a', href=True) + skip_sections =[ # add sections you want to skip + '/sports/', '/video/', '/bengaluru-crime/', '/metrolife/', + '/karnataka-districts/', '/brandspot/', '/entertainment/', + ] + if any(x in link['href'] for x in skip_sections): + self.log('Aborting Article ', link['href']) + self.abort_article('skipping section') + + self.log('Downloading ', link['href']) + html = br.open(link['href']).read() + pt = PersistentTemporaryFile('.html') + pt.write(html) + pt.close() + return pt.name + + keep_only_tags = [ + classes('article-title article-author__name'), + dict(name='div', attrs={'id':'main-content'}) + + ] + + remove_tags = [ + classes( + 'storyShare social-media-icons in_article_video static_text' + ' nl-optin-mobile dk_only article-banner-adver-wrapper wb_holder' + ' field-name-field-tags section-full strip--business' + ) + ] + + feeds = [ + ('DH', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com&hl=en-IN&gl=IN&ceid=IN:en') + ]