from calibre.web.feeds.news import BasicNewsRecipe, classes from datetime import date, datetime, timedelta from calibre.utils.date import parse_date import re class eenadu_ts(BasicNewsRecipe): title = 'ఈనాడు - తెలంగాణ' __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' use_embedded_content = False remove_javascript = True no_stylesheets = True remove_attributes = ['height', 'width', 'style'] ignore_duplicate_articles = {'url', 'title'} masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/08/GTH/5_01/d5041804_01_mr.jpg' encoding = 'utf-8' remove_empty_feeds = True extra_css = '.pub-t{font-size:small; font-style:italic;}' keep_only_tags = [ dict(name='h1'), dict(**classes('pub-t')), classes('fullstory text-justify contlist-cont'), dict(name='span', attrs={'id': 'PDSAIApbreak'}), ] remove_tags = [ dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), dict( name='p', attrs={ 'style': 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' } ), dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), dict(name='br'), classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') ] def parse_index(self): section_list = [ ('తెలంగాణ తాజా వార్తలు', 'telangana'), ('సంపాదకీయం', 'telangana/editorial'), ('తెలంగాణ ప్రధానాంశాలు', 'telangana/top-news'), ('తెలంగాణ జిల్లా వార్తలు', 'telangana/districts'), # ('క్రైమ్', 'crime'), ('పాలిటిక్స్', 'politics'), ('జాతీయం', 'india'), ('బిజినెస్', 'business'), ('అంతర్జాతీయం', 'world'), ('క్రీడలు', 'sports'), # ('సినిమా', 'movies'), # ('చదువు', 'education'), # ('సుఖీభవ', 'health'), # ('ఈ-నాడు', 'technology'), # ('మకరందం', 'devotional'), # ('ఈ తరం', 'youth'), # ('ఆహా', 'recipes'), # ('హాయ్ బుజ్జీ', 'kids-stories'), # ('స్థిరాస్తి', 'real-estate'), ] is_sunday = date.today().weekday() == 6 if is_sunday: section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) feeds = [] # For each section title, fetch the article urls for section in section_list: section_title = section[0] section_url = 'https://www.eenadu.net/' + section[1] self.log(section_title, section_url) soup = self.index_to_soup(section_url) articles = self.articles_from_soup(soup) if articles: feeds.append((section_title, articles)) return feeds def articles_from_soup(self, soup): ans = [] for link in soup.findAll( attrs={ 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] } ): for a in link.findAll('a', attrs={'href': True}): url = a['href'] if url.startswith('https') is False: url = 'https://www.eenadu.net/' + url try: desc = self.tag_to_string(a.find('div')).strip() except Exception: desc = '' for h3 in a.findAll('h3'): title = self.tag_to_string(h3).strip() sub = re.escape(title) desc = re.sub(sub, '', desc).strip() if not title or not url: continue self.log('\t', title, '\n\t', desc, '\n\t\t', url) ans.append({'title': title, 'url': url, 'description': desc}) return ans def preprocess_html(self, soup): div = soup.find('div', **classes('pub-t')) if div: date = parse_date( self.tag_to_string(div).strip().replace('Published : ', '').replace( 'Updated : ', '' ).replace(' IST', ':00.000001') ).replace(tzinfo=None) today = datetime.now() if (today - date) > timedelta(1.15): self.abort_article('Skipping old article') else: self.abort_article('not an article') for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] return soup