diff --git a/recipes/eenadu.recipe b/recipes/eenadu.recipe index fcd370ff61..d6fe7682b8 100644 --- a/recipes/eenadu.recipe +++ b/recipes/eenadu.recipe @@ -1,11 +1,11 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes - from datetime import date, datetime, timedelta from calibre.utils.date import parse_date +import re -class eenadu(BasicNewsRecipe): - title = 'ఈనాడు' +class eenadu_ts(BasicNewsRecipe): + title = 'ఈనాడు - తెలంగాణ' __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' @@ -13,66 +13,65 @@ class eenadu(BasicNewsRecipe): remove_javascript = True no_stylesheets = True remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {"title", "url"} + ignore_duplicate_articles = {'url', 'title'} masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' - cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/06/07/TEL/5_01/9de49f18_01_mr.jpg' + cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/08/GTH/5_01/d5041804_01_mr.jpg' encoding = 'utf-8' + remove_empty_feeds = True + extra_css = '.pub-t{font-size:small; font-style:italic;}' keep_only_tags = [ dict(name='h1'), - classes('eng-body grey pub-t text-justify contlist-cont'), + dict(**classes('pub-t')), + classes('fullstory text-justify contlist-cont'), dict(name='span', attrs={'id': 'PDSAIApbreak'}), ] - remove_tags = [classes('sshare-c')] - - def articles_from_soup(self, soup): - ans = [] - for link in soup.findAll(attrs={'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel']}): - for a in link.findAll('a', attrs={'href': True}): - url = a['href'] - if url.startswith('https') is False: - url = 'https://www.eenadu.net/' + url - else: - url = url - for h3 in a.findAll('h3'): - title = self.tag_to_string(h3) - if not url or not title: - continue - self.log('\t', title) - self.log('\t\t', url) - ans.append({ - 'title': title, - 'url': url}) - return ans + remove_tags = [ + dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), + dict( + name='p', + attrs={ + 'style': + 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' + } + ), + dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), + dict(name='br'), + classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') + ] def parse_index(self): - soup = self.index_to_soup('https://www.eenadu.net/') - nav_div = soup.find(id='navbar') section_list = [ - ('సంపాదకీయం', 'https://www.eenadu.net/telangana/editorial'), + ('తెలంగాణ తాజా వార్తలు', 'telangana'), + ('సంపాదకీయం', 'telangana/editorial'), + ('తెలంగాణ ప్రధానాంశాలు', 'telangana/top-news'), + ('తెలంగాణ జిల్లా వార్తలు', 'telangana/districts'), + # ('క్రైమ్', 'crime'), + ('పాలిటిక్స్', 'politics'), + ('జాతీయం', 'india'), + ('బిజినెస్', 'business'), + ('అంతర్జాతీయం', 'world'), + ('క్రీడలు', 'sports'), + # ('సినిమా', 'movies'), + # ('చదువు', 'education'), + # ('సుఖీభవ', 'health'), + # ('ఈ-నాడు', 'technology'), + # ('మకరందం', 'devotional'), + # ('ఈ తరం', 'youth'), + # ('ఆహా', 'recipes'), + # ('హాయ్ బుజ్జీ', 'kids-stories'), + # ('స్థిరాస్తి', 'real-estate'), ] - - # Finding all the section titles that are acceptable - for x in nav_div.findAll(['a']): - if self.is_accepted_entry(x): - sec = self.tag_to_string(x) - link = x['href'] - if link.endswith('telangana'): - sec = 'తెలంగాణ' - if link.endswith('andhra-pradesh'): - sec = 'ఆంధ్రప్రదేశ్' - if link.endswith('andhra-pradesh/districts'): - sec = 'ఆంధ్రప్రదేశ్.. ఆసక్తికర జిల్లా వార్తలు' - if link.endswith('telangana/districts'): - sec = 'తెలంగాణ.. ఆసక్తికర జిల్లా వార్తలు' - section_list.append((sec, link)) + is_sunday = date.today().weekday() == 6 + if is_sunday: + section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) feeds = [] # For each section title, fetch the article urls for section in section_list: section_title = section[0] - section_url = section[1] + section_url = 'https://www.eenadu.net/' + section[1] self.log(section_title, section_url) soup = self.index_to_soup(section_url) articles = self.articles_from_soup(soup) @@ -80,88 +79,48 @@ class eenadu(BasicNewsRecipe): feeds.append((section_title, articles)) return feeds - def is_accepted_entry(self, entry): - # Those sections in the top nav bar that we will omit - # https://www.eenadu.net/nri - is_sunday = date.today().weekday() == 6 - if is_sunday: - omit_list = [ - 'net/', - 'javascript:void(0)', - '#', - 'sports', - 'movies', - 'women', - 'technology', - 'business', - 'stories.eenadu.net', - 'calendar', - 'devotional', - 'youth', - 'recipes', - 'real-estate', - 'temples', - 'kathalu', - 'viral-videos', - 'Nri', - 'videos', - 'explained', - 'agriculture', - 'crime', - 'health', - 'photos', - 'kids-stories', - 'education', - ] + def articles_from_soup(self, soup): + ans = [] + for link in soup.findAll( + attrs={ + 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] + } + ): + for a in link.findAll('a', attrs={'href': True}): + url = a['href'] + if url.startswith('https') is False: + url = 'https://www.eenadu.net/' + url - else: - omit_list = [ - 'net/', - 'javascript:void(0)', - '#', - 'sports', - 'movies', - 'women', - 'technology', - 'business', - 'stories.eenadu.net', - 'calendar', - 'devotional', - 'youth', - 'recipes', - 'real-estate', - 'temples', - 'kathalu', - 'viral-videos', - 'Nri', - 'videos', - 'explained', - 'agriculture', - 'sunday-magazine', - 'crime', - 'health', - 'photos', - 'kids-stories', - 'education', - ] + try: + desc = self.tag_to_string(a.find('div')).strip() + except Exception: + desc = '' - is_accepted = True - for omit_entry in omit_list: - if entry['href'].endswith(omit_entry): - is_accepted = False - break - return is_accepted + for h3 in a.findAll('h3'): + title = self.tag_to_string(h3).strip() + sub = re.escape(title) + desc = re.sub(sub, '', desc).strip() + + if not title or not url: + continue + + self.log('\t', title, '\n\t', desc, '\n\t\t', url) + ans.append({'title': title, 'url': url, 'description': desc}) + return ans def preprocess_html(self, soup): div = soup.find('div', **classes('pub-t')) if div: date = parse_date( - self.tag_to_string(div) - .strip().replace('Published : ','').replace('Updated : ','').replace(' IST',':00.000001') - ).replace(tzinfo=None) + self.tag_to_string(div).strip().replace('Published : ', '').replace( + 'Updated : ', '' + ).replace(' IST', ':00.000001') + ).replace(tzinfo=None) today = datetime.now() - if (today - date) > timedelta(1.5): + if (today - date) > timedelta(1.15): self.abort_article('Skipping old article') - else: # may not be an article. - self.abort_article() + else: + self.abort_article('not an article') + for img in soup.findAll('img', attrs={'data-src': True}): + img['src'] = img['data-src'] return soup diff --git a/recipes/eenadu_ap.recipe b/recipes/eenadu_ap.recipe new file mode 100644 index 0000000000..353611cb8e --- /dev/null +++ b/recipes/eenadu_ap.recipe @@ -0,0 +1,126 @@ +from calibre.web.feeds.news import BasicNewsRecipe, classes +from datetime import date, datetime, timedelta +from calibre.utils.date import parse_date +import re + + +class eenadu_ap(BasicNewsRecipe): + title = 'ఈనాడు - ఆంధ్రప్రదేశ్' + __author__ = 'unkn0wn' + description = 'THE LARGEST CIRCULATED TELUGU DAILY' + language = 'te' + use_embedded_content = False + remove_javascript = True + no_stylesheets = True + remove_attributes = ['height', 'width', 'style'] + ignore_duplicate_articles = {'url', 'title'} + masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' + cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/03/CAN/5_01/bfff5654_01_mr.jpg' + encoding = 'utf-8' + remove_empty_feeds = True + extra_css = '.pub-t{font-size:small; font-style:italic;}' + + keep_only_tags = [ + dict(name='h1'), + dict(**classes('pub-t')), + classes('fullstory text-justify contlist-cont'), + dict(name='span', attrs={'id': 'PDSAIApbreak'}), + ] + + remove_tags = [ + dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), + dict( + name='p', + attrs={ + 'style': + 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' + } + ), + dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), + dict(name='br'), + classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') + ] + + def parse_index(self): + section_list = [ + ('ఆంధ్రప్రదేశ్ తాజా వార్తలు', 'andhra-pradesh'), + ('సంపాదకీయం', 'andhra-pradesh/editorial'), + ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'andhra-pradesh/top-news'), + ('ఆంధ్రప్రదేశ్ జిల్లా వార్తలు', 'andhra-pradesh/districts'), + # ('క్రైమ్', 'crime'), + ('పాలిటిక్స్', 'politics'), + ('జాతీయం', 'india'), + ('బిజినెస్', 'business'), + ('అంతర్జాతీయం', 'world'), + ('క్రీడలు', 'sports'), + # ('సినిమా', 'movies'), + # ('చదువు', 'education'), + # ('సుఖీభవ', 'health'), + # ('ఈ-నాడు', 'technology'), + # ('మకరందం', 'devotional'), + # ('ఈ తరం', 'youth'), + # ('ఆహా', 'recipes'), + # ('హాయ్ బుజ్జీ', 'kids-stories'), + # ('స్థిరాస్తి', 'real-estate'), + ] + is_sunday = date.today().weekday() == 6 + if is_sunday: + section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) + feeds = [] + + # For each section title, fetch the article urls + for section in section_list: + section_title = section[0] + section_url = 'https://www.eenadu.net/' + section[1] + self.log(section_title, section_url) + soup = self.index_to_soup(section_url) + articles = self.articles_from_soup(soup) + if articles: + feeds.append((section_title, articles)) + return feeds + + def articles_from_soup(self, soup): + ans = [] + for link in soup.findAll( + attrs={ + 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] + } + ): + for a in link.findAll('a', attrs={'href': True}): + url = a['href'] + if url.startswith('https') is False: + url = 'https://www.eenadu.net/' + url + + try: + desc = self.tag_to_string(a.find('div')).strip() + except Exception: + desc = '' + + for h3 in a.findAll('h3'): + title = self.tag_to_string(h3).strip() + sub = re.escape(title) + desc = re.sub(sub, '', desc).strip() + + if not title or not url: + continue + + self.log('\t', title, '\n\t', desc, '\n\t\t', url) + ans.append({'title': title, 'url': url, 'description': desc}) + return ans + + def preprocess_html(self, soup): + div = soup.find('div', **classes('pub-t')) + if div: + date = parse_date( + self.tag_to_string(div).strip().replace('Published : ', '').replace( + 'Updated : ', '' + ).replace(' IST', ':00.000001') + ).replace(tzinfo=None) + today = datetime.now() + if (today - date) > timedelta(1.15): + self.abort_article('Skipping old article') + else: + self.abort_article('not an article') + for img in soup.findAll('img', attrs={'data-src': True}): + img['src'] = img['data-src'] + return soup