diff --git a/recipes/eenadu.recipe b/recipes/eenadu.recipe index cebec9e5a2..9df538e8cb 100644 --- a/recipes/eenadu.recipe +++ b/recipes/eenadu.recipe @@ -1,7 +1,4 @@ -import re -from datetime import date, datetime, timedelta - -from calibre.utils.date import parse_date +from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -10,118 +7,116 @@ class eenadu_ts(BasicNewsRecipe): __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' - use_embedded_content = False - remove_javascript = True - no_stylesheets = True - remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {'url', 'title'} - masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' - cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/08/GTH/5_01/d5041804_01_mr.jpg' encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True + masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' + remove_attributes = ['style', 'height', 'width'] + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True remove_empty_feeds = True - extra_css = '.pub-t{font-size:small; font-style:italic;}' + simultaneous_downloads = 1 + art_url = '' - keep_only_tags = [ - dict(name='h1'), - dict(**classes('pub-t')), - classes('fullstory text-justify contlist-cont'), - dict(name='span', attrs={'id': 'PDSAIApbreak'}), - ] + extra_css = ''' + img {display:block; margin:0 auto;} + blockquote, em {color:#202020;} + .pub-t{font-size:small; font-style:italic;} + ''' - remove_tags = [ - dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), - dict( - name='p', - attrs={ - 'style': - 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' - } - ), - dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), - dict(name='br'), - classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') - ] + keep_only_tags = [classes('bookWrapper fullstory')] + remove_tags = [classes('ext-link offset-tb1 sshare-c')] - def parse_index(self): - section_list = [ - ('తెలంగాణ తాజా వార్తలు', 'telangana'), - ('సంపాదకీయం', 'telangana/editorial'), - ('తెలంగాణ ప్రధానాంశాలు', 'telangana/top-news'), - ('తెలంగాణ జిల్లా వార్తలు', 'telangana/districts'), - # ('క్రైమ్', 'crime'), + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links') + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + resolve_internal_links = True + remove_empty_feeds = True + + def get_cover_url(self): + import json + from datetime import date + today = quote(date.today().strftime('%d/%m/%Y'), safe='') + raw = self.index_to_soup( + 'https://epaper.eenadu.net/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True + ) + for cov in json.loads(raw): + if cov['NewsProPageTitle'].lower().startswith('front'): + return cov['HighResolution'] + + feeds = [] + + when = '27' # hours + index = 'https://www.eenadu.net' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te' + + news = index + '/telugu-news/' + news_list = [ + ('తెలంగాణ ప్రధానాంశాలు', 'ts-top-news'), + ('సంపాదకీయం', 'editorial'), + ('వ్యాఖ్యానం', 'vyakyanam'), + ('హైదరాబాద్ జిల్లా వార్తలు', 'districts/Hyderabad'), + ('క్రైమ్', 'crime'), ('పాలిటిక్స్', 'politics'), ('జాతీయం', 'india'), ('బిజినెస్', 'business'), ('అంతర్జాతీయం', 'world'), ('క్రీడలు', 'sports'), - # ('సినిమా', 'movies'), - # ('చదువు', 'education'), - # ('సుఖీభవ', 'health'), - # ('ఈ-నాడు', 'technology'), - # ('మకరందం', 'devotional'), - # ('ఈ తరం', 'youth'), - # ('ఆహా', 'recipes'), - # ('హాయ్ బుజ్జీ', 'kids-stories'), - # ('స్థిరాస్తి', 'real-estate'), + ('సినిమా', 'movies'), + ('వసుంధర', 'women'), + ('ఈ-నాడు', 'technology'), + ('వెబ్ ప్రత్యేకం', 'explained') ] - is_sunday = date.today().weekday() == 6 - if is_sunday: - section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) - feeds = [] + for n in news_list: + news_index = news + n[1] + '/' + feeds.append((n[0], a.format(when, quote(news_index, safe='')))) + feeds.append(('Other News', a.format(when, quote(news, safe='')))) - # For each section title, fetch the article urls - for section in section_list: - section_title = section[0] - section_url = 'https://www.eenadu.net/' + section[1] - self.log(section_title, section_url) - soup = self.index_to_soup(section_url) - articles = self.articles_from_soup(soup) - if articles: - feeds.append((section_title, articles)) - return feeds - def articles_from_soup(self, soup): - ans = [] - for link in soup.findAll( - attrs={ - 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] - } - ): - for a in link.findAll('a', attrs={'href': True}): - url = a['href'] - if url.startswith('https') is False: - url = 'https://www.eenadu.net/' + url + art = index + '/telugu-article/' + art_list = [ + ('చదువు', 'education'), + ('సుఖీభవ', 'health'), + ('ఆహా', 'recipes'), + ('హాయ్ బుజ్జీ', 'kids-stories'), + ('మకరందం', 'devotional'), + ('దేవతార్చన', 'temples'), + ('స్థిరాస్తి', 'real-estate'), + ('కథామృతం', 'kathalu'), + ('సండే మ్యాగజైన్', 'sunday-magazine') + ] + for x in art_list: + art_index = art + x[1] + '/' + feeds.append((x[0], a.format(when, quote(art_index, safe='')))) + feeds.append(('Other Articles', a.format(when, quote(art, safe='')))) - try: - desc = self.tag_to_string(a.find('div')).strip() - except Exception: - desc = '' + feeds.append(('ఇతరులు', a.format(when, quote(index, safe='')))) + feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/'))) - for h3 in a.findAll('h3'): - title = self.tag_to_string(h3).strip() - sub = re.escape(title) - desc = re.sub(sub, '', desc).strip() + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Eenadu', '') + desc = soup.find(attrs={'class':'srtdes'}) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary - if not title or not url: - continue - - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': desc}) - return ans - - def preprocess_html(self, soup): - div = soup.find('div', **classes('pub-t')) - if div: - date = parse_date( - self.tag_to_string(div).strip().replace('Published : ', '').replace( - 'Updated : ', '' - ).replace(' IST', ':00.000001') - ).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(1.15): - self.abort_article('Skipping old article') - else: - self.abort_article('not an article') - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - return soup + def preprocess_raw_html(self, raw, *a): + import re + if '' in raw: + body = re.search(r'([^~]+?)', raw) + return '
' + body.group(1) + '
' + return raw diff --git a/recipes/eenadu_ap.recipe b/recipes/eenadu_ap.recipe index 4c7d8d0a2e..43738caf90 100644 --- a/recipes/eenadu_ap.recipe +++ b/recipes/eenadu_ap.recipe @@ -1,7 +1,4 @@ -import re -from datetime import date, datetime, timedelta - -from calibre.utils.date import parse_date +from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -10,137 +7,116 @@ class eenadu_ap(BasicNewsRecipe): __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' - use_embedded_content = False - remove_javascript = True - no_stylesheets = True - remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {'url', 'title'} - masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' - cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/03/CAN/5_01/bfff5654_01_mr.jpg' encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True + masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' + remove_attributes = ['style', 'height', 'width'] + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True remove_empty_feeds = True - extra_css = '.pub-t{font-size:small; font-style:italic;}' + simultaneous_downloads = 1 + art_url = '' - keep_only_tags = [ - dict(name='h1'), - dict(**classes('pub-t')), - classes('fullstory text-justify contlist-cont'), - dict(name='span', attrs={'id': 'PDSAIApbreak'}), - ] + extra_css = ''' + img {display:block; margin:0 auto;} + blockquote, em {color:#202020;} + .pub-t{font-size:small; font-style:italic;} + ''' - remove_tags = [ - dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), - dict( - name='p', - attrs={ - 'style': - 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' - } - ), - dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), - dict(name='br'), - classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') - ] + keep_only_tags = [classes('bookWrapper fullstory')] + remove_tags = [classes('ext-link offset-tb1 sshare-c')] + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links') + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + resolve_internal_links = True + remove_empty_feeds = True def get_cover_url(self): + import json from datetime import date - cover = 'https://img.kiosko.net/' + str( - date.today().year - ) + '/' + date.today().strftime('%m') + '/' + date.today( - ).strftime('%d') + '/in/eenadu.750.jpg' - br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) - try: - br.open(cover) - except: - index = 'https://es.kiosko.net/in/np/eenadu.html' - soup = self.index_to_soup(index) - for image in soup.findAll('img', src=True): - if image['src'].endswith('750.jpg'): - return 'https:' + image['src'] - self.log("\nCover unavailable") - cover = None - return cover + today = quote(date.today().strftime('%d/%m/%Y'), safe='') + raw = self.index_to_soup( + 'https://epaper.eenadu.net/Home/GetAllpages?editionid=2&editiondate=' + today, raw=True + ) + for cov in json.loads(raw): + if cov['NewsProPageTitle'].lower().startswith('front'): + return cov['HighResolution'] - def parse_index(self): - section_list = [ - ('ఆంధ్రప్రదేశ్ తాజా వార్తలు', 'andhra-pradesh'), - ('సంపాదకీయం', 'andhra-pradesh/editorial'), - ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'andhra-pradesh/top-news'), - ('ఆంధ్రప్రదేశ్ జిల్లా వార్తలు', 'andhra-pradesh/districts'), - # ('క్రైమ్', 'crime'), + feeds = [] + + when = '27' # hours + index = 'https://www.eenadu.net' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te' + + news = index + '/telugu-news/' + news_list = [ + ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'ap-top-news'), + ('సంపాదకీయం', 'editorial'), + ('వ్యాఖ్యానం', 'vyakyanam'), + ('విశాఖపట్నం జిల్లా వార్తలు', 'districts/Visakhapatnam'), + ('క్రైమ్', 'crime'), ('పాలిటిక్స్', 'politics'), ('జాతీయం', 'india'), ('బిజినెస్', 'business'), ('అంతర్జాతీయం', 'world'), ('క్రీడలు', 'sports'), - # ('సినిమా', 'movies'), - # ('చదువు', 'education'), - # ('సుఖీభవ', 'health'), - # ('ఈ-నాడు', 'technology'), - # ('మకరందం', 'devotional'), - # ('ఈ తరం', 'youth'), - # ('ఆహా', 'recipes'), - # ('హాయ్ బుజ్జీ', 'kids-stories'), - # ('స్థిరాస్తి', 'real-estate'), + ('సినిమా', 'movies'), + ('వసుంధర', 'women'), + ('ఈ-నాడు', 'technology'), + ('వెబ్ ప్రత్యేకం', 'explained') ] - is_sunday = date.today().weekday() == 6 - if is_sunday: - section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) - feeds = [] + for n in news_list: + news_index = news + n[1] + '/' + feeds.append((n[0], a.format(when, quote(news_index, safe='')))) + feeds.append(('Other News', a.format(when, quote(news, safe='')))) - # For each section title, fetch the article urls - for section in section_list: - section_title = section[0] - section_url = 'https://www.eenadu.net/' + section[1] - self.log(section_title, section_url) - soup = self.index_to_soup(section_url) - articles = self.articles_from_soup(soup) - if articles: - feeds.append((section_title, articles)) - return feeds - def articles_from_soup(self, soup): - ans = [] - for link in soup.findAll( - attrs={ - 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] - } - ): - for a in link.findAll('a', attrs={'href': True}): - url = a['href'] - if url.startswith('https') is False: - url = 'https://www.eenadu.net/' + url + art = index + '/telugu-article/' + art_list = [ + ('చదువు', 'education'), + ('సుఖీభవ', 'health'), + ('ఆహా', 'recipes'), + ('హాయ్ బుజ్జీ', 'kids-stories'), + ('మకరందం', 'devotional'), + ('దేవతార్చన', 'temples'), + ('స్థిరాస్తి', 'real-estate'), + ('కథామృతం', 'kathalu'), + ('సండే మ్యాగజైన్', 'sunday-magazine') + ] + for x in art_list: + art_index = art + x[1] + '/' + feeds.append((x[0], a.format(when, quote(art_index, safe='')))) + feeds.append(('Other Articles', a.format(when, quote(art, safe='')))) - try: - desc = self.tag_to_string(a.find('div')).strip() - except Exception: - desc = '' + feeds.append(('ఇతరులు', a.format(when, quote(index, safe='')))) + feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/'))) - for h3 in a.findAll('h3'): - title = self.tag_to_string(h3).strip() - sub = re.escape(title) - desc = re.sub(sub, '', desc).strip() + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Eenadu', '') + desc = soup.find(attrs={'class':'srtdes'}) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary - if not title or not url: - continue - - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': desc}) - return ans - - def preprocess_html(self, soup): - div = soup.find('div', **classes('pub-t')) - if div: - date = parse_date( - self.tag_to_string(div).strip().replace('Published : ', '').replace( - 'Updated : ', '' - ).replace(' IST', ':00.000001') - ).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(1.15): - self.abort_article('Skipping old article') - else: - self.abort_article('not an article') - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - return soup + def preprocess_raw_html(self, raw, *a): + import re + if '' in raw: + body = re.search(r'([^~]+?)', raw) + return '
' + body.group(1) + '
' + return raw