From 165f5ccff456b4647ef9935c6d819a0809411404 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 14 Apr 2024 19:27:35 +0530 Subject: [PATCH 1/3] Update NatGeo --- recipes/natgeo.recipe | 4 ++-- recipes/natgeohis.recipe | 4 ++-- recipes/natgeomag.recipe | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index 69ceeef02c..cdcc896e42 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -54,7 +54,7 @@ def parse_inline(inl): yield '
'.format(props['image']['src']) if 'caption' in props: yield '
{}{}
'.format( - props['caption']['text'], ' ' + props['caption']['credit'] + props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '') ) yield '

' if inl.get('content', {}).get('name', '') == 'ImageGroup': @@ -65,7 +65,7 @@ def parse_inline(inl): yield '
'.format(imgs['src']) if 'caption' in imgs: yield '
{}{}
'.format( - imgs['caption']['text'], ' ' + imgs['caption']['credit'] + imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '') ) yield '

' diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe index 1009102203..0bf60aa91c 100644 --- a/recipes/natgeohis.recipe +++ b/recipes/natgeohis.recipe @@ -53,7 +53,7 @@ def parse_inline(inl): yield '
'.format(props['image']['src']) if 'caption' in props: yield '
{}{}
'.format( - props['caption']['text'], ' ' + props['caption']['credit'] + props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '') ) yield '

' if inl.get('content', {}).get('name', '') == 'ImageGroup': @@ -64,7 +64,7 @@ def parse_inline(inl): yield '
'.format(imgs['src']) if 'caption' in imgs: yield '
{}{}
'.format( - imgs['caption']['text'], ' ' + imgs['caption']['credit'] + imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '') ) yield '

' diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index 1df5c71618..01b3fd95d3 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -58,7 +58,7 @@ def parse_inline(inl): yield '
'.format(props['image']['src']) if 'caption' in props: yield '
{}{}
'.format( - props['caption']['text'], ' ' + props['caption']['credit'] + props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '') ) yield '

' if inl.get('content', {}).get('name', '') == 'ImageGroup': @@ -69,7 +69,7 @@ def parse_inline(inl): yield '
'.format(imgs['src']) if 'caption' in imgs: yield '
{}{}
'.format( - imgs['caption']['text'], ' ' + imgs['caption']['credit'] + imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '') ) yield '

' From 4aa11d3b3e7d5ebc69ec7204718799755a8e2c20 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 14 Apr 2024 19:28:01 +0530 Subject: [PATCH 2/3] update ORFonline --- recipes/observer_reach_foundation.recipe | 78 ++++++++++++++++-------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/recipes/observer_reach_foundation.recipe b/recipes/observer_reach_foundation.recipe index a2283d154d..f844708d62 100644 --- a/recipes/observer_reach_foundation.recipe +++ b/recipes/observer_reach_foundation.recipe @@ -1,8 +1,9 @@ +from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe, classes class ORF(BasicNewsRecipe): - title = u'Observer Research Foundation' + title = 'Observer Research Foundation' description = ( 'Set up in 1990, ORF seeks to lead and aid policy thinking towards building a strong and prosperous India' ' in a fair and equitable world. It helps discover and inform India’s choices, and carries Indian voices ' @@ -10,52 +11,75 @@ class ORF(BasicNewsRecipe): ) language = 'en_IN' __author__ = 'unkn0wn' - oldest_article = 7.5 # days - max_articles_per_feed = 25 encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True masthead_url = 'https://www.orfonline.org/wp-content/uploads/2015/09/Logo_ORF_JPEG.jpg' remove_attributes = ['style', 'height', 'width'] - ignore_duplicate_articles = {'url'} + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True + remove_empty_feeds = True + simultaneous_downloads = 1 + art_url = '' extra_css = ''' - .report-slider {font-size:small; color:#404040;} + img {display:block; margin:0 auto;} + .report-slider, .author_panel {font-size:small; color:#404040;} .report {font-size:small; font-weight:bold;} .excert-italic, .recent-block-people {font-style:italic; color:#202020;} blockquote, em {color:#202020;} + .espert_speak_panel {font-size:small;} ''' + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links ', link) + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + def get_browser(self): return BasicNewsRecipe.get_browser(self, user_agent='common_words/based') resolve_internal_links = True remove_empty_feeds = True - keep_only_tags = [classes('recent-updates-block recent-block-people')] + keep_only_tags = [ + dict(name='h1'), + classes('author_panel espert_speak_panel expert_panel_content') + ] remove_tags = [ classes( 'social socialshare comment-area-section telegramhtml post-tag ' - 'research-prev research-next' + 'research-prev research-next col-md-4 button_group sharethis-p tags' ) ] - feeds = [ - ('Commentaries', 'https://www.orfonline.org/content-type/commentary/feed/'), - ('Expert Speak', 'https://www.orfonline.org/expert-speak/feed/'), - ('Books and Monographs', 'https://www.orfonline.org/content-type/books/feed/'), - ('Event Reports', 'https://www.orfonline.org/content-type/event-reports/feed/'), - ('Events', 'https://www.orfonline.org/content-type/events/feed/'), - ('Forums', 'https://www.orfonline.org/content-type/forums/feed/'), - ('GP-ORF Series', 'https://www.orfonline.org/content-type/gp-orf-series/feed/'), - ('Issue Briefs & Special Reports', 'https://www.orfonline.org/content-type/issue-brief/feed/'), - ('Monitors', 'https://www.orfonline.org/content-type/monitors/feed/'), - ('Occasional Papers', 'https://www.orfonline.org/content-type/occasional-paper/feed/'), - ('Primer', 'https://www.orfonline.org/content-type/primer/feed/'), - ('Series', 'https://www.orfonline.org/content-type/series/feed/'), - ('Surveys & Polls', 'https://www.orfonline.org/content-type/surveys-polls/feed/'), - ('Young Voices', 'https://www.orfonline.org/content-type/young-voices/feed/'), - ] + feeds = [] - def print_version(self, url): - if 'marathi' in url or 'hindi' in url or 'bangla' in url: - return '' - return url + when = '170' # hours > 7 days + index = 'https://www.orfonline.org' + + sections = [ + 'expert-speak', 'books', 'event-reports', 'events', 'forums', 'gp-orf-series', 'issue-brief', 'monitors', + 'occasional-paper', 'primer', 'series', 'surveys-polls', 'young-voices', 'research' + ] + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en' + for sec in sections: + sec_index = index + '/' + sec + '/' + feeds.append((sec.capitalize(), a.format(when, quote(sec_index, safe='')))) + feeds.append(('Others', a.format(when, quote(index, safe='')))) + + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Observer Research Foundation', '') From 44596984370617ca0b512bef5ca1f772fcbe3c8b Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 14 Apr 2024 19:29:03 +0530 Subject: [PATCH 3/3] Update eenadu --- recipes/eenadu.recipe | 201 ++++++++++++++++++------------------ recipes/eenadu_ap.recipe | 214 +++++++++++++++++---------------------- 2 files changed, 193 insertions(+), 222 deletions(-) diff --git a/recipes/eenadu.recipe b/recipes/eenadu.recipe index cebec9e5a2..9df538e8cb 100644 --- a/recipes/eenadu.recipe +++ b/recipes/eenadu.recipe @@ -1,7 +1,4 @@ -import re -from datetime import date, datetime, timedelta - -from calibre.utils.date import parse_date +from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -10,118 +7,116 @@ class eenadu_ts(BasicNewsRecipe): __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' - use_embedded_content = False - remove_javascript = True - no_stylesheets = True - remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {'url', 'title'} - masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' - cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/08/GTH/5_01/d5041804_01_mr.jpg' encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True + masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' + remove_attributes = ['style', 'height', 'width'] + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True remove_empty_feeds = True - extra_css = '.pub-t{font-size:small; font-style:italic;}' + simultaneous_downloads = 1 + art_url = '' - keep_only_tags = [ - dict(name='h1'), - dict(**classes('pub-t')), - classes('fullstory text-justify contlist-cont'), - dict(name='span', attrs={'id': 'PDSAIApbreak'}), - ] + extra_css = ''' + img {display:block; margin:0 auto;} + blockquote, em {color:#202020;} + .pub-t{font-size:small; font-style:italic;} + ''' - remove_tags = [ - dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), - dict( - name='p', - attrs={ - 'style': - 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' - } - ), - dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), - dict(name='br'), - classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') - ] + keep_only_tags = [classes('bookWrapper fullstory')] + remove_tags = [classes('ext-link offset-tb1 sshare-c')] - def parse_index(self): - section_list = [ - ('తెలంగాణ తాజా వార్తలు', 'telangana'), - ('సంపాదకీయం', 'telangana/editorial'), - ('తెలంగాణ ప్రధానాంశాలు', 'telangana/top-news'), - ('తెలంగాణ జిల్లా వార్తలు', 'telangana/districts'), - # ('క్రైమ్', 'crime'), + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links') + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + resolve_internal_links = True + remove_empty_feeds = True + + def get_cover_url(self): + import json + from datetime import date + today = quote(date.today().strftime('%d/%m/%Y'), safe='') + raw = self.index_to_soup( + 'https://epaper.eenadu.net/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True + ) + for cov in json.loads(raw): + if cov['NewsProPageTitle'].lower().startswith('front'): + return cov['HighResolution'] + + feeds = [] + + when = '27' # hours + index = 'https://www.eenadu.net' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te' + + news = index + '/telugu-news/' + news_list = [ + ('తెలంగాణ ప్రధానాంశాలు', 'ts-top-news'), + ('సంపాదకీయం', 'editorial'), + ('వ్యాఖ్యానం', 'vyakyanam'), + ('హైదరాబాద్ జిల్లా వార్తలు', 'districts/Hyderabad'), + ('క్రైమ్', 'crime'), ('పాలిటిక్స్', 'politics'), ('జాతీయం', 'india'), ('బిజినెస్', 'business'), ('అంతర్జాతీయం', 'world'), ('క్రీడలు', 'sports'), - # ('సినిమా', 'movies'), - # ('చదువు', 'education'), - # ('సుఖీభవ', 'health'), - # ('ఈ-నాడు', 'technology'), - # ('మకరందం', 'devotional'), - # ('ఈ తరం', 'youth'), - # ('ఆహా', 'recipes'), - # ('హాయ్ బుజ్జీ', 'kids-stories'), - # ('స్థిరాస్తి', 'real-estate'), + ('సినిమా', 'movies'), + ('వసుంధర', 'women'), + ('ఈ-నాడు', 'technology'), + ('వెబ్ ప్రత్యేకం', 'explained') ] - is_sunday = date.today().weekday() == 6 - if is_sunday: - section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) - feeds = [] + for n in news_list: + news_index = news + n[1] + '/' + feeds.append((n[0], a.format(when, quote(news_index, safe='')))) + feeds.append(('Other News', a.format(when, quote(news, safe='')))) - # For each section title, fetch the article urls - for section in section_list: - section_title = section[0] - section_url = 'https://www.eenadu.net/' + section[1] - self.log(section_title, section_url) - soup = self.index_to_soup(section_url) - articles = self.articles_from_soup(soup) - if articles: - feeds.append((section_title, articles)) - return feeds - def articles_from_soup(self, soup): - ans = [] - for link in soup.findAll( - attrs={ - 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] - } - ): - for a in link.findAll('a', attrs={'href': True}): - url = a['href'] - if url.startswith('https') is False: - url = 'https://www.eenadu.net/' + url + art = index + '/telugu-article/' + art_list = [ + ('చదువు', 'education'), + ('సుఖీభవ', 'health'), + ('ఆహా', 'recipes'), + ('హాయ్ బుజ్జీ', 'kids-stories'), + ('మకరందం', 'devotional'), + ('దేవతార్చన', 'temples'), + ('స్థిరాస్తి', 'real-estate'), + ('కథామృతం', 'kathalu'), + ('సండే మ్యాగజైన్', 'sunday-magazine') + ] + for x in art_list: + art_index = art + x[1] + '/' + feeds.append((x[0], a.format(when, quote(art_index, safe='')))) + feeds.append(('Other Articles', a.format(when, quote(art, safe='')))) - try: - desc = self.tag_to_string(a.find('div')).strip() - except Exception: - desc = '' + feeds.append(('ఇతరులు', a.format(when, quote(index, safe='')))) + feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/'))) - for h3 in a.findAll('h3'): - title = self.tag_to_string(h3).strip() - sub = re.escape(title) - desc = re.sub(sub, '', desc).strip() + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Eenadu', '') + desc = soup.find(attrs={'class':'srtdes'}) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary - if not title or not url: - continue - - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': desc}) - return ans - - def preprocess_html(self, soup): - div = soup.find('div', **classes('pub-t')) - if div: - date = parse_date( - self.tag_to_string(div).strip().replace('Published : ', '').replace( - 'Updated : ', '' - ).replace(' IST', ':00.000001') - ).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(1.15): - self.abort_article('Skipping old article') - else: - self.abort_article('not an article') - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - return soup + def preprocess_raw_html(self, raw, *a): + import re + if '' in raw: + body = re.search(r'([^~]+?)', raw) + return '
' + body.group(1) + '
' + return raw diff --git a/recipes/eenadu_ap.recipe b/recipes/eenadu_ap.recipe index 4c7d8d0a2e..43738caf90 100644 --- a/recipes/eenadu_ap.recipe +++ b/recipes/eenadu_ap.recipe @@ -1,7 +1,4 @@ -import re -from datetime import date, datetime, timedelta - -from calibre.utils.date import parse_date +from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -10,137 +7,116 @@ class eenadu_ap(BasicNewsRecipe): __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' - use_embedded_content = False - remove_javascript = True - no_stylesheets = True - remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {'url', 'title'} - masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' - cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/03/CAN/5_01/bfff5654_01_mr.jpg' encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True + masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' + remove_attributes = ['style', 'height', 'width'] + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True remove_empty_feeds = True - extra_css = '.pub-t{font-size:small; font-style:italic;}' + simultaneous_downloads = 1 + art_url = '' - keep_only_tags = [ - dict(name='h1'), - dict(**classes('pub-t')), - classes('fullstory text-justify contlist-cont'), - dict(name='span', attrs={'id': 'PDSAIApbreak'}), - ] + extra_css = ''' + img {display:block; margin:0 auto;} + blockquote, em {color:#202020;} + .pub-t{font-size:small; font-style:italic;} + ''' - remove_tags = [ - dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), - dict( - name='p', - attrs={ - 'style': - 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' - } - ), - dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), - dict(name='br'), - classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') - ] + keep_only_tags = [classes('bookWrapper fullstory')] + remove_tags = [classes('ext-link offset-tb1 sshare-c')] + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links') + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + resolve_internal_links = True + remove_empty_feeds = True def get_cover_url(self): + import json from datetime import date - cover = 'https://img.kiosko.net/' + str( - date.today().year - ) + '/' + date.today().strftime('%m') + '/' + date.today( - ).strftime('%d') + '/in/eenadu.750.jpg' - br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) - try: - br.open(cover) - except: - index = 'https://es.kiosko.net/in/np/eenadu.html' - soup = self.index_to_soup(index) - for image in soup.findAll('img', src=True): - if image['src'].endswith('750.jpg'): - return 'https:' + image['src'] - self.log("\nCover unavailable") - cover = None - return cover + today = quote(date.today().strftime('%d/%m/%Y'), safe='') + raw = self.index_to_soup( + 'https://epaper.eenadu.net/Home/GetAllpages?editionid=2&editiondate=' + today, raw=True + ) + for cov in json.loads(raw): + if cov['NewsProPageTitle'].lower().startswith('front'): + return cov['HighResolution'] - def parse_index(self): - section_list = [ - ('ఆంధ్రప్రదేశ్ తాజా వార్తలు', 'andhra-pradesh'), - ('సంపాదకీయం', 'andhra-pradesh/editorial'), - ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'andhra-pradesh/top-news'), - ('ఆంధ్రప్రదేశ్ జిల్లా వార్తలు', 'andhra-pradesh/districts'), - # ('క్రైమ్', 'crime'), + feeds = [] + + when = '27' # hours + index = 'https://www.eenadu.net' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te' + + news = index + '/telugu-news/' + news_list = [ + ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'ap-top-news'), + ('సంపాదకీయం', 'editorial'), + ('వ్యాఖ్యానం', 'vyakyanam'), + ('విశాఖపట్నం జిల్లా వార్తలు', 'districts/Visakhapatnam'), + ('క్రైమ్', 'crime'), ('పాలిటిక్స్', 'politics'), ('జాతీయం', 'india'), ('బిజినెస్', 'business'), ('అంతర్జాతీయం', 'world'), ('క్రీడలు', 'sports'), - # ('సినిమా', 'movies'), - # ('చదువు', 'education'), - # ('సుఖీభవ', 'health'), - # ('ఈ-నాడు', 'technology'), - # ('మకరందం', 'devotional'), - # ('ఈ తరం', 'youth'), - # ('ఆహా', 'recipes'), - # ('హాయ్ బుజ్జీ', 'kids-stories'), - # ('స్థిరాస్తి', 'real-estate'), + ('సినిమా', 'movies'), + ('వసుంధర', 'women'), + ('ఈ-నాడు', 'technology'), + ('వెబ్ ప్రత్యేకం', 'explained') ] - is_sunday = date.today().weekday() == 6 - if is_sunday: - section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) - feeds = [] + for n in news_list: + news_index = news + n[1] + '/' + feeds.append((n[0], a.format(when, quote(news_index, safe='')))) + feeds.append(('Other News', a.format(when, quote(news, safe='')))) - # For each section title, fetch the article urls - for section in section_list: - section_title = section[0] - section_url = 'https://www.eenadu.net/' + section[1] - self.log(section_title, section_url) - soup = self.index_to_soup(section_url) - articles = self.articles_from_soup(soup) - if articles: - feeds.append((section_title, articles)) - return feeds - def articles_from_soup(self, soup): - ans = [] - for link in soup.findAll( - attrs={ - 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] - } - ): - for a in link.findAll('a', attrs={'href': True}): - url = a['href'] - if url.startswith('https') is False: - url = 'https://www.eenadu.net/' + url + art = index + '/telugu-article/' + art_list = [ + ('చదువు', 'education'), + ('సుఖీభవ', 'health'), + ('ఆహా', 'recipes'), + ('హాయ్ బుజ్జీ', 'kids-stories'), + ('మకరందం', 'devotional'), + ('దేవతార్చన', 'temples'), + ('స్థిరాస్తి', 'real-estate'), + ('కథామృతం', 'kathalu'), + ('సండే మ్యాగజైన్', 'sunday-magazine') + ] + for x in art_list: + art_index = art + x[1] + '/' + feeds.append((x[0], a.format(when, quote(art_index, safe='')))) + feeds.append(('Other Articles', a.format(when, quote(art, safe='')))) - try: - desc = self.tag_to_string(a.find('div')).strip() - except Exception: - desc = '' + feeds.append(('ఇతరులు', a.format(when, quote(index, safe='')))) + feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/'))) - for h3 in a.findAll('h3'): - title = self.tag_to_string(h3).strip() - sub = re.escape(title) - desc = re.sub(sub, '', desc).strip() + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Eenadu', '') + desc = soup.find(attrs={'class':'srtdes'}) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary - if not title or not url: - continue - - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': desc}) - return ans - - def preprocess_html(self, soup): - div = soup.find('div', **classes('pub-t')) - if div: - date = parse_date( - self.tag_to_string(div).strip().replace('Published : ', '').replace( - 'Updated : ', '' - ).replace(' IST', ':00.000001') - ).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(1.15): - self.abort_article('Skipping old article') - else: - self.abort_article('not an article') - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - return soup + def preprocess_raw_html(self, raw, *a): + import re + if '' in raw: + body = re.search(r'([^~]+?)', raw) + return '
' + body.group(1) + '
' + return raw