diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index 290cfe4ae9..bb3f7c3c37 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -134,27 +134,27 @@ class Bloomberg(BasicNewsRecipe): cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data and data['primaryCategory'] is not None: - cat = '

' + data['primaryCategory'] + '

' + cat = '

' + data['primaryCategory'] + '

' if len(data['abstract']) != 0 and len(data['abstract']) == 2: - subhead = '

' + data['abstract'][0] + '

' + data['abstract'][1] + '

' + subhead = '

' + data['abstract'][0] + '

' + data['abstract'][1] + '

' else: if 'summary' in data: - subhead = '

' + data['summary'] + '

' + subhead = '

' + data['summary'] + '

' if 'byline' in data and data['byline'] is not None: - auth = '
' + data['byline']\ - + ' | ' + data['publishedAt'][:-14] + '
' + auth = '
' + data['byline']\ + + ' | ' + data['publishedAt'][:-14] + '
' if 'ledeImageUrl' in data and data['ledeImageUrl'] is not None: lede = '

'.format(data['ledeImageUrl']) if 'ledeDescription' in data and data['ledeDescription'] is not None: - caption = '' + data['ledeDescription'] + '' + caption = '' + data['ledeDescription'] + '' else: if 'lede' in data and data['lede'] is not None: if 'alt' in data['lede'] and data['lede']['alt'] is not None: - caption = '' + data['lede']['alt'] + '' + caption = '' + data['lede']['alt'] + '' if m: time.sleep(3) diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe index 2f3522a970..7f2f0a0dc7 100644 --- a/recipes/business_standard.recipe +++ b/recipes/business_standard.recipe @@ -1,58 +1,107 @@ -__license__ = 'GPL v3' -__copyright__ = '2009-2012, Darko Miletic ' -''' -www.business-standard.com -''' - -from calibre.web.feeds.recipes import BasicNewsRecipe - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ptempfile import PersistentTemporaryFile +from html5_parser import parse +from datetime import datetime +import json class BusinessStandard(BasicNewsRecipe): title = 'Business Standard' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = "India's most respected business daily" - oldest_article = 1 - max_articles_per_feed = 20 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf-8' - publisher = 'Business Standard Limited' - category = 'news, business, money, india, world' language = 'en_IN' - masthead_url = 'https://bsmedia.business-standard.com/include/_mod/site/html5/images/business-standard-logo.png' + no_stylesheets = True + remove_javascript = True + remove_attributes = ['width', 'height', 'style'] + def get_cover_url(self): soup = self.index_to_soup('https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/') for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): return citem['content'] - remove_attributes = ['width', 'height', 'style'] + def get_browser(self): + return BasicNewsRecipe.get_browser(self, user_agent='common_words/based') - keep_only_tags = [ - classes('headline alternativeHeadline full-img article-content__img pubDate'), - dict(name='span', attrs={'class':'p-content'}), - ] - remove_tags = [ - classes('also-read-panel') + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True + resolve_internal_links = True + simultaneous_downloads = 1 + + extra_css = ''' + .auth, .cat { font-size:small; color:#202020; } + .cap { font-size:small; text-align:center; } + ''' + + art_url = '' + art_desc = '' + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.business-standard.com')}) + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', + ] + if any(x in link['href'] for x in skip_sections): + self.abort_article('skipping video links ', link['href']) + self.art_url = link['href'] + self.log('Found ', link['href']) + html = br.open(link['href']).read() + pt = PersistentTemporaryFile('.html') + pt.write(html) + pt.close() + return pt.name + + feeds = [] + + sections = [ + 'india-news', 'economy', 'opinion', 'markets', 'companies', 'industry', 'finance', 'world-news', + # 'politics', 'cricket', 'sports', 'technology', 'book', 'education', 'specials' ] - feeds = [ - (u'Companies', u'https://www.business-standard.com/rss/companies-101.rss'), - (u'Economy and Policy', u'https://www.business-standard.com/rss/economy-policy-102.rss'), - (u'Finance', u'https://www.business-standard.com/rss/finance-103.rss'), - (u'Beyond Business', u'https://www.business-standard.com/rss/beyond-business-104.rss'), - (u'Opinion', 'https://www.business-standard.com/rss/opinion-105.rss'), - (u'Markets', u'https://www.business-standard.com/rss/markets-106.rss'), - (u'Technology', u'https://www.business-standard.com/rss/technology-108.rss'), - (u'Personal Finance', u'https://www.business-standard.com/rss/pf-114.rss'), - (u'International', u'https://www.business-standard.com/rss/international-116.rss'), - # (u'Today\'s Paper', u'https://www.business-standard.com/rss/todays-paper.rss'), - # for todays paper - subscrition required - ] + for sec in sections: + a = 'https://news.google.com/rss/search?q=when:27h+allinurl:business-standard.com{}&hl=en-IN&gl=IN&ceid=IN:en' + feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) + # feeds.append(('Others', a.format(''))) + + def preprocess_raw_html(self, raw, *a): + root = parse(raw) + m = root.xpath('//script[@id="__NEXT_DATA__"]') + + data = json.loads(m[0].text) + data = data['props']['pageProps']['data'] + + title = '

' + data['pageTitle'] + '

' + + cat = subhead = lede = auth = caption = '' + + if 'defaultArticleCat' in data and data['defaultArticleCat'] is not None: + if 'h1_tag' in data['defaultArticleCat'] and data['defaultArticleCat']['h1_tag'] is not None: + cat = '

' + data['defaultArticleCat']['h1_tag'] + '

' + + if 'metaDescription' in data and data['metaDescription'] is not None: + subhead = '

' + data['metaDescription'] + '

' + self.art_desc = data['metaDescription'] + + date = (datetime.fromtimestamp(int(data['publishDate']))).strftime('%b %d, %Y | %I:%M %p') + + if 'multiple_authors_name' in data: + auth = '

' + data['multiple_authors_name'] + ' | ' + data['placeName'] + ' | ' + date + '

' + + if data['featuredImageObj'] and 'url' in data['featuredImageObj']: + lede = '

'.format(data['featuredImageObj']['url']) + if 'alt_text' in data['featuredImageObj']: + caption = '' + data['featuredImageObj']['alt_text'] + '

' + + body = data['htmlContent'] + + return '' + cat + title + subhead + auth + lede + caption + '

' + body + '
' + + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.summary = self.art_desc + article.text_summary = self.art_desc + article.title = article.title.replace(' - Business Standard', '') diff --git a/recipes/tagesspiegel.recipe b/recipes/tagesspiegel.recipe index 004799f8d7..ee0e562c71 100644 --- a/recipes/tagesspiegel.recipe +++ b/recipes/tagesspiegel.recipe @@ -26,8 +26,22 @@ class TagesspiegelRss(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True + def get_browser(self): + return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) + + def get_cover_url(self): + from datetime import date + cover = 'https://img.kiosko.net/' + date.today().strftime('%Y/%m/%d') + '/de/tagesspiegel.750.jpg' + return cover + keep_only_tags = [ - classes('ts-lead ts-article-body ts-intro ts-title ts-authors') + dict(name = 'header', attrs={'class':'Bo'}), + dict(name = 'div', attrs={'id':'story-elements'}) + ] + + remove_tags = [ + dict(name = 'aside'), + classes('iqd_mainAd Bs') ] feeds = [ @@ -42,5 +56,3 @@ class TagesspiegelRss(BasicNewsRecipe): (u'Wissen', u'http://www.tagesspiegel.de/contentexport/feed/wissen') ] - def get_masthead_url(self): - return 'http://www.tagesspiegel.de/images/tsp_logo/3114/6.png'