diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index ac57b37a74..ef8fee5e8c 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -4,7 +4,6 @@ from calibre.ptempfile import PersistentTemporaryFile import json import re - class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' language = 'en' @@ -12,7 +11,7 @@ class Bloomberg(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] - ignore_duplicate_articles = {'url'} + ignore_duplicate_articles = {'url', 'title'} resolve_internal_links = True oldest_article = 2 # days delay = 1.5 @@ -36,6 +35,9 @@ class Bloomberg(BasicNewsRecipe): url = e.hdrs.get('location') soup = self.index_to_soup(url) link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) + if '/videos/' in link['href']: + self.abort_article('Aborting Video article') + self.log('Found link: ', link['href']) html = br.open(link['href']).read() pt = PersistentTemporaryFile('.html') pt.write(html) @@ -49,7 +51,14 @@ class Bloomberg(BasicNewsRecipe): return br feeds = [ - ('Articles', 'https://news.google.com/rss/search?q=when:24h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en'), + ('Features', + 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Ffeatures%2F&hl=en-US&gl=US&ceid=US:en'), + ('News', + 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), + ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fopinion%2F&hl=en-US&gl=US&ceid=US:en'), + ('Newsletters', + 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), + ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en') ] def preprocess_raw_html(self, raw, *a): @@ -84,12 +93,12 @@ class Bloomberg(BasicNewsRecipe): if 'ledeImageUrl' in data: if data['ledeImageUrl'] is not None: - lede = '

'.format(data['ledeImageUrl'].replace('\\', '')) + lede = '

'.format(data['ledeImageUrl']) if data['ledeDescription'] is not None: caption = '' + data['ledeDescription'] + '' - body = data['body'].replace('\\', '') + body = data['body'] html = '' + cat + title + subhead + auth + lede + caption + '

' + body return html diff --git a/recipes/deccan_herald.recipe b/recipes/deccan_herald.recipe index 9d3ec3b47b..bb8d69e3db 100644 --- a/recipes/deccan_herald.recipe +++ b/recipes/deccan_herald.recipe @@ -8,9 +8,9 @@ class herald(BasicNewsRecipe): language = 'en_IN' no_stylesheets = True remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {'url'} + ignore_duplicate_articles = {'url', 'title'} encoding = 'utf-8' - + articles_are_obfuscated = True def get_obfuscated_article(self, url): @@ -22,26 +22,26 @@ class herald(BasicNewsRecipe): soup = self.index_to_soup(url) link = soup.find('a', href=True) skip_sections =[ # add sections you want to skip - '/sports/', '/video/', '/bengaluru-crime/', '/metrolife/', + '/video/', '/bengaluru-crime/', '/metrolife/', '/karnataka-districts/', '/brandspot/', '/entertainment/', ] if any(x in link['href'] for x in skip_sections): self.log('Aborting Article ', link['href']) self.abort_article('skipping section') - + self.log('Downloading ', link['href']) html = br.open(link['href']).read() pt = PersistentTemporaryFile('.html') pt.write(html) pt.close() return pt.name - + keep_only_tags = [ classes('article-title article-author__name'), dict(name='div', attrs={'id':'main-content'}) - + ] - + remove_tags = [ classes( 'storyShare social-media-icons in_article_video static_text' @@ -49,7 +49,17 @@ class herald(BasicNewsRecipe): ' field-name-field-tags section-full strip--business' ) ] - + feeds = [ - ('DH', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com&hl=en-IN&gl=IN&ceid=IN:en') - ] + ('Nation', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fnational%2F&hl=en-IN&gl=IN&ceid=IN:en'), + ('Karnataka', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fstate%2F&hl=en-IN&gl=IN&ceid=IN:en'), + ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fopinion%2F&hl=en-IN&gl=IN&ceid=IN:en'), + ('City', + 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fcity%2F&hl=en-IN&gl=IN&ceid=IN:en'), + ('Business', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fbusiness%2F&hl=en-IN&gl=IN&ceid=IN:en'), + ('World', + 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Finternational%2F&hl=en-IN&gl=IN&ceid=IN:en'), + ('Sports', + 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com%2Fsports%2F&hl=en-IN&gl=IN&ceid=IN:en'), + ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:deccanherald.com&hl=en-IN&gl=IN&ceid=IN:en'), + ] \ No newline at end of file