diff --git a/resources/recipes/biggovernment.recipe b/resources/recipes/biggovernment.recipe index f14b78f1b8..ccb4e64678 100644 --- a/resources/recipes/biggovernment.recipe +++ b/resources/recipes/biggovernment.recipe @@ -1,28 +1,27 @@ -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -class BigGovernmentRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' - __author__ = 'kwetal' - language = 'en_US' - version = 1 - - title = u'Big Government' - publisher = u'Andrew Breitbart' - category = u'Political blog' - description = u'Political news from the USA' - - oldest_article = 30 - max_articles_per_feed = 100 - use_embedded_content = True - - feeds = [(u'Big Government', u'http://feeds.feedburner.com/BigGovernment')] - - conversion_options = {'comments': description, 'tags': category, 'language': 'en', - 'publisher': publisher} - - extra_css = ''' - body{font-family:verdana,arial,helvetica,geneva,sans-serif;} - img {float: left; margin-right: 0.5em;} - ''' - +from calibre.web.feeds.news import BasicNewsRecipe + +class BigGovernmentRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_US' + version = 1 + + title = u'Big Government' + publisher = u'Andrew Breitbart' + category = u'Political blog' + description = u'Political news from the USA' + + oldest_article = 30 + max_articles_per_feed = 100 + use_embedded_content = True + + feeds = [(u'Big Government', u'http://feeds.feedburner.com/BigGovernment')] + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher} + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif;} + img {float: left; margin-right: 0.5em;} + ''' + diff --git a/resources/recipes/eluniversalimpresa.recipe b/resources/recipes/eluniversalimpresa.recipe index c7046a31c4..7263a76e2a 100644 --- a/resources/recipes/eluniversalimpresa.recipe +++ b/resources/recipes/eluniversalimpresa.recipe @@ -1,82 +1,82 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class ElUniversalImpresaRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' - __author__ = 'kwetal' - language = 'es' - version = 1 - - title = u'El Universal (Edici\u00F3n Impresa)' - publisher = u'El Universal' - category = u'News, Mexico' - description = u'News from Mexico' - - remove_empty_feeds = True - remove_javascript = True - - INDEX = 'http://www.eluniversal.com.mx' - - extra_css = ''' - body{font-family:verdana,arial,helvetica,geneva,sans-serif;} - ''' - - conversion_options = {'comments': description, 'tags': category, 'language': 'en', - 'publisher': publisher, 'linearize_tables': True} - - def parse_index(self): - soup = self.index_to_soup('http://www.eluniversal.com.mx/edicion_impresa.html') - index = [] - - table = soup.find('table', attrs = {'width': '500'}) - articles = [] - for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'): - a = td.a - a.extract() - title = self.tag_to_string(a) - url = self.INDEX + a['href'] - description = self.tag_to_string(td) - articles.append({'title': title, 'date': None, 'url': url, 'description' : description}) - - index.append(('Primera Plana', articles)) - - for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0): - articles = [] - feedTitle = None - for a in td.findAll('a'): - if not feedTitle: - feedTitle = self.tag_to_string(a) - continue - - title = self.tag_to_string(a) - - url = self.INDEX + a['href'] - articles.append({'title': title, 'date': None, 'url': url, 'description': ''}) - - index.append((feedTitle, articles)) - - return index - - def print_version(self, url): - if url.find('wcarton') >= 0: - return None - - main, sep, id = url.rpartition('/') - - return main + '/vi_' + id - - def preprocess_html(self, soup): - table = soup.find('table') - table.extract() - - for p in soup.findAll('p'): - if self.tag_to_string(p).strip() == '': - p.extract() - - tag = soup.find('font', attrs = {'color': '#0F046A'}) - if tag: - for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']: - if tag.has_key(attr): - del tag[attr] - tag.name = 'h1' - - return soup +from calibre.web.feeds.news import BasicNewsRecipe + +class ElUniversalImpresaRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'es' + version = 1 + + title = u'El Universal (Edici\u00F3n Impresa)' + publisher = u'El Universal' + category = u'News, Mexico' + description = u'News from Mexico' + + remove_empty_feeds = True + remove_javascript = True + + INDEX = 'http://www.eluniversal.com.mx' + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif;} + ''' + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher, 'linearize_tables': True} + + def parse_index(self): + soup = self.index_to_soup('http://www.eluniversal.com.mx/edicion_impresa.html') + index = [] + + table = soup.find('table', attrs = {'width': '500'}) + articles = [] + for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'): + a = td.a + a.extract() + title = self.tag_to_string(a) + url = self.INDEX + a['href'] + description = self.tag_to_string(td) + articles.append({'title': title, 'date': None, 'url': url, 'description' : description}) + + index.append(('Primera Plana', articles)) + + for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0): + articles = [] + feedTitle = None + for a in td.findAll('a'): + if not feedTitle: + feedTitle = self.tag_to_string(a) + continue + + title = self.tag_to_string(a) + + url = self.INDEX + a['href'] + articles.append({'title': title, 'date': None, 'url': url, 'description': ''}) + + index.append((feedTitle, articles)) + + return index + + def print_version(self, url): + if url.find('wcarton') >= 0: + return None + + main, sep, id = url.rpartition('/') + + return main + '/vi_' + id + + def preprocess_html(self, soup): + table = soup.find('table') + table.extract() + + for p in soup.findAll('p'): + if self.tag_to_string(p).strip() == '': + p.extract() + + tag = soup.find('font', attrs = {'color': '#0F046A'}) + if tag: + for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']: + if tag.has_key(attr): + del tag[attr] + tag.name = 'h1' + + return soup diff --git a/resources/recipes/indian_express.recipe b/resources/recipes/indian_express.recipe new file mode 100644 index 0000000000..80a5840517 --- /dev/null +++ b/resources/recipes/indian_express.recipe @@ -0,0 +1,57 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class IndianExpress(BasicNewsRecipe): + title = u'Indian Express' + language = 'en_IN' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + encoding = 'cp1252' + + no_stylesheets = True + remove_tags_before = dict(name='div', attrs={'class':'top_head'}) + #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['bookmarks_div', 'comment_box', 'bookmarks_div_bot', 'box']}), + dict(name='div', attrs={'id':['footer', 'tab_innerhc', 'discussion', 'google_new']}), + dict(name='a', attrs={'class':'nobdr'}), + #dict(name='span', text=':'), + ] + + feeds = [ +('Front Page', + 'http://syndication.indianexpress.com/rss/33/front-page.xml'), +('Markets', + 'http://syndication.indianexpress.com/rss/793/markets.xml'), +('Editorials', + 'http://syndication.indianexpress.com/rss/35/editorials.xml'), +('Crime', + 'http://syndication.indianexpress.com/rss/801/crime-&-justice.xml'), +('Cricket', + 'http://syndication.indianexpress.com/rss/777/cricket.xml'), +('Health', + 'http://syndication.indianexpress.com/rss/697/health.xml'), +('Asia', + 'http://syndication.indianexpress.com/rss/790/asia.xml'), +('Politics', + 'http://syndication.indianexpress.com/rss/799/politics.xml'), +('Mumbai', + 'http://syndication.indianexpress.com/rss/707/mumbai.xml'), +('Op-Ed', + 'http://syndication.indianexpress.com/rss/36/oped.xml'), +('Economy', + 'http://syndication.indianexpress.com/rss/794/economy.xml'), +('Lifestyle', + 'http://syndication.indianexpress.com/rss/713/lifestyle.xml'), +('Letters to the Editor', + 'http://syndication.indianexpress.com/rss/40/letters-to-editor.xml'), +('Movie Reviews', + 'http://syndication.indianexpress.com/rss/665/movie-reviews.xml'), +('Bollywood', + 'http://syndication.indianexpress.com/rss/887/bollywood.xml'), +] + + def print_version(self, url): + return url+'/0' + diff --git a/resources/recipes/journalofaccountancy.recipe b/resources/recipes/journalofaccountancy.recipe index 51a6ac8d29..5407b5eae3 100644 --- a/resources/recipes/journalofaccountancy.recipe +++ b/resources/recipes/journalofaccountancy.recipe @@ -1,44 +1,45 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class JournalOfAccountancyRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' - __author__ = 'kwetal' - language = 'en' - version = 1 - - title = u'Journal of Accountancy' - publisher = u'AICPA' - category = u'News, Accountancy' - description = u'Publication of the American Institute of Certified Public Accountants' - - use_embedded_content = False - remove_empty_feeds = True - oldest_article = 30 - max_articles_per_feed = 100 - - no_stylesheets = True - remove_javascript = True - - extra_css = ''' - body{font-family:verdana,arial,helvetica,geneva,sans-serif;} - div#Rubricname {font-size: small; color: #666666; margin-bottom: 1em;} - div#Headline {font-size: x-large; font-weight: bold; margin-bottom: 0.6em} - div#SubHeadline {font-size: medium; font-weight: bold; margin-bottom: 1em} - div#Authorname, div#Date {font-size: x-small; color: #696969;} - ''' - - conversion_options = {'comments': description, 'tags': category, 'language': 'en', - 'publisher': publisher} - - keep_only_tags = [] - keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Rubricname'})) - keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Headline'})) - keep_only_tags.append(dict(name = 'div', attrs = {'id': 'SubHeadline'})) - keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Authorname'})) - keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Date'})) - keep_only_tags.append(dict(name = 'div', attrs = {'id': 'BodyContent'})) - - remove_attributes = ['style'] - - feeds = [] - feeds.append((u'Journal of Accountancy', u'http://feeds2.feedburner.com/JournalOfAccountancy')) + +from calibre.web.feeds.news import BasicNewsRecipe + +class JournalOfAccountancyRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en' + version = 1 + + title = u'Journal of Accountancy' + publisher = u'AICPA' + category = u'News, Accountancy' + description = u'Publication of the American Institute of Certified Public Accountants' + + use_embedded_content = False + remove_empty_feeds = True + oldest_article = 30 + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif;} + div#Rubricname {font-size: small; color: #666666; margin-bottom: 1em;} + div#Headline {font-size: x-large; font-weight: bold; margin-bottom: 0.6em} + div#SubHeadline {font-size: medium; font-weight: bold; margin-bottom: 1em} + div#Authorname, div#Date {font-size: x-small; color: #696969;} + ''' + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher} + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Rubricname'})) + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Headline'})) + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'SubHeadline'})) + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Authorname'})) + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Date'})) + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'BodyContent'})) + + remove_attributes = ['style'] + + feeds = [] + feeds.append((u'Journal of Accountancy', u'http://feeds2.feedburner.com/JournalOfAccountancy')) diff --git a/resources/recipes/propublica.recipe b/resources/recipes/propublica.recipe index 1e1f0af7a9..5c35fe648e 100644 --- a/resources/recipes/propublica.recipe +++ b/resources/recipes/propublica.recipe @@ -1,60 +1,60 @@ -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -class ProPublicaRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' - __author__ = 'kwetal' - language = 'en_US' - version = 1 - - title = u'Pro Publica' - publisher = u'ProPublica.org' - category = u'Political blog' - description = u'Independent investigative journalism in the public interest.' - - oldest_article = 14 - max_articles_per_feed = 100 - use_embedded_content = False - - remove_empty_feeds = True - no_stylesheets = True - remove_javascript = True - - keep_only_tags = [] - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article'})) - - remove_tags = [] - remove_tags.append(dict(name = 'div', attrs = {'id': 'rollups'})) - remove_tags.append(dict(name = 'div', attrs = {'class': 'follow_info'})) - remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools-top'})) - remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box'})) - remove_tags.append(dict(name = 'div', attrs = {'class': 'tags'})) - remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools'})) - remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box2'})) - remove_tags.append(dict(name = 'p', attrs = {'id': 'original-url'})) - - feeds = [] - feeds.append((u'Top Stories', u'http://feeds.propublica.org/propublica/main')) - feeds.append((u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus')) - feeds.append((u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout')) - feeds.append((u'Business', u'http://feeds.propublica.org/propublica/business-money')) - feeds.append((u'Justice', u'http://feeds.propublica.org/propublica/justice-law')) - feeds.append((u'Energy & Environment', u'http://feeds.propublica.org/propublica/energy-environment')) - feeds.append((u'Government & Politics', u'http://feeds.propublica.org/propublica/government-politics')) - feeds.append((u'Health & Science', u'http://feeds.propublica.org/propublica/health-science')) - feeds.append((u'Media & Technology', u'http://feeds.propublica.org/propublica/media-technology')) - feeds.append((u'National Security', u'http://feeds.propublica.org/propublica/national-security')) - #feeds.append((u'', u'')) - - conversion_options = {'comments': description, 'tags': category, 'language': 'en', - 'publisher': publisher} - - extra_css = ''' - body{font-family:verdana,arial,helvetica,geneva,sans-serif;} - img {float: left; margin-right: 0.5em;} - h1 {text-align: left;} - a, a[href] {text-decoration: none; color: blue;} - div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;} - div.info {font-size: small; color: #696969;} - ''' - \ No newline at end of file + +from calibre.web.feeds.news import BasicNewsRecipe + +class ProPublicaRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_US' + version = 1 + + title = u'Pro Publica' + publisher = u'ProPublica.org' + category = u'Political blog' + description = u'Independent investigative journalism in the public interest.' + + oldest_article = 14 + max_articles_per_feed = 100 + use_embedded_content = False + + remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article'})) + + remove_tags = [] + remove_tags.append(dict(name = 'div', attrs = {'id': 'rollups'})) + remove_tags.append(dict(name = 'div', attrs = {'class': 'follow_info'})) + remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools-top'})) + remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box'})) + remove_tags.append(dict(name = 'div', attrs = {'class': 'tags'})) + remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools'})) + remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box2'})) + remove_tags.append(dict(name = 'p', attrs = {'id': 'original-url'})) + + feeds = [] + feeds.append((u'Top Stories', u'http://feeds.propublica.org/propublica/main')) + feeds.append((u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus')) + feeds.append((u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout')) + feeds.append((u'Business', u'http://feeds.propublica.org/propublica/business-money')) + feeds.append((u'Justice', u'http://feeds.propublica.org/propublica/justice-law')) + feeds.append((u'Energy & Environment', u'http://feeds.propublica.org/propublica/energy-environment')) + feeds.append((u'Government & Politics', u'http://feeds.propublica.org/propublica/government-politics')) + feeds.append((u'Health & Science', u'http://feeds.propublica.org/propublica/health-science')) + feeds.append((u'Media & Technology', u'http://feeds.propublica.org/propublica/media-technology')) + feeds.append((u'National Security', u'http://feeds.propublica.org/propublica/national-security')) + #feeds.append((u'', u'')) + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher} + + extra_css = ''' + body{font-family:verdana,arial,helvetica,geneva,sans-serif;} + img {float: left; margin-right: 0.5em;} + h1 {text-align: left;} + a, a[href] {text-decoration: none; color: blue;} + div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;} + div.info {font-size: small; color: #696969;} + ''' +