New recipe for The Indian Express by Krittika Goyal

2026-01-06 12:10:18 -05:00 · 2009-12-31 16:21:18 -07:00 · 2009-12-31 16:21:18 -07:00 · 95cf14a3ad
commit 95cf14a3ad
parent 49383b0db5
5 changed files with 271 additions and 214 deletions
--- a/resources/recipes/biggovernment.recipe
+++ b/resources/recipes/biggovernment.recipe
@ -1,28 +1,27 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
-class BigGovernmentRecipe(BasicNewsRecipe):
-    __license__  = 'GPL v3'
-    __author__ = 'kwetal'
-    language = 'en_US'
-    version = 1
-
-    title = u'Big Government'
-    publisher = u'Andrew Breitbart'
-    category = u'Political blog'
-    description = u'Political news from the USA'
-
-    oldest_article = 30
-    max_articles_per_feed = 100
-    use_embedded_content = True
-
-    feeds = [(u'Big Government', u'http://feeds.feedburner.com/BigGovernment')]
-
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher}
-
-    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                img {float: left; margin-right: 0.5em;}
-                '''
-
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class BigGovernmentRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'en_US'
+    version = 1
+
+    title = u'Big Government'
+    publisher = u'Andrew Breitbart'
+    category = u'Political blog'
+    description = u'Political news from the USA'
+
+    oldest_article = 30
+    max_articles_per_feed = 100
+    use_embedded_content = True
+
+    feeds = [(u'Big Government', u'http://feeds.feedburner.com/BigGovernment')]
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                img {float: left; margin-right: 0.5em;}
+                '''
+
--- a/resources/recipes/eluniversalimpresa.recipe
+++ b/resources/recipes/eluniversalimpresa.recipe
@ -1,82 +1,82 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class ElUniversalImpresaRecipe(BasicNewsRecipe):
-    __license__  = 'GPL v3'
-    __author__ = 'kwetal'
-    language = 'es'
-    version = 1
-
-    title = u'El Universal (Edici\u00F3n Impresa)'
-    publisher = u'El Universal'
-    category = u'News, Mexico'
-    description = u'News from Mexico'
-
-    remove_empty_feeds = True
-    remove_javascript = True
-
-    INDEX = 'http://www.eluniversal.com.mx'
-
-    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                '''
-
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher, 'linearize_tables': True}
-
-    def parse_index(self):
-        soup = self.index_to_soup('http://www.eluniversal.com.mx/edicion_impresa.html')
-        index = []
-
-        table = soup.find('table', attrs = {'width': '500'})
-        articles = []
-        for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'):
-            a = td.a
-            a.extract()
-            title = self.tag_to_string(a)
-            url = self.INDEX + a['href']
-            description = self.tag_to_string(td)
-            articles.append({'title': title, 'date': None, 'url': url, 'description' : description})
-
-        index.append(('Primera Plana', articles))
-
-        for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0):
-            articles = []
-            feedTitle = None
-            for a in td.findAll('a'):
-                if not feedTitle:
-                    feedTitle = self.tag_to_string(a)
-                    continue
-
-                title = self.tag_to_string(a)
-
-                url = self.INDEX + a['href']
-                articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
-
-            index.append((feedTitle, articles))
-
-        return index
-
-    def print_version(self, url):
-        if url.find('wcarton') >= 0:
-            return None
-
-        main, sep, id = url.rpartition('/')
-
-        return main + '/vi_' + id
-
-    def preprocess_html(self, soup):
-        table = soup.find('table')
-        table.extract()
-
-        for p in soup.findAll('p'):
-            if self.tag_to_string(p).strip() == '':
-                p.extract()
-
-        tag = soup.find('font', attrs = {'color': '#0F046A'})
-        if tag:
-            for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
-                if tag.has_key(attr):
-                    del tag[attr]
-            tag.name = 'h1'
-
-        return soup
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ElUniversalImpresaRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'es'
+    version = 1
+
+    title = u'El Universal (Edici\u00F3n Impresa)'
+    publisher = u'El Universal'
+    category = u'News, Mexico'
+    description = u'News from Mexico'
+
+    remove_empty_feeds = True
+    remove_javascript = True
+
+    INDEX = 'http://www.eluniversal.com.mx'
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                '''
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher, 'linearize_tables': True}
+
+    def parse_index(self):
+        soup = self.index_to_soup('http://www.eluniversal.com.mx/edicion_impresa.html')
+        index = []
+
+        table = soup.find('table', attrs = {'width': '500'})
+        articles = []
+        for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'):
+            a = td.a
+            a.extract()
+            title = self.tag_to_string(a)
+            url = self.INDEX + a['href']
+            description = self.tag_to_string(td)
+            articles.append({'title': title, 'date': None, 'url': url, 'description' : description})
+
+        index.append(('Primera Plana', articles))
+
+        for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0):
+            articles = []
+            feedTitle = None
+            for a in td.findAll('a'):
+                if not feedTitle:
+                    feedTitle = self.tag_to_string(a)
+                    continue
+
+                title = self.tag_to_string(a)
+
+                url = self.INDEX + a['href']
+                articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
+
+            index.append((feedTitle, articles))
+
+        return index
+
+    def print_version(self, url):
+        if url.find('wcarton') >= 0:
+            return None
+
+        main, sep, id = url.rpartition('/')
+
+        return main + '/vi_' + id
+
+    def preprocess_html(self, soup):
+        table = soup.find('table')
+        table.extract()
+
+        for p in soup.findAll('p'):
+            if self.tag_to_string(p).strip() == '':
+                p.extract()
+
+        tag = soup.find('font', attrs = {'color': '#0F046A'})
+        if tag:
+            for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
+                if tag.has_key(attr):
+                    del tag[attr]
+            tag.name = 'h1'
+
+        return soup
--- a/resources/recipes/indian_express.recipe
+++ b/resources/recipes/indian_express.recipe
@ -0,0 +1,57 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class IndianExpress(BasicNewsRecipe):
+    title          = u'Indian Express'
+    language       = 'en_IN'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    encoding = 'cp1252'
+
+    no_stylesheets = True
+    remove_tags_before = dict(name='div', attrs={'class':'top_head'})
+    #remove_tags_after  = dict(name='td', attrs={'class':'newptool1'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name='div', attrs={'class':['bookmarks_div', 'comment_box', 'bookmarks_div_bot', 'box']}),
+       dict(name='div', attrs={'id':['footer', 'tab_innerhc', 'discussion', 'google_new']}),
+       dict(name='a', attrs={'class':'nobdr'}),
+       #dict(name='span', text=':'),
+    ]
+
+    feeds          = [
+('Front Page',
+ 'http://syndication.indianexpress.com/rss/33/front-page.xml'),
+('Markets',
+ 'http://syndication.indianexpress.com/rss/793/markets.xml'),
+('Editorials',
+ 'http://syndication.indianexpress.com/rss/35/editorials.xml'),
+('Crime',
+ 'http://syndication.indianexpress.com/rss/801/crime-&-justice.xml'),
+('Cricket',
+ 'http://syndication.indianexpress.com/rss/777/cricket.xml'),
+('Health',
+ 'http://syndication.indianexpress.com/rss/697/health.xml'),
+('Asia',
+ 'http://syndication.indianexpress.com/rss/790/asia.xml'),
+('Politics',
+ 'http://syndication.indianexpress.com/rss/799/politics.xml'),
+('Mumbai',
+ 'http://syndication.indianexpress.com/rss/707/mumbai.xml'),
+('Op-Ed',
+ 'http://syndication.indianexpress.com/rss/36/oped.xml'),
+('Economy',
+ 'http://syndication.indianexpress.com/rss/794/economy.xml'),
+('Lifestyle',
+ 'http://syndication.indianexpress.com/rss/713/lifestyle.xml'),
+('Letters to the Editor',
+ 'http://syndication.indianexpress.com/rss/40/letters-to-editor.xml'),
+('Movie Reviews',
+ 'http://syndication.indianexpress.com/rss/665/movie-reviews.xml'),
+('Bollywood',
+ 'http://syndication.indianexpress.com/rss/887/bollywood.xml'),
+]
+
+    def print_version(self, url):
+        return url+'/0'
+
--- a/resources/recipes/journalofaccountancy.recipe
+++ b/resources/recipes/journalofaccountancy.recipe
@ -1,44 +1,45 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class JournalOfAccountancyRecipe(BasicNewsRecipe):
-    __license__  = 'GPL v3'
-    __author__ = 'kwetal'
-    language = 'en'
-    version = 1
-
-    title = u'Journal of Accountancy'
-    publisher = u'AICPA'
-    category = u'News, Accountancy'
-    description = u'Publication of the American Institute of Certified Public Accountants'
-
-    use_embedded_content = False
-    remove_empty_feeds = True
-    oldest_article = 30
-    max_articles_per_feed = 100
-
-    no_stylesheets = True
-    remove_javascript = True
-
-    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                div#Rubricname {font-size: small; color: #666666; margin-bottom: 1em;}
-                div#Headline {font-size: x-large; font-weight: bold; margin-bottom: 0.6em}
-                div#SubHeadline {font-size: medium; font-weight: bold; margin-bottom: 1em}
-                div#Authorname, div#Date {font-size: x-small; color: #696969;}
-                '''
-
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher}
-
-    keep_only_tags = []
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Rubricname'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Headline'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'SubHeadline'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Authorname'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Date'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'BodyContent'}))
-
-    remove_attributes = ['style']
-
-    feeds = []
-    feeds.append((u'Journal of Accountancy', u'http://feeds2.feedburner.com/JournalOfAccountancy'))
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class JournalOfAccountancyRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'en'
+    version = 1
+
+    title = u'Journal of Accountancy'
+    publisher = u'AICPA'
+    category = u'News, Accountancy'
+    description = u'Publication of the American Institute of Certified Public Accountants'
+
+    use_embedded_content = False
+    remove_empty_feeds = True
+    oldest_article = 30
+    max_articles_per_feed = 100
+
+    no_stylesheets = True
+    remove_javascript = True
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                div#Rubricname {font-size: small; color: #666666; margin-bottom: 1em;}
+                div#Headline {font-size: x-large; font-weight: bold; margin-bottom: 0.6em}
+                div#SubHeadline {font-size: medium; font-weight: bold; margin-bottom: 1em}
+                div#Authorname, div#Date {font-size: x-small; color: #696969;}
+                '''
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
+
+    keep_only_tags = []
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Rubricname'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Headline'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'SubHeadline'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Authorname'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Date'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'BodyContent'}))
+
+    remove_attributes = ['style']
+
+    feeds = []
+    feeds.append((u'Journal of Accountancy', u'http://feeds2.feedburner.com/JournalOfAccountancy'))
--- a/resources/recipes/propublica.recipe
+++ b/resources/recipes/propublica.recipe
@ -1,60 +1,60 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
-class ProPublicaRecipe(BasicNewsRecipe):
-    __license__  = 'GPL v3'
-    __author__ = 'kwetal'
-    language = 'en_US'
-    version = 1
-
-    title = u'Pro Publica'
-    publisher = u'ProPublica.org'
-    category = u'Political blog'
-    description = u'Independent investigative journalism in the public interest.'
-
-    oldest_article = 14
-    max_articles_per_feed = 100
-    use_embedded_content = False
-
-    remove_empty_feeds = True
-    no_stylesheets = True
-    remove_javascript = True
-
-    keep_only_tags = []
-    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article'}))
-
-    remove_tags = []
-    remove_tags.append(dict(name = 'div', attrs = {'id': 'rollups'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class': 'follow_info'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools-top'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class': 'tags'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box2'}))
-    remove_tags.append(dict(name = 'p', attrs = {'id': 'original-url'}))
-
-    feeds = []
-    feeds.append((u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
-    feeds.append((u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
-    feeds.append((u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
-    feeds.append((u'Business', u'http://feeds.propublica.org/propublica/business-money'))
-    feeds.append((u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
-    feeds.append((u'Energy & Environment', u'http://feeds.propublica.org/propublica/energy-environment'))
-    feeds.append((u'Government & Politics', u'http://feeds.propublica.org/propublica/government-politics'))
-    feeds.append((u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
-    feeds.append((u'Media & Technology', u'http://feeds.propublica.org/propublica/media-technology'))
-    feeds.append((u'National Security', u'http://feeds.propublica.org/propublica/national-security'))
-    #feeds.append((u'', u''))
-
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher}
-
-    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                img {float: left; margin-right: 0.5em;}
-                h1 {text-align: left;}
-                a, a[href] {text-decoration: none; color: blue;}
-                div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
-                div.info {font-size: small; color: #696969;}
-                '''
-    
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ProPublicaRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'en_US'
+    version = 1
+
+    title = u'Pro Publica'
+    publisher = u'ProPublica.org'
+    category = u'Political blog'
+    description = u'Independent investigative journalism in the public interest.'
+
+    oldest_article = 14
+    max_articles_per_feed = 100
+    use_embedded_content = False
+
+    remove_empty_feeds = True
+    no_stylesheets = True
+    remove_javascript = True
+
+    keep_only_tags = []
+    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article'}))
+
+    remove_tags = []
+    remove_tags.append(dict(name = 'div', attrs = {'id': 'rollups'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class': 'follow_info'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools-top'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class': 'tags'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box2'}))
+    remove_tags.append(dict(name = 'p', attrs = {'id': 'original-url'}))
+
+    feeds = []
+    feeds.append((u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
+    feeds.append((u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
+    feeds.append((u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
+    feeds.append((u'Business', u'http://feeds.propublica.org/propublica/business-money'))
+    feeds.append((u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
+    feeds.append((u'Energy & Environment', u'http://feeds.propublica.org/propublica/energy-environment'))
+    feeds.append((u'Government & Politics', u'http://feeds.propublica.org/propublica/government-politics'))
+    feeds.append((u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
+    feeds.append((u'Media & Technology', u'http://feeds.propublica.org/propublica/media-technology'))
+    feeds.append((u'National Security', u'http://feeds.propublica.org/propublica/national-security'))
+    #feeds.append((u'', u''))
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                img {float: left; margin-right: 0.5em;}
+                h1 {text-align: left;}
+                a, a[href] {text-decoration: none; color: blue;}
+                div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
+                div.info {font-size: small; color: #696969;}
+                '''
+