New recipe for The Indian Express by Krittika Goyal

2025-07-09 03:04:10 -04:00 · 2009-12-31 16:21:18 -07:00 · 2009-12-31 16:21:18 -07:00 · 95cf14a3ad
commit 95cf14a3ad
parent 49383b0db5
5 changed files with 271 additions and 214 deletions
--- a/resources/recipes/biggovernment.recipe
+++ b/resources/recipes/biggovernment.recipe
@ -1,28 +1,27 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
-
+class BigGovernmentRecipe(BasicNewsRecipe):
-class BigGovernmentRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
-    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
-    __author__ = 'kwetal'
+    language = 'en_US'
-    language = 'en_US'
+    version = 1
-    version = 1
+
-
+    title = u'Big Government'
-    title = u'Big Government'
+    publisher = u'Andrew Breitbart'
-    publisher = u'Andrew Breitbart'
+    category = u'Political blog'
-    category = u'Political blog'
+    description = u'Political news from the USA'
-    description = u'Political news from the USA'
+
-
+    oldest_article = 30
-    oldest_article = 30
+    max_articles_per_feed = 100
-    max_articles_per_feed = 100
+    use_embedded_content = True
-    use_embedded_content = True
+
-
+    feeds = [(u'Big Government', u'http://feeds.feedburner.com/BigGovernment')]
-    feeds = [(u'Big Government', u'http://feeds.feedburner.com/BigGovernment')]
+
-
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
-                          'publisher': publisher}
+
-
+    extra_css = '''
-    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                img {float: left; margin-right: 0.5em;}
-                img {float: left; margin-right: 0.5em;}
+                '''
-                '''
+
--- a/resources/recipes/eluniversalimpresa.recipe
+++ b/resources/recipes/eluniversalimpresa.recipe
@ -1,82 +1,82 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe
-
+
-class ElUniversalImpresaRecipe(BasicNewsRecipe):
+class ElUniversalImpresaRecipe(BasicNewsRecipe):
-    __license__  = 'GPL v3'
+    __license__  = 'GPL v3'
-    __author__ = 'kwetal'
+    __author__ = 'kwetal'
-    language = 'es'
+    language = 'es'
-    version = 1
+    version = 1
-
+
-    title = u'El Universal (Edici\u00F3n Impresa)'
+    title = u'El Universal (Edici\u00F3n Impresa)'
-    publisher = u'El Universal'
+    publisher = u'El Universal'
-    category = u'News, Mexico'
+    category = u'News, Mexico'
-    description = u'News from Mexico'
+    description = u'News from Mexico'
-
+
-    remove_empty_feeds = True
+    remove_empty_feeds = True
-    remove_javascript = True
+    remove_javascript = True
-
+
-    INDEX = 'http://www.eluniversal.com.mx'
+    INDEX = 'http://www.eluniversal.com.mx'
-
+
-    extra_css = '''
+    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                '''
+                '''
-
+
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher, 'linearize_tables': True}
+                          'publisher': publisher, 'linearize_tables': True}
-
+
-    def parse_index(self):
+    def parse_index(self):
-        soup = self.index_to_soup('http://www.eluniversal.com.mx/edicion_impresa.html')
+        soup = self.index_to_soup('http://www.eluniversal.com.mx/edicion_impresa.html')
-        index = []
+        index = []
-
+
-        table = soup.find('table', attrs = {'width': '500'})
+        table = soup.find('table', attrs = {'width': '500'})
-        articles = []
+        articles = []
-        for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'):
+        for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'):
-            a = td.a
+            a = td.a
-            a.extract()
+            a.extract()
-            title = self.tag_to_string(a)
+            title = self.tag_to_string(a)
-            url = self.INDEX + a['href']
+            url = self.INDEX + a['href']
-            description = self.tag_to_string(td)
+            description = self.tag_to_string(td)
-            articles.append({'title': title, 'date': None, 'url': url, 'description' : description})
+            articles.append({'title': title, 'date': None, 'url': url, 'description' : description})
-
+
-        index.append(('Primera Plana', articles))
+        index.append(('Primera Plana', articles))
-
+
-        for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0):
+        for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0):
-            articles = []
+            articles = []
-            feedTitle = None
+            feedTitle = None
-            for a in td.findAll('a'):
+            for a in td.findAll('a'):
-                if not feedTitle:
+                if not feedTitle:
-                    feedTitle = self.tag_to_string(a)
+                    feedTitle = self.tag_to_string(a)
-                    continue
+                    continue
-
+
-                title = self.tag_to_string(a)
+                title = self.tag_to_string(a)
-
+
-                url = self.INDEX + a['href']
+                url = self.INDEX + a['href']
-                articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
+                articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
-
+
-            index.append((feedTitle, articles))
+            index.append((feedTitle, articles))
-
+
-        return index
+        return index
-
+
-    def print_version(self, url):
+    def print_version(self, url):
-        if url.find('wcarton') >= 0:
+        if url.find('wcarton') >= 0:
-            return None
+            return None
-
+
-        main, sep, id = url.rpartition('/')
+        main, sep, id = url.rpartition('/')
-
+
-        return main + '/vi_' + id
+        return main + '/vi_' + id
-
+
-    def preprocess_html(self, soup):
+    def preprocess_html(self, soup):
-        table = soup.find('table')
+        table = soup.find('table')
-        table.extract()
+        table.extract()
-
+
-        for p in soup.findAll('p'):
+        for p in soup.findAll('p'):
-            if self.tag_to_string(p).strip() == '':
+            if self.tag_to_string(p).strip() == '':
-                p.extract()
+                p.extract()
-
+
-        tag = soup.find('font', attrs = {'color': '#0F046A'})
+        tag = soup.find('font', attrs = {'color': '#0F046A'})
-        if tag:
+        if tag:
-            for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
+            for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
-                if tag.has_key(attr):
+                if tag.has_key(attr):
-                    del tag[attr]
+                    del tag[attr]
-            tag.name = 'h1'
+            tag.name = 'h1'
-
+
-        return soup
+        return soup
--- a/resources/recipes/indian_express.recipe
+++ b/resources/recipes/indian_express.recipe
@ -0,0 +1,57 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class IndianExpress(BasicNewsRecipe):
    title          = u'Indian Express'
    language       = 'en_IN'
    __author__     = 'Krittika Goyal'
    oldest_article = 1 #days
    max_articles_per_feed = 25
    encoding = 'cp1252'
    no_stylesheets = True
    remove_tags_before = dict(name='div', attrs={'class':'top_head'})
    #remove_tags_after  = dict(name='td', attrs={'class':'newptool1'})
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':['bookmarks_div', 'comment_box', 'bookmarks_div_bot', 'box']}),
       dict(name='div', attrs={'id':['footer', 'tab_innerhc', 'discussion', 'google_new']}),
       dict(name='a', attrs={'class':'nobdr'}),
       #dict(name='span', text=':'),
    ]
    feeds          = [
 ('Front Page',
 'http://syndication.indianexpress.com/rss/33/front-page.xml'),
 ('Markets',
 'http://syndication.indianexpress.com/rss/793/markets.xml'),
 ('Editorials',
 'http://syndication.indianexpress.com/rss/35/editorials.xml'),
 ('Crime',
 'http://syndication.indianexpress.com/rss/801/crime-&-justice.xml'),
 ('Cricket',
 'http://syndication.indianexpress.com/rss/777/cricket.xml'),
 ('Health',
 'http://syndication.indianexpress.com/rss/697/health.xml'),
 ('Asia',
 'http://syndication.indianexpress.com/rss/790/asia.xml'),
 ('Politics',
 'http://syndication.indianexpress.com/rss/799/politics.xml'),
 ('Mumbai',
 'http://syndication.indianexpress.com/rss/707/mumbai.xml'),
 ('Op-Ed',
 'http://syndication.indianexpress.com/rss/36/oped.xml'),
 ('Economy',
 'http://syndication.indianexpress.com/rss/794/economy.xml'),
 ('Lifestyle',
 'http://syndication.indianexpress.com/rss/713/lifestyle.xml'),
 ('Letters to the Editor',
 'http://syndication.indianexpress.com/rss/40/letters-to-editor.xml'),
 ('Movie Reviews',
 'http://syndication.indianexpress.com/rss/665/movie-reviews.xml'),
 ('Bollywood',
 'http://syndication.indianexpress.com/rss/887/bollywood.xml'),
 ]
    def print_version(self, url):
        return url+'/0'
--- a/resources/recipes/journalofaccountancy.recipe
+++ b/resources/recipes/journalofaccountancy.recipe
@ -1,44 +1,45 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+
-
+from calibre.web.feeds.news import BasicNewsRecipe
-class JournalOfAccountancyRecipe(BasicNewsRecipe):
+
-    __license__  = 'GPL v3'
+class JournalOfAccountancyRecipe(BasicNewsRecipe):
-    __author__ = 'kwetal'
+    __license__  = 'GPL v3'
-    language = 'en'
+    __author__ = 'kwetal'
-    version = 1
+    language = 'en'
-
+    version = 1
-    title = u'Journal of Accountancy'
+
-    publisher = u'AICPA'
+    title = u'Journal of Accountancy'
-    category = u'News, Accountancy'
+    publisher = u'AICPA'
-    description = u'Publication of the American Institute of Certified Public Accountants'
+    category = u'News, Accountancy'
-
+    description = u'Publication of the American Institute of Certified Public Accountants'
-    use_embedded_content = False
+
-    remove_empty_feeds = True
+    use_embedded_content = False
-    oldest_article = 30
+    remove_empty_feeds = True
-    max_articles_per_feed = 100
+    oldest_article = 30
-
+    max_articles_per_feed = 100
-    no_stylesheets = True
+
-    remove_javascript = True
+    no_stylesheets = True
-
+    remove_javascript = True
-    extra_css = '''
+
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+    extra_css = '''
-                div#Rubricname {font-size: small; color: #666666; margin-bottom: 1em;}
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                div#Headline {font-size: x-large; font-weight: bold; margin-bottom: 0.6em}
+                div#Rubricname {font-size: small; color: #666666; margin-bottom: 1em;}
-                div#SubHeadline {font-size: medium; font-weight: bold; margin-bottom: 1em}
+                div#Headline {font-size: x-large; font-weight: bold; margin-bottom: 0.6em}
-                div#Authorname, div#Date {font-size: x-small; color: #696969;}
+                div#SubHeadline {font-size: medium; font-weight: bold; margin-bottom: 1em}
-                '''
+                div#Authorname, div#Date {font-size: x-small; color: #696969;}
-
+                '''
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+
-                          'publisher': publisher}
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-
+                          'publisher': publisher}
-    keep_only_tags = []
+
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Rubricname'}))
+    keep_only_tags = []
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Headline'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Rubricname'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'SubHeadline'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Headline'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Authorname'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'SubHeadline'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Date'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Authorname'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'BodyContent'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Date'}))
-
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'BodyContent'}))
-    remove_attributes = ['style']
+
-
+    remove_attributes = ['style']
-    feeds = []
+
-    feeds.append((u'Journal of Accountancy', u'http://feeds2.feedburner.com/JournalOfAccountancy'))
+    feeds = []
    feeds.append((u'Journal of Accountancy', u'http://feeds2.feedburner.com/JournalOfAccountancy'))
--- a/resources/recipes/propublica.recipe
+++ b/resources/recipes/propublica.recipe
@ -1,60 +1,60 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.web.feeds.news import BasicNewsRecipe
-
+
-class ProPublicaRecipe(BasicNewsRecipe):
+class ProPublicaRecipe(BasicNewsRecipe):
-    __license__  = 'GPL v3'
+    __license__  = 'GPL v3'
-    __author__ = 'kwetal'
+    __author__ = 'kwetal'
-    language = 'en_US'
+    language = 'en_US'
-    version = 1
+    version = 1
-
+
-    title = u'Pro Publica'
+    title = u'Pro Publica'
-    publisher = u'ProPublica.org'
+    publisher = u'ProPublica.org'
-    category = u'Political blog'
+    category = u'Political blog'
-    description = u'Independent investigative journalism in the public interest.'
+    description = u'Independent investigative journalism in the public interest.'
-
+
-    oldest_article = 14
+    oldest_article = 14
-    max_articles_per_feed = 100
+    max_articles_per_feed = 100
-    use_embedded_content = False
+    use_embedded_content = False
-
+
-    remove_empty_feeds = True
+    remove_empty_feeds = True
-    no_stylesheets = True
+    no_stylesheets = True
-    remove_javascript = True
+    remove_javascript = True
-
+
-    keep_only_tags = []
+    keep_only_tags = []
-    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article'}))
-
+
-    remove_tags = []
+    remove_tags = []
-    remove_tags.append(dict(name = 'div', attrs = {'id': 'rollups'}))
+    remove_tags.append(dict(name = 'div', attrs = {'id': 'rollups'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class': 'follow_info'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class': 'follow_info'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools-top'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools-top'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class': 'tags'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class': 'tags'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools'}))
-    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box2'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box2'}))
-    remove_tags.append(dict(name = 'p', attrs = {'id': 'original-url'}))
+    remove_tags.append(dict(name = 'p', attrs = {'id': 'original-url'}))
-
+
-    feeds = []
+    feeds = []
-    feeds.append((u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
+    feeds.append((u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
-    feeds.append((u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
+    feeds.append((u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
-    feeds.append((u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
+    feeds.append((u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
-    feeds.append((u'Business', u'http://feeds.propublica.org/propublica/business-money'))
+    feeds.append((u'Business', u'http://feeds.propublica.org/propublica/business-money'))
-    feeds.append((u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
+    feeds.append((u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
-    feeds.append((u'Energy & Environment', u'http://feeds.propublica.org/propublica/energy-environment'))
+    feeds.append((u'Energy & Environment', u'http://feeds.propublica.org/propublica/energy-environment'))
-    feeds.append((u'Government & Politics', u'http://feeds.propublica.org/propublica/government-politics'))
+    feeds.append((u'Government & Politics', u'http://feeds.propublica.org/propublica/government-politics'))
-    feeds.append((u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
+    feeds.append((u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
-    feeds.append((u'Media & Technology', u'http://feeds.propublica.org/propublica/media-technology'))
+    feeds.append((u'Media & Technology', u'http://feeds.propublica.org/propublica/media-technology'))
-    feeds.append((u'National Security', u'http://feeds.propublica.org/propublica/national-security'))
+    feeds.append((u'National Security', u'http://feeds.propublica.org/propublica/national-security'))
-    #feeds.append((u'', u''))
+    #feeds.append((u'', u''))
-
+
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher}
+                          'publisher': publisher}
-
+
-    extra_css = '''
+    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                img {float: left; margin-right: 0.5em;}
+                img {float: left; margin-right: 0.5em;}
-                h1 {text-align: left;}
+                h1 {text-align: left;}
-                a, a[href] {text-decoration: none; color: blue;}
+                a, a[href] {text-decoration: none; color: blue;}
-                div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
+                div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
-                div.info {font-size: small; color: #696969;}
+                div.info {font-size: small; color: #696969;}
-                '''
+                '''
-    
+