New recipes for Pro Publica, Big Government, El Universal Impressa and Journal of Accountacy by kwetal

2026-02-03 09:33:31 -05:00 · 2009-12-30 10:48:46 -07:00 · 2009-12-30 10:48:46 -07:00 · 18a208a9f4
commit 18a208a9f4
parent 2668df9f93
7 changed files with 214 additions and 0 deletions
--- a/resources/images/news/biggovernment.png
+++ b/resources/images/news/biggovernment.png
--- a/resources/images/news/eluniversal.png
+++ b/resources/images/news/eluniversal.png
--- a/resources/images/news/propublica.png
+++ b/resources/images/news/propublica.png
--- a/resources/recipes/biggovernment.recipe
+++ b/resources/recipes/biggovernment.recipe
@ -0,0 +1,28 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class BigGovernmentRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'en_US'
+    version = 1
+
+    title = u'Big Government'
+    publisher = u'Andrew Breitbart'
+    category = u'Political blog'
+    description = u'Political news from the USA'
+
+    oldest_article = 30
+    max_articles_per_feed = 100
+    use_embedded_content = True
+
+    feeds = [(u'Big Government', u'http://feeds.feedburner.com/BigGovernment')]
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                img {float: left; margin-right: 0.5em;}
+                '''
+
--- a/resources/recipes/eluniversalimpresa.recipe
+++ b/resources/recipes/eluniversalimpresa.recipe
@ -0,0 +1,82 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ElUniversalImpresaRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'es'
+    version = 1
+
+    title = u'El Universal (Edici\u00F3n Impresa)'
+    publisher = u'El Universal'
+    category = u'News, Mexico'
+    description = u'News from Mexico'
+
+    remove_empty_feeds = True
+    remove_javascript = True
+
+    INDEX = 'http://www.eluniversal.com.mx'
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                '''
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher, 'linearize_tables': True}
+
+    def parse_index(self):
+        soup = self.index_to_soup('http://www.eluniversal.com.mx/edicion_impresa.html')
+        index = []
+
+        table = soup.find('table', attrs = {'width': '500'})
+        articles = []
+        for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'):
+            a = td.a
+            a.extract()
+            title = self.tag_to_string(a)
+            url = self.INDEX + a['href']
+            description = self.tag_to_string(td)
+            articles.append({'title': title, 'date': None, 'url': url, 'description' : description})
+
+        index.append(('Primera Plana', articles))
+
+        for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0):
+            articles = []
+            feedTitle = None
+            for a in td.findAll('a'):
+                if not feedTitle:
+                    feedTitle = self.tag_to_string(a)
+                    continue
+
+                title = self.tag_to_string(a)
+
+                url = self.INDEX + a['href']
+                articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
+
+            index.append((feedTitle, articles))
+
+        return index
+
+    def print_version(self, url):
+        if url.find('wcarton') >= 0:
+            return None
+
+        main, sep, id = url.rpartition('/')
+
+        return main + '/vi_' + id
+
+    def preprocess_html(self, soup):
+        table = soup.find('table')
+        table.extract()
+
+        for p in soup.findAll('p'):
+            if self.tag_to_string(p).strip() == '':
+                p.extract()
+
+        tag = soup.find('font', attrs = {'color': '#0F046A'})
+        if tag:
+            for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
+                if tag.has_key(attr):
+                    del tag[attr]
+            tag.name = 'h1'
+
+        return soup
--- a/resources/recipes/journalofaccountancy.recipe
+++ b/resources/recipes/journalofaccountancy.recipe
@ -0,0 +1,44 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class JournalOfAccountancyRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'en'
+    version = 1
+
+    title = u'Journal of Accountancy'
+    publisher = u'AICPA'
+    category = u'News, Accountancy'
+    description = u'Publication of the American Institute of Certified Public Accountants'
+
+    use_embedded_content = False
+    remove_empty_feeds = True
+    oldest_article = 30
+    max_articles_per_feed = 100
+
+    no_stylesheets = True
+    remove_javascript = True
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                div#Rubricname {font-size: small; color: #666666; margin-bottom: 1em;}
+                div#Headline {font-size: x-large; font-weight: bold; margin-bottom: 0.6em}
+                div#SubHeadline {font-size: medium; font-weight: bold; margin-bottom: 1em}
+                div#Authorname, div#Date {font-size: x-small; color: #696969;}
+                '''
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
+
+    keep_only_tags = []
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Rubricname'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Headline'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'SubHeadline'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Authorname'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'Date'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'BodyContent'}))
+
+    remove_attributes = ['style']
+
+    feeds = []
+    feeds.append((u'Journal of Accountancy', u'http://feeds2.feedburner.com/JournalOfAccountancy'))
--- a/resources/recipes/propublica.recipe
+++ b/resources/recipes/propublica.recipe
@ -0,0 +1,60 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class ProPublicaRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'en_US'
+    version = 1
+
+    title = u'Pro Publica'
+    publisher = u'ProPublica.org'
+    category = u'Political blog'
+    description = u'Independent investigative journalism in the public interest.'
+
+    oldest_article = 14
+    max_articles_per_feed = 100
+    use_embedded_content = False
+
+    remove_empty_feeds = True
+    no_stylesheets = True
+    remove_javascript = True
+
+    keep_only_tags = []
+    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'article'}))
+
+    remove_tags = []
+    remove_tags.append(dict(name = 'div', attrs = {'id': 'rollups'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class': 'follow_info'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools-top'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class': 'tags'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class': 'long-tools'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'id': 'share-box2'}))
+    remove_tags.append(dict(name = 'p', attrs = {'id': 'original-url'}))
+
+    feeds = []
+    feeds.append((u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
+    feeds.append((u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
+    feeds.append((u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
+    feeds.append((u'Business', u'http://feeds.propublica.org/propublica/business-money'))
+    feeds.append((u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
+    feeds.append((u'Energy & Environment', u'http://feeds.propublica.org/propublica/energy-environment'))
+    feeds.append((u'Government & Politics', u'http://feeds.propublica.org/propublica/government-politics'))
+    feeds.append((u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
+    feeds.append((u'Media & Technology', u'http://feeds.propublica.org/propublica/media-technology'))
+    feeds.append((u'National Security', u'http://feeds.propublica.org/propublica/national-security'))
+    #feeds.append((u'', u''))
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                img {float: left; margin-right: 0.5em;}
+                h1 {text-align: left;}
+                a, a[href] {text-decoration: none; color: blue;}
+                div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
+                div.info {font-size: small; color: #696969;}
+                '''
+