News Observer by Krittika Goyal

2025-12-16 18:15:03 -05:00 · 2010-03-06 22:16:38 -07:00 · 2010-03-06 22:16:38 -07:00 · ce6ef6e01a
commit ce6ef6e01a
parent 83cd0f85b6
2 changed files with 46 additions and 20 deletions
--- a/resources/recipes/newsobs.recipe
+++ b/resources/recipes/newsobs.recipe
@ -0,0 +1,33 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class NewsAndObserver(BasicNewsRecipe):
+    title          = u'News And Observer'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    #encoding = 'latin1'
+
+    no_stylesheets = True
+    remove_tags_before = dict(name='div', attrs={'id':'story_header'})
+    remove_tags_after  = dict(name='div', attrs={'id':'shirttail'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name='div', attrs={'class':['contained_round', 'contained']}),
+       dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget', 'stories_widget', 'classifieds_widget', 'most_popular_widget', 'footer']}),
+       #dict(name='ul', attrs={'class':'article-tools'}),
+       dict(name='ul', attrs={'id':'story_tabs'}),
+    ]
+
+
+    feeds = [
+        ('Cover', 'http://www.newsobserver.com/100/index.rss'),
+        ('News', 'http://www.newsobserver.com/102/index.rss'),
+        ('Politics', 'http://www.newsobserver.com/105/index.rss'),
+        ('Business', 'http://www.newsobserver.com/104/index.rss'),
+        ('Sports', 'http://www.newsobserver.com/103/index.rss'),
+        ('College Sports', 'http://www.newsobserver.com/119/index.rss'),
+        ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
+        ('Editorials', 'http://www.newsobserver.com/158/index.rss')]
+
+
--- a/resources/recipes/sfbg.recipe
+++ b/resources/recipes/sfbg.recipe
@ -1,42 +1,35 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup

 class SanFranciscoBayGuardian(BasicNewsRecipe):
    title          = u'San Francisco Bay Guardian'
    language       = 'en'
    __author__     = 'Krittika Goyal'
-    oldest_article = 1 #days
+    oldest_article = 31 #days
    max_articles_per_feed = 25
    #encoding = 'latin1'

    no_stylesheets = True
-    remove_tags_before = dict(name='div', attrs={'id':'story_header'})
-    remove_tags_after  = dict(name='div', attrs={'id':'shirttail'})
+    #remove_tags_before = dict(name='div', attrs={'id':'story_header'})
+    #remove_tags_after  = dict(name='div', attrs={'id':'shirttail'})
    remove_tags = [
       dict(name='iframe'),
       #dict(name='div', attrs={'class':'related-articles'}),
-        dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}),
+        #dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}),
       #dict(name='ul', attrs={'class':'article-tools'}),
-       dict(name='ul', attrs={'id':'story_tabs'}),
+       #dict(name='ul', attrs={'id':'story_tabs'}),
    ]


    feeds = [
-        ('Cover', 'http://www.newsobserver.com/100/index.rss'),
-        ('News', 'http://www.newsobserver.com/102/index.rss'),
-        ('Politics', 'http://www.newsobserver.com/105/index.rss'),
-        ('Business', 'http://www.newsobserver.com/104/index.rss'),
-        ('Sports', 'http://www.newsobserver.com/103/index.rss'),
-        ('College Sports', 'http://www.newsobserver.com/119/index.rss'),
-        ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
-        ('Editorials', 'http://www.newsobserver.com/158/index.rss')]
+        ('sfbg', 'http://www.sfbg.com/rss.xml'),
+    ]


-    def preprocess_html(self, soup):
-        story = soup.find(name='div', attrs={'id':'story_body'})
+    #def preprocess_html(self, soup):
+        #story = soup.find(name='div', attrs={'id':'story_body'})
        #td = heading.findParent(name='td')
        #td.extract()
-        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
-        body = soup.find(name='body')
-        body.insert(0, story)
-        return soup
+        #soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
+        #body = soup.find(name='body')
+        #body.insert(0, story)
+        #return soup