Update High Country News

2025-07-09 03:04:10 -04:00 · 2013-08-18 21:32:37 +05:30 · 2013-08-18 21:32:37 +05:30 · fdd9b43f51
commit fdd9b43f51
parent c2a572faa7
2 changed files with 61 additions and 59 deletions
--- a/recipes/high_country_blogs.recipe
+++ b/recipes/high_country_blogs.recipe
@ -1,44 +0,0 @@
-# -*- coding: utf-8 -*-
-__license__   = 'GPL v3'
-__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
-
-'''
-Fetch High Country News - Blogs
-'''
-from calibre.web.feeds.news import BasicNewsRecipe
-class HighCountryNewsBlogs(BasicNewsRecipe):
-
-    title                 = u'High Country News - Blogs'
-    description           = u'High Country News - Blogs (RSS Version)'
-    __author__            = 'Armin Geller' # 2012-08-01
-    publisher             = 'High Country News'
-    category              = 'news, politics, Germany'
-    timefmt               = ' [%a, %d %b %Y]'
-    language              = 'en'
-    encoding              = 'UTF-8'
-    publication_type      = 'newspaper'
-    oldest_article        = 7
-    max_articles_per_feed = 100
-    no_stylesheets        = True
-    auto_cleanup          = True
-    remove_javascript     = True
-    use_embedded_content  = False
-    masthead_url          = 'http://www.hcn.org/logo.jpg'
-    cover_source          = 'http://www.hcn.org'
-
-    def get_cover_url(self):
-       cover_source_soup = self.index_to_soup(self.cover_source)
-       preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
-       return preview_image_div.div.img['src']
-
-    feeds = [
-              (u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'),
-
-              (u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'),
-              (u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'),
-              (u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'),
-             ]
-
-    def print_version(self, url):
-          return url
-
--- a/recipes/high_country_news.recipe
+++ b/recipes/high_country_news.recipe
@ -1,6 +1,12 @@
 # -*- coding: utf-8 -*-
+#
+# Written:      2012-01-28
+# Last Edited:  2013-08-18
+# Remark:       Version 1.2
+# Integration of former separated Blog-News
+#
 __license__   = 'GPL v3'
-__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
+__copyright__ = '2013, Armin Geller'

 '''
 Fetch High Country News
@ -9,35 +15,75 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class HighCountryNews(BasicNewsRecipe):

    title                 = u'High Country News'
-    description           = u'News from the American West'
-    __author__            = 'Armin Geller' # 2012-01-31
+    description           = u'High Country News (RSS Version)'
+    __author__            = 'Armin Geller'
    publisher             = 'High Country News'
+    category              = 'news, politics'
    timefmt               = ' [%a, %d %b %Y]'
    language              = 'en'
    encoding              = 'UTF-8'
    publication_type      = 'newspaper'
-    oldest_article        = 7
+    oldest_article        = 14
    max_articles_per_feed = 100
    no_stylesheets        = True
-    auto_cleanup          = True
+    auto_cleanup          = False
    remove_javascript     = True
+    remove_empty_feeds    = True  # 2013-08-18 AGe add
    use_embedded_content  = False
-    masthead_url          = 'http://www.hcn.org/logo.jpg' # 2012-01-31 AGe add
-    cover_source          = 'http://www.hcn.org'          # 2012-01-31 AGe add

-    def get_cover_url(self):                              # 2012-01-31 AGe add
+    masthead_url          = 'http://www.hcn.org/logo.jpg'
+    cover_source          = 'http://www.hcn.org'
+
+    def get_cover_url(self):
        cover_source_soup = self.index_to_soup(self.cover_source)
        preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
        return preview_image_div.div.img['src']

    feeds = [
-              (u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent'),
-              (u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue'),
+              (u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent?format=xml'),
+              (u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue?format=xml'),
+
+              (u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'),  # 2013-07-23 AGe add
+              (u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'),  # 2013-07-23 AGe add
+              (u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'),          # 2013-07-23 AGe add
+              (u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'),             # 2013-07-23 AGe add

              (u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
              (u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
             ]

-    def print_version(self, url):
-          return url + '/print_view'
+ # 2013-07-23 AGe New coding w/o using print_version

+    keep_only_tags    = [
+                          dict(name='div', attrs={'id':['content']}),
+                        ]
+
+    remove_tags = [
+                    dict(name='div', attrs={'class':['documentActions supercedeDocumentActions editorialDocumentActions',
+                                                      'documentActions supercedeDocumentActions editorialDocumentActions editorialFooterDocumentActions',
+                                                      'article-sidebar',
+                                                      'image-viewer-controls nojs',
+                                                      'protectedArticleWrapper',
+                                                      'visualClear',
+                                                     ]})
+                  ]
+
+    INDEX                 = ''
+    def append_page(self, soup, appendtag, position):
+        pager = soup.find('span',attrs={'class':'next'})
+        print 'AGE-append_page-------------->: ', pager
+        if pager:
+            nexturl = self.INDEX + pager.a['href']
+            soup2 = self.index_to_soup(nexturl)
+            texttag = soup2.find('div', attrs={'class':'article-text'})
+            newpos = len(texttag.contents)
+            self.append_page(soup2,texttag,newpos)
+            texttag.extract()
+            appendtag.insert(position,texttag)
+
+    def preprocess_html(self, soup):
+        self.append_page(soup, soup.body, 3)
+        pager = soup.find('div',attrs={'class':'listingBar listingBar-article'})
+        if pager:
+            pager.extract()
+        return self.adeify_images(soup)