Fix #7210 (Download News)

2025-07-09 03:04:10 -04:00 · 2010-10-30 12:15:00 -06:00 · 2010-10-30 12:15:00 -06:00 · 7d7757ab93
commit 7d7757ab93
parent 0bff8a7d5a
2 changed files with 34 additions and 62 deletions
--- a/resources/recipes/nzherald.recipe
+++ b/resources/recipes/nzherald.recipe
@ -1,74 +1,43 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
 import re
 class NewZealandHerald(BasicNewsRecipe):
    title       = 'New Zealand Herald'
-    __author__  = 'Krittika Goyal'
+    __author__  = 'Kovid Goyal'
    description = 'Daily news'
    timefmt = ' [%d %b, %Y]'
    language = 'en_NZ'
    oldest_article = 2.5
-    no_stylesheets = True
+    feeds = [
-    remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'})
+            ('Business',
-    remove_tags_after  = dict(name='div', attrs={'class':'callToAction'})
+                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000003.xml'),
-    remove_tags = [
+            ('World',
-       dict(name='iframe'),
+                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000002.xml'),
-       dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
+            ('National',
-       #dict(name='div', attrs={'id':['shareContainer']}),
+                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000001.xml'),
-       #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
+            ('Entertainment',
-       #dict(name='table', attrs={'cellspacing':'0'}),
+                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_001501119.xml'),
            ('Travel',
                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000007.xml'),
            ('Opinion',
                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000466.xml'),
            ('Life & Style',
                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000006.xml'),
            ('Technology'
                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000005.xml'),
            ('Sport',
                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000004.xml'),
            ('Motoring',
                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000009.xml'),
            ('Property',
                'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000008.xml'),
    ]
-    def preprocess_html(self, soup):
+    def print_version(self, url):
-        table = soup.find('table')
+        m = re.search(r'objectid=(\d+)', url)
-        if table is not None:
+        if m is None:
-            table.extract()
+            return url
-        return soup
+        return 'http://www.nzherald.co.nz/news/print.cfm?pnum=1&objectid=' + m.group(1)
    #TO GET ARTICLES IN SECTION
    def nz_parse_section(self, url):
            soup = self.index_to_soup(url)
            div = soup.find(attrs={'class':'col-300 categoryList'})
            date = div.find(attrs={'class':'link-list-heading'})
            current_articles = []
            for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}):
                if x.get('class') == 'link-list-heading': break
                for li in x.findAll('li'):
                    a = li.find('a', href=True)
                    if a is None:
                        continue
                    title = self.tag_to_string(a)
                    url = a.get('href', False)
                    if not url or not title:
                        continue
                    if url.startswith('/'):
                         url = 'http://www.nzherald.co.nz'+url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
                        'description':'', 'date':''})
            return current_articles
    # To GET SECTIONS
    def parse_index(self):
            feeds = []
            for title, url in [
                ('National',
                 'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'),
                ('World',
                 'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'),
                ('Politics',
                 'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'),
                ('Crime',
                 'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'),
                ('Environment',
                 'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'),
             ]:
               articles = self.nz_parse_section(url)
               if articles:
                   feeds.append((title, articles))
            return feeds
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -957,6 +957,8 @@ class BasicNewsRecipe(Recipe):
            self.log.error(_('Could not download cover: %s')%str(err))
            self.log.debug(traceback.format_exc())
        else:
            if not cu:
                return
            cdata = None
            if os.access(cu, os.R_OK):
                cdata = open(cu, 'rb').read()
@ -987,6 +989,7 @@ class BasicNewsRecipe(Recipe):
            self.cover_path = cpath
    def download_cover(self):
        self.cover_path = None
        try:
            self._download_cover()
        except: