Fix #5972 (Houston Chronicle Recipe Fails)

2026-01-04 11:10:20 -05:00 · 2010-06-27 15:54:47 -06:00 · 2010-06-27 15:54:47 -06:00 · f66a9077c6
commit f66a9077c6
parent 951df0948b
1 changed files with 20 additions and 48 deletions
--- a/resources/recipes/houston_chronicle.recipe
+++ b/resources/recipes/houston_chronicle.recipe
@ -1,12 +1,15 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+import string, pprint
+
 from calibre.web.feeds.news import BasicNewsRecipe

 class HoustonChronicle(BasicNewsRecipe):

    title          = u'The Houston Chronicle'
    description    = 'News from Houston, Texas'
-    __author__	   = 'Kovid Goyal and Sujata Raman'
+    __author__	   = 'Kovid Goyal'
    language       = 'en'
    timefmt        = ' [%a, %d %b, %Y]'
    no_stylesheets = True
@ -38,54 +41,23 @@ class HoustonChronicle(BasicNewsRecipe):


    def parse_index(self):
-        soup = self.index_to_soup('http://www.chron.com/news/')
-        container = soup.find('table', attrs={'class':'body-columns'})
-
+        categories = ['news', 'sports', 'business', 'entertainment', 'life',
+                'travel']
        feeds = []
-        current_section = 'Top Stories'
-        current_articles = []
-
-        self.log('\tFound section:', current_section)
-
-        for div in container.findAll('div'):
-            if div.get('class', None) == 'module-mast':
-                t = self.tag_to_string(div).replace(u'\xbb', '').strip()
-                if t and 'interactives' not in t:
-                    if current_section and current_articles:
-                        feeds.append((current_section, current_articles))
-                    current_section = t
-                    current_articles = []
-                    self.log('\tFound section:', current_section)
-            elif div.get('storyid', False):
-                a = div.find('a', href=True)
-                if a:
-                    title = self.tag_to_string(a)
-                    url = a.get('href')
-                    if title and url:
-                        if url.startswith('/'):
-                            url = 'http://www.chron.com'+url
-                        self.log('\t\tFound article:', title)
-                        self.log('\t\t\t', url)
-                        current_articles.append({'title':title, 'url':url,
-                            'date':'', 'description':''})
-            elif div.get('class', None) == 'columnbox' and \
-                    'special' in current_section.lower():
-                a = div.find('a')
-                if a:
-                    title = self.tag_to_string(a)
-                    url = a.get('href')
-                    if title and url:
-                        if not url.startswith('/'): continue
-                        url = 'http://www.chron.com'+url
-                        self.log('\t\tFound article:', title)
-                        self.log('\t\t\t', url)
-                        a.extract()
-                        desc = self.tag_to_string(div)
-                        current_articles.append({'title':title, 'url':url,
-                            'date':'', 'description':desc})
-
-        if current_section and current_articles:
-            feeds.append((current_section, current_articles))
+        for cat in categories:
+            articles = []
+            soup = self.index_to_soup('http://www.chron.com/%s/'%cat)
+            for elem in soup.findAll(comptype='story', storyid=True):
+                a = elem.find('a', href=True)
+                if a is None: continue
+                url = a['href']
+                if not url.startswith('http://'):
+                    url = 'http://www.chron.com'+url
+                articles.append({'title':self.tag_to_string(a), 'url':url,
+                    'description':'', 'date':''})
+                pprint.pprint(articles[-1])
+            if articles:
+                feeds.append((string.capwords(cat), articles))
        return feeds