Fix #5972 (Houston Chronicle Recipe Fails)

2025-07-09 03:04:10 -04:00 · 2010-06-27 15:54:47 -06:00 · 2010-06-27 15:54:47 -06:00 · f66a9077c6
commit f66a9077c6
parent 951df0948b
1 changed files with 20 additions and 48 deletions
--- a/resources/recipes/houston_chronicle.recipe
+++ b/resources/recipes/houston_chronicle.recipe
@ -1,12 +1,15 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 import string, pprint
 from calibre.web.feeds.news import BasicNewsRecipe
 class HoustonChronicle(BasicNewsRecipe):
    title          = u'The Houston Chronicle'
    description    = 'News from Houston, Texas'
-    __author__	   = 'Kovid Goyal and Sujata Raman'
+    __author__	   = 'Kovid Goyal'
    language       = 'en'
    timefmt        = ' [%a, %d %b, %Y]'
    no_stylesheets = True
@ -38,54 +41,23 @@ class HoustonChronicle(BasicNewsRecipe):
    def parse_index(self):
-        soup = self.index_to_soup('http://www.chron.com/news/')
+        categories = ['news', 'sports', 'business', 'entertainment', 'life',
-        container = soup.find('table', attrs={'class':'body-columns'})
+                'travel']
        feeds = []
-        current_section = 'Top Stories'
+        for cat in categories:
-        current_articles = []
+            articles = []
-
+            soup = self.index_to_soup('http://www.chron.com/%s/'%cat)
-        self.log('\tFound section:', current_section)
+            for elem in soup.findAll(comptype='story', storyid=True):
-
+                a = elem.find('a', href=True)
-        for div in container.findAll('div'):
+                if a is None: continue
-            if div.get('class', None) == 'module-mast':
+                url = a['href']
-                t = self.tag_to_string(div).replace(u'\xbb', '').strip()
+                if not url.startswith('http://'):
-                if t and 'interactives' not in t:
+                    url = 'http://www.chron.com'+url
-                    if current_section and current_articles:
+                articles.append({'title':self.tag_to_string(a), 'url':url,
-                        feeds.append((current_section, current_articles))
+                    'description':'', 'date':''})
-                    current_section = t
+                pprint.pprint(articles[-1])
-                    current_articles = []
+            if articles:
-                    self.log('\tFound section:', current_section)
+                feeds.append((string.capwords(cat), articles))
            elif div.get('storyid', False):
                a = div.find('a', href=True)
                if a:
                    title = self.tag_to_string(a)
                    url = a.get('href')
                    if title and url:
                        if url.startswith('/'):
                            url = 'http://www.chron.com'+url
                        self.log('\t\tFound article:', title)
                        self.log('\t\t\t', url)
                        current_articles.append({'title':title, 'url':url,
                            'date':'', 'description':''})
            elif div.get('class', None) == 'columnbox' and \
                    'special' in current_section.lower():
                a = div.find('a')
                if a:
                    title = self.tag_to_string(a)
                    url = a.get('href')
                    if title and url:
                        if not url.startswith('/'): continue
                        url = 'http://www.chron.com'+url
                        self.log('\t\tFound article:', title)
                        self.log('\t\t\t', url)
                        a.extract()
                        desc = self.tag_to_string(div)
                        current_articles.append({'title':title, 'url':url,
                            'date':'', 'description':desc})
        if current_section and current_articles:
            feeds.append((current_section, current_articles))
        return feeds