diff --git a/resources/recipes/houston_chronicle.recipe b/resources/recipes/houston_chronicle.recipe index 3ec1abbf0f..3390228455 100644 --- a/resources/recipes/houston_chronicle.recipe +++ b/resources/recipes/houston_chronicle.recipe @@ -1,12 +1,15 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +import string, pprint + from calibre.web.feeds.news import BasicNewsRecipe class HoustonChronicle(BasicNewsRecipe): title = u'The Houston Chronicle' description = 'News from Houston, Texas' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'Kovid Goyal' language = 'en' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True @@ -38,54 +41,23 @@ class HoustonChronicle(BasicNewsRecipe): def parse_index(self): - soup = self.index_to_soup('http://www.chron.com/news/') - container = soup.find('table', attrs={'class':'body-columns'}) - + categories = ['news', 'sports', 'business', 'entertainment', 'life', + 'travel'] feeds = [] - current_section = 'Top Stories' - current_articles = [] - - self.log('\tFound section:', current_section) - - for div in container.findAll('div'): - if div.get('class', None) == 'module-mast': - t = self.tag_to_string(div).replace(u'\xbb', '').strip() - if t and 'interactives' not in t: - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = t - current_articles = [] - self.log('\tFound section:', current_section) - elif div.get('storyid', False): - a = div.find('a', href=True) - if a: - title = self.tag_to_string(a) - url = a.get('href') - if title and url: - if url.startswith('/'): - url = 'http://www.chron.com'+url - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - current_articles.append({'title':title, 'url':url, - 'date':'', 'description':''}) - elif div.get('class', None) == 'columnbox' and \ - 'special' in current_section.lower(): - a = div.find('a') - if a: - title = self.tag_to_string(a) - url = a.get('href') - if title and url: - if not url.startswith('/'): continue - url = 'http://www.chron.com'+url - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - a.extract() - desc = self.tag_to_string(div) - current_articles.append({'title':title, 'url':url, - 'date':'', 'description':desc}) - - if current_section and current_articles: - feeds.append((current_section, current_articles)) + for cat in categories: + articles = [] + soup = self.index_to_soup('http://www.chron.com/%s/'%cat) + for elem in soup.findAll(comptype='story', storyid=True): + a = elem.find('a', href=True) + if a is None: continue + url = a['href'] + if not url.startswith('http://'): + url = 'http://www.chron.com'+url + articles.append({'title':self.tag_to_string(a), 'url':url, + 'description':'', 'date':''}) + pprint.pprint(articles[-1]) + if articles: + feeds.append((string.capwords(cat), articles)) return feeds