From 2144e84dd20403c659a50e40812146837b78000f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 Jul 2011 11:05:26 -0600 Subject: [PATCH] Fix #817094 (The Economist news fetch doesn't work) --- recipes/economist.recipe | 78 +++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 45 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 22e317ee68..79a247d855 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -6,10 +6,10 @@ __copyright__ = '2008, Kovid Goyal ' economist.com ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import Tag, NavigableString +from collections import OrderedDict -import string, time, re +import time, re class Economist(BasicNewsRecipe): @@ -67,52 +67,40 @@ class Economist(BasicNewsRecipe): return self.economist_parse_index() def economist_parse_index(self): - soup = BeautifulSoup(self.browser.open(self.INDEX).read(), - convertEntities=BeautifulSoup.HTML_ENTITIES) - index_started = False - feeds = {} - ans = [] - key = None - for tag in soup.findAll(['h1', 'h2']): - text = ''.join(tag.findAll(text=True)) - if tag.name in ('h1', 'h2') and 'Classified ads' in text: - break - if tag.name == 'h1': - if 'The world this week' in text or 'The world this year' in text: - index_started = True - if not index_started: + soup = self.index_to_soup(self.INDEX) + feeds = OrderedDict() + for section in soup.findAll(attrs={'class':'section'}): + h4 = section.find('h4') + if h4 is None: + continue + section_title = self.tag_to_string(h4).strip() + if not section_title: + continue + self.log('Found section: %s'%section_title) + articles = [] + for h5 in section.findAll('h5'): + article_title = self.tag_to_string(h5).strip() + if not article_title: continue - text = string.capwords(text) - if text not in feeds.keys(): - feeds[text] = [] - if text not in ans: - ans.append(text) - key = text - continue - if key is None: - continue - a = tag.find('a', href=True) - if a is not None: - url=a['href'] - id_ = re.search(r'story_id=(\d+)', url).group(1) - url = 'http://www.economist.com/node/%s/print'%id_ - if url.startswith('Printer'): - url = '/'+url - if url.startswith('/'): - url = 'http://www.economist.com' + url - try: - subtitle = tag.previousSibling.contents[0].contents[0] - text = subtitle + ': ' + text - except: - pass - article = dict(title=text, - url = url, - description='', content='', date='') - feeds[key].append(article) + data = h5.findNextSibling(attrs={'class':'article'}) + if data is None: continue + a = data.find('a', href=True) + if a is None: continue + url = a['href'] + if url.startswith('/'): url = 'http://www.economist.com'+url + url += '/print' + article_title += ': %s'%self.tag_to_string(a).strip() + articles.append({'title':article_title, 'url':url, + 'description':'', 'date':''}) + if articles: + feeds[section_title] = articles - ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)] + ans = [(key, val) for key, val in feeds.iteritems()] if not ans: - raise Exception('Could not find any articles. Has your subscription expired?') + raise Exception('Could not find any articles, either the ' + 'economist.com server is having trouble and you should ' + 'try later or the website format has changed and the ' + 'recipe needs to be updated.') return ans def eco_find_image_tables(self, soup):