Fix #817094 (The Economist news fetch doesn't work)

This commit is contained in:
Kovid Goyal 2011-07-27 11:05:26 -06:00
parent d6c7a8b4ab
commit 2144e84dd2

View File

@ -6,10 +6,10 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
economist.com economist.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from collections import OrderedDict
import string, time, re import time, re
class Economist(BasicNewsRecipe): class Economist(BasicNewsRecipe):
@ -67,52 +67,40 @@ class Economist(BasicNewsRecipe):
return self.economist_parse_index() return self.economist_parse_index()
def economist_parse_index(self): def economist_parse_index(self):
soup = BeautifulSoup(self.browser.open(self.INDEX).read(), soup = self.index_to_soup(self.INDEX)
convertEntities=BeautifulSoup.HTML_ENTITIES) feeds = OrderedDict()
index_started = False for section in soup.findAll(attrs={'class':'section'}):
feeds = {} h4 = section.find('h4')
ans = [] if h4 is None:
key = None
for tag in soup.findAll(['h1', 'h2']):
text = ''.join(tag.findAll(text=True))
if tag.name in ('h1', 'h2') and 'Classified ads' in text:
break
if tag.name == 'h1':
if 'The world this week' in text or 'The world this year' in text:
index_started = True
if not index_started:
continue continue
text = string.capwords(text) section_title = self.tag_to_string(h4).strip()
if text not in feeds.keys(): if not section_title:
feeds[text] = []
if text not in ans:
ans.append(text)
key = text
continue continue
if key is None: self.log('Found section: %s'%section_title)
articles = []
for h5 in section.findAll('h5'):
article_title = self.tag_to_string(h5).strip()
if not article_title:
continue continue
a = tag.find('a', href=True) data = h5.findNextSibling(attrs={'class':'article'})
if a is not None: if data is None: continue
url=a['href'] a = data.find('a', href=True)
id_ = re.search(r'story_id=(\d+)', url).group(1) if a is None: continue
url = 'http://www.economist.com/node/%s/print'%id_ url = a['href']
if url.startswith('Printer'): if url.startswith('/'): url = 'http://www.economist.com'+url
url = '/'+url url += '/print'
if url.startswith('/'): article_title += ': %s'%self.tag_to_string(a).strip()
url = 'http://www.economist.com' + url articles.append({'title':article_title, 'url':url,
try: 'description':'', 'date':''})
subtitle = tag.previousSibling.contents[0].contents[0] if articles:
text = subtitle + ': ' + text feeds[section_title] = articles
except:
pass
article = dict(title=text,
url = url,
description='', content='', date='')
feeds[key].append(article)
ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)] ans = [(key, val) for key, val in feeds.iteritems()]
if not ans: if not ans:
raise Exception('Could not find any articles. Has your subscription expired?') raise Exception('Could not find any articles, either the '
'economist.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.')
return ans return ans
def eco_find_image_tables(self, soup): def eco_find_image_tables(self, soup):