Fix #817094 (The Economist news fetch doesn't work)

This commit is contained in:
Kovid Goyal 2011-07-27 11:05:26 -06:00
parent d6c7a8b4ab
commit 2144e84dd2

View File

@ -6,10 +6,10 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
economist.com economist.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from collections import OrderedDict
import string, time, re import time, re
class Economist(BasicNewsRecipe): class Economist(BasicNewsRecipe):
@ -67,52 +67,40 @@ class Economist(BasicNewsRecipe):
return self.economist_parse_index() return self.economist_parse_index()
def economist_parse_index(self): def economist_parse_index(self):
soup = BeautifulSoup(self.browser.open(self.INDEX).read(), soup = self.index_to_soup(self.INDEX)
convertEntities=BeautifulSoup.HTML_ENTITIES) feeds = OrderedDict()
index_started = False for section in soup.findAll(attrs={'class':'section'}):
feeds = {} h4 = section.find('h4')
ans = [] if h4 is None:
key = None continue
for tag in soup.findAll(['h1', 'h2']): section_title = self.tag_to_string(h4).strip()
text = ''.join(tag.findAll(text=True)) if not section_title:
if tag.name in ('h1', 'h2') and 'Classified ads' in text: continue
break self.log('Found section: %s'%section_title)
if tag.name == 'h1': articles = []
if 'The world this week' in text or 'The world this year' in text: for h5 in section.findAll('h5'):
index_started = True article_title = self.tag_to_string(h5).strip()
if not index_started: if not article_title:
continue continue
text = string.capwords(text) data = h5.findNextSibling(attrs={'class':'article'})
if text not in feeds.keys(): if data is None: continue
feeds[text] = [] a = data.find('a', href=True)
if text not in ans: if a is None: continue
ans.append(text) url = a['href']
key = text if url.startswith('/'): url = 'http://www.economist.com'+url
continue url += '/print'
if key is None: article_title += ': %s'%self.tag_to_string(a).strip()
continue articles.append({'title':article_title, 'url':url,
a = tag.find('a', href=True) 'description':'', 'date':''})
if a is not None: if articles:
url=a['href'] feeds[section_title] = articles
id_ = re.search(r'story_id=(\d+)', url).group(1)
url = 'http://www.economist.com/node/%s/print'%id_
if url.startswith('Printer'):
url = '/'+url
if url.startswith('/'):
url = 'http://www.economist.com' + url
try:
subtitle = tag.previousSibling.contents[0].contents[0]
text = subtitle + ': ' + text
except:
pass
article = dict(title=text,
url = url,
description='', content='', date='')
feeds[key].append(article)
ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)] ans = [(key, val) for key, val in feeds.iteritems()]
if not ans: if not ans:
raise Exception('Could not find any articles. Has your subscription expired?') raise Exception('Could not find any articles, either the '
'economist.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.')
return ans return ans
def eco_find_image_tables(self, soup): def eco_find_image_tables(self, soup):