Fix #892860 (Updated recipe for NIN online)

This commit is contained in:
Kovid Goyal 2011-11-21 08:02:24 +05:30
parent 0d7ca78a5e
commit 408a71c485

View File

@ -80,59 +80,11 @@ class Nin(BasicNewsRecipe):
return self.PREFIX + item.img['src']
return cover_url
def parse_index(self):
articles = []
count = 0
soup = self.index_to_soup(self.INDEX)
for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
count = count +1
if self.test and count > 2:
return articles
section = self.tag_to_string(item)
feedlink = self.PREFIX + item['href']
feedpage = self.index_to_soup(feedlink)
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
inarts = []
for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
alink = art.parent
url = self.PREFIX + alink['href']
title = self.tag_to_string(art)
sparent = alink.parent
alink.extract()
description = self.tag_to_string(sparent)
date = strftime(self.timefmt)
inarts.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
articles.append((section,inarts))
return articles
def index_to_soup(self, url_or_raw, raw=False):
if re.match(r'\w+://', url_or_raw):
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
with closing(open_func(url_or_raw)) as f:
_raw = f.read()
if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
if callable(self.encoding):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
massage.append((re.compile(r'&(\S+?);'), lambda match:
entity_to_unicode(match, encoding=enc)))
massage.append((re.compile(r'[\x00-\x08]+'), lambda match:
''))
return BeautifulSoup(_raw, markupMassage=massage)
feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
return url.replace('.co.yu', '.co.rs')
def preprocess_html(self, soup):
for item in soup.findAll(style=True):