From 41b9342a2a1cf4e9013b4c6e83c30de8fa7820d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sat, 9 Mar 2013 10:17:39 +0100 Subject: [PATCH] fix telepolis_pl and improve swiatkindle --- recipes/swiatkindle.recipe | 3 ++- recipes/telepolis_pl.recipe | 24 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/recipes/swiatkindle.recipe b/recipes/swiatkindle.recipe index d8e0e3f403..c589d1b6e1 100644 --- a/recipes/swiatkindle.recipe +++ b/recipes/swiatkindle.recipe @@ -19,6 +19,7 @@ class swiatczytnikow(BasicNewsRecipe): feeds = [(u'Świat Czytników - wpisy', u'http://swiatczytnikow.pl/feed')] - remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'})] + remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'}), + dict(name = 'div', attrs = {'class' : 'feedflare'})] preprocess_regexps = [(re.compile(u'

Czytaj dalej:

'), lambda match: '')] diff --git a/recipes/telepolis_pl.recipe b/recipes/telepolis_pl.recipe index 9ea878bc77..1aa7734c2c 100644 --- a/recipes/telepolis_pl.recipe +++ b/recipes/telepolis_pl.recipe @@ -16,11 +16,31 @@ class telepolis(BasicNewsRecipe): use_embedded_content = False feeds = [ - (u'Wiadomości', u'http://www.telepolis.pl/rss/news.php')#, - #(u'Artykuły', u'http://www.telepolis.pl/rss/artykuly.php') + (u'Wiadomości', u'http://www.telepolis.pl/rss,2,5,0.html') ] keep_only_tags = [ dict(name='div', attrs={'class':'flol w510'}), + dict(name='div', attrs={'class':'main_tresc'}), dict(name='div', attrs={'class':'main_tresc_news'}) ] + + def append_page(self, soup, appendtag): + chpage= appendtag.find(attrs={'class':'str'}) + if chpage: + for page in chpage.findAll('a'): + if page.renderContents() == 'Następna ›': + break + soup2 = self.index_to_soup(page['href']) + pagetext = soup2.find(attrs={'class':'main_tresc'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'str'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + for image in soup.findAll('img'): + if 'm.jpg' in image['src']: + image['src'] = image['src'].replace('m.jpg', '.jpg') + return soup