From c8fe13195ceffcc4bc3aff8fcf1f68b0b7e2b091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Mon, 4 Mar 2013 23:44:07 +0100 Subject: [PATCH] partial fix for telepolis_pl --- recipes/telepolis_pl.recipe | 58 ++++++------------------------------- 1 file changed, 9 insertions(+), 49 deletions(-) diff --git a/recipes/telepolis_pl.recipe b/recipes/telepolis_pl.recipe index ff4803697f..06739fe31d 100644 --- a/recipes/telepolis_pl.recipe +++ b/recipes/telepolis_pl.recipe @@ -8,60 +8,20 @@ import re class telepolis(BasicNewsRecipe): title = u'Telepolis.pl' - __author__ = 'Artur Stachecki ' + __author__ = 'Artur Stachecki , Tomasz Długosz ' + language = 'pl' - description = u'Twój telekomunikacyjny serwis informacyjny.\ - Codzienne informacje, testy i artykuły,\ - promocje, baza telefonów oraz centrum rozrywki' - oldest_article = 7 + description = u'Twój telekomunikacyjny serwis informacyjny.' masthead_url = 'http://telepolis.pl/i/telepolis-logo2.gif' - max_articles_per_feed = 100 - simultaneous_downloads = 5 - remove_javascript = True no_stylesheets = True use_embedded_content = False - remove_tags = [] - remove_tags.append(dict(attrs={'alt': 'TELEPOLIS.pl'})) - - preprocess_regexps = [(re.compile(r'<: .*? :>'), - lambda match: ''), - (re.compile(r'Zobacz:.*?', re.DOTALL), - lambda match: ''), - (re.compile(r'<-ankieta.*?>'), - lambda match: ''), - (re.compile(r'\(Q\!\)'), - lambda match: ''), - (re.compile(r'\(plik.*?\)'), - lambda match: ''), - (re.compile(r'', re.DOTALL), - lambda match: '') - ] - - extra_css = '''.tb { font-weight: bold; font-size: 20px;}''' - feeds = [ - (u'Wiadomości', u'http://www.telepolis.pl/rss/news.php'), - (u'Artykuły', u'http://www.telepolis.pl/rss/artykuly.php') + (u'Wiadomości', u'http://www.telepolis.pl/rss/news.php')#, + #(u'Artykuły', u'http://www.telepolis.pl/rss/artykuly.php') ] - def print_version(self, url): - if 'news.php' in url: - print_url = url.replace('news.php', 'news_print.php') - else: - print_url = url.replace('artykuly.php', 'art_print.php') - return print_url - - def preprocess_html(self, soup): - for image in soup.findAll('img'): - if 'm.jpg' in image['src']: - image_big = image['src'] - image_big = image_big.replace('m.jpg', '.jpg') - image['src'] = image_big - logo = soup.find('tr') - logo.extract() - for tag in soup.findAll('tr'): - for strings in ['Wiadomość wydrukowana', 'copyright']: - if strings in self.tag_to_string(tag): - tag.extract() - return self.adeify_images(soup) + keep_only_tags = [ + dict(name='div', attrs={'class':'flol w510'}), + dict(name='div', attrs={'class':'main_tresc_news'}) + ]