From f97f69a207e9302d7a73bc1ec1c90941785e9ba3 Mon Sep 17 00:00:00 2001 From: fenuks Date: Sun, 23 Jun 2013 14:52:01 +0200 Subject: [PATCH 1/3] set of updates for Polish recipes --- recipes/forsal.recipe | 1 + recipes/kosmonauta_pl.recipe | 2 +- recipes/national_geographic_pl.recipe | 35 +++++++++++++++------------ recipes/polter_pl.recipe | 30 +++++++++++++++++++---- recipes/stopklatka.recipe | 16 +++++++----- recipes/tablety_pl.recipe | 4 +-- 6 files changed, 57 insertions(+), 31 deletions(-) diff --git a/recipes/forsal.recipe b/recipes/forsal.recipe index 4ebbb5a2a8..22a1ddcda5 100644 --- a/recipes/forsal.recipe +++ b/recipes/forsal.recipe @@ -9,6 +9,7 @@ class ForsalPL(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 use_embedded_content = False + remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} cover_url = 'http://www.bizneswnieruchomosciach.pl/wp-content/uploads/2010/07/logo_forsal.jpg' no_stylesheets = True diff --git a/recipes/kosmonauta_pl.recipe b/recipes/kosmonauta_pl.recipe index d943739832..e93853bd57 100644 --- a/recipes/kosmonauta_pl.recipe +++ b/recipes/kosmonauta_pl.recipe @@ -8,7 +8,7 @@ class Kosmonauta(BasicNewsRecipe): category = 'astronomy' language = 'pl' cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg' - extra_css = '.thumbnail {float:left;margin-right:5px;}' + extra_css = '.thumb-left {float:left; margin-right:5px;} .calibre_navbar {clear: both;}' no_stylesheets = True INDEX = 'http://www.kosmonauta.net' oldest_article = 7 diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe index 07fc0da666..be02ce4ebd 100644 --- a/recipes/national_geographic_pl.recipe +++ b/recipes/national_geographic_pl.recipe @@ -10,8 +10,8 @@ class recipeMagic(BasicNewsRecipe): title = 'National Geographic PL' __author__ = 'Marcin Urban 2011' __modified_by__ = 'fenuks' - description = 'legenda wśród magazynów z historią sięgającą 120 lat' - #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' + description = 'Legenda wśród magazynów z historią sięgającą 120 lat' + #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True @@ -21,13 +21,14 @@ class recipeMagic(BasicNewsRecipe): publisher = 'G+J Gruner+Jahr Polska' category = 'news, PL,' language = 'pl' + remove_empty_feeds = True publication_type = 'newsportal' extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} - h1{text-align: center;} - h2{font-size: medium; font-weight: bold;} - .authordate {font-size: small; color: #696969;} - p.lead {font-weight: bold; text-align: center;} - .fot{font-size: x-small; color: #666666;} ''' + h1{text-align: center;} + h2{font-size: medium; font-weight: bold;} + .authordate {font-size: small; color: #696969;} + p.lead {font-weight: bold; text-align: center;} + .fot{font-size: x-small; color: #666666;} ''' preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] conversion_options = { 'comments' : description @@ -37,9 +38,9 @@ class recipeMagic(BasicNewsRecipe): ,'linearize_tables': True } - remove_tags = [ - dict(name='div', attrs={'class':'add_inf'}), - dict(name='div', attrs={'class':'add_f'}), + remove_tags = [ + dict(name='div', attrs={'class':'add_inf'}), + dict(name='div', attrs={'class':'add_f'}), ] remove_attributes = ['width','height'] @@ -47,14 +48,16 @@ class recipeMagic(BasicNewsRecipe): def find_articles(self, url): articles = [] - soup=self.index_to_soup(url) - tag=soup.find(attrs={'class':'arl'}) - art=tag.ul.findAll('li') + soup = self.index_to_soup(url) + tag = soup.find(attrs={'class':'arl'}) + if not tag: + return articles + art = tag.ul.findAll('li') for i in art: - title=i.a['title'] - url=i.a['href'] + title = i.a['title'] + url = i.a['href'] #date=soup.find(id='footer').ul.li.string[41:-1] - desc=i.div.p.string + desc = i.div.p.string articles.append({'title' : title, 'url' : url, 'date' : '', diff --git a/recipes/polter_pl.recipe b/recipes/polter_pl.recipe index aea21dca9c..fdebcbc269 100644 --- a/recipes/polter_pl.recipe +++ b/recipes/polter_pl.recipe @@ -2,7 +2,7 @@ __license__ = 'GPL v3' import re from calibre.web.feeds.news import BasicNewsRecipe -class Poltergeist(BasicNewsRecipe): +class Polter(BasicNewsRecipe): title = u'Polter.pl' __author__ = 'fenuks' description = u'Największy polski serwis poświęcony ogólno pojętej fantastyce - grom fabularnym (RPG), książkom, filmowi, komiksowi, grom planszowym, karcianym i bitewnym.' @@ -10,21 +10,26 @@ class Poltergeist(BasicNewsRecipe): #publication_type = '' language = 'pl' #encoding = '' - extra_css = '.image, .floatright {float: right; margin-left: 10px;} .floatleft {float: left; margin-right: 10px;}' + extra_css = '.image, .floatright {float: right; margin-left: 10px;} .floatleft {float: left; margin-right: 10px;} .calibre_navbar {clear: both;} .p_title {font-weight: bold;} .p_image {margin-left: auto; margin-right: auto; display: block;} .italic {font-style: italic;}' cover_url = 'http://static.polter.pl/sub/promo/bpromo2524.jpg' #masthead_url = '' use_embedded_content = False oldest_article = 7 - preprocess_regexps = [(re.compile(ur']*?id="pol_lista"[^>]*?>.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur']*?>wersja do druku', re.DOTALL|re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(ur']*?id="pol_lista"[^>]*?>.*', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur']*?>wersja do druku', re.DOTALL|re.IGNORECASE), lambda match: ''), + #(re.compile(ur']*">(]*?/>)', re.DOTALL|re.IGNORECASE), lambda match: '/1') + (re.compile(ur'(
[\n\s\r]*){2,}', re.DOTALL|re.IGNORECASE), lambda match: '
'), + (re.compile(ur']*>Zaloguj się aby wyłączyć tę reklamę', re.DOTALL|re.IGNORECASE), lambda match: ''), + ] max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True remove_javascript = True - remove_attributes = ['style', 'font'] + remove_attributes = ['font', 'fieldset', 'onclick'] ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [dict(attrs={'class':'boxcontent'})] - remove_tags = [dict(attrs={'class':'fb-like'}), dict(attrs={'alt':'Wersja do druku'}), dict(id='pol_liczba'), dict(attrs={'scr':'http://static.polter.pl/tplimg/buttons/ceneo_140_40.gif'})] + remove_tags = [dict(attrs={'class':'fb-like'}), dict(attrs={'alt':'Wersja do druku'}), dict(id=['pol_liczba', 'col12AdSenseLight']), dict(attrs={'scr':'http://static.polter.pl/tplimg/buttons/ceneo_140_40.gif'}), dict(name=['g:plusone', 'fb:like'])] remove_tags_after = dict(attrs={'class':'fb-like'}) #remove_tags_before = dict() @@ -35,9 +40,24 @@ class Poltergeist(BasicNewsRecipe): s['class'] = 'floatleft' for s in soup.findAll(attrs={'style':re.compile('float: ?right')}): s['class'] = 'floatright' + for s in soup.findAll(style=True): + if 'bold;' in s['style']: + if s.get('class', ''): + s['class'] = s['class'] + ' p_title' + else: + s['class'] = 'p_title' + if 'italic;' in s['style']: + if s.get('class', ''): + s['class'] = s['class'] + ' italic' + else: + s['class'] = 'italic' + del s['style'] + tag = soup.find(id='twoja_ocena') if tag: tag.parent.extract() for tag in soup.findAll(id='lista_chce_ile'): tag.parent.parent.extract() + for r in soup.findAll(name='a', href=re.compile(r'^http://www.ceneo.pl/')): + r.extract() return soup \ No newline at end of file diff --git a/recipes/stopklatka.recipe b/recipes/stopklatka.recipe index 28c92a2453..1f629b1225 100644 --- a/recipes/stopklatka.recipe +++ b/recipes/stopklatka.recipe @@ -12,6 +12,7 @@ class Stopklatka_pl(BasicNewsRecipe): cover_url = 'http://static1.stopklatka.pl/images/20/19/11501.jpg' use_embedded_content = False oldest_article = 7 + BASEURL = 'http://stopklatka.pl' max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True @@ -20,9 +21,8 @@ class Stopklatka_pl(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [dict(attrs={'class':'asset-full-content default-asset-publisher show-asset-title'})] - remove_tags = [dict(attrs={'class':['metadata-entry metadata-tags', 'print-action', 'asset-flag', 'asset-ratings']}), dict(id='contest')] - #remove_tags_after = dict() - #remove_tags_before = dict() + remove_tags = [dict(attrs={'class':['metadata-entry metadata-tags', 'print-action', 'asset-flag', 'asset-ratings', 'ad-nav']}), dict(id='contest')] + feeds = [(u'Wiadomo\u015bci', u'http://stopklatka.pl/wiadomosci/-/asset_publisher/Hl7x4Ku4GpZj/rss?p_p_cacheability=cacheLevelPage'), (u'Artyku\u0142y', u'http://stopklatka.pl/artykuly/-/asset_publisher/pKhn5s0IxqSc/rss?p_p_cacheability=cacheLevelPage'), (u'Premiery i zapowiedzi', u'http://stopklatka.pl/premiery-i-zapowiedzi?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=1&p_p_col_count=3&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13393201&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13760176&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=15238425&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13470227&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13913324&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=20234402&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13917041&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13905169&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=14253975&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=21586017&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13540662&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=12999052&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=45280408&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=14826890&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13459998&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13070805&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=20209965&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=21741457&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=35577381&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13530138&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13392987%2Cmartwe-zlo&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13760162%2Cuklad-zamkniety&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F15238403%2Cwszyscy-w-naszej-rodzinie&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13470213%2Cdonoma&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13913310%2Ccristiada&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F20234381%2Craj-wiara&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13917027%2Cintruz&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13905155%2Cspring-breakers&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F14253957%2Ckrudowie&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F21586004%2Cswieta-czworca&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13540648%2Ckwartet&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F12999038%2Cimagine&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45280404%2Cdom-na-kolkach&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F14826876%2Cg-i-joe-odwet&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13459984%2Cnieobliczalni&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13070591%2Csamotny-port-milosc&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F20209952%2Czanim-noc-nas-nie-rozdzieli&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F21741444%2Chemel&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35577377%2Czywie-bielarus-&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13530124%2Cpanaceum'), (u'Recenzje filmowe', u'http://stopklatka.pl/box-office/-/asset_publisher/3yxqotUEiqHJ/rss?p_p_cacheability=cacheLevelPage'), (u'Recenzje', u'http://stopklatka.pl/recenzje/-/asset_publisher/5oZ3s2J3L0tG/rss?p_p_cacheability=cacheLevelPage'), (u'Gwiazdy', u'http://stopklatka.pl/czerwony-dywan/-/asset_publisher/PqN7MDEGWGvh/rss?p_p_cacheability=cacheLevelPage'), (u'Wywiady Stopklatki', u'http://stopklatka.pl/wywiady/-/asset_publisher/uVh3OrZCaLd7/rss?p_p_cacheability=cacheLevelPage'), (u'Prosto z Hollywood', u'http://stopklatka.pl/wywiady-z-hollywood/-/asset_publisher/YsbU0JSoxb9G/rss?p_p_cacheability=cacheLevelPage'), (u'Plotki', u'http://stopklatka.pl/czerwony-dywan/-/asset_publisher/XuF8EGAkVeTa/rss?p_p_cacheability=cacheLevelPage'), (u'Box Office Polska', u'http://stopklatka.pl/box-office?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=1&p_p_col_count=5&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_assetEntryIds=47982267&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_assetEntryIds=46685247&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_assetEntryIds=45280313&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47982263%2Cbox-office-weekendowy-polska-15-03-2013-17-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F46685243%2Cbox-office-weekendowy-polska-08-03-2013-10-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45280309%2Cbox-office-weekendowy-polska-01-03-2013-03-03-2013'), (u'Box Office USA', u'http://stopklatka.pl/box-office?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=2&p_p_col_count=5&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=49047234&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=48879358&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=47605057&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=47809980&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=46505246&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F49047230%2Cbox-office-weekendowy-stany-zjednoczone-22-03-2013-24-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48879354%2Cbox-office-weekendowy-stany-zjednoczone-22-03-2013-24-03-2013-estymacja-&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47605053%2Cbox-office-weekendowy-stany-zjednoczone-15-03-2013-17-03-2013-estymacja-&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47809976%2Cbox-office-weekendowy-stany-zjednoczone-15-03-2013-17-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F46505242%2Cbox-office-weekendowy-stany-zjednoczone-08-03-2013-10-03-2013'), (u'Relacje', u'http://stopklatka.pl/czerwony-dywan/-/asset_publisher/IkgAkSFxLWV2/rss?p_p_cacheability=cacheLevelPage'), (u'Kalendarium imprez', u'http://stopklatka.pl/kalendarium-imprez?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=1&p_p_col_count=3&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628974&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47627805&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=45317244&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=48884855&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47629292&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=48884742&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482058&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47627893&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482076&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47627838&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=48167620&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482067&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47811744&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482049&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47629615&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=45088670&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628531&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481950&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481496&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482022&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=19323743&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628034&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628064&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=45088819&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482031&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481415&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481977&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=19323617&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481932&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481995&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628970%2C4-festiwal-filmow-swiata-trzy-zywioly&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47627801%2Cwielka-podroz-krudow&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45317240%2C6-przeglad-kina-rosyjskiego-nowe-kino-rosyjskie-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48884851%2C2-1-nowy-cykl-spotkan-literatury-z-filmem&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47629288%2C5-festiwal-polskich-filmow-krotkometrazowych-short-waves&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48884738%2Cmoico-enjoy-movies-przeglad-filmow-klasy-b-we-wroclawiu&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482054%2C1-ogolnopolski-festiwal-polskiej-animacji-o-pla-2013-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47627889%2Cviii-festiwal-filmow-afrykanskich-afrykamera-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482072%2C6-miedzynarodowy-festwial-kina-niezaleznego-off-plus-camera&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47627834%2C11-przeglad-filmow-studenckich-z-lodzkiej-filmowki-lodzia-po-wisle-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48167616%2Cweze-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482063%2Cxiv-festiwal-kina-amatorskiego-i-niezaleznego-kan&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47811740%2Cv-festiwal-muzyki-filmowej-krzysztofa-komedy&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482045%2Ckonkurs-scenariuszowy-script-pro-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47629611%2C9-miedzynarodowy-festiwal-filmowy-%E2%80%9Ezydowskie-motywy%E2%80%9D&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45088666%2C15-przeglad-filmowy-cieszyn-kino-na-granicy-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628527%2Cdzien-filmowca-filmmaker-s-day&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481946%2C10-planete-doc-film-festival&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481492%2C66-miedzynarodowy-festiwal-filmowy-w-cannes&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482018%2C16-festiwal-filmow-kultowych&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F19323739%2C53-krakowski-festiwal-filmowy&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628030%2C4-festiwal-filmow-mlodziezowych-18&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628060%2Cvii-superorbitalny-festiwal-filmow-amatorskich-soffa&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45088815%2Cxv-miedzynarodowy-festiwal-filmow-przyrodniczych-im-wlodzimierza-puchalskiego&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482027%2C32-koszalinski-festiwal-debiutow-filmowych-mlodzi-i-film-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481411%2C6-miedzynarodowy-festiwal-filmow-animowanych-animator-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481973%2C13-miedzynarodowy-festiwal-filmowy-sopot-film-festival&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F19323613%2C13-miedzynarodowy-festiwal-filmowy-t-mobile-nowe-horyzonty&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481928%2C7-festiwal-filmu-i-sztuki-dwa-brzegi-w-kazimierzu-dolnym-i-janowcu&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481991%2C3-miedzynarodowy-festiwal-filmu-i-muzyki-transatlantyk'), (u'Konkursy', u'http://stopklatka.pl/konkursy?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=3&p_p_col_count=5&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_assetEntryIds=47091950&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_assetEntryIds=48879762&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_assetEntryIds=48880109&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47091941%2Cksiazki-dwie-kobiety-&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48879753%2Cdvd-rozmowy-noca-&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48880000%2Cdvd-milosc-'), (u'Komiks Stopklatki', u'http://stopklatka.pl/komiks/-/asset_publisher/pKhn5s0IxqSc/rss?p_p_cacheability=cacheLevelPage')] def append_page(self, soup, appendtag): @@ -30,7 +30,7 @@ class Stopklatka_pl(BasicNewsRecipe): if tag: while tag: url = tag['href'] - soup2 = self.index_to_soup(url) + soup2 = self.index_to_soup(self.BASEURL+url) tag = soup2.find('a', attrs={'class': 'next'}) pagetext = soup2.find(attrs={'class': 'journal-content-article'}) comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) @@ -40,7 +40,11 @@ class Stopklatka_pl(BasicNewsRecipe): appendtag.insert(pos, pagetext) appendtag.find('a', attrs={'class': 'next'}).extract() - def preprocess_html(self, soup): self.append_page(soup, soup.body) - return soup \ No newline at end of file + return soup + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open(self.BASEURL) + return br \ No newline at end of file diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index c0d8b66c3d..827a86180e 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -14,7 +14,5 @@ class Tablety_pl(BasicNewsRecipe): max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] keep_only_tags = [dict(id='news_block')] - #remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) - #remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'}) - remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments']})] + remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments', 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer']})] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] \ No newline at end of file From d7a5118c42fae882420791e773f4c4c35c1af23a Mon Sep 17 00:00:00 2001 From: fenuks Date: Fri, 1 Nov 2013 10:17:31 +0100 Subject: [PATCH 2/3] updated polish recipes --- recipes/benchmark_pl.recipe | 1 + recipes/cdrinfo_pl.recipe | 4 ++-- recipes/computerworld_pl.recipe | 7 ++++--- recipes/dobreprogamy.recipe | 4 ++-- recipes/infra_pl.recipe | 1 + recipes/kdefamily_pl.recipe | 5 ++++- recipes/lomza.recipe | 2 +- recipes/national_geographic_pl.recipe | 2 +- recipes/polter_pl.recipe | 11 ++--------- recipes/wprost_rss.recipe | 26 ++------------------------ recipes/zaufana_trzecia_strona.recipe | 1 + 11 files changed, 21 insertions(+), 43 deletions(-) diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 6572130389..974053324b 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -14,6 +14,7 @@ class BenchmarkPl(BasicNewsRecipe): max_articles_per_feed = 100 extra_css = 'ul {list-style-type: none;}' no_stylesheets = True + use_embedded_content = False #remove_attributes = ['style'] preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] diff --git a/recipes/cdrinfo_pl.recipe b/recipes/cdrinfo_pl.recipe index 2a8b3b9a2e..2e75eee622 100644 --- a/recipes/cdrinfo_pl.recipe +++ b/recipes/cdrinfo_pl.recipe @@ -23,8 +23,8 @@ class cdrinfo(BasicNewsRecipe): preprocess_regexps = [(re.compile(u']*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com\.

', re.DOTALL), lambda match: '')] ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id='text')] - remove_tags = [dict(attrs={'class':['navigation', 'sociable']}), dict(name='hr'), dict(id='respond')] + keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id=['text', 'text2'])] + remove_tags = [dict(attrs={'class':['navigation', 'sociable', 'last6news']}), dict(name='hr'), dict(id='respond')] remove_tags_after = dict(id='artnawigacja') feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'), (u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'), diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 8bf1f55124..250eeb2d84 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -13,10 +13,11 @@ class Computerworld_pl(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 remove_attributes = ['style',] + use_embedded_content = False preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''),] - keep_only_tags = [dict(id=['szpaltaL', 's2011'])] - remove_tags_after = dict(name='div', attrs={'class':'tresc'}) - remove_tags = [dict(attrs={'class':['nnav', 'rMobi', 'tagi', 'rec']}), dict(name='a', attrs={'target':'_blank'})] + keep_only_tags = [dict(id=['article-default-body'])] + remove_tags = [dict(attrs={'class':['share_tools nocontent', 'rec']}), dict(id=['topComment', 'bottom_tools'])] + feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] def skip_ad_pages(self, soup): diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index f37059becf..3b0c1c5f33 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -18,8 +18,8 @@ class Dobreprogramy_pl(BasicNewsRecipe): max_articles_per_feed = 100 remove_attrs = ['style', 'width', 'height'] preprocess_regexps = [(re.compile(ur'
Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...
'), lambda match: '') ] - keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] - remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')] + keep_only_tags = [dict(attrs={'class':['entry single']}), dict(id='phContent_divArticle')] + remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master', 'social nested-grid grid-margin-px15-top clearfix no-mobile', 'page-info text-h4 font-heading grid-margin-px15-top color-annotation clearfix']}), dict(id='komentarze'), dict(name='iframe')] #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe index 09228c15d5..eb5d5532a4 100644 --- a/recipes/infra_pl.recipe +++ b/recipes/infra_pl.recipe @@ -11,6 +11,7 @@ class INFRA(BasicNewsRecipe): index='http://infra.org.pl' language = 'pl' max_articles_per_feed = 100 + remove_empty_feeds = True remove_attrs = ['style'] no_stylesheets = True keep_only_tags = [dict(id='ja-current-content')] diff --git a/recipes/kdefamily_pl.recipe b/recipes/kdefamily_pl.recipe index 0e03ea75aa..607d545ca9 100644 --- a/recipes/kdefamily_pl.recipe +++ b/recipes/kdefamily_pl.recipe @@ -13,5 +13,8 @@ class KDEFamilyPl(BasicNewsRecipe): preprocess_regexps = [(re.compile(r"Podobne wpisy.*", re.IGNORECASE|re.DOTALL), lambda m: '')] no_stylesheets = True remove_empty_feeds = True - use_embedded_content = True + use_embedded_content = False + keep_only_tags = [dict(attrs={'class':'blog-post'})] + remove_tags = [dict(attrs={'class':['blog-bottom', 'ratings hreview-aggregate']})] + feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')] \ No newline at end of file diff --git a/recipes/lomza.recipe b/recipes/lomza.recipe index ed5c513430..86b574a844 100644 --- a/recipes/lomza.recipe +++ b/recipes/lomza.recipe @@ -1,7 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Lomza(BasicNewsRecipe): - title = u'4Lomza' + title = u'4Łomza' __author__ = 'fenuks' description = u'Regionalny portal. Najświeższe informacje z regionu, kulturalne, sportowe. Ogłoszenia, baza biznesu, forum.' cover_url = 'http://www.4lomza.pl/i/logo4lomza_m.jpg' diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe index be02ce4ebd..882349662e 100644 --- a/recipes/national_geographic_pl.recipe +++ b/recipes/national_geographic_pl.recipe @@ -10,7 +10,7 @@ class recipeMagic(BasicNewsRecipe): title = 'National Geographic PL' __author__ = 'Marcin Urban 2011' __modified_by__ = 'fenuks' - description = 'Legenda wśród magazynów z historią sięgającą 120 lat' + description = u'Legenda wśród magazynów z historią sięgającą 120 lat' #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' oldest_article = 7 max_articles_per_feed = 100 diff --git a/recipes/polter_pl.recipe b/recipes/polter_pl.recipe index fdebcbc269..d5ee6114a0 100644 --- a/recipes/polter_pl.recipe +++ b/recipes/polter_pl.recipe @@ -15,12 +15,6 @@ class Polter(BasicNewsRecipe): #masthead_url = '' use_embedded_content = False oldest_article = 7 - preprocess_regexps = [(re.compile(ur']*?id="pol_lista"[^>]*?>.*', re.DOTALL|re.IGNORECASE), lambda match: ''), - (re.compile(ur']*?>wersja do druku', re.DOTALL|re.IGNORECASE), lambda match: ''), - #(re.compile(ur']*">(]*?/>)', re.DOTALL|re.IGNORECASE), lambda match: '/1') - (re.compile(ur'(
[\n\s\r]*){2,}', re.DOTALL|re.IGNORECASE), lambda match: '
'), - (re.compile(ur']*>Zaloguj się aby wyłączyć tę reklamę', re.DOTALL|re.IGNORECASE), lambda match: ''), - ] max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True @@ -29,9 +23,8 @@ class Polter(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [dict(attrs={'class':'boxcontent'})] - remove_tags = [dict(attrs={'class':'fb-like'}), dict(attrs={'alt':'Wersja do druku'}), dict(id=['pol_liczba', 'col12AdSenseLight']), dict(attrs={'scr':'http://static.polter.pl/tplimg/buttons/ceneo_140_40.gif'}), dict(name=['g:plusone', 'fb:like'])] - remove_tags_after = dict(attrs={'class':'fb-like'}) - #remove_tags_before = dict() + remove_tags = [dict(id='komentarze')] + remove_tags_after = dict(id='komentarze') feeds = [(u'Wieści', 'http://polter.pl/wiesci,rss.html'), (u'RPG', 'http://rpg.polter.pl/wiesci,rss.html'), (u'Książki', 'http://ksiazki.polter.pl/wiesci,rss.html'), (u'Film', 'http://film.polter.pl/wiesci,rss.html'), (u'Komiks', 'http://komiks.polter.pl/wiesci,rss.html'), (u'Gry bitewne', 'http://bitewniaki.polter.pl/wiesci,rss.html'), (u'Gry karciane', 'http://karcianki.polter.pl/wiesci,rss.html'), (u'Gry planszowe', 'http://planszowki.polter.pl/wiesci,rss.html'), (u'Gry PC', 'http://gry.polter.pl/wiesci,rss.html'), (u'Gry konsolowe', 'http://konsole.polter.pl/wiesci,rss.html'), (u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html'), (u'Blogi', 'http://polter.pl/blogi,rss.html')] diff --git a/recipes/wprost_rss.recipe b/recipes/wprost_rss.recipe index 59c130fc75..7cd9d9ce5c 100644 --- a/recipes/wprost_rss.recipe +++ b/recipes/wprost_rss.recipe @@ -16,33 +16,11 @@ class Wprost(BasicNewsRecipe): no_stylesheets = True language = 'pl' remove_javascript = True - recursions = 0 use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - ''' - keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'})) - ''' - - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), - (re.compile(r'display: block;'), lambda match: ''), - (re.compile(r'\\\<\/table\>'), lambda match: ''), - (re.compile(r'\'), lambda match: ''), - (re.compile(r'\'), lambda match: ''), - (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\