diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 6572130389..974053324b 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -14,6 +14,7 @@ class BenchmarkPl(BasicNewsRecipe): max_articles_per_feed = 100 extra_css = 'ul {list-style-type: none;}' no_stylesheets = True + use_embedded_content = False #remove_attributes = ['style'] preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] diff --git a/recipes/cdrinfo_pl.recipe b/recipes/cdrinfo_pl.recipe index 2a8b3b9a2e..2e75eee622 100644 --- a/recipes/cdrinfo_pl.recipe +++ b/recipes/cdrinfo_pl.recipe @@ -23,8 +23,8 @@ class cdrinfo(BasicNewsRecipe): preprocess_regexps = [(re.compile(u']*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com\.

', re.DOTALL), lambda match: '')] ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id='text')] - remove_tags = [dict(attrs={'class':['navigation', 'sociable']}), dict(name='hr'), dict(id='respond')] + keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id=['text', 'text2'])] + remove_tags = [dict(attrs={'class':['navigation', 'sociable', 'last6news']}), dict(name='hr'), dict(id='respond')] remove_tags_after = dict(id='artnawigacja') feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'), (u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'), diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 8bf1f55124..250eeb2d84 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -13,10 +13,11 @@ class Computerworld_pl(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 remove_attributes = ['style',] + use_embedded_content = False preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''),] - keep_only_tags = [dict(id=['szpaltaL', 's2011'])] - remove_tags_after = dict(name='div', attrs={'class':'tresc'}) - remove_tags = [dict(attrs={'class':['nnav', 'rMobi', 'tagi', 'rec']}), dict(name='a', attrs={'target':'_blank'})] + keep_only_tags = [dict(id=['article-default-body'])] + remove_tags = [dict(attrs={'class':['share_tools nocontent', 'rec']}), dict(id=['topComment', 'bottom_tools'])] + feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] def skip_ad_pages(self, soup): diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index f37059becf..3b0c1c5f33 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -18,8 +18,8 @@ class Dobreprogramy_pl(BasicNewsRecipe): max_articles_per_feed = 100 remove_attrs = ['style', 'width', 'height'] preprocess_regexps = [(re.compile(ur'
Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...
'), lambda match: '') ] - keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] - remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')] + keep_only_tags = [dict(attrs={'class':['entry single']}), dict(id='phContent_divArticle')] + remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master', 'social nested-grid grid-margin-px15-top clearfix no-mobile', 'page-info text-h4 font-heading grid-margin-px15-top color-annotation clearfix']}), dict(id='komentarze'), dict(name='iframe')] #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe index 09228c15d5..eb5d5532a4 100644 --- a/recipes/infra_pl.recipe +++ b/recipes/infra_pl.recipe @@ -11,6 +11,7 @@ class INFRA(BasicNewsRecipe): index='http://infra.org.pl' language = 'pl' max_articles_per_feed = 100 + remove_empty_feeds = True remove_attrs = ['style'] no_stylesheets = True keep_only_tags = [dict(id='ja-current-content')] diff --git a/recipes/kdefamily_pl.recipe b/recipes/kdefamily_pl.recipe index 0e03ea75aa..607d545ca9 100644 --- a/recipes/kdefamily_pl.recipe +++ b/recipes/kdefamily_pl.recipe @@ -13,5 +13,8 @@ class KDEFamilyPl(BasicNewsRecipe): preprocess_regexps = [(re.compile(r"Podobne wpisy.*", re.IGNORECASE|re.DOTALL), lambda m: '')] no_stylesheets = True remove_empty_feeds = True - use_embedded_content = True + use_embedded_content = False + keep_only_tags = [dict(attrs={'class':'blog-post'})] + remove_tags = [dict(attrs={'class':['blog-bottom', 'ratings hreview-aggregate']})] + feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')] \ No newline at end of file diff --git a/recipes/lomza.recipe b/recipes/lomza.recipe index ed5c513430..86b574a844 100644 --- a/recipes/lomza.recipe +++ b/recipes/lomza.recipe @@ -1,7 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Lomza(BasicNewsRecipe): - title = u'4Lomza' + title = u'4Łomza' __author__ = 'fenuks' description = u'Regionalny portal. Najświeższe informacje z regionu, kulturalne, sportowe. Ogłoszenia, baza biznesu, forum.' cover_url = 'http://www.4lomza.pl/i/logo4lomza_m.jpg' diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe index be02ce4ebd..882349662e 100644 --- a/recipes/national_geographic_pl.recipe +++ b/recipes/national_geographic_pl.recipe @@ -10,7 +10,7 @@ class recipeMagic(BasicNewsRecipe): title = 'National Geographic PL' __author__ = 'Marcin Urban 2011' __modified_by__ = 'fenuks' - description = 'Legenda wśród magazynów z historią sięgającą 120 lat' + description = u'Legenda wśród magazynów z historią sięgającą 120 lat' #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' oldest_article = 7 max_articles_per_feed = 100 diff --git a/recipes/polter_pl.recipe b/recipes/polter_pl.recipe index fdebcbc269..d5ee6114a0 100644 --- a/recipes/polter_pl.recipe +++ b/recipes/polter_pl.recipe @@ -15,12 +15,6 @@ class Polter(BasicNewsRecipe): #masthead_url = '' use_embedded_content = False oldest_article = 7 - preprocess_regexps = [(re.compile(ur']*?id="pol_lista"[^>]*?>.*', re.DOTALL|re.IGNORECASE), lambda match: ''), - (re.compile(ur']*?>wersja do druku', re.DOTALL|re.IGNORECASE), lambda match: ''), - #(re.compile(ur']*">(]*?/>)', re.DOTALL|re.IGNORECASE), lambda match: '/1') - (re.compile(ur'(
[\n\s\r]*){2,}', re.DOTALL|re.IGNORECASE), lambda match: '
'), - (re.compile(ur']*>Zaloguj się aby wyłączyć tę reklamę', re.DOTALL|re.IGNORECASE), lambda match: ''), - ] max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True @@ -29,9 +23,8 @@ class Polter(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [dict(attrs={'class':'boxcontent'})] - remove_tags = [dict(attrs={'class':'fb-like'}), dict(attrs={'alt':'Wersja do druku'}), dict(id=['pol_liczba', 'col12AdSenseLight']), dict(attrs={'scr':'http://static.polter.pl/tplimg/buttons/ceneo_140_40.gif'}), dict(name=['g:plusone', 'fb:like'])] - remove_tags_after = dict(attrs={'class':'fb-like'}) - #remove_tags_before = dict() + remove_tags = [dict(id='komentarze')] + remove_tags_after = dict(id='komentarze') feeds = [(u'Wieści', 'http://polter.pl/wiesci,rss.html'), (u'RPG', 'http://rpg.polter.pl/wiesci,rss.html'), (u'Książki', 'http://ksiazki.polter.pl/wiesci,rss.html'), (u'Film', 'http://film.polter.pl/wiesci,rss.html'), (u'Komiks', 'http://komiks.polter.pl/wiesci,rss.html'), (u'Gry bitewne', 'http://bitewniaki.polter.pl/wiesci,rss.html'), (u'Gry karciane', 'http://karcianki.polter.pl/wiesci,rss.html'), (u'Gry planszowe', 'http://planszowki.polter.pl/wiesci,rss.html'), (u'Gry PC', 'http://gry.polter.pl/wiesci,rss.html'), (u'Gry konsolowe', 'http://konsole.polter.pl/wiesci,rss.html'), (u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html'), (u'Blogi', 'http://polter.pl/blogi,rss.html')] diff --git a/recipes/wprost_rss.recipe b/recipes/wprost_rss.recipe index 59c130fc75..7cd9d9ce5c 100644 --- a/recipes/wprost_rss.recipe +++ b/recipes/wprost_rss.recipe @@ -16,33 +16,11 @@ class Wprost(BasicNewsRecipe): no_stylesheets = True language = 'pl' remove_javascript = True - recursions = 0 use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - ''' - keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'})) - ''' - - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), - (re.compile(r'display: block;'), lambda match: ''), - (re.compile(r'\\\<\/table\>'), lambda match: ''), - (re.compile(r'\'), lambda match: ''), - (re.compile(r'\'), lambda match: ''), - (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\