diff --git a/recipes/di.recipe b/recipes/di.recipe index 179983e4dd..dad0fdd648 100644 --- a/recipes/di.recipe +++ b/recipes/di.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python -__license__ = 'GPL v3' +__license__ = 'GPL v3' __author__ = 'Mori' __version__ = 'v. 0.5' ''' @@ -11,56 +11,56 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class DziennikInternautowRecipe(BasicNewsRecipe): - __author__ = 'Mori' - language = 'pl' + __author__ = 'Mori' + language = 'pl' - title = u'Dziennik Internautow' - publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.' - description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.' + title = u'Dziennik Internautow' + publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.' + description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.' - max_articles_per_feed = 100 - oldest_article = 7 - cover_url = 'http://di.com.pl/pic/logo_di_norm.gif' - - no_stylesheets = True - remove_javascript = True - encoding = 'utf-8' - - extra_css = ''' - .fotodesc{font-size: 75%;} - .pub_data{font-size: 75%;} - .fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;} - #pub_foto{font-size: 75%; float: left; padding-right: 10px;} - ''' - - feeds = [ - (u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di') - ] - - keep_only_tags = [ - dict(name = 'div', attrs = {'id' : 'pub_head'}), - dict(name = 'div', attrs = {'id' : 'pub_content'}) - ] - - remove_tags = [ - dict(name = 'div', attrs = {'class' : 'poradniki_context'}), - dict(name = 'div', attrs = {'class' : 'uniBox'}), - dict(name = 'object', attrs = {}), - dict(name = 'h3', attrs = {}), - dict(attrs={'class':'twitter-share-button'}) - ] - - preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - [ - (r', ', lambda match: '
'), - (r'http://di.com.pl/pic/photo/mini/', lambda match: 'http://di.com.pl/pic/photo/oryginal/'), - (r'\s*', lambda match: '
'), + (r'
', lambda match: '
'), + (r'http://di.com.pl/pic/photo/mini/', lambda match: 'http://di.com.pl/pic/photo/oryginal/'), + (r'\s*'), lambda match: '')] - + (re.compile(r'align="right"'), lambda match: ''), + (re.compile(r'width=\"*\"'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; } - img { display: block; clear: both;} - ''' + img { display: block; clear: both;} + ''' remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align'] feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')] diff --git a/recipes/frazpc.recipe b/recipes/frazpc.recipe index b5225e33d5..2c12a58b55 100644 --- a/recipes/frazpc.recipe +++ b/recipes/frazpc.recipe @@ -20,7 +20,7 @@ class FrazPC(BasicNewsRecipe): no_stylesheets = True cover_url='http://www.frazpc.pl/images/logo.png' feeds = [ - (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), + (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), (u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly') ] diff --git a/recipes/interia_fakty.recipe b/recipes/interia_fakty.recipe index 63f95f3382..74cf56b267 100644 --- a/recipes/interia_fakty.recipe +++ b/recipes/interia_fakty.recipe @@ -20,10 +20,10 @@ class InteriaFakty(BasicNewsRecipe): max_articles_per_feed = 100 feeds = [(u'Kraj', u'http://kanaly.rss.interia.pl/kraj.xml'), - (u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'), - (u'Wiadomo\u015bci dnia', u'http://kanaly.rss.interia.pl/fakty.xml'), - (u'Przegl\u0105d prasy', u'http://kanaly.rss.interia.pl/przeglad_prasy.xml'), - (u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'), + (u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'), + (u'Wiadomo\u015bci dnia', u'http://kanaly.rss.interia.pl/fakty.xml'), + (u'Przegl\u0105d prasy', u'http://kanaly.rss.interia.pl/przeglad_prasy.xml'), + (u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'), (u'Ciekawostki', u'http://kanaly.rss.interia.pl/ciekawostki.xml')] keep_only_tags = [dict(name='div', attrs={'id':'article'})] @@ -35,5 +35,5 @@ class InteriaFakty(BasicNewsRecipe): dict(name='span', attrs={'class':'keywords'})] extra_css = ''' - h2 { font-size: 1.2em; } - ''' + h2 { font-size: 1.2em; } + ''' diff --git a/recipes/interia_sport.recipe b/recipes/interia_sport.recipe index 995dd114a8..dd46b0a4bc 100644 --- a/recipes/interia_sport.recipe +++ b/recipes/interia_sport.recipe @@ -20,13 +20,13 @@ class InteriaSport(BasicNewsRecipe): remove_javascript = True max_articles_per_feed = 100 - feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'), - (u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'), - (u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'), - (u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'), - (u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'), - (u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'), - (u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'), + feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'), + (u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'), + (u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'), + (u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'), + (u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'), + (u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'), + (u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'), (u'Tenis', u'http://kanaly.rss.interia.pl/tenis.xml')] keep_only_tags = [dict(name='div', attrs={'id':'article'})] @@ -63,7 +63,7 @@ class InteriaSport(BasicNewsRecipe): ''' preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'

', lambda match: ''), # FIXME diff --git a/recipes/kopalniawiedzy.recipe b/recipes/kopalniawiedzy.recipe index a7b932f618..619185ed34 100644 --- a/recipes/kopalniawiedzy.recipe +++ b/recipes/kopalniawiedzy.recipe @@ -6,74 +6,74 @@ import re from calibre.web.feeds.recipes import BasicNewsRecipe class KopalniaWiedzy(BasicNewsRecipe): - title = u'Kopalnia Wiedzy' - publisher = u'Kopalnia Wiedzy' - description = u'Ciekawostki ze świata nauki i techniki' - encoding = 'utf-8' - __author__ = 'Attis & Tomasz Długosz' - language = 'pl' - oldest_article = 7 - max_articles_per_feed = 100 - INDEX = u'http://kopalniawiedzy.pl/' - remove_javascript = True - no_stylesheets = True + title = u'Kopalnia Wiedzy' + publisher = u'Kopalnia Wiedzy' + description = u'Ciekawostki ze świata nauki i techniki' + encoding = 'utf-8' + __author__ = 'Attis & Tomasz Długosz' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + INDEX = u'http://kopalniawiedzy.pl/' + remove_javascript = True + no_stylesheets = True - remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}] - remove_tags_after = dict(attrs={'class':'ad-square'}) - keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})] - extra_css = '.topimage {margin-top: 30px}' + remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}] + remove_tags_after = dict(attrs={'class':'ad-square'}) + keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})] + extra_css = '.topimage {margin-top: 30px}' - preprocess_regexps = [ - (re.compile(u''), - lambda match: '' ), - (re.compile(u'

'), - lambda match: '') - ] - - feeds = [ - (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'), - (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'), - (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'), - (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'), - (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'), - (u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss') + preprocess_regexps = [ + (re.compile(u''), + lambda match: '' ), + (re.compile(u'

'), + lambda match: '') ] - def is_link_wanted(self, url, tag): - return tag['class'] == 'next' + feeds = [ + (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'), + (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'), + (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'), + (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'), + (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'), + (u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss') + ] - def remove_beyond(self, tag, next): - while tag is not None and getattr(tag, 'name', None) != 'body': - after = getattr(tag, next) - while after is not None: - ns = getattr(tag, next) - after.extract() - after = ns - tag = tag.parent + def is_link_wanted(self, url, tag): + return tag['class'] == 'next' - def append_page(self, soup, appendtag, position): - pager = soup.find('a',attrs={'class':'next'}) - if pager: - nexturl = self.INDEX + pager['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'id':'articleContent'}) + def remove_beyond(self, tag, next): + while tag is not None and getattr(tag, 'name', None) != 'body': + after = getattr(tag, next) + while after is not None: + ns = getattr(tag, next) + after.extract() + after = ns + tag = tag.parent - tag = texttag.find(attrs={'class':'pages'}) - self.remove_beyond(tag, 'nextSibling') + def append_page(self, soup, appendtag, position): + pager = soup.find('a',attrs={'class':'next'}) + if pager: + nexturl = self.INDEX + pager['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'id':'articleContent'}) - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) + tag = texttag.find(attrs={'class':'pages'}) + self.remove_beyond(tag, 'nextSibling') - appendtag.insert(position,texttag) + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + + appendtag.insert(position,texttag) - def preprocess_html(self, soup): - self.append_page(soup, soup.body, 3) + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) - for item in soup.findAll('div',attrs={'class':'pages'}): - item.extract() + for item in soup.findAll('div',attrs={'class':'pages'}): + item.extract() - for item in soup.findAll('p', attrs={'class':'wykop'}): - item.extract() + for item in soup.findAll('p', attrs={'class':'wykop'}): + item.extract() - return soup + return soup diff --git a/recipes/korespondent.recipe b/recipes/korespondent.recipe index aa9cf6e828..fff0946593 100644 --- a/recipes/korespondent.recipe +++ b/recipes/korespondent.recipe @@ -24,17 +24,16 @@ class KorespondentPL(BasicNewsRecipe): extra_css = '.naglowek {font-size: small}\n .tytul {font-size: x-large; padding-bottom: 10px; padding-top: 30px} \n .external {font-size: small}' preprocess_regexps = [ - (re.compile(u'' ), - (re.compile(u'

Więcej'), - lambda match:'Więcej' ), - (re.compile(u'target="_blank"'), - lambda match:'target="_blank" class="external"' ), - (re.compile(u'

\nPoczytaj inne teksty w Serwisie wolnorynkowym Korespondent.pl.*', re.DOTALL|re.IGNORECASE), - lambda match: ''), - ] + (re.compile(u'' ), + (re.compile(u'

Więcej'), + lambda match:'Więcej' ), + (re.compile(u'target="_blank"'), + lambda match:'target="_blank" class="external"' ), + (re.compile(u'

\nPoczytaj inne teksty w Serwisie wolnorynkowym Korespondent.pl.*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] feeds = [(u'Serwis informacyjny', u'http://korespondent.pl/rss.xml')] - diff --git a/recipes/legeartis.recipe b/recipes/legeartis.recipe index 8365d3639d..1b882c26d7 100644 --- a/recipes/legeartis.recipe +++ b/recipes/legeartis.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python -__license__ = 'GPL v3' +__license__ = 'GPL v3' __author__ = 'Mori' __version__ = 'v. 0.1' ''' @@ -10,34 +10,34 @@ olgierd.bblog.pl from calibre.web.feeds.news import BasicNewsRecipe class LegeArtisRecipe(BasicNewsRecipe): - __author__ = 'Mori' - language = 'pl' + __author__ = 'Mori' + language = 'pl' - title = u'Lege Artis' - publisher = u'Olgierd Rudak' - description = u'Wszystko, co chcieliby\u015bcie wiedzie\u0107 o prawie, ale wstydzicie si\u0119 zapyta\u0107' + title = u'Lege Artis' + publisher = u'Olgierd Rudak' + description = u'Wszystko, co chcieliby\u015bcie wiedzie\u0107 o prawie, ale wstydzicie si\u0119 zapyta\u0107' - max_articles_per_feed = 100 - - no_stylesheets = True - remove_javascript = True - - extra_css = ''' - img{clear: both;} - ''' - - feeds = [ - (u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml') - ] - - keep_only_tags = [ - dict(name = 'div', attrs = {'class' : 'post_title'}), - dict(name = 'div', attrs = {'class' : 'post_date'}), - dict(name = 'div', attrs = {'class' : 'post_content'}) - ] - - remove_tags = [ - dict(name = 'div', attrs = {'id' : 'bb_tools'}), - dict(name = 'div', attrs = {'class' : 'post_comments'}), - dict(name = 'object', attrs = {}) - ] + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + + extra_css = ''' + img{clear: both;} + ''' + + feeds = [ + (u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml') + ] + + keep_only_tags = [ + dict(name = 'div', attrs = {'class' : 'post_title'}), + dict(name = 'div', attrs = {'class' : 'post_date'}), + dict(name = 'div', attrs = {'class' : 'post_content'}) + ] + + remove_tags = [ + dict(name = 'div', attrs = {'id' : 'bb_tools'}), + dict(name = 'div', attrs = {'class' : 'post_comments'}), + dict(name = 'object', attrs = {}) + ] diff --git a/recipes/legitymizm.recipe b/recipes/legitymizm.recipe index e54d2beefd..b135cefae6 100644 --- a/recipes/legitymizm.recipe +++ b/recipes/legitymizm.recipe @@ -32,7 +32,7 @@ class Legitymizm(BasicNewsRecipe): #szeroka_kolumna ul.wykaz { list-style-type: none; margin: 0 0 1.2em 0; padding: 0; } #szeroka_kolumna ul.wykaz li.wykaz_2 { font-weight: bold; margin: 0.6em 0 0 0; } #szeroka_kolumna ul.wykaz a { text-decoration: none; } - #szeroka_kolumna ul.wykaz li.wykaz_1, #szeroka_kolumna ul.wykaz li.wykaz_2 ul li { list-style-type: square; color: #898981; text-transform: none; font-weight: normal; padding: 0; } + #szeroka_kolumna ul.wykaz li.wykaz_1, #szeroka_kolumna ul.wykaz li.wykaz_2 ul li { list-style-type: square; color: #898981; text-transform: none; font-weight: normal; padding: 0; } #szeroka_kolumna ul.wykaz li.wykaz_1 { margin: 0 0 0 1.3em; } #szeroka_kolumna ul.wykaz li.wykaz_2 ul { margin: 0; padding: 0 0 0 1.3em; } #szeroka_kolumna h3.autor { background-color: #898981; color: #f9f9f8; margin: -25px 0px 30px 0; text-align: left; padding: 0 0 0 2px; } @@ -47,4 +47,3 @@ class Legitymizm(BasicNewsRecipe): #cytat p.sentencja:first-letter { font-size: 44px; line-height: 33px; margin: 0 2px 0 0; font-style: normal; float: left; display: block; } p.autor { text-transform: uppercase; color: #898981; font-style: normal; text-align: left; } ''' - diff --git a/recipes/rmf24_opinie.recipe b/recipes/rmf24_opinie.recipe index 9e4d336252..0bbe5d03a4 100644 --- a/recipes/rmf24_opinie.recipe +++ b/recipes/rmf24_opinie.recipe @@ -46,7 +46,7 @@ class RMF24_opinie(BasicNewsRecipe): return link preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'

Zdj.cie

', lambda match: ''), (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'),