From 88b7fd9e909602cc98d5fc63203d54e1a4643398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Wed, 29 May 2013 08:02:44 +0530 Subject: [PATCH] Various new and updated Polish news sources --- recipes/alejakomiksu_com.recipe | 37 ++++++++ recipes/fdb_pl.recipe | 49 ++++++++++ recipes/gazeta_pl_krakow.recipe | 147 ++++++++++++++--------------- recipes/gazeta_pl_szczecin.recipe | 86 +++++++++++++---- recipes/gazeta_pl_warszawa.recipe | 137 +++++++++++++-------------- recipes/gazeta_wyborcza.recipe | 138 ++++++++++++--------------- recipes/gosc_niedzielny.recipe | 23 ++--- recipes/icons/alejakomiksu_com.png | Bin 0 -> 575 bytes recipes/icons/linuxportal_pl.png | Bin 0 -> 1430 bytes recipes/icons/picoboard_pl.png | Bin 0 -> 469 bytes recipes/icons/polter_pl.png | Bin 0 -> 766 bytes recipes/icons/racjonalista_pl.png | Bin 850 -> 476 bytes recipes/icons/sekurak_pl.png | Bin 0 -> 956 bytes recipes/icons/tawernarpg_pl.png | Bin 0 -> 1087 bytes recipes/ihned.recipe | 63 +++++-------- recipes/linuxportal_pl.recipe | 62 ++++++++++++ recipes/picoboard_pl.recipe | 33 +++++++ recipes/polter_pl.recipe | 43 +++++++++ recipes/sekurak_pl.recipe | 28 ++++++ recipes/tawernarpg_pl.recipe | 38 ++++++++ 20 files changed, 590 insertions(+), 294 deletions(-) create mode 100644 recipes/alejakomiksu_com.recipe create mode 100644 recipes/fdb_pl.recipe create mode 100644 recipes/icons/alejakomiksu_com.png create mode 100644 recipes/icons/linuxportal_pl.png create mode 100644 recipes/icons/picoboard_pl.png create mode 100644 recipes/icons/polter_pl.png create mode 100644 recipes/icons/sekurak_pl.png create mode 100644 recipes/icons/tawernarpg_pl.png create mode 100644 recipes/linuxportal_pl.recipe create mode 100644 recipes/picoboard_pl.recipe create mode 100644 recipes/polter_pl.recipe create mode 100644 recipes/sekurak_pl.recipe create mode 100644 recipes/tawernarpg_pl.recipe diff --git a/recipes/alejakomiksu_com.recipe b/recipes/alejakomiksu_com.recipe new file mode 100644 index 0000000000..f34fd1183d --- /dev/null +++ b/recipes/alejakomiksu_com.recipe @@ -0,0 +1,37 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class AlejaKomiksu(BasicNewsRecipe): + title = u'Aleja Komiksu' + __author__ = 'fenuks' + description = u'Serwis poświęcony komiksom. Najnowsze wieści, recenzje, artykuły, wywiady, galerie, komiksy online, konkursy, linki, baza komiksów online.' + category = 'comics' + #publication_type = '' + language = 'pl' + #encoding = '' + extra_css = 'ul {list-style-type: none;} .gfx_news {float: right;}' + preprocess_regexps = [(re.compile(ur'((
  • (Do poczytania)|(Nowości):
  • )|(

    Komentarze

    )).*', re.DOTALL|re.IGNORECASE), lambda match: '')] + cover_url = 'http://www.alejakomiksu.com/gfx/build/logo.png' + masthead_url = 'http://www.alejakomiksu.com/gfx/build/logo.png' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'cont_tresc'})] + #remove_tags = [dict()] + #remove_tags_before = dict() + + feeds = [(u'Wiadomości', 'http://www.alejakomiksu.com/rss.php5')] + + def skip_ad_pages(self, soup): + tag = soup.find(attrs={'class':'rodzaj'}) + if tag and tag.a.string.lower().strip() == 'recenzje': + link = soup.find(text=re.compile('recenzuje')) + if link: + return self.index_to_soup(link.parent['href'], raw=True) \ No newline at end of file diff --git a/recipes/fdb_pl.recipe b/recipes/fdb_pl.recipe new file mode 100644 index 0000000000..c7baf84408 --- /dev/null +++ b/recipes/fdb_pl.recipe @@ -0,0 +1,49 @@ +__license__ = 'GPL v3' +from calibre.web.feeds.news import BasicNewsRecipe + +class FDBPl(BasicNewsRecipe): + title = u'Fdb.pl' + __author__ = 'fenuks' + description = u'Wiadomości ze świata filmu, baza danych filmowych, recenzje, zwiastuny, boxoffice.' + category = 'film' + #publication_type = '' + language = 'pl' + #encoding = '' + extra_css = '.options-left > li {display: inline;} em {display: block;}' + cover_url = 'http://fdb.pl/assets/fdb2/logo.png' + #masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'news-item news-first'})] + remove_tags = [dict(attrs={'class':['dig dig-first', 'ads clearfix', 'comments']})] + #remove_tags_after = dict() + #remove_tags_before = dict() + feeds = [] + + def parse_index(self): + feeds = [] + feeds.append((u'Wiadomości', self.get_articles('http://fdb.pl/wiadomosci?page={0}', 2))) + return feeds + + def get_articles(self, url, pages=1): + articles = [] + for nr in range(1, pages+1): + soup = self.index_to_soup(url.format(nr)) + for tag in soup.findAll(attrs={'class':'news-item clearfix'}): + node = tag.find('h2') + title = node.a.string + url = 'http://fdb.pl' + node.a['href'] + date = '' + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles \ No newline at end of file diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe index 0f7633e4b2..4abc6120b7 100644 --- a/recipes/gazeta_pl_krakow.recipe +++ b/recipes/gazeta_pl_krakow.recipe @@ -8,94 +8,87 @@ krakow.gazeta.pl ''' from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.ebooks.BeautifulSoup import Comment class gw_krakow(BasicNewsRecipe): title = u'Gazeta Wyborcza Kraków' __author__ = 'teepel based on GW from fenuks' language = 'pl' - description =u'Wiadomości z Krakowa na portalu Gazeta.pl.' - category='newspaper' + description = u'Wiadomości z Krakowa na portalu Gazeta.pl.' + category = 'newspaper' publication_type = 'newspaper' - masthead_url='http://bi.gazeta.pl/im/5/8528/m8528105.gif' - INDEX='http://krakow.gazeta.pl/' - remove_empty_feeds= True - oldest_article = 1 + # encoding = 'iso-8859-2' + masthead_url = 'http://bi.gazeta.pl/im/5/8528/m8528105.gif' + INDEX = 'http://krakow.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) - remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) - remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'})) - - remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})] + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] - def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) - return new_soup - - - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True - if appendtag.find(id='source'): - appendtag.find(id='source').extract() - while loop: - loop=False - for link in nexturl: - if u'następne' in link.string: - url= self.INDEX + link['href'] - soup2 = self.index_to_soup(url) - pagetext = soup2.find(id='artykul') - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True - - def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') - if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] - appendtag.find(id='gal_navi').extract() - while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') - if nexturl: - nexturl=nexturl.a['href'] - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') - if rem: - rem.extract() + def print_version(self, url): + if 'feedsportal.com' in url: + s = url.rpartition('gazeta0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('wyborcza0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = self.INDEX + u + return u + else: + return url def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() + + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gazeta_pl_szczecin.recipe b/recipes/gazeta_pl_szczecin.recipe index c0c83fd109..e966ca5eed 100644 --- a/recipes/gazeta_pl_szczecin.recipe +++ b/recipes/gazeta_pl_szczecin.recipe @@ -1,8 +1,8 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai import re -import string from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GazetaPlSzczecin(BasicNewsRecipe): title = u'Gazeta Wyborcza Szczecin' @@ -12,24 +12,74 @@ class GazetaPlSzczecin(BasicNewsRecipe): language = 'pl' publisher = 'Agora S.A.' category = 'news, szczecin' - oldest_article = 2 + INDEX = 'http://szczecin.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 max_articles_per_feed = 100 - auto_cleanup = True - remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}] - cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif" + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')] - def get_article_url(self, article): - s = re.search("""/0L(szczecin.*)/story01.htm""", article.link) - s = s.group(1) - replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"} - for (a, b) in replacements.iteritems(): - s = string.replace(s, a, b) - s = string.replace(s, "0A", "0") - return "http://"+s - def print_version(self, url): - s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url) - no1 = s.group(2) - no2 = s.group(3) - return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2) + if 'feedsportal.com' in url: + s = url.rpartition('gazeta0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('wyborcza0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = self.INDEX + u + return u + else: + return url + + def preprocess_html(self, soup): + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) + return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() + + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe index 6a37a96885..fcdffc3abd 100644 --- a/recipes/gazeta_pl_warszawa.recipe +++ b/recipes/gazeta_pl_warszawa.recipe @@ -7,7 +7,9 @@ __author__ = 'teepel based on GW from fenuks' warszawa.gazeta.pl ''' +import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class gw_wawa(BasicNewsRecipe): title = u'Gazeta Wyborcza Warszawa' @@ -17,82 +19,75 @@ class gw_wawa(BasicNewsRecipe): category='newspaper' publication_type = 'newspaper' masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif' - INDEX='http://warszawa.gazeta.pl/' - remove_empty_feeds= True - oldest_article = 1 + INDEX = 'http://warszawa.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) - remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) - remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] - def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) - return new_soup - - - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True - if appendtag.find(id='source'): - appendtag.find(id='source').extract() - while loop: - loop=False - for link in nexturl: - if u'następne' in link.string: - url= self.INDEX + link['href'] - soup2 = self.index_to_soup(url) - pagetext = soup2.find(id='artykul') - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True - - def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') - if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] - appendtag.find(id='gal_navi').extract() - while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') - if nexturl: - nexturl=nexturl.a['href'] - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') - if rem: - rem.extract() + def print_version(self, url): + if 'feedsportal.com' in url: + s = url.rpartition('gazeta0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('wyborcza0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = self.INDEX + u + return u + else: + return url def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() + + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 880aea5bc1..653c776723 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment - +import re class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks, Artur Stachecki' @@ -9,7 +9,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.' category = 'newspaper' publication_type = 'newspaper' - #encoding = 'iso-8859-2' + # encoding = 'iso-8859-2' masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX = 'http://wyborcza.pl' remove_empty_feeds = True @@ -19,10 +19,18 @@ class Gazeta_Wyborcza(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} - remove_tags_before = dict(id='k0') - remove_tags_after = dict(id='banP4') - remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] - feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), + + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') + + # rules for wyborcza.biz + preprocess_regexps.append((re.compile(u'(
    )?(
    )? Czytaj (także|też):.*?\.?
    ', re.DOTALL), lambda m: '')) + + feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), + (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), @@ -39,86 +47,55 @@ class Gazeta_Wyborcza(BasicNewsRecipe): (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') - ] - - def skip_ad_pages(self, soup): - tag = soup.find(name='a', attrs={'class': 'btn'}) - if tag: - new_soup = self.index_to_soup(tag['href'], raw=True) - return new_soup - - def append_page(self, soup, appendtag): - loop = False - tag = soup.find('div', attrs={'id': 'Str'}) - if appendtag.find('div', attrs={'id': 'Str'}): - nexturl = tag.findAll('a') - appendtag.find('div', attrs={'id': 'Str'}).extract() - loop = True - if appendtag.find(id='source'): - appendtag.find(id='source').extract() - while loop: - loop = False - for link in nexturl: - if u'następne' in link.string: - url = self.INDEX + link['href'] - soup2 = self.index_to_soup(url) - pagetext = soup2.find(id='artykul') - comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) - for comment in comments: - comment.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id': 'Str'}) - nexturl = tag.findAll('a') - loop = True - - def gallery_article(self, appendtag): - tag = appendtag.find(id='container_gal') - if tag: - nexturl = appendtag.find(id='gal_btn_next').a['href'] - appendtag.find(id='gal_navi').extract() - while nexturl: - soup2 = self.index_to_soup(nexturl) - pagetext = soup2.find(id='container_gal') - nexturl = pagetext.find(id='gal_btn_next') - if nexturl: - nexturl = nexturl.a['href'] - comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) - for comment in comments: - comment.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - rem = appendtag.find(id='gal_navi') - if rem: - rem.extract() - - def preprocess_html(self, soup): - if soup.find(attrs={'class': 'piano_btn_1'}): - return None - else: - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup + ] def print_version(self, url): - if url.count('rss.feedsportal.com'): - u = url.find('wyborcza0Bpl') - u = 'http://www.wyborcza.pl/' + url[u + 11:] + if 'feedsportal.com' in url: + s = url.rpartition('wyborcza0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('gazeta0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') u = u.replace('0C', '/') u = u.replace('A', '') u = u.replace('0E', '-') u = u.replace('0H', ',') u = u.replace('0I', '_') u = u.replace('0B', '.') - u = u.replace('/1,', '/2029020,') - u = u.replace('/story01.htm', '') - print(u) + u = self.INDEX + u return u - elif 'http://wyborcza.pl/1' in url: - return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020') else: - return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + return url + + def preprocess_html(self, soup): + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) + return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') @@ -127,6 +104,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe): self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) - '''def image_url_processor(self, baseurl, url): - print "@@@@@@@@", url - return url.replace('http://wyborcza.pl/ ', '')''' + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe index 65e6e1704c..ba2280c2a5 100644 --- a/recipes/gosc_niedzielny.recipe +++ b/recipes/gosc_niedzielny.recipe @@ -13,7 +13,7 @@ class GN(BasicNewsRecipe): __author__ = 'Piotr Kontek, Tomasz Długosz' title = u'Gość Niedzielny' - description = 'Ogólnopolski tygodnik katolicki' + description = 'Ogólnopolski tygodnik katolicki - fragmenty artykułów z poprzedniego numeru' encoding = 'utf-8' no_stylesheets = True language = 'pl' @@ -33,7 +33,7 @@ class GN(BasicNewsRecipe): a = soup.find('div',attrs={'class':'release-wp-b'}).find('a') articles = [ {'title' : self.tag_to_string(a), - 'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/') + 'url' : 'http://www.gosc.pl' + a['href'] }] feeds.append((u'Wstępniak',articles)) #kategorie @@ -71,12 +71,11 @@ class GN(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) - ''' - for image_div in soup.findAll(attrs={'class':'doc_image'}): - link = - if 'm.jpg' in image['src']: - image['src'] = image['src'].replace('m.jpg', '.jpg') - ''' + return soup + + def postprocess_html(self, soup, first_fetch): + for r in soup.findAll(attrs={'class':'pgr'}): + r.extract() return soup keep_only_tags = [ @@ -85,12 +84,14 @@ class GN(BasicNewsRecipe): remove_tags = [ dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}), - dict(name='div', attrs={'class':['doc_actions', 'pgr', 'fr1_cl']}), - dict(name='div', attrs={'id':'vote'}) + dict(name='div', attrs={'class':['doc_actions', 'cf', 'fr1_cl']}), + dict(name='div', attrs={'id':'vote'}), + dict(name='a', attrs={'class':'img_enlarge'}) ] extra_css = ''' h1 {font-size:150%} - div#doc_image {font-style:italic; font-size:70%} p.limiter {font-size:150%; font-weight: bold} + span.cm-i-a {text-transform:uppercase;} + span.cm-i-p {font-style:italic; font-size:70%} ''' diff --git a/recipes/icons/alejakomiksu_com.png b/recipes/icons/alejakomiksu_com.png new file mode 100644 index 0000000000000000000000000000000000000000..c564af10ffae2f5ae8a5ea47181e87f2ab54d017 GIT binary patch literal 575 zcmV-F0>J%=P)mjc>= zrUyy^CEk6L{{LUPAYVd8LM$ysa?d`gpMRu*4g+fVgy|%p?LfKjf23#6m8`9iJbhN` z)d%U%zo6d0q5)|9&;QcLPD=q@weNs5&`I|mNx%I9R0?z$HVr@iNsNx&FLOO@#B zmjp6ie~^xkl{k1z>ftj`$U^j>YxwbLEHjSqH8#MLT>&7@y}nRCr^>)W)o9W6jziLml6@z z(-PXaz71E>PzUOiofHh!6M5OWm1HG= zs^z4_WyD2fCB=Na+_7r_B1>~;88Jbi@j#>?B`(a*?`&^&_Us9`0s!GqpoSeSD9-=@ N002ovPDHLkV1g~(6@>r* literal 0 HcmV?d00001 diff --git a/recipes/icons/linuxportal_pl.png b/recipes/icons/linuxportal_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..daa92d05d8ec101978b2ae07b7377f53f6e8d2c1 GIT binary patch literal 1430 zcmZ`&YgANK6#nJ`k_u`MQQnVX6l8|EGehcN2|6UBLW)porGlubsYGZ=rK6(22n;jK zd*o4B2@*b1OH0vwgbzdlABjaS*D7_Hn2p5D+G_WAa{6?3v? z1P%=u3LsFEq0T0DB=-ae$a}nG#5`hzE7CI4fCKtrbe17bU5Hc)2B>p;~((SXRx21o`WV8HkAV7~-RoQN@7 zATmH^hun!I2NX6WS)g4kYtU7+r41506phsggGdjV z6$%Gpjnu4quh;AI(7ksa5->TGLMB%jW#X90=?W(lR>YZRE^hJAUgqg~qwL)%1j0rJiUYceYnu9WAOXn9KM1l=*wOmwWL{t@rOeFf0dT9?q z5@J#V;U~7`=nX_yb8O)iP!xjnH=+#jMu>K<`l#(;zl#vMJ)V1i4is$ICx|t0grQY! zWEfI5BSc54oWh*WHY7W^k(fYaw{Xqnx^IHi zLZ-B#sChJiYsJ}F^B@rNh2oOd?kgwGcU|r2IU*1QmAIbj$=rKDvmJgxjb57dvNZeT zHr?r8>LD?jW&VE9nfQnVSWgSh_FUY?vTn2XG|RM~W7xsU)(b5(ee$2DouO+FKVc5t zXHE~Y9{u4#)@8oSob6y~uW8Q{hP`EU(~UKR(Y<5;BX;0aGt1^rB8hc%e+;mmn`Lh1 z&e#Aa+gq}`<789SLwAudLH3~VZ#H1MvTO$XPtbYp>5=u<#AM`PL%w;w5?cOAD4Q zU-9M;v|G`4_#*z;|`gqf3?Ut?EO0j)MS$TzSXXU1< zYJH8tSPPTcVzt>F&Ruo&4UM0C`WbeA{>86f?rCb?+wxUw-@g4q960#(p~FYM`S$3s z@4i2NqU|J3{c!rs+27Wm`?2r#n)dS-$e#~fywq{I^GerG-92RNx=s{Mo>{XYocQC4 z)8`i#14Ba#1H&(%P{RubhEf9thF1v;3|2E37{m+a>ZCt)8ZRVMoK&3!~K;Zv+8995P6^u#V?k)@+tg;?JPM@cXV~E7%-u~Ns zhYWa}FFS6&p~`)C>h1sa>QmX6?tPftpAxz4#C|W$()@ahw70$oH+}AW_d)U8tI`}B zL;tW(G7&4*DTSn!@~pTyfk~(>L2WPF0hdhHSL+ULeVWr&vF&VCoJ6o=Ijdcx7b&V`S3@xop46RH}wGE7{3=H(YWL-zmkei>9nO2EgL)^+s Q*+2~pp00i_>zopr07qfFKmY&$ literal 0 HcmV?d00001 diff --git a/recipes/icons/polter_pl.png b/recipes/icons/polter_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..581c0dc41928b9a222f0489f9d6c12e247790aab GIT binary patch literal 766 zcmZ`$Z%7ky82*_vmS$aGb()eRlT91k`6t}4aMw(C<#4BwUahu%&RoaHbsK0insO|) zu~05Ztxw{4Hmszo0JxtEfSm(ihdC3vU;KpM(B6KWl}wo`6PzF%i%}c|BgIi-RT^ zG!g6r(mf6tGr?LJr^{(`kX$!Ks3b_55J7}^k%-pH*jCaz&9bd*;6uRu3<`yOJRr&? zv|i?$V|~+pXSdDSPdR&O35Iemx^IML%+%NfUKU&qLIcq^0^K7J2mo~j_f9d;NR4?I$r)6pkg!&gswX&yjdM|=90V^B zF(w%J7;t+S&Ov$SSZI)WCm4|!B{jU5Kah?eWh`a+`3eu)XmSd?R_MyQYvcEWvBy*7jttP_X~7-1B~b* z8y&x9Wi2bA*6nzbK-4DEAJO3E11gE}d^&Qo|ElZ44TVCXQK<*DN#aq@c;L`Rev`0? zWZH|2ThZ=L$-Bp!lP|WS(GwV&@qRPo>A=w7wa!>HT1Vq3y}Gh86zU3X8Dr5Bkc4J_ zix0bxZfLI$+w;~ArlV`;+?Kwyv>&DZ_?Xq-?%1i6Z^y)W;?Df|_UbEp*0C?ySxwS$ zq~T=dKJ%UDy%Vm)hXc(mg2lof%~CJEGik2;V|Y2${Rjh}X0z69uNQmxrl&3Cb(RLb z#UO|G3_O4W3@?;o#nOV3T3jYCDwY?X&BqGl7`8MJn)%0Iy4zrEZ2iAsbY7pr8-S`@ Kt(+iq9e)ADGa>>2 literal 0 HcmV?d00001 diff --git a/recipes/icons/racjonalista_pl.png b/recipes/icons/racjonalista_pl.png index 8f4d3c6c81be9d1db995bd3987b854e950b321b1..b5e82fff2a0f8188b2131b29bb762427d7dfbae4 100644 GIT binary patch delta 434 zcmcb_c89sXGr-TCmrII^fq{Y7)59eQNDF{42Mdtg`pf(kkYY)9^mSxl*x1kgCy^D% zXDkkKcVbv~PUa<$qmb+o+;K zw#~X2^z_GV_2pduZ7OxUm+7lcd1!yT{(Ar$8(UjTD9?Xcr$i$kfm1UZ8xMZ`aqcP) z5TvKvkoYri!ORV&Qy%V5(_lUvK;W--)6jH$Npat&+h=2CTzG*T5vi$jHjr*vh~RqQlfWig{wN1~6(oUHx3vIVCg! E0R4BLF#rGn delta 810 zcmV+_1J(T81JVXAiBL{Q4GJ0x0000DNk~Le0000m0000m2nGNE09OL}hX4Qo0drDE zLIAGL9O;oE6A}qw_ zhQVZzg)znu5raX*AY%-}f-D$hj3F|H7(~Qi5Q9k;7Q|p$5)ly*5raVtvNU9fK}@nB zLqzP~_#RL3d9U+2<;nkZg%^D9-E+@5-@V^G@92pCxC7h)?f`dyJHQ>_4sZv!1OK%H z)9Dn3f5Rbsettr;*%Yqvcnq?NE*6WBN~Itkk3%>dhNq_|NT<^h61`pzGMNlKJUoES zW&?x409vgU6bc2tC%KeLCHQRFP`i5dU$<(h0$mvJV&=t z{PFQIfpK+pC7%cy0d#rU-MhQHwS?Z^-$A3%e-I*MTS|7joi_R6UMLjW=;q7I3xVNw zyEl&8i^%8a=d}VJA0L@W32wn0Pfku? zo9eN0{Pn%n3f$b>tQFAj_W_rTSS5)7lgR`hA0JHO_d=fq&P5A2kl?Vak&5m;e9(07*qoM6N<$f)45bDP46hOx7_4S6Fo+k-*%fF5)Yl*2 z6XN>+|No0u9v(P&?#!8Mw;sIPvgN?;17{B&y|QcHnL7_&UBCURyLTFpz5BqqhtJ+U zdiLe*w{OQ!U0=U(@3Iw}FJ67Pa@E#L*B(83{^jwDug_k8KY8}%wjGC`zy0>^^Uu|5 zw#{9%cHX>I+cfRYoWJqp`8%MG3@ zx^e5l`?c$K00npMKKlIi*PZ=yFW-E&e#7n!8+T7(5no(XyKw2+lP51sW)_>LZwD0V z@(Jtgo7%%I)4(WL!ywRF+0@z6cXIN)I%eVSoQj<6()Q+_{rk`E+I0d*?%Q{I&z_Tq z4_`cY@&2(>*Y7=k`|#Q4^Ox>kzy0Fg;}3TpzB_yV&Za5zH*Y;~{KU1bJC4kpv1o;% z^JWIw-7E@w6AJe4JF$7|{w32Fuaq}l!yvtzL4Fs5+(vVc+mGHa7gXEAAiIr0ZYhK0 zB4&xz%QsGD5S_{(HitocK7+&}28k&Qq7xZJ8yEyz7=*hRM0y!S`WQrN7zC?;ZU&0* z$W${3v@;5~n>t?laqJQ>&H_t<{DK*URoU9^OBETjKKb`M#YBloWYfRtf^VO#EmlkY z?%Db+ao!JK$uECz{}h;V)mrksp7N)9_CnuWEw4XHOipWUPVou1-MS*eWc2)|)bkAj zwhrp~JtihV!x@vj-CYogS<#~t;h?8uW1|!l z=V+)E)M5sgbQ95uU)+A?d$D-;>4L#*U}9POpMYJ3{HCb zpFewAU&AETz?gkza$rbIP*hY{TISINUb7p^_AJ`8YS*$8V$XDT-`e%7?Uh?K`;3Og zlZ%6kySud~h2ObjYqK|?_W9e#-C7Tx%QL7j?)t0xf0O&~6rfL3OI#yLQW8s2t&)pU zffR$0fuXUkfvK*MMTn7sm63^+skyd+k(GggFLUTA6b-rgDVb@N=o*ZzjG!8>xc?Ue PYGCkm^>bP0l+XkKy0)_i literal 0 HcmV?d00001 diff --git a/recipes/icons/tawernarpg_pl.png b/recipes/icons/tawernarpg_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..d4dfb17fe9beb6923fd4c912a2244a211d0e20ec GIT binary patch literal 1087 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbKJOS+@4BLl<6e(pbstUx|vage(c z!@6@aFM%9|WRD45bDP46hOx7_4S6Fo+k-*%fHRz`!IN z;1l8sRPgHF<|}9CKD)W@?Sn1PZ*BVc;?$Gdn@{iUdjI0$>j%4D-P``;`s$A_&U}1+ z^!%ZTpI=}6^6u{Ll|?V^?gKL9#d&}xSgP~;mlEgkM`cbxcJS3t?!>4`1t(D zhi8YL-`e>2#`zK#_darub!Fv`oYdm&yQU` zHudR^HP3Ia|M=|in}_@Ntu5QUu;|#1<|o(JubG*BaDC0${k?AH?vAks#}(?}>g&bPiSDRnA$~2W_gm{>jWP}~swrunA8B1rhW&3Nyx~U~Pi4_KGd0VU16vjvRSbEtT z`dBGs1?uMeDaJcWnke%+>hQ-qNu|5XIAP7X_-i>+>5c@rGInN85>| zxJsqENrzgA#yLv38wv+nN(NhshgwVdTS~hdiFz9e-#aB#28>jTk|4ie1_1{LgZuB- z-#;IqknlcW|9ydi^9tt`>fhgYc%QI;zruTg{{0E>_piUdzTP09{{DT3{`dFm6&xJm z9iK%4&EPEXh%5%u9$@TtGTRQwcWNHH?lHwD}Eke`g2ZiUEVvnd2#pbn3@zA731yyHyl{-V8R81M8OS07e92ISn*=U zjU5$+3jQ%liiVn&s;0WO%E3Y>rk|QLZQ|K;C(oYN*U*{7%ymLsTSIH=lxb7LL&8F} zn|T5PLxb0^T)R3yBPZ(`gY?Rqmu_9Vck$-cyO(dX7tRR#-_ki*&3~pzopr07t#Dp#T5? literal 0 HcmV?d00001 diff --git a/recipes/ihned.recipe b/recipes/ihned.recipe index a74f9e5649..69e78b0983 100644 --- a/recipes/ihned.recipe +++ b/recipes/ihned.recipe @@ -4,10 +4,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class IHNed(BasicNewsRecipe): - stahnout_vsechny = True - #True = stahuje vsechny z homepage - #False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten) + # True = stahuje vsechny z homepage + # False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten) title = 'iHNed' __author__ = 'Karel Bílek' @@ -28,38 +27,33 @@ class IHNed(BasicNewsRecipe): 'linearize_tables' : True, } - - def preprocess_html(self, soup): def makeurl(wat): - return "http://ihned.cz"+wat; + return "http://ihned.cz"+wat for h1 in soup.findAll('h1'): - a = h1.find('a') - if a: - string = a.string - if string: - soup.a.replaceWith(string) + a = h1.find('a') + if a: + string = a.string + if string: + soup.a.replaceWith(string) for a in soup.findAll('a', href=True) : cil = str(a['href']) - if cil.startswith("/") or cil.startswith("index"): + if cil.startswith("/") or cil.startswith("index"): a['href'] = makeurl(cil) return soup - def parse_index(self): def makeurl(wat): - if wat.startswith("/") or wat.startswith("index"): - return "http://ihned.cz"+wat; + if wat.startswith("/") or wat.startswith("index"): + return "http://ihned.cz"+wat else: return wat - - articles = {} #vysledek, asi - key = None #soucasna sekce - ans = [] #vsechny sekce + articles = {} # vysledek, asi + ans = [] # vsechny sekce articles["Hlavní"] = [] ans.append("Hlavní") @@ -70,12 +64,11 @@ class IHNed(BasicNewsRecipe): articles[name] = [] ans.append(name) - soup = self.index_to_soup(url) otvirak = soup.find(True, attrs={'class':['otv']}) if otvirak: - #the code is copypasted here because I don't know python. simple as that. + # the code is copypasted here because I don't know python. simple as that. a = otvirak.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() txt = otvirak.find(True, attrs={'class':['txt']}) @@ -98,13 +91,13 @@ class IHNed(BasicNewsRecipe): a = ow.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() description='' - prx = ow.find(True, attrs={'class':['prx']}); + prx = ow.find(True, attrs={'class':['prx']}) if prx: description = str(prx.string) - nfo = ow.find(True, attrs={'class':['nfo']}); + nfo = ow.find(True, attrs={'class':['nfo']}) pubdate = '' if nfo: - dtime = time.localtime(); + dtime = time.localtime() day = dtime[2] month = dtime[1] @@ -119,11 +112,6 @@ class IHNed(BasicNewsRecipe): description=description, content='')) - - - - - soup = self.index_to_soup('http://ihned.cz/') otvirak = soup.find(True, attrs={'class':['otv']}) if otvirak: @@ -150,7 +138,7 @@ class IHNed(BasicNewsRecipe): a = otv2.find('a', attrs={'class':['tit2']}, href=True) title = self.tag_to_string(a, use_alt=True).strip() description='' - span = otv2.find('span'); + span = otv2.find('span') if span: match = re.match(r'\s*([^<]*)\s*]*?id="pol_lista"[^>]*?>.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur']*?>wersja do druku', re.DOTALL|re.IGNORECASE), lambda match: '')] + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'boxcontent'})] + remove_tags = [dict(attrs={'class':'fb-like'}), dict(attrs={'alt':'Wersja do druku'}), dict(id='pol_liczba'), dict(attrs={'scr':'http://static.polter.pl/tplimg/buttons/ceneo_140_40.gif'})] + remove_tags_after = dict(attrs={'class':'fb-like'}) + #remove_tags_before = dict() + + feeds = [(u'Wieści', 'http://polter.pl/wiesci,rss.html'), (u'RPG', 'http://rpg.polter.pl/wiesci,rss.html'), (u'Książki', 'http://ksiazki.polter.pl/wiesci,rss.html'), (u'Film', 'http://film.polter.pl/wiesci,rss.html'), (u'Komiks', 'http://komiks.polter.pl/wiesci,rss.html'), (u'Gry bitewne', 'http://bitewniaki.polter.pl/wiesci,rss.html'), (u'Gry karciane', 'http://karcianki.polter.pl/wiesci,rss.html'), (u'Gry planszowe', 'http://planszowki.polter.pl/wiesci,rss.html'), (u'Gry PC', 'http://gry.polter.pl/wiesci,rss.html'), (u'Gry konsolowe', 'http://konsole.polter.pl/wiesci,rss.html'), (u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html'), (u'Blogi', 'http://polter.pl/blogi,rss.html')] + + def preprocess_html(self, soup): + for s in soup.findAll(attrs={'style':re.compile('float: ?left')}): + s['class'] = 'floatleft' + for s in soup.findAll(attrs={'style':re.compile('float: ?right')}): + s['class'] = 'floatright' + tag = soup.find(id='twoja_ocena') + if tag: + tag.parent.extract() + for tag in soup.findAll(id='lista_chce_ile'): + tag.parent.parent.extract() + return soup \ No newline at end of file diff --git a/recipes/sekurak_pl.recipe b/recipes/sekurak_pl.recipe new file mode 100644 index 0000000000..dd689798c7 --- /dev/null +++ b/recipes/sekurak_pl.recipe @@ -0,0 +1,28 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Sekurak(BasicNewsRecipe): + title = u'Sekurak' + __author__ = 'fenuks' + description = u'Wiadomości z dziedziny bezpieczeństwa' + category = 'it, security' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://www.securitum.pl/aktualnosci/sekurak.pl/image' + masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(id='articleContent')] + #remove_tags = [] + #remove_tags_after = dict() + #remove_tags_before = dict() + + feeds = [(u'Wpisy', u'http://feeds.feedburner.com/sekurak')] diff --git a/recipes/tawernarpg_pl.recipe b/recipes/tawernarpg_pl.recipe new file mode 100644 index 0000000000..bda30c98d7 --- /dev/null +++ b/recipes/tawernarpg_pl.recipe @@ -0,0 +1,38 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class TawernaRPG(BasicNewsRecipe): + title = u'Tawerna RPG' + __author__ = 'fenuks' + description = u'Tawerna RPG to ogólnopolski serwis zajmujący się fantastyką i grami fantastycznymi. Znajdziesz u nas zarówno gry fabularne, karciane, planszowe i komputerowe, a także recenzje, opowiadania i sporą dawkę humoru.' + category = 'fantasy, rpg, board games' + #publication_type = '' + language = 'pl' + #encoding = '' + extra_css = '.slajd {list-style-type: none; padding-left: 0px; margin-left: 0px;} .lewanc {float: left; margin-right: 5px;} .srodek {display: block; margin-left: auto; margin-right: auto;}' + cover_url = 'http://www.tawerna.rpg.pl/img/logo.png' + #masthead_url = '' + preprocess_regexps = [(re.compile(ur'

    Dodaj komentarz

    .*', re.DOTALL|re.IGNORECASE), lambda match: '')] + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(id='site')] + remove_tags = [dict(id=['player', 'komentarz'])] + remove_tags_after = dict(id='komentarz') + #remove_tags_before = dict() + + feeds = [(u'Artykuły', 'http://www.tawerna.rpg.pl/css/rss.rss')] + + def preprocess_html(self, soup): + for r in soup.findAll(attrs={'class':'powi'}): + r.parent.extract() + for c in soup.findAll(name=['li', 'ol', 'ul']): + c.name = 'div' + return soup \ No newline at end of file