diff --git a/recipes/alejakomiksu_com.recipe b/recipes/alejakomiksu_com.recipe new file mode 100644 index 0000000000..f34fd1183d --- /dev/null +++ b/recipes/alejakomiksu_com.recipe @@ -0,0 +1,37 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class AlejaKomiksu(BasicNewsRecipe): + title = u'Aleja Komiksu' + __author__ = 'fenuks' + description = u'Serwis poświęcony komiksom. Najnowsze wieści, recenzje, artykuły, wywiady, galerie, komiksy online, konkursy, linki, baza komiksów online.' + category = 'comics' + #publication_type = '' + language = 'pl' + #encoding = '' + extra_css = 'ul {list-style-type: none;} .gfx_news {float: right;}' + preprocess_regexps = [(re.compile(ur'((
  • (Do poczytania)|(Nowości):
  • )|(

    Komentarze

    )).*', re.DOTALL|re.IGNORECASE), lambda match: '')] + cover_url = 'http://www.alejakomiksu.com/gfx/build/logo.png' + masthead_url = 'http://www.alejakomiksu.com/gfx/build/logo.png' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'cont_tresc'})] + #remove_tags = [dict()] + #remove_tags_before = dict() + + feeds = [(u'Wiadomości', 'http://www.alejakomiksu.com/rss.php5')] + + def skip_ad_pages(self, soup): + tag = soup.find(attrs={'class':'rodzaj'}) + if tag and tag.a.string.lower().strip() == 'recenzje': + link = soup.find(text=re.compile('recenzuje')) + if link: + return self.index_to_soup(link.parent['href'], raw=True) \ No newline at end of file diff --git a/recipes/fdb_pl.recipe b/recipes/fdb_pl.recipe new file mode 100644 index 0000000000..c7baf84408 --- /dev/null +++ b/recipes/fdb_pl.recipe @@ -0,0 +1,49 @@ +__license__ = 'GPL v3' +from calibre.web.feeds.news import BasicNewsRecipe + +class FDBPl(BasicNewsRecipe): + title = u'Fdb.pl' + __author__ = 'fenuks' + description = u'Wiadomości ze świata filmu, baza danych filmowych, recenzje, zwiastuny, boxoffice.' + category = 'film' + #publication_type = '' + language = 'pl' + #encoding = '' + extra_css = '.options-left > li {display: inline;} em {display: block;}' + cover_url = 'http://fdb.pl/assets/fdb2/logo.png' + #masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'news-item news-first'})] + remove_tags = [dict(attrs={'class':['dig dig-first', 'ads clearfix', 'comments']})] + #remove_tags_after = dict() + #remove_tags_before = dict() + feeds = [] + + def parse_index(self): + feeds = [] + feeds.append((u'Wiadomości', self.get_articles('http://fdb.pl/wiadomosci?page={0}', 2))) + return feeds + + def get_articles(self, url, pages=1): + articles = [] + for nr in range(1, pages+1): + soup = self.index_to_soup(url.format(nr)) + for tag in soup.findAll(attrs={'class':'news-item clearfix'}): + node = tag.find('h2') + title = node.a.string + url = 'http://fdb.pl' + node.a['href'] + date = '' + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles \ No newline at end of file diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe index 0f7633e4b2..4abc6120b7 100644 --- a/recipes/gazeta_pl_krakow.recipe +++ b/recipes/gazeta_pl_krakow.recipe @@ -8,94 +8,87 @@ krakow.gazeta.pl ''' from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.ebooks.BeautifulSoup import Comment class gw_krakow(BasicNewsRecipe): title = u'Gazeta Wyborcza Kraków' __author__ = 'teepel based on GW from fenuks' language = 'pl' - description =u'Wiadomości z Krakowa na portalu Gazeta.pl.' - category='newspaper' + description = u'Wiadomości z Krakowa na portalu Gazeta.pl.' + category = 'newspaper' publication_type = 'newspaper' - masthead_url='http://bi.gazeta.pl/im/5/8528/m8528105.gif' - INDEX='http://krakow.gazeta.pl/' - remove_empty_feeds= True - oldest_article = 1 + # encoding = 'iso-8859-2' + masthead_url = 'http://bi.gazeta.pl/im/5/8528/m8528105.gif' + INDEX = 'http://krakow.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) - remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) - remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'})) - - remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})] + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] - def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) - return new_soup - - - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True - if appendtag.find(id='source'): - appendtag.find(id='source').extract() - while loop: - loop=False - for link in nexturl: - if u'następne' in link.string: - url= self.INDEX + link['href'] - soup2 = self.index_to_soup(url) - pagetext = soup2.find(id='artykul') - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True - - def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') - if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] - appendtag.find(id='gal_navi').extract() - while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') - if nexturl: - nexturl=nexturl.a['href'] - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') - if rem: - rem.extract() + def print_version(self, url): + if 'feedsportal.com' in url: + s = url.rpartition('gazeta0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('wyborcza0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = self.INDEX + u + return u + else: + return url def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() + + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gazeta_pl_szczecin.recipe b/recipes/gazeta_pl_szczecin.recipe index c0c83fd109..e966ca5eed 100644 --- a/recipes/gazeta_pl_szczecin.recipe +++ b/recipes/gazeta_pl_szczecin.recipe @@ -1,8 +1,8 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai import re -import string from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GazetaPlSzczecin(BasicNewsRecipe): title = u'Gazeta Wyborcza Szczecin' @@ -12,24 +12,74 @@ class GazetaPlSzczecin(BasicNewsRecipe): language = 'pl' publisher = 'Agora S.A.' category = 'news, szczecin' - oldest_article = 2 + INDEX = 'http://szczecin.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 max_articles_per_feed = 100 - auto_cleanup = True - remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}] - cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif" + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')] - def get_article_url(self, article): - s = re.search("""/0L(szczecin.*)/story01.htm""", article.link) - s = s.group(1) - replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"} - for (a, b) in replacements.iteritems(): - s = string.replace(s, a, b) - s = string.replace(s, "0A", "0") - return "http://"+s - def print_version(self, url): - s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url) - no1 = s.group(2) - no2 = s.group(3) - return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2) + if 'feedsportal.com' in url: + s = url.rpartition('gazeta0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('wyborcza0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = self.INDEX + u + return u + else: + return url + + def preprocess_html(self, soup): + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) + return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() + + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe index 6a37a96885..fcdffc3abd 100644 --- a/recipes/gazeta_pl_warszawa.recipe +++ b/recipes/gazeta_pl_warszawa.recipe @@ -7,7 +7,9 @@ __author__ = 'teepel based on GW from fenuks' warszawa.gazeta.pl ''' +import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class gw_wawa(BasicNewsRecipe): title = u'Gazeta Wyborcza Warszawa' @@ -17,82 +19,75 @@ class gw_wawa(BasicNewsRecipe): category='newspaper' publication_type = 'newspaper' masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif' - INDEX='http://warszawa.gazeta.pl/' - remove_empty_feeds= True - oldest_article = 1 + INDEX = 'http://warszawa.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) - remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) - remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) - remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] - def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) - return new_soup - - - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True - if appendtag.find(id='source'): - appendtag.find(id='source').extract() - while loop: - loop=False - for link in nexturl: - if u'następne' in link.string: - url= self.INDEX + link['href'] - soup2 = self.index_to_soup(url) - pagetext = soup2.find(id='artykul') - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True - - def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') - if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] - appendtag.find(id='gal_navi').extract() - while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') - if nexturl: - nexturl=nexturl.a['href'] - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') - if rem: - rem.extract() + def print_version(self, url): + if 'feedsportal.com' in url: + s = url.rpartition('gazeta0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('wyborcza0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = self.INDEX + u + return u + else: + return url def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() + + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 880aea5bc1..653c776723 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment - +import re class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks, Artur Stachecki' @@ -9,7 +9,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.' category = 'newspaper' publication_type = 'newspaper' - #encoding = 'iso-8859-2' + # encoding = 'iso-8859-2' masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX = 'http://wyborcza.pl' remove_empty_feeds = True @@ -19,10 +19,18 @@ class Gazeta_Wyborcza(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} - remove_tags_before = dict(id='k0') - remove_tags_after = dict(id='banP4') - remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] - feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), + + # rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '')] + keep_only_tags = [dict(id='gazeta_article')] + remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})] + remove_tags_after = dict(id='gazeta_article_body') + + # rules for wyborcza.biz + preprocess_regexps.append((re.compile(u'(
    )?(
    )? Czytaj (także|też):.*?\.?
    ', re.DOTALL), lambda m: '')) + + feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), + (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), @@ -39,86 +47,55 @@ class Gazeta_Wyborcza(BasicNewsRecipe): (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') - ] - - def skip_ad_pages(self, soup): - tag = soup.find(name='a', attrs={'class': 'btn'}) - if tag: - new_soup = self.index_to_soup(tag['href'], raw=True) - return new_soup - - def append_page(self, soup, appendtag): - loop = False - tag = soup.find('div', attrs={'id': 'Str'}) - if appendtag.find('div', attrs={'id': 'Str'}): - nexturl = tag.findAll('a') - appendtag.find('div', attrs={'id': 'Str'}).extract() - loop = True - if appendtag.find(id='source'): - appendtag.find(id='source').extract() - while loop: - loop = False - for link in nexturl: - if u'następne' in link.string: - url = self.INDEX + link['href'] - soup2 = self.index_to_soup(url) - pagetext = soup2.find(id='artykul') - comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) - for comment in comments: - comment.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id': 'Str'}) - nexturl = tag.findAll('a') - loop = True - - def gallery_article(self, appendtag): - tag = appendtag.find(id='container_gal') - if tag: - nexturl = appendtag.find(id='gal_btn_next').a['href'] - appendtag.find(id='gal_navi').extract() - while nexturl: - soup2 = self.index_to_soup(nexturl) - pagetext = soup2.find(id='container_gal') - nexturl = pagetext.find(id='gal_btn_next') - if nexturl: - nexturl = nexturl.a['href'] - comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) - for comment in comments: - comment.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - rem = appendtag.find(id='gal_navi') - if rem: - rem.extract() - - def preprocess_html(self, soup): - if soup.find(attrs={'class': 'piano_btn_1'}): - return None - else: - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup + ] def print_version(self, url): - if url.count('rss.feedsportal.com'): - u = url.find('wyborcza0Bpl') - u = 'http://www.wyborcza.pl/' + url[u + 11:] + if 'feedsportal.com' in url: + s = url.rpartition('wyborcza0Bpl') + u = s[2] + if not s[0]: + u = url.rpartition('gazeta0Bpl')[2] + u = u.replace('/l/', '/') + u = u.replace('/ia1.htm', '') + u = u.replace('/story01.htm', '') u = u.replace('0C', '/') u = u.replace('A', '') u = u.replace('0E', '-') u = u.replace('0H', ',') u = u.replace('0I', '_') u = u.replace('0B', '.') - u = u.replace('/1,', '/2029020,') - u = u.replace('/story01.htm', '') - print(u) + u = self.INDEX + u return u - elif 'http://wyborcza.pl/1' in url: - return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020') else: - return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + return url + + def preprocess_html(self, soup): + tag = soup.find(id='Str') + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + elif tag and tag.findAll('a'): + self.append_page(soup, soup.body) + return soup + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'id': 'Str'}) + try: + baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content'] + except: + return 1 + link = tag.findAll('a')[-1] + while link: + soup2 = self.index_to_soup(baseurl + link['href']) + link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1] + if not u'następne' in link.string: + link = '' + pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') @@ -127,6 +104,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe): self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) - '''def image_url_processor(self, baseurl, url): - print "@@@@@@@@", url - return url.replace('http://wyborcza.pl/ ', '')''' + def image_url_processor(self, baseurl, url): + if url.startswith(' '): + return url.strip() + else: + return url + diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe index 65e6e1704c..ba2280c2a5 100644 --- a/recipes/gosc_niedzielny.recipe +++ b/recipes/gosc_niedzielny.recipe @@ -13,7 +13,7 @@ class GN(BasicNewsRecipe): __author__ = 'Piotr Kontek, Tomasz Długosz' title = u'Gość Niedzielny' - description = 'Ogólnopolski tygodnik katolicki' + description = 'Ogólnopolski tygodnik katolicki - fragmenty artykułów z poprzedniego numeru' encoding = 'utf-8' no_stylesheets = True language = 'pl' @@ -33,7 +33,7 @@ class GN(BasicNewsRecipe): a = soup.find('div',attrs={'class':'release-wp-b'}).find('a') articles = [ {'title' : self.tag_to_string(a), - 'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/') + 'url' : 'http://www.gosc.pl' + a['href'] }] feeds.append((u'Wstępniak',articles)) #kategorie @@ -71,12 +71,11 @@ class GN(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) - ''' - for image_div in soup.findAll(attrs={'class':'doc_image'}): - link = - if 'm.jpg' in image['src']: - image['src'] = image['src'].replace('m.jpg', '.jpg') - ''' + return soup + + def postprocess_html(self, soup, first_fetch): + for r in soup.findAll(attrs={'class':'pgr'}): + r.extract() return soup keep_only_tags = [ @@ -85,12 +84,14 @@ class GN(BasicNewsRecipe): remove_tags = [ dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}), - dict(name='div', attrs={'class':['doc_actions', 'pgr', 'fr1_cl']}), - dict(name='div', attrs={'id':'vote'}) + dict(name='div', attrs={'class':['doc_actions', 'cf', 'fr1_cl']}), + dict(name='div', attrs={'id':'vote'}), + dict(name='a', attrs={'class':'img_enlarge'}) ] extra_css = ''' h1 {font-size:150%} - div#doc_image {font-style:italic; font-size:70%} p.limiter {font-size:150%; font-weight: bold} + span.cm-i-a {text-transform:uppercase;} + span.cm-i-p {font-style:italic; font-size:70%} ''' diff --git a/recipes/icons/alejakomiksu_com.png b/recipes/icons/alejakomiksu_com.png new file mode 100644 index 0000000000..c564af10ff Binary files /dev/null and b/recipes/icons/alejakomiksu_com.png differ diff --git a/recipes/icons/linuxportal_pl.png b/recipes/icons/linuxportal_pl.png new file mode 100644 index 0000000000..daa92d05d8 Binary files /dev/null and b/recipes/icons/linuxportal_pl.png differ diff --git a/recipes/icons/picoboard_pl.png b/recipes/icons/picoboard_pl.png new file mode 100644 index 0000000000..1c1943242f Binary files /dev/null and b/recipes/icons/picoboard_pl.png differ diff --git a/recipes/icons/polter_pl.png b/recipes/icons/polter_pl.png new file mode 100644 index 0000000000..581c0dc419 Binary files /dev/null and b/recipes/icons/polter_pl.png differ diff --git a/recipes/icons/racjonalista_pl.png b/recipes/icons/racjonalista_pl.png index 8f4d3c6c81..b5e82fff2a 100644 Binary files a/recipes/icons/racjonalista_pl.png and b/recipes/icons/racjonalista_pl.png differ diff --git a/recipes/icons/sekurak_pl.png b/recipes/icons/sekurak_pl.png new file mode 100644 index 0000000000..37ce651fb4 Binary files /dev/null and b/recipes/icons/sekurak_pl.png differ diff --git a/recipes/icons/tawernarpg_pl.png b/recipes/icons/tawernarpg_pl.png new file mode 100644 index 0000000000..d4dfb17fe9 Binary files /dev/null and b/recipes/icons/tawernarpg_pl.png differ diff --git a/recipes/ihned.recipe b/recipes/ihned.recipe index a74f9e5649..69e78b0983 100644 --- a/recipes/ihned.recipe +++ b/recipes/ihned.recipe @@ -4,10 +4,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class IHNed(BasicNewsRecipe): - stahnout_vsechny = True - #True = stahuje vsechny z homepage - #False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten) + # True = stahuje vsechny z homepage + # False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten) title = 'iHNed' __author__ = 'Karel Bílek' @@ -28,38 +27,33 @@ class IHNed(BasicNewsRecipe): 'linearize_tables' : True, } - - def preprocess_html(self, soup): def makeurl(wat): - return "http://ihned.cz"+wat; + return "http://ihned.cz"+wat for h1 in soup.findAll('h1'): - a = h1.find('a') - if a: - string = a.string - if string: - soup.a.replaceWith(string) + a = h1.find('a') + if a: + string = a.string + if string: + soup.a.replaceWith(string) for a in soup.findAll('a', href=True) : cil = str(a['href']) - if cil.startswith("/") or cil.startswith("index"): + if cil.startswith("/") or cil.startswith("index"): a['href'] = makeurl(cil) return soup - def parse_index(self): def makeurl(wat): - if wat.startswith("/") or wat.startswith("index"): - return "http://ihned.cz"+wat; + if wat.startswith("/") or wat.startswith("index"): + return "http://ihned.cz"+wat else: return wat - - articles = {} #vysledek, asi - key = None #soucasna sekce - ans = [] #vsechny sekce + articles = {} # vysledek, asi + ans = [] # vsechny sekce articles["Hlavní"] = [] ans.append("Hlavní") @@ -70,12 +64,11 @@ class IHNed(BasicNewsRecipe): articles[name] = [] ans.append(name) - soup = self.index_to_soup(url) otvirak = soup.find(True, attrs={'class':['otv']}) if otvirak: - #the code is copypasted here because I don't know python. simple as that. + # the code is copypasted here because I don't know python. simple as that. a = otvirak.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() txt = otvirak.find(True, attrs={'class':['txt']}) @@ -98,13 +91,13 @@ class IHNed(BasicNewsRecipe): a = ow.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() description='' - prx = ow.find(True, attrs={'class':['prx']}); + prx = ow.find(True, attrs={'class':['prx']}) if prx: description = str(prx.string) - nfo = ow.find(True, attrs={'class':['nfo']}); + nfo = ow.find(True, attrs={'class':['nfo']}) pubdate = '' if nfo: - dtime = time.localtime(); + dtime = time.localtime() day = dtime[2] month = dtime[1] @@ -119,11 +112,6 @@ class IHNed(BasicNewsRecipe): description=description, content='')) - - - - - soup = self.index_to_soup('http://ihned.cz/') otvirak = soup.find(True, attrs={'class':['otv']}) if otvirak: @@ -150,7 +138,7 @@ class IHNed(BasicNewsRecipe): a = otv2.find('a', attrs={'class':['tit2']}, href=True) title = self.tag_to_string(a, use_alt=True).strip() description='' - span = otv2.find('span'); + span = otv2.find('span') if span: match = re.match(r'\s*([^<]*)\s*]*?id="pol_lista"[^>]*?>.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur']*?>wersja do druku', re.DOTALL|re.IGNORECASE), lambda match: '')] + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'boxcontent'})] + remove_tags = [dict(attrs={'class':'fb-like'}), dict(attrs={'alt':'Wersja do druku'}), dict(id='pol_liczba'), dict(attrs={'scr':'http://static.polter.pl/tplimg/buttons/ceneo_140_40.gif'})] + remove_tags_after = dict(attrs={'class':'fb-like'}) + #remove_tags_before = dict() + + feeds = [(u'Wieści', 'http://polter.pl/wiesci,rss.html'), (u'RPG', 'http://rpg.polter.pl/wiesci,rss.html'), (u'Książki', 'http://ksiazki.polter.pl/wiesci,rss.html'), (u'Film', 'http://film.polter.pl/wiesci,rss.html'), (u'Komiks', 'http://komiks.polter.pl/wiesci,rss.html'), (u'Gry bitewne', 'http://bitewniaki.polter.pl/wiesci,rss.html'), (u'Gry karciane', 'http://karcianki.polter.pl/wiesci,rss.html'), (u'Gry planszowe', 'http://planszowki.polter.pl/wiesci,rss.html'), (u'Gry PC', 'http://gry.polter.pl/wiesci,rss.html'), (u'Gry konsolowe', 'http://konsole.polter.pl/wiesci,rss.html'), (u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html'), (u'Blogi', 'http://polter.pl/blogi,rss.html')] + + def preprocess_html(self, soup): + for s in soup.findAll(attrs={'style':re.compile('float: ?left')}): + s['class'] = 'floatleft' + for s in soup.findAll(attrs={'style':re.compile('float: ?right')}): + s['class'] = 'floatright' + tag = soup.find(id='twoja_ocena') + if tag: + tag.parent.extract() + for tag in soup.findAll(id='lista_chce_ile'): + tag.parent.parent.extract() + return soup \ No newline at end of file diff --git a/recipes/sekurak_pl.recipe b/recipes/sekurak_pl.recipe new file mode 100644 index 0000000000..dd689798c7 --- /dev/null +++ b/recipes/sekurak_pl.recipe @@ -0,0 +1,28 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Sekurak(BasicNewsRecipe): + title = u'Sekurak' + __author__ = 'fenuks' + description = u'Wiadomości z dziedziny bezpieczeństwa' + category = 'it, security' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://www.securitum.pl/aktualnosci/sekurak.pl/image' + masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(id='articleContent')] + #remove_tags = [] + #remove_tags_after = dict() + #remove_tags_before = dict() + + feeds = [(u'Wpisy', u'http://feeds.feedburner.com/sekurak')] diff --git a/recipes/tawernarpg_pl.recipe b/recipes/tawernarpg_pl.recipe new file mode 100644 index 0000000000..bda30c98d7 --- /dev/null +++ b/recipes/tawernarpg_pl.recipe @@ -0,0 +1,38 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class TawernaRPG(BasicNewsRecipe): + title = u'Tawerna RPG' + __author__ = 'fenuks' + description = u'Tawerna RPG to ogólnopolski serwis zajmujący się fantastyką i grami fantastycznymi. Znajdziesz u nas zarówno gry fabularne, karciane, planszowe i komputerowe, a także recenzje, opowiadania i sporą dawkę humoru.' + category = 'fantasy, rpg, board games' + #publication_type = '' + language = 'pl' + #encoding = '' + extra_css = '.slajd {list-style-type: none; padding-left: 0px; margin-left: 0px;} .lewanc {float: left; margin-right: 5px;} .srodek {display: block; margin-left: auto; margin-right: auto;}' + cover_url = 'http://www.tawerna.rpg.pl/img/logo.png' + #masthead_url = '' + preprocess_regexps = [(re.compile(ur'

    Dodaj komentarz

    .*', re.DOTALL|re.IGNORECASE), lambda match: '')] + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(id='site')] + remove_tags = [dict(id=['player', 'komentarz'])] + remove_tags_after = dict(id='komentarz') + #remove_tags_before = dict() + + feeds = [(u'Artykuły', 'http://www.tawerna.rpg.pl/css/rss.rss')] + + def preprocess_html(self, soup): + for r in soup.findAll(attrs={'class':'powi'}): + r.parent.extract() + for c in soup.findAll(name=['li', 'ol', 'ul']): + c.name = 'div' + return soup \ No newline at end of file