diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe new file mode 100644 index 0000000000..0f35e536f6 --- /dev/null +++ b/recipes/gazeta_pl_krakow.recipe @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = 'teepel based on GW from fenuks' + +''' +krakow.gazeta.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class gw_krakow(BasicNewsRecipe): + title = u'Gazeta.pl Kraków' + __author__ = 'teepel based on GW from fenuks' + language = 'pl' + description =u'Wiadomości z Krakowa na portalu Gazeta.pl.' + category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/8528/m8528105.gif' + INDEX='http://krakow.gazeta.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) + remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'})) + + remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})] + + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] + + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + + + def append_page(self, soup, appendtag): + loop=False + tag = soup.find('div', attrs={'id':'Str'}) + if appendtag.find('div', attrs={'id':'Str'}): + nexturl=tag.findAll('a') + appendtag.find('div', attrs={'id':'Str'}).extract() + loop=True + if appendtag.find(id='source'): + appendtag.find(id='source').extract() + while loop: + loop=False + for link in nexturl: + if u'następne' in link.string: + url= self.INDEX + link['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='artykul') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('div', attrs={'id':'Str'}) + nexturl=tag.findAll('a') + loop=True + + def gallery_article(self, appendtag): + tag=appendtag.find(id='container_gal') + if tag: + nexturl=appendtag.find(id='gal_btn_next').a['href'] + appendtag.find(id='gal_navi').extract() + while nexturl: + soup2=self.index_to_soup(nexturl) + pagetext=soup2.find(id='container_gal') + nexturl=pagetext.find(id='gal_btn_next') + if nexturl: + nexturl=nexturl.a['href'] + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(id='gal_navi') + if rem: + rem.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup + diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe new file mode 100644 index 0000000000..7a43931db4 --- /dev/null +++ b/recipes/gazeta_pl_warszawa.recipe @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel based on GW from fenuks' + +''' +warszawa.gazeta.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class gw_wawa(BasicNewsRecipe): + title = u'Gazeta.pl Warszawa' + __author__ = 'teepel based on GW from fenuks' + language = 'pl' + description ='Wiadomości z Warszawy na portalu Gazeta.pl.' + category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif' + INDEX='http://warszawa.gazeta.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) + remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] + + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + + + def append_page(self, soup, appendtag): + loop=False + tag = soup.find('div', attrs={'id':'Str'}) + if appendtag.find('div', attrs={'id':'Str'}): + nexturl=tag.findAll('a') + appendtag.find('div', attrs={'id':'Str'}).extract() + loop=True + if appendtag.find(id='source'): + appendtag.find(id='source').extract() + while loop: + loop=False + for link in nexturl: + if u'następne' in link.string: + url= self.INDEX + link['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='artykul') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('div', attrs={'id':'Str'}) + nexturl=tag.findAll('a') + loop=True + + def gallery_article(self, appendtag): + tag=appendtag.find(id='container_gal') + if tag: + nexturl=appendtag.find(id='gal_btn_next').a['href'] + appendtag.find(id='gal_navi').extract() + while nexturl: + soup2=self.index_to_soup(nexturl) + pagetext=soup2.find(id='container_gal') + nexturl=pagetext.find(id='gal_btn_next') + if nexturl: + nexturl=nexturl.a['href'] + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(id='gal_navi') + if rem: + rem.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup + diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 3d416e444f..633b80444a 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Gazeta_Wyborcza(BasicNewsRecipe): - title = u'Gazeta Wyborcza' + title = u'Gazeta.pl' __author__ = 'fenuks, Artur Stachecki' language = 'pl' description = 'news from gazeta.pl' diff --git a/recipes/icons/gazeta_pl_krakow.png b/recipes/icons/gazeta_pl_krakow.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_krakow.png differ diff --git a/recipes/icons/gazeta_pl_szczecin.png b/recipes/icons/gazeta_pl_szczecin.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_szczecin.png differ diff --git a/recipes/icons/gazeta_pl_warszawa.png b/recipes/icons/gazeta_pl_warszawa.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_warszawa.png differ diff --git a/recipes/icons/gazeta_wyborcza.png b/recipes/icons/gazeta_wyborcza.png index 9e480cc41d..119afbba3a 100644 Binary files a/recipes/icons/gazeta_wyborcza.png and b/recipes/icons/gazeta_wyborcza.png differ