diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 5c034b10ab..3d416e444f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,104 +1,107 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe + class Gazeta_Wyborcza(BasicNewsRecipe): - title = u'Gazeta Wyborcza' - __author__ = 'fenuks' - language = 'pl' - description ='news from gazeta.pl' - category='newspaper' + title = u'Gazeta Wyborcza' + __author__ = 'fenuks, Artur Stachecki' + language = 'pl' + description = 'news from gazeta.pl' + category = 'newspaper' publication_type = 'newspaper' - masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' - INDEX='http://wyborcza.pl' - remove_empty_feeds= True + masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' + INDEX = 'http://wyborcza.pl' + remove_empty_feeds = True oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True - ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = dict(id=['gazeta_article', 'article']) - remove_tags_after = dict(id='gazeta_article_share') - remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])] - - feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), - (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), - (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), - (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), - (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), - (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), - (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), - #(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), - (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), - (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), - (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), - (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), - (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), - (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), - (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), - (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss') - ] + remove_javascript = True + no_stylesheets = True + remove_tags_before = dict(id='k0') + remove_tags_after = dict(id='banP4') + remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] + feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), + (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), + (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), + (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), + (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') + ] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) + tag = soup.find(name='a', attrs={'class': 'btn'}) + if tag: + new_soup = self.index_to_soup(tag['href'], raw=True) return new_soup - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True + loop = False + tag = soup.find('div', attrs={'id': 'Str'}) + if appendtag.find('div', attrs={'id': 'Str'}): + nexturl = tag.findAll('a') + appendtag.find('div', attrs={'id': 'Str'}).extract() + loop = True if appendtag.find(id='source'): appendtag.find(id='source').extract() while loop: - loop=False + loop = False for link in nexturl: if u'następne' in link.string: - url= self.INDEX + link['href'] + url = self.INDEX + link['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find(id='artykul') pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True + tag = soup2.find('div', attrs={'id': 'Str'}) + nexturl = tag.findAll('a') + loop = True def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') + tag = appendtag.find(id='container_gal') if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] + nexturl = appendtag.find(id='gal_btn_next').a['href'] appendtag.find(id='gal_navi').extract() while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') + soup2 = self.index_to_soup(nexturl) + pagetext = soup2.find(id='container_gal') + nexturl = pagetext.find(id='gal_btn_next') if nexturl: - nexturl=nexturl.a['href'] + nexturl = nexturl.a['href'] pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') + rem = appendtag.find(id='gal_navi') if rem: rem.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + else: + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup def print_version(self, url): - if 'http://wyborcza.biz/biznes/' not in url: - return url + if url.count('rss.feedsportal.com'): + u = url.find('wyborcza0Bpl') + u = 'http://www.wyborcza.pl/' + url[u + 11:] + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = u.replace('/1,', '/2029020,') + u = u.replace('/story01.htm', '') + print(u) + return u + elif 'http://wyborcza.pl/1' in url: + return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020') else: - return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') - cover=soup.find(id='GWmini2') - soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) - self.cover_url='http://wyborcza.pl' + soup.img['src'] + cover = soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) + self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url)