align to kalibrator - gazeta_wyborcza

This commit is contained in:
Tomasz Długosz 2012-10-26 21:29:22 +02:00
parent 027c020b0a
commit 39fc5eb4ac

View File

@ -1,9 +1,10 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta Wyborcza'
__author__ = 'fenuks'
__author__ = 'fenuks, Artur Stachecki'
language = 'pl'
description = 'news from gazeta.pl'
category = 'newspaper'
@ -15,27 +16,14 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = dict(id=['gazeta_article', 'article'])
remove_tags_after = dict(id='gazeta_article_share')
remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])]
remove_tags_before = dict(id='k0')
remove_tags_after = dict(id='banP4')
remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'),
(u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'),
(u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'),
#(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'),
(u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'),
(u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'),
(u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'),
(u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'),
(u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'),
(u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'),
(u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
(u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss')
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
]
def skip_ad_pages(self, soup):
@ -44,7 +32,6 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
new_soup = self.index_to_soup(tag['href'], raw=True)
return new_soup
def append_page(self, soup, appendtag):
loop = False
tag = soup.find('div', attrs={'id': 'Str'})
@ -85,14 +72,30 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
rem.extract()
def preprocess_html(self, soup):
if soup.find(attrs={'class': 'piano_btn_1'}):
return None
else:
self.append_page(soup, soup.body)
if soup.find(id='container_gal'):
self.gallery_article(soup.body)
return soup
def print_version(self, url):
if 'http://wyborcza.biz/biznes/' not in url:
return url
if url.count('rss.feedsportal.com'):
u = url.find('wyborcza0Bpl')
u = 'http://www.wyborcza.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace('0E', '-')
u = u.replace('0H', ',')
u = u.replace('0I', '_')
u = u.replace('0B', '.')
u = u.replace('/1,', '/2029020,')
u = u.replace('/story01.htm', '')
print(u)
return u
elif 'http://wyborcza.pl/1' in url:
return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020')
else:
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')