mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
align to kalibrator - gazeta_wyborcza
This commit is contained in:
parent
027c020b0a
commit
39fc5eb4ac
@ -1,104 +1,107 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||||
title = u'Gazeta Wyborcza'
|
title = u'Gazeta Wyborcza'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks, Artur Stachecki'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description ='news from gazeta.pl'
|
description = 'news from gazeta.pl'
|
||||||
category='newspaper'
|
category = 'newspaper'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
||||||
INDEX='http://wyborcza.pl'
|
INDEX = 'http://wyborcza.pl'
|
||||||
remove_empty_feeds= True
|
remove_empty_feeds = True
|
||||||
oldest_article = 3
|
oldest_article = 3
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript = True
|
||||||
no_stylesheets=True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
remove_tags_before = dict(id='k0')
|
||||||
keep_only_tags = dict(id=['gazeta_article', 'article'])
|
remove_tags_after = dict(id='banP4')
|
||||||
remove_tags_after = dict(id='gazeta_article_share')
|
remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
|
||||||
remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])]
|
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
|
||||||
|
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
|
||||||
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
|
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
|
||||||
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
|
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
|
||||||
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
|
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
|
||||||
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
|
]
|
||||||
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'),
|
|
||||||
(u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'),
|
|
||||||
(u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'),
|
|
||||||
#(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'),
|
|
||||||
(u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'),
|
|
||||||
(u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'),
|
|
||||||
(u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'),
|
|
||||||
(u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'),
|
|
||||||
(u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'),
|
|
||||||
(u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'),
|
|
||||||
(u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
|
|
||||||
(u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss')
|
|
||||||
]
|
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
tag=soup.find(name='a', attrs={'class':'btn'})
|
tag = soup.find(name='a', attrs={'class': 'btn'})
|
||||||
if tag:
|
if tag:
|
||||||
new_soup=self.index_to_soup(tag['href'], raw=True)
|
new_soup = self.index_to_soup(tag['href'], raw=True)
|
||||||
return new_soup
|
return new_soup
|
||||||
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def append_page(self, soup, appendtag):
|
||||||
loop=False
|
loop = False
|
||||||
tag = soup.find('div', attrs={'id':'Str'})
|
tag = soup.find('div', attrs={'id': 'Str'})
|
||||||
if appendtag.find('div', attrs={'id':'Str'}):
|
if appendtag.find('div', attrs={'id': 'Str'}):
|
||||||
nexturl=tag.findAll('a')
|
nexturl = tag.findAll('a')
|
||||||
appendtag.find('div', attrs={'id':'Str'}).extract()
|
appendtag.find('div', attrs={'id': 'Str'}).extract()
|
||||||
loop=True
|
loop = True
|
||||||
if appendtag.find(id='source'):
|
if appendtag.find(id='source'):
|
||||||
appendtag.find(id='source').extract()
|
appendtag.find(id='source').extract()
|
||||||
while loop:
|
while loop:
|
||||||
loop=False
|
loop = False
|
||||||
for link in nexturl:
|
for link in nexturl:
|
||||||
if u'następne' in link.string:
|
if u'następne' in link.string:
|
||||||
url= self.INDEX + link['href']
|
url = self.INDEX + link['href']
|
||||||
soup2 = self.index_to_soup(url)
|
soup2 = self.index_to_soup(url)
|
||||||
pagetext = soup2.find(id='artykul')
|
pagetext = soup2.find(id='artykul')
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
tag = soup2.find('div', attrs={'id':'Str'})
|
tag = soup2.find('div', attrs={'id': 'Str'})
|
||||||
nexturl=tag.findAll('a')
|
nexturl = tag.findAll('a')
|
||||||
loop=True
|
loop = True
|
||||||
|
|
||||||
def gallery_article(self, appendtag):
|
def gallery_article(self, appendtag):
|
||||||
tag=appendtag.find(id='container_gal')
|
tag = appendtag.find(id='container_gal')
|
||||||
if tag:
|
if tag:
|
||||||
nexturl=appendtag.find(id='gal_btn_next').a['href']
|
nexturl = appendtag.find(id='gal_btn_next').a['href']
|
||||||
appendtag.find(id='gal_navi').extract()
|
appendtag.find(id='gal_navi').extract()
|
||||||
while nexturl:
|
while nexturl:
|
||||||
soup2=self.index_to_soup(nexturl)
|
soup2 = self.index_to_soup(nexturl)
|
||||||
pagetext=soup2.find(id='container_gal')
|
pagetext = soup2.find(id='container_gal')
|
||||||
nexturl=pagetext.find(id='gal_btn_next')
|
nexturl = pagetext.find(id='gal_btn_next')
|
||||||
if nexturl:
|
if nexturl:
|
||||||
nexturl=nexturl.a['href']
|
nexturl = nexturl.a['href']
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
rem=appendtag.find(id='gal_navi')
|
rem = appendtag.find(id='gal_navi')
|
||||||
if rem:
|
if rem:
|
||||||
rem.extract()
|
rem.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
if soup.find(attrs={'class': 'piano_btn_1'}):
|
||||||
if soup.find(id='container_gal'):
|
return None
|
||||||
self.gallery_article(soup.body)
|
else:
|
||||||
return soup
|
self.append_page(soup, soup.body)
|
||||||
|
if soup.find(id='container_gal'):
|
||||||
|
self.gallery_article(soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
if 'http://wyborcza.biz/biznes/' not in url:
|
if url.count('rss.feedsportal.com'):
|
||||||
return url
|
u = url.find('wyborcza0Bpl')
|
||||||
|
u = 'http://www.wyborcza.pl/' + url[u + 11:]
|
||||||
|
u = u.replace('0C', '/')
|
||||||
|
u = u.replace('A', '')
|
||||||
|
u = u.replace('0E', '-')
|
||||||
|
u = u.replace('0H', ',')
|
||||||
|
u = u.replace('0I', '_')
|
||||||
|
u = u.replace('0B', '.')
|
||||||
|
u = u.replace('/1,', '/2029020,')
|
||||||
|
u = u.replace('/story01.htm', '')
|
||||||
|
print(u)
|
||||||
|
return u
|
||||||
|
elif 'http://wyborcza.pl/1' in url:
|
||||||
|
return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020')
|
||||||
else:
|
else:
|
||||||
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
|
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
|
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
|
||||||
cover=soup.find(id='GWmini2')
|
cover = soup.find(id='GWmini2')
|
||||||
soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href'])
|
soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href'])
|
||||||
self.cover_url='http://wyborcza.pl' + soup.img['src']
|
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
|
||||||
return getattr(self, 'cover_url', self.cover_url)
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user