Various updated Polish recipes

This commit is contained in:
Kovid Goyal 2012-11-07 17:27:11 +05:30
commit a6acd8c1c9
7 changed files with 165 additions and 126 deletions

View File

@ -35,3 +35,7 @@ nbproject/
.settings/
*.DS_Store
calibre_plugins/
recipes/.git
recipes/.gitignore
recipes/README
recipes/katalog_egazeciarz.recipe

View File

@ -2,7 +2,9 @@ import re
from calibre.web.feeds.news import BasicNewsRecipe
class FocusRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = u'intromatyk <intromatyk@gmail.com>'
language = 'pl'
@ -12,10 +14,10 @@ class FocusRecipe(BasicNewsRecipe):
publisher = u'Gruner + Jahr Polska'
category = u'News'
description = u'Newspaper'
category='magazine'
cover_url=''
remove_empty_feeds= True
no_stylesheets=True
category = 'magazine'
cover_url = ''
remove_empty_feeds = True
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100000
recursions = 0
@ -27,15 +29,15 @@ class FocusRecipe(BasicNewsRecipe):
simultaneous_downloads = 5
r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
keep_only_tags =[]
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
keep_only_tags = []
keep_only_tags.append(dict(name='div', attrs={'id': 'cll'}))
remove_tags = []
remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'}))
remove_tags.append(dict(name='div', attrs={'class': 'txb'}))
remove_tags.append(dict(name='div', attrs={'class': 'h2'}))
remove_tags.append(dict(name='ul', attrs={'class': 'txu'}))
remove_tags.append(dict(name='div', attrs={'class': 'ulc'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
@ -44,18 +46,17 @@ class FocusRecipe(BasicNewsRecipe):
p.lead {font-weight: bold; text-align: left;}
.authordate {font-size: small; color: #696969;}
.fot{font-size: x-small; color: #666666;}
'''
'''
feeds = [
('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
]
feeds = [
('Nauka', 'http://www.focus.pl/nauka/rss/'),
('Historia', 'http://www.focus.pl/historia/rss/'),
('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'),
('Sport', 'http://www.focus.pl/sport/rss/'),
('Technika', 'http://www.focus.pl/technika/rss/'),
('Przyroda', 'http://www.focus.pl/przyroda/rss/'),
('Technologie', 'http://www.focus.pl/gadzety/rss/')
]
def skip_ad_pages(self, soup):
if ('advertisement' in soup.find('title').string.lower()):
@ -65,20 +66,20 @@ class FocusRecipe(BasicNewsRecipe):
return None
def get_cover_url(self):
soup=self.index_to_soup('http://www.focus.pl/magazyn/')
tag=soup.find(name='div', attrs={'class':'clr fl'})
soup = self.index_to_soup('http://www.focus.pl/magazyn/')
tag = soup.find(name='div', attrs={'class': 'clr fl'})
if tag:
self.cover_url='http://www.focus.pl/' + tag.a['href']
self.cover_url = 'http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url)
def print_version(self, url):
if url.count ('focus.pl.feedsportal.com'):
if url.count('focus.pl.feedsportal.com'):
u = url.find('focus0Bpl')
u = 'http://www.focus.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace ('0E','-')
u = u.replace('0E', '-')
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
else:
u = url.replace('/nc/1','/do-druku/1')
return u
else:
u = url.replace('/nc/1', '/do-druku/1')
return u

View File

@ -1,104 +1,107 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta Wyborcza'
__author__ = 'fenuks'
language = 'pl'
description ='news from gazeta.pl'
category='newspaper'
title = u'Gazeta Wyborcza'
__author__ = 'fenuks, Artur Stachecki'
language = 'pl'
description = 'news from gazeta.pl'
category = 'newspaper'
publication_type = 'newspaper'
masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
INDEX='http://wyborcza.pl'
remove_empty_feeds= True
masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
INDEX = 'http://wyborcza.pl'
remove_empty_feeds = True
oldest_article = 3
max_articles_per_feed = 100
remove_javascript=True
no_stylesheets=True
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = dict(id=['gazeta_article', 'article'])
remove_tags_after = dict(id='gazeta_article_share')
remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])]
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'),
(u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'),
(u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'),
#(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'),
(u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'),
(u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'),
(u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'),
(u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'),
(u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'),
(u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'),
(u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
(u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss')
]
remove_javascript = True
no_stylesheets = True
remove_tags_before = dict(id='k0')
remove_tags_after = dict(id='banP4')
remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
]
def skip_ad_pages(self, soup):
tag=soup.find(name='a', attrs={'class':'btn'})
if tag:
new_soup=self.index_to_soup(tag['href'], raw=True)
tag = soup.find(name='a', attrs={'class': 'btn'})
if tag:
new_soup = self.index_to_soup(tag['href'], raw=True)
return new_soup
def append_page(self, soup, appendtag):
loop=False
tag = soup.find('div', attrs={'id':'Str'})
if appendtag.find('div', attrs={'id':'Str'}):
nexturl=tag.findAll('a')
appendtag.find('div', attrs={'id':'Str'}).extract()
loop=True
loop = False
tag = soup.find('div', attrs={'id': 'Str'})
if appendtag.find('div', attrs={'id': 'Str'}):
nexturl = tag.findAll('a')
appendtag.find('div', attrs={'id': 'Str'}).extract()
loop = True
if appendtag.find(id='source'):
appendtag.find(id='source').extract()
while loop:
loop=False
loop = False
for link in nexturl:
if u'następne' in link.string:
url= self.INDEX + link['href']
url = self.INDEX + link['href']
soup2 = self.index_to_soup(url)
pagetext = soup2.find(id='artykul')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
tag = soup2.find('div', attrs={'id':'Str'})
nexturl=tag.findAll('a')
loop=True
tag = soup2.find('div', attrs={'id': 'Str'})
nexturl = tag.findAll('a')
loop = True
def gallery_article(self, appendtag):
tag=appendtag.find(id='container_gal')
tag = appendtag.find(id='container_gal')
if tag:
nexturl=appendtag.find(id='gal_btn_next').a['href']
nexturl = appendtag.find(id='gal_btn_next').a['href']
appendtag.find(id='gal_navi').extract()
while nexturl:
soup2=self.index_to_soup(nexturl)
pagetext=soup2.find(id='container_gal')
nexturl=pagetext.find(id='gal_btn_next')
soup2 = self.index_to_soup(nexturl)
pagetext = soup2.find(id='container_gal')
nexturl = pagetext.find(id='gal_btn_next')
if nexturl:
nexturl=nexturl.a['href']
nexturl = nexturl.a['href']
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
rem=appendtag.find(id='gal_navi')
rem = appendtag.find(id='gal_navi')
if rem:
rem.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
if soup.find(id='container_gal'):
self.gallery_article(soup.body)
return soup
if soup.find(attrs={'class': 'piano_btn_1'}):
return None
else:
self.append_page(soup, soup.body)
if soup.find(id='container_gal'):
self.gallery_article(soup.body)
return soup
def print_version(self, url):
if 'http://wyborcza.biz/biznes/' not in url:
return url
if url.count('rss.feedsportal.com'):
u = url.find('wyborcza0Bpl')
u = 'http://www.wyborcza.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace('0E', '-')
u = u.replace('0H', ',')
u = u.replace('0I', '_')
u = u.replace('0B', '.')
u = u.replace('/1,', '/2029020,')
u = u.replace('/story01.htm', '')
print(u)
return u
elif 'http://wyborcza.pl/1' in url:
return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020')
else:
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
def get_cover_url(self):
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
cover=soup.find(id='GWmini2')
soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href'])
self.cover_url='http://wyborcza.pl' + soup.img['src']
cover = soup.find(id='GWmini2')
soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href'])
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class FocusRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = u'intromatyk <intromatyk@gmail.com>'
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
language = 'pl'
version = 1

View File

@ -34,16 +34,20 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'story'}))
remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleLeftBox'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'socialNewTools'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'socialTools'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxTop'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'clr'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'recommendations'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'editorPicks'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks editorPicksFirst'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightText'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightButton'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxBottom'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'more'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'addRecommendation'}))
remove_tags.append(dict(name = 'h3', attrs = {'id' : 'tags'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
@ -67,3 +71,4 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
return start + '/' + index + '?print=tak'

View File

@ -1,34 +1,55 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class tvn24(BasicNewsRecipe):
title = u'TVN24'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
__author__ = 'fenuks, Artur Stachecki'
description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata'
category = 'news'
language = 'pl'
#masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.userlogos.org/files/logos/Struna/TVN24.jpg'
extra_css = 'ul {list-style:none;} \
li {list-style:none; float: left; margin: 0 0.15em;} \
h2 {font-size: medium} \
.date60m {float: left; margin: 0 10px 0 5px;}'
masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags=[dict(name='h1', attrs={'class':['size30 mt10 pb10', 'size38 mt10 pb15']}), dict(name='figure', attrs={'class':'articleMainPhoto articleMainPhotoWide'}), dict(name='article', attrs={'class':['mb20', 'mb20 textArticleDefault']}), dict(name='ul', attrs={'class':'newsItem'})]
remove_tags = [dict(name='aside', attrs={'class':['innerArticleModule onRight cols externalContent', 'innerArticleModule center']}), dict(name='div', attrs={'class':['thumbsGallery', 'articleTools', 'article right rd7', 'heading', 'quizContent']}), dict(name='a', attrs={'class':'watchMaterial text'}), dict(name='section', attrs={'class':['quiz toCenter', 'quiz toRight']})]
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'),
(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
keep_only_tags=[
# dict(name='h1', attrs={'class':'size38 mt20 pb20'}),
dict(name='div', attrs={'class':'mainContainer'}),
# dict(name='p'),
# dict(attrs={'class':['size18 mt10 mb15', 'bold topicSize1', 'fromUsers content', 'textArticleDefault']})
]
remove_tags=[
dict(attrs={'class':['commentsInfo', 'textSize', 'related newsNews align-right', 'box', 'watchMaterial text', 'related galleryGallery align-center', 'advert block-alignment-right', 'userActions', 'socialBookmarks', 'im yourArticle fl', 'dynamicButton addComment fl', 'innerArticleModule onRight cols externalContent', 'thumbsGallery', 'relatedObject customBlockquote align-right', 'lead', 'mainRightColumn', 'articleDateContainer borderGreyBottom', 'socialMediaContainer onRight loaded', 'quizContent', 'twitter', 'facebook', 'googlePlus', 'share', 'voteResult', 'reportTitleBar bgBlue_v4 mb15', 'innerVideoModule center']}),
dict(name='article', attrs={'class':['singleArtPhotoCenter', 'singleArtPhotoRight', 'singleArtPhotoLeft']}),
dict(name='section', attrs={'id':['forum', 'innerArticle', 'quiz toCenter', 'mb20']}),
dict(name='div', attrs={'class':'socialMediaContainer big p20 mb20 borderGrey loaded'})
]
remove_tags_after=[dict(name='li', attrs={'class':'share'})]
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), ]
#(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
tag = soup.find(name='ul', attrs={'class':'newsItem'})
if tag:
tag.name='div'
tag.li.name='div'
return soup
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
def postprocess_html(self, soup, first):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -3,6 +3,8 @@
__license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com'
__copyright__ = 'Modified 2011, Mariusz Wolek <mariusz_dot_wolek @ gmail dot com>'
__copyright__ = 'Modified 2012, Artur Stachecki <artur.stachecki@gmail.com>'
from calibre.web.feeds.news import BasicNewsRecipe
import re
@ -11,7 +13,7 @@ class Wprost(BasicNewsRecipe):
EDITION = 0
FIND_LAST_FULL_ISSUE = True
EXCLUDE_LOCKED = True
ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif'
ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png'
title = u'Wprost'
__author__ = 'matek09'
@ -20,6 +22,7 @@ class Wprost(BasicNewsRecipe):
no_stylesheets = True
language = 'pl'
remove_javascript = True
recursions = 0
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
@ -35,13 +38,15 @@ class Wprost(BasicNewsRecipe):
(re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''),
(re.compile(r'\<table .*?\>'), lambda match: ''),
(re.compile(r'\<tr>'), lambda match: ''),
(re.compile(r'\<td .*?\>'), lambda match: '')]
(re.compile(r'\<td .*?\>'), lambda match: ''),
(re.compile(r'\<div id="footer"\>.*?\</footer\>'), lambda match: '')]
remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'}))
remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'}))
extra_css = '''
.div-header {font-size: x-small; font-weight: bold}
'''
@ -59,27 +64,26 @@ class Wprost(BasicNewsRecipe):
a = 0
if self.FIND_LAST_FULL_ISSUE:
ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED})
a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)})
else:
a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
a = soup.find('a', attrs={'title' : re.compile(r'Spis *', re.IGNORECASE | re.DOTALL)})
self.EDITION = a['href'].replace('/tygodnik/?I=', '')
self.cover_url = a.img['src']
self.EDITION_SHORT = a['href'].replace('/tygodnik/?I=15', '')
self.cover_url = a.img['src']
def parse_index(self):
self.find_last_issue()
soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION)
feeds = []
for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}):
for main_block in soup.findAll(attrs={'id': 'content-main-column-element-content'}):
articles = list(self.find_articles(main_block))
if len(articles) > 0:
section = self.tag_to_string(main_block)
section = self.tag_to_string(main_block.find('h3'))
feeds.append((section, articles))
return feeds
def find_articles(self, main_block):
for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}):
for a in main_block.findAll('a'):
if a.name in "td":
break
if self.EXCLUDE_LOCKED & self.is_blocked(a):
@ -91,3 +95,4 @@ class Wprost(BasicNewsRecipe):
'description' : ''
}