Updated various Polish recipes

This commit is contained in:
Kovid Goyal 2012-03-21 08:52:11 +05:30
parent 2b60b652fa
commit 632ae65855
12 changed files with 103 additions and 85 deletions

View File

@ -6,6 +6,7 @@ class Android_com_pl(BasicNewsRecipe):
description = 'Android.com.pl - biggest polish Android site' description = 'Android.com.pl - biggest polish Android site'
category = 'Android, mobile' category = 'Android, mobile'
language = 'pl' language = 'pl'
use_embedded_content=True
cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png' cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class CGM(BasicNewsRecipe): class CGM(BasicNewsRecipe):
title = u'CGM' title = u'CGM'
@ -17,9 +18,9 @@ class CGM(BasicNewsRecipe):
remove_tags_before=dict(id='mainContent') remove_tags_before=dict(id='mainContent')
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'}) remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
remove_tags=[dict(name='div', attrs={'class':'fbContainer'}), remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}), dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
dict(id=['movieShare', 'container'])] dict(id=['movieShare', 'container'])]
feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'), feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),
(u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')] (u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')]
@ -33,10 +34,12 @@ class CGM(BasicNewsRecipe):
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
gallery.contents[1].name='img' gallery.contents[1].name='img'
gallery.contents[1]['src']=img gallery.contents[1]['src']=img
pos = len(gallery.contents)
gallery.insert(pos, BeautifulSoup('<br />'))
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
ad=soup.findAll('a') ad=soup.findAll('a')
for r in ad: for r in ad:
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
r.extract() r.extract()
return soup return soup

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Elektroda(BasicNewsRecipe): class Elektroda(BasicNewsRecipe):
title = u'Elektroda' title = u'Elektroda'
@ -13,3 +14,18 @@ class Elektroda(BasicNewsRecipe):
remove_tags_after=dict(name='td', attrs={'class':'spaceRow'}) remove_tags_after=dict(name='td', attrs={'class':'spaceRow'})
remove_tags=[dict(name='a', attrs={'href':'#top'})] remove_tags=[dict(name='a', attrs={'href':'#top'})]
feeds = [(u'Elektroda', u'http://www.elektroda.pl/rtvforum/rss.php')] feeds = [(u'Elektroda', u'http://www.elektroda.pl/rtvforum/rss.php')]
def preprocess_html(self, soup):
tag=soup.find('span', attrs={'class':'postbody'})
if tag:
pos = len(tag.contents)
tag.insert(pos, BeautifulSoup('<br />'))
return soup
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
article.title=article.title[article.title.find("::")+3:]
return feeds

View File

@ -13,7 +13,7 @@ class Filmweb_pl(BasicNewsRecipe):
remove_empty_feeds=True remove_empty_feeds=True
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),

View File

@ -9,12 +9,12 @@ class Gram_pl(BasicNewsRecipe):
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets= True no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;}' extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})] keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'), feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
(u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')] (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
def parse_feeds (self): def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self) feeds = BasicNewsRecipe.parse_feeds(self)
@ -23,3 +23,33 @@ class Gram_pl(BasicNewsRecipe):
if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper(): if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
feed.articles.remove(article) feed.articles.remove(article)
return feeds return feeds
def append_page(self, soup, appendtag):
nexturl = appendtag.find('a', attrs={'class':'cpn'})
while nexturl:
soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href'])
r=appendtag.find(id='pgbox')
if r:
r.extract()
pagetext = soup2.find(attrs={'class':'main'})
r=pagetext.find('h1')
if r:
r.extract()
r=pagetext.find('h2')
if r:
r.extract()
for r in pagetext.findAll('script'):
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
nexturl = appendtag.find('a', attrs={'class':'cpn'})
r=appendtag.find(id='pgbox')
if r:
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
tag=soup.findAll(name='div', attrs={'class':'picbox'})
for t in tag:
t['style']='float: left;'
return soup

View File

@ -7,12 +7,12 @@ class naczytniki(BasicNewsRecipe):
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
language = 'pl' language = 'pl'
description ='everything about e-readers' description ='everything about e-readers'
category='readers' category='e-readers'
no_stylesheets=True no_stylesheets=True
use_embedded_content=False
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><br><b>Zobacz także:</b></p>.*?</body>', re.DOTALL), lambda match: '</body>') ] preprocess_regexps = [(re.compile(ur'<p><br><b>Zobacz także:</b></p>.*?</body>', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
keep_only_tags=[dict(name='div', attrs={'class':'post'})] keep_only_tags=[dict(name='div', attrs={'class':'post'})]
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})] remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')] feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]

View File

@ -17,21 +17,8 @@ class Overclock_pl(BasicNewsRecipe):
remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})] remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})]
feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')] feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')]
def print_version(self, url):
def append_page(self, soup, appendtag): if 'articles/show' in url:
tag=soup.find(id='navigation') return url.replace('show', 'showall')
if tag: else:
nexturl=tag.findAll('option') return url
tag.extract()
for nextpage in nexturl[2:]:
soup2 = self.index_to_soup(nextpage['value'])
pagetext = soup2.find(id='content')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
rem=appendtag.find(attrs={'alt':'Pierwsza'})
if rem:
rem.parent.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -10,5 +10,7 @@ class palmtop_pl(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content=True
#remove_tags_before=dict(name='h2')
#remove_tags_after=dict(attrs={'class':'entry clearfix'})
feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')] feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')]

View File

@ -1,31 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class PC_Arena(BasicNewsRecipe): class PC_Arena(BasicNewsRecipe):
title = u'PCArena' title = u'PCArena'
oldest_article = 18300 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT' category = 'IT'
language = 'pl' language = 'pl'
masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' masthead_url='http://pcarena.pl/pcarena/img/logo.png'
cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] remove_empty_feeds=True
remove_tags=[dict(attrs={'class':'pages'})] #keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] #remove_tags=[dict(attrs={'class':'pages'})]
feeds = [(u'Aktualności', u'http://pcarena.pl/aktualnosci/feeds.rss'), (u'Testy', u'http://pcarena.pl/testy/feeds.rss'), (u'Software', u'http://pcarena.pl/oprogramowanie/feeds.rss'), (u'Poradniki', u'http://pcarena.pl/poradniki/feeds.rss'), (u'Mobile', u'http://pcarena.pl/mobile/feeds.rss')]
def print_version(self, url):
return url.replace('show', 'print')
def append_page(self, soup, appendtag): def image_url_processor(self, baseurl, url):
tag=soup.find(name='div', attrs={'class':'pagNum'}) if 'http' not in url:
if tag: return 'http://pcarena.pl' + url
nexturl=tag.findAll('a') else:
tag.extract() return url
for nextpage in nexturl[1:]:
nextpage= 'http://pcarena.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(attrs={'class':'artBody'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -10,32 +10,11 @@ class PC_Centre(BasicNewsRecipe):
masthead_url= 'http://pccentre.pl/views/images/logo.gif' masthead_url= 'http://pccentre.pl/views/images/logo.gif'
cover_url= 'http://pccentre.pl/views/images/logo.gif' cover_url= 'http://pccentre.pl/views/images/logo.gif'
no_stylesheets = True no_stylesheets = True
keep_only_tags= [dict(id='content')] remove_empty_feeds = True
remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] #keep_only_tags= [dict(id='content')]
feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')] #remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
remove_tags=[dict(attrs={'class':'logo_print'})]
feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')]
def print_version(self, url):
def append_page(self, soup, appendtag): return url.replace('show', 'print')
tag=soup.find(name='div', attrs={'class':'pages'})
if tag:
nexturl=tag.findAll('a')
tag.extract()
for nextpage in nexturl[:-1]:
nextpage= 'http://pccentre.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(id='content')
rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']})
for r in rem:
r.extract()
rem=pagetext.findAll(id='comments')
for r in rem:
r.extract()
rem=pagetext.findAll('h1')
for r in rem:
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -8,10 +8,11 @@ class Tablety_pl(BasicNewsRecipe):
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
category = 'IT' category = 'IT'
language = 'pl' language = 'pl'
use_embedded_content=True
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) #remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) #remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] #remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})]
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe1312886443(BasicNewsRecipe): class AdvancedUserRecipe1312886443(BasicNewsRecipe):
title = u'WNP' title = u'WNP'
@ -8,10 +8,11 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
description = u'Wirtualny Nowy Przemysł' description = u'Wirtualny Nowy Przemysł'
category = 'economy' category = 'economy'
language = 'pl' language = 'pl'
preprocess_regexps = [(re.compile(ur'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')]
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets= True no_stylesheets= True
keep_only_tags = dict(name='div', attrs={'id':'contentText'}) remove_tags=[dict(attrs={'class':'printF'})]
feeds = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'), feeds = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'),
(u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'), (u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'),
(u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'), (u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'),
@ -19,3 +20,7 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
(u'Serwis Górnictwo', u'http://www.wnp.pl/rss/serwis_rss_4.xml'), (u'Serwis Górnictwo', u'http://www.wnp.pl/rss/serwis_rss_4.xml'),
(u'Serwis Logistyka', u'http://www.wnp.pl/rss/serwis_rss_5.xml'), (u'Serwis Logistyka', u'http://www.wnp.pl/rss/serwis_rss_5.xml'),
(u'Serwis IT', u'http://www.wnp.pl/rss/serwis_rss_6.xml')] (u'Serwis IT', u'http://www.wnp.pl/rss/serwis_rss_6.xml')]
def print_version(self, url):
return 'http://wnp.pl/drukuj/' +url[url.find(',')+1:]