Various updated Polish recipes

This commit is contained in:
Kovid Goyal 2012-07-27 01:09:49 +05:30
parent 8822ef28f9
commit 3afc065c2a
5 changed files with 30 additions and 21 deletions

View File

@ -1,6 +1,6 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Benchmark_pl(BasicNewsRecipe):
class BenchmarkPl(BasicNewsRecipe):
title = u'Benchmark.pl'
__author__ = 'fenuks'
description = u'benchmark.pl -IT site'
@ -14,7 +14,7 @@ class Benchmark_pl(BasicNewsRecipe):
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
remove_tags_after=dict(name='div', attrs={'class':'body'})
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
INDEX= 'http://www.benchmark.pl'
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]

View File

@ -1,6 +1,7 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Filmweb_pl(BasicNewsRecipe):
import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class FilmWebPl(BasicNewsRecipe):
title = u'FilmWeb'
__author__ = 'fenuks'
description = 'FilmWeb - biggest polish movie site'
@ -12,8 +13,9 @@ class Filmweb_pl(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets= True
remove_empty_feeds=True
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
@ -31,18 +33,22 @@ class Filmweb_pl(BasicNewsRecipe):
(u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'),
(u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'),
(u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'),
(u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')]
(u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')
]
def skip_ad_pages(self, soup):
def skip_ad_pages(self, soup):
skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'})
if skip_tag is not None:
self.log.warn('skip_tag')
self.log.warn(skip_tag)
return self.index_to_soup(skip_tag['href'], raw=True)
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup
for i in soup.findAll('a', attrs={'class':'fn'}):
i.insert(len(i), BeautifulSoup('<br />'))
for i in soup.findAll('sup'):
if not i.string or i.string.startswith('(kliknij'):
i.extract()
return soup

View File

@ -1,6 +1,6 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
class Gry_online_pl(BasicNewsRecipe):
class GryOnlinePl(BasicNewsRecipe):
title = u'Gry-Online.pl'
__author__ = 'fenuks'
description = 'Gry-Online.pl - computer games'
@ -21,17 +21,18 @@ class Gry_online_pl(BasicNewsRecipe):
tag = appendtag.find('div', attrs={'class':'n5p'})
if tag:
nexturls=tag.findAll('a')
for nexturl in nexturls[1:]:
try:
soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href'])
except:
soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href'])
url_part = soup.find('link', attrs={'rel':'canonical'})['href']
url_part = url_part[25:].rpartition('?')[0]
for nexturl in nexturls[1:-1]:
soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href'])
pagetext = soup2.find(attrs={'class':'gc660'})
for r in pagetext.findAll(name='header'):
r.extract()
for r in pagetext.findAll(attrs={'itemprop':'description'}):
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}):
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
r.extract()

View File

@ -1,3 +1,4 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class NaTemat(BasicNewsRecipe):
@ -8,8 +9,9 @@ class NaTemat(BasicNewsRecipe):
description = u'informacje, komentarze, opinie'
category = 'news'
language = 'pl'
preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj więcej\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?</a>', re.IGNORECASE), lambda m: '')]
cover_url= 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png'
no_stylesheets = True
keep_only_tags= [dict(id='main')]
remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related']})]
remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related', 'user-header', 'links']}), dict(name='img', attrs={'class':'indent'})]
feeds = [(u'Artyku\u0142y', u'http://natemat.pl/rss/wszystkie')]

View File

@ -1,7 +1,7 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe1312886443(BasicNewsRecipe):
class WNP(BasicNewsRecipe):
title = u'WNP'
cover_url= 'http://k.wnp.pl/images/wnpLogo.gif'
__author__ = 'fenuks'
@ -12,7 +12,7 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
remove_tags=[dict(attrs={'class':'printF'})]
remove_tags=[dict(attrs={'class':['printF', 'border3B2 clearfix', 'articleMenu clearfix']})]
feeds = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'),
(u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'),
(u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'),