Update various Polish news sources

This commit is contained in:
Kovid Goyal 2013-03-22 08:18:54 +05:30
commit 6ce4e61d4f
13 changed files with 53 additions and 35 deletions

View File

@ -10,15 +10,15 @@ class Adventure_zone(BasicNewsRecipe):
oldest_article = 20 oldest_article = 20
max_articles_per_feed = 100 max_articles_per_feed = 100
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
index='http://www.adventure-zone.info/fusion/' index = 'http://www.adventure-zone.info/fusion/'
use_embedded_content = False use_embedded_content = False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''), preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
(re.compile(r'</?table.*?>'), lambda match: ''), (re.compile(r'</?table.*?>'), lambda match: ''),
(re.compile(r'</?tbody.*?>'), lambda match: '')] (re.compile(r'</?tbody.*?>'), lambda match: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) remove_tags_before = dict(name='td', attrs={'class':'main-bg'})
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})] remove_tags = [dict(name='img', attrs={'alt':'Drukuj'})]
remove_tags_after= dict(id='comments') remove_tags_after = dict(id='comments')
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; } img.news-category {float: left; margin-right: 5px;}'
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
'''def get_cover_url(self): '''def get_cover_url(self):
@ -67,4 +67,3 @@ class Adventure_zone(BasicNewsRecipe):
a['href']=self.index + a['href'] a['href']=self.index + a['href']
return soup return soup

View File

@ -18,3 +18,10 @@ class Astroflesz(BasicNewsRecipe):
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'}) remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})] remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')] feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')]
def postprocess_html(self, soup, first_fetch):
t = soup.find(attrs={'class':'itemIntroText'})
if t:
for i in t.findAll('img'):
i['style'] = 'float: left; margin-right: 5px;'
return soup

View File

@ -11,7 +11,8 @@ class Ciekawostki_Historyczne(BasicNewsRecipe):
masthead_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' masthead_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
cover_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' cover_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
max_articles_per_feed = 100 max_articles_per_feed = 100
oldest_article = 140000 extra_css = 'img.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}'
oldest_article = 12
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')] preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True

View File

@ -11,6 +11,7 @@ class CoNowegoPl(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
INDEX = 'http://www.conowego.pl/' INDEX = 'http://www.conowego.pl/'
extra_css = '.news-single-img {float:left; margin-right:5px;}'
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False

View File

@ -12,11 +12,13 @@ class CzasGentlemanow(BasicNewsRecipe):
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
extra_css = '.gallery-item {float:left; margin-right: 10px; max-width: 20%;} .alignright {text-align: right; float:right; margin-left:5px;}\
.wp-caption-text {text-align: left;} img.aligncenter {display: block; margin-left: auto; margin-right: auto;} .alignleft {float: left; margin-right:5px;}'
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
preprocess_regexps = [(re.compile(u'<h3>Może Cię też zainteresować:</h3>'), lambda m: '')] preprocess_regexps = [(re.compile(u'<h3>Może Cię też zainteresować:</h3>'), lambda m: '')]
use_embedded_content = False use_embedded_content = False
keep_only_tags = [dict(name='div', attrs={'class':'content'})] keep_only_tags = [dict(name='div', attrs={'class':'content'})]
remove_tags = [dict(attrs={'class':'meta_comments'}), dict(id=['comments', 'related_posts_thumbnails'])] remove_tags = [dict(attrs={'class':'meta_comments'}), dict(id=['comments', 'related_posts_thumbnails', 'respond'])]
remove_tags_after = dict(id='comments') remove_tags_after = dict(id='comments')
feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')] feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')]

View File

@ -16,6 +16,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
extra_css = '.title {font-size:22px;}' extra_css = '.title {font-size:22px;}'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_attrs = ['style', 'width', 'height']
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ] preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')] remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')]
@ -28,4 +29,11 @@ class Dobreprogramy_pl(BasicNewsRecipe):
for a in soup('a'): for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href'] a['href']=self.index + a['href']
for r in soup.findAll('iframe'):
r.parent.extract()
return soup
def postprocess_html(self, soup, first_fetch):
for r in soup.findAll('span', text=''):
if not r.string:
r.extract()
return soup return soup

View File

@ -9,6 +9,7 @@ class Dzieje(BasicNewsRecipe):
category = 'history' category = 'history'
language = 'pl' language = 'pl'
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
extra_css = '.imagecache-default {float:left; margin-right:20px;}'
index = 'http://dzieje.pl' index = 'http://dzieje.pl'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100

View File

@ -9,7 +9,7 @@ class EkologiaPl(BasicNewsRecipe):
language = 'pl' language = 'pl'
cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png' cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png'
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
extra_css = '.title {font-size: 200%;}' extra_css = '.title {font-size: 200%;} .imagePowiazane, .imgCon {float:left; margin-right:5px;}'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True

View File

@ -7,6 +7,7 @@ class FilmOrgPl(BasicNewsRecipe):
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce." description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce."
category = 'film' category = 'film'
language = 'pl' language = 'pl'
extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;}'
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png' cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png'
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
oldest_article = 7 oldest_article = 7

View File

@ -10,7 +10,6 @@ class FilmWebPl(BasicNewsRecipe):
category = 'movies' category = 'movies'
language = 'pl' language = 'pl'
index = 'http://www.filmweb.pl' index = 'http://www.filmweb.pl'
#extra_css = '.MarkupPhotoHTML-7 {float:left; margin-right: 10px;}'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
@ -19,9 +18,9 @@ class FilmWebPl(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />')]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />')]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags = [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] #remove_tags = [dict()]
remove_attributes = ['style',] remove_attributes = ['style',]
keep_only_tags = [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] keep_only_tags = [dict(attrs={'class':['hdr hdr-super', 'newsContent']})]
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), (u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'),
@ -59,11 +58,6 @@ class FilmWebPl(BasicNewsRecipe):
for i in soup.findAll('sup'): for i in soup.findAll('sup'):
if not i.string or i.string.startswith('(kliknij'): if not i.string or i.string.startswith('(kliknij'):
i.extract() i.extract()
tag = soup.find(name='ul', attrs={'class':'inline sep-line'})
if tag:
tag.name = 'div'
for t in tag.findAll('li'):
t.name = 'div'
for r in soup.findAll(id=re.compile('photo-\d+')): for r in soup.findAll(id=re.compile('photo-\d+')):
r.extract() r.extract()
for r in soup.findAll(style=re.compile('float: ?left')): for r in soup.findAll(style=re.compile('float: ?left')):

View File

@ -9,6 +9,7 @@ class Niebezpiecznik_pl(BasicNewsRecipe):
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True
cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png' cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png'
remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})] remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})]
keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})] keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})]

View File

@ -8,6 +8,7 @@ class WirtualneMedia(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
remove_empty_feeds = True remove_empty_feeds = True
__author__ = 'fenuks' __author__ = 'fenuks'
extra_css = '.thumbnail {float:left; max-width:150px; margin-right:5px;}'
description = u'Portal o mediach, reklamie, internecie, PR, telekomunikacji - nr 1 w Polsce - WirtualneMedia.pl - wiadomości z pierwszej ręki.' description = u'Portal o mediach, reklamie, internecie, PR, telekomunikacji - nr 1 w Polsce - WirtualneMedia.pl - wiadomości z pierwszej ręki.'
category = 'internet' category = 'internet'
language = 'pl' language = 'pl'

View File

@ -1,5 +1,6 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ZTS(BasicNewsRecipe): class ZTS(BasicNewsRecipe):
title = u'Zaufana Trzecia Strona' title = u'Zaufana Trzecia Strona'
__author__ = 'fenuks' __author__ = 'fenuks'
@ -7,6 +8,7 @@ class ZTS(BasicNewsRecipe):
category = 'IT, security' category = 'IT, security'
language = 'pl' language = 'pl'
cover_url = 'http://www.zaufanatrzeciastrona.pl/wp-content/uploads/2012/08/z3s_h100.png' cover_url = 'http://www.zaufanatrzeciastrona.pl/wp-content/uploads/2012/08/z3s_h100.png'
extra_css = '.thumbnail {float: left; margin-right:5px;}'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True