Update various Polish recipes

This commit is contained in:
Kovid Goyal 2012-10-17 19:42:08 +05:30
parent 83f732cced
commit 2767403b94
18 changed files with 70 additions and 41 deletions

View File

@ -3,7 +3,7 @@ import re
class Adventure_zone(BasicNewsRecipe): class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone' title = u'Adventure Zone'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Adventure zone - adventure games from A to Z' description = u'Adventure zone - adventure games from A to Z'
category = 'games' category = 'games'
language = 'pl' language = 'pl'
no_stylesheets = True no_stylesheets = True
@ -11,7 +11,9 @@ class Adventure_zone(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
index='http://www.adventure-zone.info/fusion/' index='http://www.adventure-zone.info/fusion/'
use_embedded_content=False use_embedded_content=False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')] preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
(re.compile(r'\<table .*?\>'), lambda match: ''),
(re.compile(r'\<tbody\>'), lambda match: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})] remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
remove_tags_after= dict(id='comments') remove_tags_after= dict(id='comments')
@ -52,6 +54,11 @@ class Adventure_zone(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
footer=soup.find(attrs={'class':'news-footer middle-border'}) footer=soup.find(attrs={'class':'news-footer middle-border'})
r = soup.find(name='td', attrs={'class':'capmain'})
if r:
r.name='h1'
for item in soup.findAll(name=['tr', 'td']):
item.name='div'
if footer and len(footer('a'))>=2: if footer and len(footer('a'))>=2:
footer('a')[1].extract() footer('a')[1].extract()
for item in soup.findAll(style=True): for item in soup.findAll(style=True):

View File

@ -12,9 +12,9 @@ class BenchmarkPl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets=True no_stylesheets=True
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')] preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')]
remove_tags_after=dict(name='div', attrs={'class':'body'}) remove_tags_after=dict(name='div', attrs={'class':'body'})
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
INDEX= 'http://www.benchmark.pl' INDEX= 'http://www.benchmark.pl'
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]

View File

@ -13,6 +13,7 @@ class Biolog_pl(BasicNewsRecipe):
masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png' masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png'
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png' cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
#keeps_only_tags=[dict(id='main')] #keeps_only_tags=[dict(id='main')]
remove_tags_before=dict(id='main') remove_tags_before=dict(id='main')
remove_tags_after=dict(name='a', attrs={'name':'komentarze'}) remove_tags_after=dict(name='a', attrs={'name':'komentarze'})

View File

@ -13,11 +13,11 @@ class CGM(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
remove_empty_feeds= True remove_empty_feeds= True
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheers=True no_stylesheets = True
extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}' extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}'
remove_tags_before=dict(id='mainContent') remove_tags_before=dict(id='mainContent')
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'}) remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
remove_tags=[dict(name='div', attrs={'class':'fbContainer'}), remove_tags=[dict(name='div', attrs={'class':['fbContainer', 'socials']}),
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}), dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
dict(id=['movieShare', 'container'])] dict(id=['movieShare', 'container'])]
feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'), feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),

View File

@ -19,7 +19,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ] preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})] remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze')]
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]

View File

@ -12,9 +12,8 @@ class Dzieje(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript=True remove_javascript=True
no_stylesheets= True no_stylesheets= True
remove_tags_before= dict(name='h1', attrs={'class':'title'}) keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')]
remove_tags_after= dict(id='dogory') remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')]
remove_tags=[dict(id='dogory')]
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]

View File

@ -15,6 +15,7 @@ class Dziennik_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript=True remove_javascript=True
remove_empty_feeds=True remove_empty_feeds=True
ignore_duplicate_articles = {'title', 'url'}
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">&gt;&gt;&gt; CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')] preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">&gt;&gt;&gt; CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
keep_only_tags=[dict(id='article')] keep_only_tags=[dict(id='article')]
@ -59,8 +60,6 @@ class Dziennik_pl(BasicNewsRecipe):
appendtag.find('div', attrs={'class':'article_paginator'}).extract() appendtag.find('div', attrs={'class':'article_paginator'}).extract()
def preprocess_html(self, soup): def preprocess_html(self, soup):
self.append_page(soup, soup.body) self.append_page(soup, soup.body)
return soup return soup

View File

@ -13,12 +13,12 @@ class FilmWebPl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets= True no_stylesheets= True
remove_empty_feeds=True remove_empty_feeds=True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), (u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'),
(u'News / Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'), (u'News / Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'),
@ -41,7 +41,6 @@ class FilmWebPl(BasicNewsRecipe):
if skip_tag is not None: if skip_tag is not None:
return self.index_to_soup(skip_tag['href'], raw=True) return self.index_to_soup(skip_tag['href'], raw=True)
def preprocess_html(self, soup): def preprocess_html(self, soup):
for a in soup('a'): for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:

View File

@ -15,14 +15,27 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript=True remove_javascript=True
no_stylesheets=True no_stylesheets=True
remove_tags_before=dict(id='k0') ignore_duplicate_articles = {'title', 'url'}
remove_tags_after=dict(id='banP4') keep_only_tags = dict(id=['gazeta_article', 'article'])
remove_tags=[dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] remove_tags_after = dict(id='gazeta_article_share')
remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])]
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'),
(u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'),
(u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'),
#(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'),
(u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'),
(u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'),
(u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'),
(u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'),
(u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'),
(u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'),
(u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
(u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss')
] ]
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):

View File

@ -11,6 +11,8 @@ class Gildia(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_empty_feeds=True remove_empty_feeds=True
no_stylesheets=True no_stylesheets=True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '') ]
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
keep_only_tags=dict(name='div', attrs={'class':'widetext'}) keep_only_tags=dict(name='div', attrs={'class':'widetext'})
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')] feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')]
@ -18,10 +20,9 @@ class Gildia(BasicNewsRecipe):
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
content = soup.find('div', attrs={'class':'news'}) content = soup.find('div', attrs={'class':'news'})
skip_tag= content.findAll(name='a') if 'recenzj' in soup.title.string.lower():
if skip_tag is not None: for link in content.findAll(name='a'):
for link in skip_tag: if 'recenzj' in link['href']:
if 'recenzja' in link['href']:
self.log.warn('odnosnik') self.log.warn('odnosnik')
self.log.warn(link['href']) self.log.warn(link['href'])
return self.index_to_soup(link['href'], raw=True) return self.index_to_soup(link['href'], raw=True)

View File

@ -12,7 +12,7 @@ class Gram_pl(BasicNewsRecipe):
no_stylesheets= True no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info', 'entry-footer clearfix']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button', 'entry-comment-counter', 'snap_nopreview sharing robots-nocontent']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info', 'entry-footer clearfix']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button', 'entry-comment-counter', 'snap_nopreview sharing robots-nocontent', 'sharedaddy sd-sharing-enabled']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']}), dict(name='article')] keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']}), dict(name='article')]
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'), (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'),

View File

@ -12,8 +12,8 @@ class GryOnlinePl(BasicNewsRecipe):
cover_url='http://www.gry-online.pl/im/gry-online-logo.png' cover_url='http://www.gry-online.pl/im/gry-online-logo.png'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets= True no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':'gc660'})] keep_only_tags=[dict(name='div', attrs={'class':['gc660', 'gc660 S013']})]
remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]

View File

@ -15,7 +15,6 @@ class Konflikty(BasicNewsRecipe):
keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')] keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
(u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'),
(u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'), (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
(u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'), (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
(u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),

View File

@ -7,7 +7,7 @@ class Lomza(BasicNewsRecipe):
cover_url = 'http://www.4lomza.pl/i/logo4lomza_m.jpg' cover_url = 'http://www.4lomza.pl/i/logo4lomza_m.jpg'
language = 'pl' language = 'pl'
oldest_article = 15 oldest_article = 15
no_styleseets=True no_stylesheets = True
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_tags=[dict(name='div', attrs={'class':['bxbanner', 'drukuj', 'wyslijznajomemu']})] remove_tags=[dict(name='div', attrs={'class':['bxbanner', 'drukuj', 'wyslijznajomemu']})]
keep_only_tags=[dict(name='div', attrs={'class':'wiadomosc'})] keep_only_tags=[dict(name='div', attrs={'class':'wiadomosc'})]

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Polska_times(BasicNewsRecipe): class PolskaTimes(BasicNewsRecipe):
title = u'Polska Times' title = u'Polska Times'
__author__ = 'fenuks' __author__ = 'fenuks'
description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.' description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.'
@ -10,6 +10,7 @@ class Polska_times(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_emty_feeds= True remove_emty_feeds= True
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
@ -23,6 +24,7 @@ class Polska_times(BasicNewsRecipe):
nexturl=soup.find('a')['href'] nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True) return self.index_to_soup(nexturl, raw=True)
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/') soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/')
self.cover_url=soup.find(id='pojemnik').img['src'] self.cover_url=soup.find(id='pojemnik').img['src']

View File

@ -11,5 +11,5 @@ class SpidersWeb(BasicNewsRecipe):
no_stylesheers=True no_stylesheers=True
max_articles_per_feed = 100 max_articles_per_feed = 100
keep_only_tags=[dict(id='Post')] keep_only_tags=[dict(id='Post')]
remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']})] remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')]
feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]

View File

@ -19,7 +19,7 @@ class Swiat_Obrazu(BasicNewsRecipe):
return url + ',drukuj' return url + ',drukuj'
def image_url_processor(self, baseurl, url): def image_url_processor(self, baseurl, url):
if 'http://' not in url or 'https://' not in url: if 'http://' not in url and 'https://' not in url:
return 'http://www.swiatobrazu.pl' + url[5:] return 'http://www.swiatobrazu.pl' + url[5:]
else: else:
return url return url

View File

@ -7,19 +7,28 @@ class tvn24(BasicNewsRecipe):
description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata' description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata'
category = 'news' category = 'news'
language = 'pl' language = 'pl'
masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' #masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' cover_url= 'http://www.userlogos.org/files/logos/Struna/TVN24.jpg'
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' extra_css = 'ul {list-style:none;} \
li {list-style:none; float: left; margin: 0 0.15em;} \
h2 {font-size: medium} \
.date60m {float: left; margin: 0 10px 0 5px;}'
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
keep_only_tags=[dict(name='h1', attrs={'class':'standardHeader1'}), dict(attrs={'class':['date60m rd5', 'imageBackground fl rd7', 'contentFromCMS']}), dict(attrs={'class':'mainLeftColumn'})] use_embedded_content = False
remove_tags=[dict(attrs={'class':['commentsInfo', 'textSize', 'related newsNews align-right', 'box', 'watchMaterial text']})] ignore_duplicate_articles = {'title', 'url'}
#remove_tags_after= dict(attrs={'class':'articleAuthors mb30 mt5 grey_v6'}) keep_only_tags=[dict(name='h1', attrs={'class':['size30 mt10 pb10', 'size38 mt10 pb15']}), dict(name='figure', attrs={'class':'articleMainPhoto articleMainPhotoWide'}), dict(name='article', attrs={'class':['mb20', 'mb20 textArticleDefault']}), dict(name='ul', attrs={'class':'newsItem'})]
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), ] remove_tags = [dict(name='aside', attrs={'class':['innerArticleModule onRight cols externalContent', 'innerArticleModule center']}), dict(name='div', attrs={'class':['thumbsGallery', 'articleTools', 'article right rd7', 'heading', 'quizContent']}), dict(name='a', attrs={'class':'watchMaterial text'}), dict(name='section', attrs={'class':['quiz toCenter', 'quiz toRight']})]
#(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'),
(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
tag = soup.find(name='ul', attrs={'class':'newsItem'})
if tag:
tag.name='div'
tag.li.name='div'
return soup return soup