diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe
index 65f4e3e52d..bb311606ac 100644
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe):
no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
+ index='http://www.adventure-zone.info/fusion/'
use_embedded_content=False
preprocess_regexps = [(re.compile(r"
Komentarze | ", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
@@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe):
skip_tag = skip_tag.findAll(name='a')
for r in skip_tag:
if r.strong:
- word=r.strong.string
- if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
- return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
\ No newline at end of file
+ word=r.strong.string.lower()
+ if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
+ return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
+
+ def preprocess_html(self, soup):
+ footer=soup.find(attrs={'class':'news-footer middle-border'})
+ if footer and len(footer('a'))>=2:
+ footer('a')[1].extract()
+ for item in soup.findAll(style=True):
+ del item['style']
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
+
+
\ No newline at end of file
diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe
index cc74cc9128..00eea1be68 100644
--- a/recipes/benchmark_pl.recipe
+++ b/recipes/benchmark_pl.recipe
@@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe):
self.image_article(soup, soup.body)
else:
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.INDEX + a['href']
return soup
diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe
index ff46774dc9..4e19fbc6c1 100644
--- a/recipes/cd_action.recipe
+++ b/recipes/cd_action.recipe
@@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe):
description = 'cdaction.pl - polish games magazine site'
category = 'games'
language = 'pl'
+ index='http://www.cdaction.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
- return getattr(self, 'cover_url', self.cover_url)
\ No newline at end of file
+ return getattr(self, 'cover_url', self.cover_url)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe
index a27a9b0877..0614cf98ee 100644
--- a/recipes/dobreprogamy.recipe
+++ b/recipes/dobreprogamy.recipe
@@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
description = u'Aktualności i blogi z dobreprogramy.pl'
encoding = 'utf-8'
+ index='http://www.dobreprogramy.pl/'
no_stylesheets = True
language = 'pl'
extra_css = '.title {font-size:22px;}'
@@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe):
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
+
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe
index d80161e71a..4c583e4815 100644
--- a/recipes/dzieje_pl.recipe
+++ b/recipes/dzieje_pl.recipe
@@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe):
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history'
language = 'pl'
+ index='http://dzieje.pl'
oldest_article = 8
max_articles_per_feed = 100
remove_javascript=True
@@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe):
remove_tags_after= dict(id='dogory')
remove_tags=[dict(id='dogory')]
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
+
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/eioba.recipe b/recipes/eioba.recipe
index 14256c5811..1df79d64bd 100644
--- a/recipes/eioba.recipe
+++ b/recipes/eioba.recipe
@@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe):
(u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'),
(u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml')
]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ return soup
diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe
index 75271c510a..2fbf9ff514 100644
--- a/recipes/emuzica_pl.recipe
+++ b/recipes/emuzica_pl.recipe
@@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe):
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
+ index='http://www.emuzyka.pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
@@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe):
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe
index 877d4472bc..2a6e00d501 100644
--- a/recipes/film_web.recipe
+++ b/recipes/film_web.recipe
@@ -7,6 +7,7 @@ class Filmweb_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png'
category = 'movies'
language = 'pl'
+ index='http://www.filmweb.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe):
self.log.warn(skip_tag)
return self.index_to_soup(skip_tag['href'], raw=True)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe
index f3384263d6..7b0ccb4f55 100644
--- a/recipes/gameplay_pl.recipe
+++ b/recipes/gameplay_pl.recipe
@@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe):
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
+ index='http://gameplay.pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
+ remove_javascript= True
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
- remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
+ remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
- return url
+ return url
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and '../' in a['href']:
+ a['href']=self.index + a['href'][2:]
+ return soup
\ No newline at end of file
diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe
index 042902b5fc..36d3ef4da2 100644
--- a/recipes/gildia_pl.recipe
+++ b/recipes/gildia_pl.recipe
@@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe):
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
+ remove_empty_feeds=True
no_stylesheets=True
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
keep_only_tags=dict(name='div', attrs={'class':'widetext'})
@@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe):
self.log.warn('odnosnik')
self.log.warn(link['href'])
return self.index_to_soup(link['href'], raw=True)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ if '/gry/' in a['href']:
+ a['href']='http://www.gry.gildia.pl' + a['href']
+ elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
+ a['href']='http://www.literatura.gildia.pl' + a['href']
+ elif u'komiks' in soup.title.string.lower():
+ a['href']='http://www.literatura.gildia.pl' + a['href']
+ else:
+ a['href']='http://www.gildia.pl' + a['href']
+ return soup
diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe
index 07927796c0..1f8147ba3d 100644
--- a/recipes/gram_pl.recipe
+++ b/recipes/gram_pl.recipe
@@ -7,6 +7,7 @@ class Gram_pl(BasicNewsRecipe):
category = 'games'
language = 'pl'
oldest_article = 8
+ index='http://www.gram.pl'
max_articles_per_feed = 100
no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
@@ -52,4 +53,7 @@ class Gram_pl(BasicNewsRecipe):
tag=soup.findAll(name='div', attrs={'class':'picbox'})
for t in tag:
t['style']='float: left;'
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe
index 16ad622b46..e385522714 100644
--- a/recipes/in4_pl.recipe
+++ b/recipes/in4_pl.recipe
@@ -8,6 +8,7 @@ class in4(BasicNewsRecipe):
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
category = 'IT'
language = 'pl'
+ index='http://www.in4.pl/'
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
no_stylesheets = True
remove_empty_feeds = True
@@ -39,6 +40,7 @@ class in4(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
return soup
-
-
diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe
index 0e035e0980..e021fa0c17 100644
--- a/recipes/infra_pl.recipe
+++ b/recipes/infra_pl.recipe
@@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe):
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
+ index='http://infra.org.pl'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
@@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe):
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/konflikty_zbrojne.recipe b/recipes/konflikty_zbrojne.recipe
index 7921e98f48..4211093443 100644
--- a/recipes/konflikty_zbrojne.recipe
+++ b/recipes/konflikty_zbrojne.recipe
@@ -10,6 +10,23 @@ class Konflikty(BasicNewsRecipe):
category='military, history'
oldest_article = 7
max_articles_per_feed = 100
- auto_cleanup = True
+ no_stylesheets = True
+ keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
- feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')]
+ feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
+ (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'),
+ (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
+ (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
+ (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
+ (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
+ (u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for image in soup.findAll(name='a', attrs={'class':'image'}):
+ if image.img and image.img.has_key('alt'):
+ image.name='div'
+ pos = len(image.contents)
+ image.insert(pos, BeautifulSoup(''+image.img['alt']+'
'))
+ return soup
\ No newline at end of file
diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe
index a2f759e878..07fc0da666 100644
--- a/recipes/national_geographic_pl.recipe
+++ b/recipes/national_geographic_pl.recipe
@@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class recipeMagic(BasicNewsRecipe):
title = 'National Geographic PL'
__author__ = 'Marcin Urban 2011'
+ __modified_by__ = 'fenuks'
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
- cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
+ #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
@@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe):
]
remove_attributes = ['width','height']
+ feeds=[]
- feeds = [
- ('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
- ]
+ def find_articles(self, url):
+ articles = []
+ soup=self.index_to_soup(url)
+ tag=soup.find(attrs={'class':'arl'})
+ art=tag.ul.findAll('li')
+ for i in art:
+ title=i.a['title']
+ url=i.a['href']
+ #date=soup.find(id='footer').ul.li.string[41:-1]
+ desc=i.div.p.string
+ articles.append({'title' : title,
+ 'url' : url,
+ 'date' : '',
+ 'description' : desc
+ })
+ return articles
+
+ def parse_index(self):
+ feeds = []
+ feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/')))
+ feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/')))
+
+ return feeds
def print_version(self, url):
- return url.replace('artykuly0Cpokaz', 'drukuj-artykul')
+ if 'artykuly' in url:
+ return url.replace('artykuly/pokaz', 'drukuj-artykul')
+ elif 'aktualnosci' in url:
+ return url.replace('aktualnosci/pokaz', 'drukuj-artykul')
+ else:
+ return url
+
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/')
+ tag=soup.find(attrs={'class':'txt jus'})
+ self.cover_url=tag.img['src']
+ return getattr(self, 'cover_url', self.cover_url)
diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe
index ec556da5fa..0371cb1f58 100644
--- a/recipes/nowa_fantastyka.recipe
+++ b/recipes/nowa_fantastyka.recipe
@@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
title=soup.find(attrs={'class':'tytul'})
if title:
title['style']='font-size: 20px; font-weight: bold;'
- self.log.warn(soup)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.INDEX + a['href']
return soup
diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe
index 952db30c3e..56bb601f70 100644
--- a/recipes/pc_arena.recipe
+++ b/recipes/pc_arena.recipe
@@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe):
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
+ index='http://pcarena.pl'
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True
@@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe):
if 'http' not in url:
return 'http://pcarena.pl' + url
else:
- return url
\ No newline at end of file
+ return url
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe
index 666cb8aa77..a615763307 100644
--- a/recipes/tanuki.recipe
+++ b/recipes/tanuki.recipe
@@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ if 'tanuki-anime' in soup.title.string.lower():
+ a['href']='http://anime.tanuki.pl' + a['href']
+ elif 'tanuki-manga' in soup.title.string.lower():
+ a['href']='http://manga.tanuki.pl' + a['href']
+ elif 'tanuki-czytelnia' in soup.title.string.lower():
+ a['href']='http://czytelnia.tanuki.pl' + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/webhosting_pl.recipe b/recipes/webhosting_pl.recipe
index aeb98477f3..8ebb91c4ba 100644
--- a/recipes/webhosting_pl.recipe
+++ b/recipes/webhosting_pl.recipe
@@ -8,6 +8,7 @@ class webhosting_pl(BasicNewsRecipe):
cover_url='http://webhosting.pl/images/logo.png'
masthead_url='http://webhosting.pl/images/logo.png'
oldest_article = 7
+ index='http://webhosting.pl'
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
@@ -36,4 +37,10 @@ class webhosting_pl(BasicNewsRecipe):
(u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')]
def print_version(self, url):
- return url.replace('webhosting.pl', 'webhosting.pl/print')
\ No newline at end of file
+ return url.replace('webhosting.pl', 'webhosting.pl/print')
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file