diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 65f4e3e52d..bb311606ac 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe): no_stylesheets = True oldest_article = 20 max_articles_per_feed = 100 + index='http://www.adventure-zone.info/fusion/' use_embedded_content=False preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: '')] remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) @@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe): skip_tag = skip_tag.findAll(name='a') for r in skip_tag: if r.strong: - word=r.strong.string - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) \ No newline at end of file + word=r.strong.string.lower() + if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + + def preprocess_html(self, soup): + footer=soup.find(attrs={'class':'news-footer middle-border'}) + if footer and len(footer('a'))>=2: + footer('a')[1].extract() + for item in soup.findAll(style=True): + del item['style'] + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup + + \ No newline at end of file diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index cc74cc9128..00eea1be68 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe): self.image_article(soup, soup.body) else: self.append_page(soup, soup.body) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.INDEX + a['href'] return soup diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index ff46774dc9..4e19fbc6c1 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe): description = 'cdaction.pl - polish games magazine site' category = 'games' language = 'pl' + index='http://www.cdaction.pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True @@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://www.cdaction.pl/magazyn/') self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href'] - return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file + return getattr(self, 'cover_url', self.cover_url) + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index a27a9b0877..0614cf98ee 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' encoding = 'utf-8' + index='http://www.dobreprogramy.pl/' no_stylesheets = True language = 'pl' extra_css = '.title {font-size:22px;}' @@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe): #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] + + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe index d80161e71a..4c583e4815 100644 --- a/recipes/dzieje_pl.recipe +++ b/recipes/dzieje_pl.recipe @@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe): cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png' category = 'history' language = 'pl' + index='http://dzieje.pl' oldest_article = 8 max_articles_per_feed = 100 remove_javascript=True @@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe): remove_tags_after= dict(id='dogory') remove_tags=[dict(id='dogory')] feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] + + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/eioba.recipe b/recipes/eioba.recipe index 14256c5811..1df79d64bd 100644 --- a/recipes/eioba.recipe +++ b/recipes/eioba.recipe @@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe): (u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'), (u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml') ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe index 75271c510a..2fbf9ff514 100644 --- a/recipes/emuzica_pl.recipe +++ b/recipes/emuzica_pl.recipe @@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe): description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce' category = 'music' language = 'pl' + index='http://www.emuzyka.pl' cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg' no_stylesheets = True oldest_article = 7 @@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe): keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})] remove_tags=[dict(name='span', attrs={'id':'date'})] feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')] + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 877d4472bc..2a6e00d501 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -7,6 +7,7 @@ class Filmweb_pl(BasicNewsRecipe): cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png' category = 'movies' language = 'pl' + index='http://www.filmweb.pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True @@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe): self.log.warn(skip_tag) return self.index_to_soup(skip_tag['href'], raw=True) + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe index f3384263d6..7b0ccb4f55 100644 --- a/recipes/gameplay_pl.recipe +++ b/recipes/gameplay_pl.recipe @@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe): description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.' category = 'games, movies, books, music' language = 'pl' + index='http://gameplay.pl' masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png' cover_url= 'http://gameplay.pl/img/gpy_top_logo.png' max_articles_per_feed = 100 + remove_javascript= True no_stylesheets= True keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})] - remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})] + remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})] feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')] def image_url_processor(self, baseurl, url): if 'http' not in url: return 'http://gameplay.pl'+ url[2:] else: - return url + return url + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and '../' in a['href']: + a['href']=self.index + a['href'][2:] + return soup \ No newline at end of file diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 042902b5fc..36d3ef4da2 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + remove_empty_feeds=True no_stylesheets=True remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] keep_only_tags=dict(name='div', attrs={'class':'widetext'}) @@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe): self.log.warn('odnosnik') self.log.warn(link['href']) return self.index_to_soup(link['href'], raw=True) + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + if '/gry/' in a['href']: + a['href']='http://www.gry.gildia.pl' + a['href'] + elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower(): + a['href']='http://www.literatura.gildia.pl' + a['href'] + elif u'komiks' in soup.title.string.lower(): + a['href']='http://www.literatura.gildia.pl' + a['href'] + else: + a['href']='http://www.gildia.pl' + a['href'] + return soup diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index 07927796c0..1f8147ba3d 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -7,6 +7,7 @@ class Gram_pl(BasicNewsRecipe): category = 'games' language = 'pl' oldest_article = 8 + index='http://www.gram.pl' max_articles_per_feed = 100 no_stylesheets= True extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' @@ -52,4 +53,7 @@ class Gram_pl(BasicNewsRecipe): tag=soup.findAll(name='div', attrs={'class':'picbox'}) for t in tag: t['style']='float: left;' + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] return soup \ No newline at end of file diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe index 16ad622b46..e385522714 100644 --- a/recipes/in4_pl.recipe +++ b/recipes/in4_pl.recipe @@ -8,6 +8,7 @@ class in4(BasicNewsRecipe): description = u'Serwis Informacyjny - Aktualnosci, recenzje' category = 'IT' language = 'pl' + index='http://www.in4.pl/' #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' no_stylesheets = True remove_empty_feeds = True @@ -39,6 +40,7 @@ class in4(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] return soup - - diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe index 0e035e0980..e021fa0c17 100644 --- a/recipes/infra_pl.recipe +++ b/recipes/infra_pl.recipe @@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe): description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.' cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg' category = 'UFO' + index='http://infra.org.pl' language = 'pl' max_articles_per_feed = 100 no_stylesheers=True @@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe): remove_tags_after=dict(attrs={'class':'pagenav'}) remove_tags=[dict(attrs={'class':'pagenav'})] feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/konflikty_zbrojne.recipe b/recipes/konflikty_zbrojne.recipe index 7921e98f48..4211093443 100644 --- a/recipes/konflikty_zbrojne.recipe +++ b/recipes/konflikty_zbrojne.recipe @@ -10,6 +10,23 @@ class Konflikty(BasicNewsRecipe): category='military, history' oldest_article = 7 max_articles_per_feed = 100 - auto_cleanup = True + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')] - feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')] + feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), + (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), + (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'), + (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'), + (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), + (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'), + (u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for image in soup.findAll(name='a', attrs={'class':'image'}): + if image.img and image.img.has_key('alt'): + image.name='div' + pos = len(image.contents) + image.insert(pos, BeautifulSoup('

'+image.img['alt']+'

')) + return soup \ No newline at end of file diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe index a2f759e878..07fc0da666 100644 --- a/recipes/national_geographic_pl.recipe +++ b/recipes/national_geographic_pl.recipe @@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class recipeMagic(BasicNewsRecipe): title = 'National Geographic PL' __author__ = 'Marcin Urban 2011' + __modified_by__ = 'fenuks' description = 'legenda wśród magazynów z historią sięgającą 120 lat' - cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' + #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True @@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe): ] remove_attributes = ['width','height'] + feeds=[] - feeds = [ - ('National Geographic PL', 'http://www.national-geographic.pl/rss/'), - ] + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + tag=soup.find(attrs={'class':'arl'}) + art=tag.ul.findAll('li') + for i in art: + title=i.a['title'] + url=i.a['href'] + #date=soup.find(id='footer').ul.li.string[41:-1] + desc=i.div.p.string + articles.append({'title' : title, + 'url' : url, + 'date' : '', + 'description' : desc + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/'))) + feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/'))) + + return feeds def print_version(self, url): - return url.replace('artykuly0Cpokaz', 'drukuj-artykul') + if 'artykuly' in url: + return url.replace('artykuly/pokaz', 'drukuj-artykul') + elif 'aktualnosci' in url: + return url.replace('aktualnosci/pokaz', 'drukuj-artykul') + else: + return url + + def get_cover_url(self): + soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/') + tag=soup.find(attrs={'class':'txt jus'}) + self.cover_url=tag.img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index ec556da5fa..0371cb1f58 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe): title=soup.find(attrs={'class':'tytul'}) if title: title['style']='font-size: 20px; font-weight: bold;' - self.log.warn(soup) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.INDEX + a['href'] return soup diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe index 952db30c3e..56bb601f70 100644 --- a/recipes/pc_arena.recipe +++ b/recipes/pc_arena.recipe @@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe): description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' category = 'IT' language = 'pl' + index='http://pcarena.pl' masthead_url='http://pcarena.pl/pcarena/img/logo.png' cover_url= 'http://pcarena.pl/pcarena/img/logo.png' no_stylesheets = True @@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe): if 'http' not in url: return 'http://pcarena.pl' + url else: - return url \ No newline at end of file + return url + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe index 666cb8aa77..a615763307 100644 --- a/recipes/tanuki.recipe +++ b/recipes/tanuki.recipe @@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + if 'tanuki-anime' in soup.title.string.lower(): + a['href']='http://anime.tanuki.pl' + a['href'] + elif 'tanuki-manga' in soup.title.string.lower(): + a['href']='http://manga.tanuki.pl' + a['href'] + elif 'tanuki-czytelnia' in soup.title.string.lower(): + a['href']='http://czytelnia.tanuki.pl' + a['href'] return soup \ No newline at end of file diff --git a/recipes/webhosting_pl.recipe b/recipes/webhosting_pl.recipe index aeb98477f3..8ebb91c4ba 100644 --- a/recipes/webhosting_pl.recipe +++ b/recipes/webhosting_pl.recipe @@ -8,6 +8,7 @@ class webhosting_pl(BasicNewsRecipe): cover_url='http://webhosting.pl/images/logo.png' masthead_url='http://webhosting.pl/images/logo.png' oldest_article = 7 + index='http://webhosting.pl' max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True @@ -36,4 +37,10 @@ class webhosting_pl(BasicNewsRecipe): (u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')] def print_version(self, url): - return url.replace('webhosting.pl', 'webhosting.pl/print') \ No newline at end of file + return url.replace('webhosting.pl', 'webhosting.pl/print') + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file