diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 366b1ccf5a..65f4e3e52d 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -1,19 +1,38 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Adventure_zone(BasicNewsRecipe): title = u'Adventure Zone' __author__ = 'fenuks' description = 'Adventure zone - adventure games from A to Z' category = 'games' language = 'pl' - oldest_article = 15 - max_articles_per_feed = 100 no_stylesheets = True + oldest_article = 20 + max_articles_per_feed = 100 + use_embedded_content=False + preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: '')] remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) - remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'}) + remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})] + remove_tags_after= dict(id='comments') extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] + def parse_feeds (self): + feeds = BasicNewsRecipe.parse_feeds(self) + soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') + tag=soup.find(name='channel') + titles=[] + for r in tag.findAll(name='image'): + r.extract() + art=tag.findAll(name='item') + for i in art: + titles.append(i.title.string) + for feed in feeds: + for article in feed.articles[:]: + article.title=titles[feed.articles.index(article)] + return feeds + + def get_cover_url(self): soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') cover=soup.find(id='box_OstatninumerAZ') @@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe): def skip_ad_pages(self, soup): - skip_tag = soup.body.findAll(name='a') - if skip_tag is not None: - for r in skip_tag: - if 'articles.php?' in r['href']: - if r.strong is not None: - word=r.strong.string - if ('zapowied' or 'recenzj') in word: - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True) - else: - None - - def print_version(self, url): - return url.replace('news.php?readmore', 'print.php?type=N&item_id') - + skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) + skip_tag = skip_tag.findAll(name='a') + for r in skip_tag: + if r.strong: + word=r.strong.string + if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) \ No newline at end of file diff --git a/recipes/astro_news_pl.recipe b/recipes/astro_news_pl.recipe index e5561fc98d..2808fed6e1 100644 --- a/recipes/astro_news_pl.recipe +++ b/recipes/astro_news_pl.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe - class AstroNEWS(BasicNewsRecipe): title = u'AstroNEWS' __author__ = 'fenuks' @@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - auto_cleanup = True + #extra_css= 'table {text-align: left;}' + no_stylesheets=True cover_url='http://news.astronet.pl/img/logo_news.jpg' - # no_stylesheets= True + remove_tags=[dict(name='hr')] feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')] def print_version(self, url): return url.replace('astronet.pl/', 'astronet.pl/print.cgi?') + def preprocess_html(self, soup): + for item in soup.findAll(align=True): + del item['align'] + return soup diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe index d63af135bc..7ff61a8a11 100644 --- a/recipes/focus_pl.recipe +++ b/recipes/focus_pl.recipe @@ -12,8 +12,9 @@ class Focus_pl(BasicNewsRecipe): cover_url='' remove_empty_feeds= True no_stylesheets=True - remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'}) - remove_tags_after=dict(name='div', attrs={'class':'clear'}) + #remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'}) + #remove_tags_after=dict(name='div', attrs={'class':'clear'}) + keep_only_tags=[dict(name='div', attrs={'class':['h2 h2f', 'news-left', 'news-right']})] feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'), (u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'), (u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), @@ -23,35 +24,33 @@ class Focus_pl(BasicNewsRecipe): (u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), (u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'), (u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'), - - - -] + ] def skip_ad_pages(self, soup): - tag=soup.find(name='a') - if tag: - new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True) - return new_soup + if 'Advertisement' in soup.title: + tag=soup.find(name='a') + if tag: + new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True) + return new_soup def append_page(self, appendtag): - tag=appendtag.find(name='div', attrs={'class':'arrows'}) - if tag: - nexturl='http://www.focus.pl/'+tag.a['href'] - for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}): - rem.extract() - while nexturl: - soup2=self.index_to_soup(nexturl) - nexturl=None - pagetext=soup2.find(name='div', attrs={'class':'txt'}) - tag=pagetext.find(name='div', attrs={'class':'arrows'}) - for r in tag.findAll(name='a'): - if u'Następne' in r.string: - nexturl='http://www.focus.pl/'+r['href'] - for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}): - rem.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) + tag=appendtag.find(name='div', attrs={'class':'arrows'}) + if tag: + nexturl='http://www.focus.pl/'+tag.a['href'] + for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}): + rem.extract() + while nexturl: + soup2=self.index_to_soup(nexturl) + nexturl=None + pagetext=soup2.find(name='div', attrs={'class':'txt'}) + tag=pagetext.find(name='div', attrs={'class':'arrows'}) + for r in tag.findAll(name='a'): + if u'Następne' in r.string: + nexturl='http://www.focus.pl/'+r['href'] + for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}): + rem.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) def get_cover_url(self): soup=self.index_to_soup('http://www.focus.pl/magazyn/') diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe index 374c6dd0cb..e4769d58bc 100644 --- a/recipes/naczytniki.recipe +++ b/recipes/naczytniki.recipe @@ -7,6 +7,7 @@ class naczytniki(BasicNewsRecipe): language = 'pl' description ='everything about e-readers' category='readers' + no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 remove_tags_after= dict(name='div', attrs={'class':'sociable'}) diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index d8015105f8..74534f3346 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -1,20 +1,21 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe - class Nowa_Fantastyka(BasicNewsRecipe): title = u'Nowa Fantastyka' oldest_article = 7 __author__ = 'fenuks' language = 'pl' + encoding='latin2' description ='site for fantasy readers' category='fantasy' max_articles_per_feed = 100 INDEX='http://www.fantastyka.pl/' + no_stylesheets=True + needs_subscription = 'optional' remove_tags_before=dict(attrs={'class':'belka1-tlo-md'}) #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'}) remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'}) - remove_tags=[dict(attrs={'class':'avatar2'})] - feeds = [] + remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})] def find_articles(self, url): articles = [] @@ -45,3 +46,13 @@ class Nowa_Fantastyka(BasicNewsRecipe): cover=soup.find(name='img', attrs={'class':'okladka'}) self.cover_url=self.INDEX+ cover['src'] return getattr(self, 'cover_url', self.cover_url) + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://www.fantastyka.pl/') + br.select_form(nr=0) + br['login'] = self.username + br['pass'] = self.password + br.submit() + return br diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe index d615f01aa9..4fe7d9c8d6 100644 --- a/recipes/spiders_web_pl.recipe +++ b/recipes/spiders_web_pl.recipe @@ -8,8 +8,8 @@ class SpidersWeb(BasicNewsRecipe): cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' category = 'IT, WEB' language = 'pl' + no_stylesheers=True max_articles_per_feed = 100 - remove_tags_before=dict(name="h1", attrs={'class':'Title'}) - remove_tags_after=dict(name="div", attrs={'class':'Text'}) - remove_tags=[dict(name='div', attrs={'class':['Tags', 'CommentCount FloatL', 'Show FloatL']})] + keep_only_tags=[dict(id='Post')] + remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']})] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]