From 1e5ce66ca36bbc16c479e0da0e801329a22c6387 Mon Sep 17 00:00:00 2001 From: fenuks Date: Mon, 17 Jun 2013 09:45:13 +0200 Subject: [PATCH] various minor fixes --- recipes/ekologia_pl.recipe | 4 ++- recipes/gildia_pl.recipe | 59 ++++++++++++++++++++--------------- recipes/media2.recipe | 36 ++++++++++----------- recipes/nauka_w_polsce.recipe | 2 +- recipes/polter_pl.recipe | 2 +- recipes/ppe_pl.recipe | 46 ++++++++++++--------------- recipes/pure_pc.recipe | 17 ++++++---- 7 files changed, 85 insertions(+), 81 deletions(-) diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe index e925ebad6f..c053e6d5bc 100644 --- a/recipes/ekologia_pl.recipe +++ b/recipes/ekologia_pl.recipe @@ -9,13 +9,15 @@ class EkologiaPl(BasicNewsRecipe): language = 'pl' cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png' ignore_duplicate_articles = {'title', 'url'} - extra_css = '.title {font-size: 200%;} .imagePowiazane, .imgCon {float:left; margin-right:5px;}' + extra_css = '.title {font-size: 200%;} .imagePowiazane {float:left; margin-right:5px; width: 200px;}' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True + remove_javascript = True use_embedded_content = False remove_attrs = ['style'] + keep_only_tags = [dict(attrs={'class':'contentParent'})] remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 37c129aaa1..513bbe44d6 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -16,40 +16,47 @@ class Gildia(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(ur''), lambda match: '') ] ignore_duplicate_articles = {'title', 'url'} - remove_tags = [dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] - keep_only_tags = dict(name='div', attrs={'class':'widetext'}) - feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')] - + remove_tags = [dict(name='div', attrs={'class':['backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})] + keep_only_tags = [dict(name='div', attrs={'class':'widetext'})] + feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), + (u'Literatura', u'http://www.literatura.gildia.pl/rss'), + (u'Film', u'http://www.film.gildia.pl/rss'), + (u'Horror', u'http://www.horror.gildia.pl/rss'), + (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), + (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), + (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), + (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), + (u'Techno', u'http://www.techno.gildia.pl/rss'), + (u'Historia', u'http://www.historia.gildia.pl/rss'), + (u'Magia', u'http://www.magia.gildia.pl/rss'), + (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), + (u'RPG', u'http://www.rpg.gildia.pl/rss'), + (u'LARP', u'http://www.larp.gildia.pl/rss'), + (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), + (u'Nauka', u'http://www.nauka.gildia.pl/rss'), + ] def skip_ad_pages(self, soup): content = soup.find('div', attrs={'class':'news'}) - if 'recenzj' in soup.title.string.lower(): - for link in content.findAll(name='a'): - if 'recenzj' in link['href'] or 'muzyka/plyty' in link['href']: - return self.index_to_soup(link['href'], raw=True) - if 'fragmen' in soup.title.string.lower(): - for link in content.findAll(name='a'): - if 'fragment' in link['href']: - return self.index_to_soup(link['href'], raw=True) - if 'relacj' in soup.title.string.lower(): - for link in content.findAll(name='a'): - if 'relacj' in link['href']: - return self.index_to_soup(link['href'], raw=True) - if 'wywiad' in soup.title.string.lower(): - for link in content.findAll(name='a'): - if 'wywiad' in link['href']: - return self.index_to_soup(link['href'], raw=True) - + words = ('recenzj', 'zapowied','fragmen', 'relacj', 'wywiad', 'nominacj') + for word in words: + if word in soup.title.string.lower(): + for link in content.findAll(name='a'): + if word in link['href'] or (link.string and word in link.string): + return self.index_to_soup(link['href'], raw=True) + for tag in content.findAll(name='a', href=re.compile('/publicystyka/')): + if 'Więcej...' == tag.string: + return self.index_to_soup(tag['href'], raw=True) def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and not a['href'].startswith('http'): if '/gry/' in a['href']: - a['href']='http://www.gry.gildia.pl' + a['href'] + a['href'] = 'http://www.gry.gildia.pl' + a['href'] elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower(): - a['href']='http://www.literatura.gildia.pl' + a['href'] + a['href'] = 'http://www.literatura.gildia.pl' + a['href'] elif u'komiks' in soup.title.string.lower(): - a['href']='http://www.literatura.gildia.pl' + a['href'] + a['href'] = 'http://www.literatura.gildia.pl' + a['href'] else: - a['href']='http://www.gildia.pl' + a['href'] - return soup + a['href'] = 'http://www.gildia.pl' + a['href'] + return soup \ No newline at end of file diff --git a/recipes/media2.recipe b/recipes/media2.recipe index 135740a62e..d685a90803 100644 --- a/recipes/media2.recipe +++ b/recipes/media2.recipe @@ -3,33 +3,29 @@ __license__ = 'GPL v3' __copyright__ = 'teepel' -''' -media2.pl -''' - from calibre.web.feeds.news import BasicNewsRecipe class media2_pl(BasicNewsRecipe): title = u'Media2' __author__ = 'teepel ' language = 'pl' - description =u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.' - masthead_url='http://media2.pl/res/logo/www.png' - remove_empty_feeds= True - oldest_article = 1 + description = u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.' + masthead_url = 'http://media2.pl/res/logo/www.png' + cover_url = 'http://media2.pl/res/logo/www.png' + remove_empty_feeds = True + oldest_article = 7 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True - simultaneous_downloads = 5 - + remove_javascript = True + no_stylesheets = True + remove_attributes = ['style'] + ignore_duplicate_articles = {'title', 'url'} extra_css = '''.news-lead{font-weight: bold; }''' - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-item tpl-big'})) + keep_only_tags = [dict(name = 'div', attrs = {'class' : 'news-item tpl-big'})] + remove_tags = [dict(name = 'span', attrs = {'class' : 'news-comments'}), dict(name = 'div', attrs = {'class' : 'item-sidebar'}), dict(name = 'div', attrs = {'class' : 'news-tags'})] - remove_tags =[] - remove_tags.append(dict(name = 'span', attrs = {'class' : 'news-comments'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'item-sidebar'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-tags'})) - - feeds = [(u'Media2', u'http://feeds.feedburner.com/media2')] + feeds = [(u'Media2', u'http://feeds.feedburner.com/media2'), (u'Internet', u'http://feeds.feedburner.com/media2/internet'), + (u'Media', 'http://feeds.feedburner.com/media2/media'), (u'Telekomunikacja', 'http://feeds.feedburner.com/media2/telekomunikacja'), + (u'Reklama/PR', 'http://feeds.feedburner.com/media2/reklama-pr'), (u'Technologie', 'http://feeds.feedburner.com/media2/technologie'), + (u'Badania', 'http://feeds.feedburner.com/media2/badania') + ] \ No newline at end of file diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe index 715780d162..2a44aa7e84 100644 --- a/recipes/nauka_w_polsce.recipe +++ b/recipes/nauka_w_polsce.recipe @@ -1,7 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class NaukawPolsce(BasicNewsRecipe): - title = u'Nauka w Polsce' + title = u'PAP Nauka w Polsce' __author__ = 'fenuks' description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' category = 'science' diff --git a/recipes/polter_pl.recipe b/recipes/polter_pl.recipe index 1f9cef3be3..aea21dca9c 100644 --- a/recipes/polter_pl.recipe +++ b/recipes/polter_pl.recipe @@ -3,7 +3,7 @@ import re from calibre.web.feeds.news import BasicNewsRecipe class Poltergeist(BasicNewsRecipe): - title = u'Poltergeist' + title = u'Polter.pl' __author__ = 'fenuks' description = u'Największy polski serwis poświęcony ogólno pojętej fantastyce - grom fabularnym (RPG), książkom, filmowi, komiksowi, grom planszowym, karcianym i bitewnym.' category = 'fantasy, books, rpg, games' diff --git a/recipes/ppe_pl.recipe b/recipes/ppe_pl.recipe index 2edc611ad7..597c9ef2d3 100644 --- a/recipes/ppe_pl.recipe +++ b/recipes/ppe_pl.recipe @@ -1,41 +1,35 @@ #!/usr/bin/env python __license__ = 'GPL v3' - +import re from calibre.web.feeds.news import BasicNewsRecipe class ppeRecipe(BasicNewsRecipe): __author__ = u'Artur Stachecki ' language = 'pl' - title = u'ppe.pl' category = u'News' description = u'Portal o konsolach i grach wideo.' - cover_url='' - remove_empty_feeds= True - no_stylesheets=True - oldest_article = 1 - max_articles_per_feed = 100000 - recursions = 0 + extra_css = '.categories > li {list-style: none; display: inline;} .galmini > li {list-style: none; float: left;} .calibre_navbar {clear: both;}' + remove_empty_feeds = True no_stylesheets = True + oldest_article = 7 + max_articles_per_feed = 100 remove_javascript = True - simultaneous_downloads = 2 + remove_empty_feeds = True + remove_attributes = ['style'] + + keep_only_tags = [dict(attrs={'class':'box'})] + remove_tags = [dict(attrs={'class':['voltage-1', 'voltage-2', 'encyklopedia', 'nag', 'related', 'comment_form', 'komentarze-box']})] - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-heading'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'tresc-poziom'})) + feeds = [ + ('Newsy', 'http://ppe.pl/rss.html'), + ('Recenzje', 'http://ppe.pl/rss-recenzje.html'), + ('Publicystyka', 'http://ppe.pl/rss-publicystyka.html'), + ] - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria1'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria2'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria3'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-photo'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'fbl'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'info'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'links'})) - - remove_tags.append(dict(name = 'div', attrs = {'style' : 'padding: 4px'})) - - feeds = [ - ('Newsy', 'feed://ppe.pl/rss/rss.xml'), - ] + def get_cover_url(self): + soup = self.index_to_soup('http://www.ppe.pl/psx_extreme.html') + part = soup.find(attrs={'class':'archiwum-foto'})['style'] + part = re.search("'(.+)'", part).group(1).replace('_min', '') + return 'http://www.ppe.pl' + part diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe index 13d9307a09..167136c90f 100644 --- a/recipes/pure_pc.recipe +++ b/recipes/pure_pc.recipe @@ -1,3 +1,4 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment @@ -11,6 +12,7 @@ class PurePC(BasicNewsRecipe): language = 'pl' masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + extra_css = '.wykres_logo {float: left; margin-right: 5px;}' no_stylesheets = True keep_only_tags= [dict(id='content')] remove_tags_after= dict(attrs={'class':'fivestar-widget'}) @@ -19,11 +21,14 @@ class PurePC(BasicNewsRecipe): def append_page(self, soup, appendtag): - nexturl= appendtag.find(attrs={'class':'pager-next'}) - if nexturl: - while nexturl: - soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href']) - nexturl=soup2.find(attrs={'class':'pager-next'}) + lasturl = appendtag.find(attrs={'class':'pager-last'}) + if lasturl: + regex = re.search('(.+?2C)(\d+)', lasturl.a['href']) + baseurl = regex.group(1).replace('?page=0%2C', '?page=1%2C') + baseurl = 'http://www.purepc.pl' + baseurl + nr = int(regex.group(2)) + for page_nr in range(1, nr+1): + soup2 = self.index_to_soup(baseurl+str(page_nr)) pagetext = soup2.find(attrs={'class':'article'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) @@ -35,4 +40,4 @@ class PurePC(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) - return soup + return soup \ No newline at end of file