From 1e5ce66ca36bbc16c479e0da0e801329a22c6387 Mon Sep 17 00:00:00 2001
From: fenuks <fenuks@gmail.com>
Date: Mon, 17 Jun 2013 09:45:13 +0200
Subject: [PATCH] various minor fixes

---
 recipes/ekologia_pl.recipe    |  4 ++-
 recipes/gildia_pl.recipe      | 59 ++++++++++++++++++++---------------
 recipes/media2.recipe         | 36 ++++++++++-----------
 recipes/nauka_w_polsce.recipe |  2 +-
 recipes/polter_pl.recipe      |  2 +-
 recipes/ppe_pl.recipe         | 46 ++++++++++++---------------
 recipes/pure_pc.recipe        | 17 ++++++----
 7 files changed, 85 insertions(+), 81 deletions(-)

diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe
index e925ebad6f..c053e6d5bc 100644
--- a/recipes/ekologia_pl.recipe
+++ b/recipes/ekologia_pl.recipe
@@ -9,13 +9,15 @@ class EkologiaPl(BasicNewsRecipe):
     language       = 'pl'
     cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png'
     ignore_duplicate_articles = {'title', 'url'}
-    extra_css = '.title {font-size: 200%;} .imagePowiazane, .imgCon {float:left; margin-right:5px;}'
+    extra_css = '.title {font-size: 200%;} .imagePowiazane {float:left; margin-right:5px; width: 200px;}'
     oldest_article = 7
     max_articles_per_feed = 100
     no_stylesheets = True
     remove_empty_feeds = True
+    remove_javascript = True
     use_embedded_content = False
     remove_attrs = ['style']
+    keep_only_tags = [dict(attrs={'class':'contentParent'})]
     remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})]
 
     feeds          = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')]
diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe
index 37c129aaa1..513bbe44d6 100644
--- a/recipes/gildia_pl.recipe
+++ b/recipes/gildia_pl.recipe
@@ -16,40 +16,47 @@ class Gildia(BasicNewsRecipe):
     ignore_duplicate_articles = {'title', 'url'}
     preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '') ]
     ignore_duplicate_articles = {'title', 'url'}
-    remove_tags = [dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
-    keep_only_tags = dict(name='div', attrs={'class':'widetext'})
-    feeds          = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')]
-
+    remove_tags = [dict(name='div', attrs={'class':['backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})]
+    keep_only_tags = [dict(name='div', attrs={'class':'widetext'})]
+    feeds          = [(u'Gry', u'http://www.gry.gildia.pl/rss'),
+                        (u'Literatura', u'http://www.literatura.gildia.pl/rss'),
+                        (u'Film', u'http://www.film.gildia.pl/rss'),
+                        (u'Horror', u'http://www.horror.gildia.pl/rss'),
+                        (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'),
+                        (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'),
+                        (u'Manga i anime', u'http://www.manga.gildia.pl/rss'),
+                        (u'Star Wars', u'http://www.starwars.gildia.pl/rss'),
+                        (u'Techno', u'http://www.techno.gildia.pl/rss'),
+                        (u'Historia', u'http://www.historia.gildia.pl/rss'),
+                        (u'Magia', u'http://www.magia.gildia.pl/rss'),
+                        (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'),
+                        (u'RPG', u'http://www.rpg.gildia.pl/rss'),
+                        (u'LARP', u'http://www.larp.gildia.pl/rss'),
+                        (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'),
+                        (u'Nauka', u'http://www.nauka.gildia.pl/rss'),
+                    ]
 
     def skip_ad_pages(self, soup):
         content = soup.find('div', attrs={'class':'news'})
-        if 'recenzj' in soup.title.string.lower():
-            for link in content.findAll(name='a'):
-                if 'recenzj' in link['href'] or 'muzyka/plyty' in link['href']:
-                    return self.index_to_soup(link['href'], raw=True)
-        if 'fragmen' in soup.title.string.lower():
-            for link in content.findAll(name='a'):
-                if 'fragment' in link['href']:
-                    return self.index_to_soup(link['href'], raw=True)
-        if 'relacj' in soup.title.string.lower():
-            for link in content.findAll(name='a'):
-                if 'relacj' in link['href']:
-                    return self.index_to_soup(link['href'], raw=True)
-        if 'wywiad' in soup.title.string.lower():
-            for link in content.findAll(name='a'):
-                if 'wywiad' in link['href']:
-                    return self.index_to_soup(link['href'], raw=True)
-
+        words = ('recenzj', 'zapowied','fragmen', 'relacj', 'wywiad', 'nominacj')
+        for word in words:
+            if word in soup.title.string.lower():
+                for link in content.findAll(name='a'):
+                    if word in link['href'] or (link.string and word in link.string):
+                        return self.index_to_soup(link['href'], raw=True)
+        for tag in content.findAll(name='a', href=re.compile('/publicystyka/')):
+            if 'Wi&#281;cej...' == tag.string:
+                return self.index_to_soup(tag['href'], raw=True)
 
     def preprocess_html(self, soup):
         for a in soup('a'):
             if a.has_key('href') and not a['href'].startswith('http'):
                 if '/gry/' in a['href']:
-                    a['href']='http://www.gry.gildia.pl' + a['href']
+                    a['href'] = 'http://www.gry.gildia.pl' + a['href']
                 elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
-                    a['href']='http://www.literatura.gildia.pl' + a['href']
+                    a['href'] = 'http://www.literatura.gildia.pl' + a['href']
                 elif u'komiks' in soup.title.string.lower():
-                    a['href']='http://www.literatura.gildia.pl' + a['href']
+                    a['href'] = 'http://www.literatura.gildia.pl' + a['href']
                 else:
-                    a['href']='http://www.gildia.pl' + a['href']
-        return soup
+                    a['href'] = 'http://www.gildia.pl' + a['href']
+        return soup
\ No newline at end of file
diff --git a/recipes/media2.recipe b/recipes/media2.recipe
index 135740a62e..d685a90803 100644
--- a/recipes/media2.recipe
+++ b/recipes/media2.recipe
@@ -3,33 +3,29 @@
 __license__ = 'GPL v3'
 __copyright__ = 'teepel'
 
-'''
-media2.pl
-'''
-
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class media2_pl(BasicNewsRecipe):
     title = u'Media2'
     __author__ = 'teepel <teepel44@gmail.com>'
     language = 'pl'
-    description =u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.'
-    masthead_url='http://media2.pl/res/logo/www.png'
-    remove_empty_feeds= True
-    oldest_article = 1
+    description = u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.'
+    masthead_url = 'http://media2.pl/res/logo/www.png'
+    cover_url = 'http://media2.pl/res/logo/www.png'
+    remove_empty_feeds = True
+    oldest_article = 7
     max_articles_per_feed = 100
-    remove_javascript=True
-    no_stylesheets=True
-    simultaneous_downloads = 5
-
+    remove_javascript = True
+    no_stylesheets = True
+    remove_attributes = ['style']
+    ignore_duplicate_articles = {'title', 'url'}
     extra_css = '''.news-lead{font-weight: bold; }'''
 
-    keep_only_tags =[]
-    keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-item tpl-big'}))
+    keep_only_tags = [dict(name = 'div', attrs = {'class' : 'news-item tpl-big'})]
+    remove_tags = [dict(name = 'span', attrs = {'class' : 'news-comments'}), dict(name = 'div', attrs = {'class' : 'item-sidebar'}), dict(name = 'div', attrs = {'class' : 'news-tags'})]
 
-    remove_tags =[]
-    remove_tags.append(dict(name = 'span', attrs = {'class' : 'news-comments'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'item-sidebar'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-tags'}))
-
-    feeds          = [(u'Media2', u'http://feeds.feedburner.com/media2')]
+    feeds = [(u'Media2', u'http://feeds.feedburner.com/media2'), (u'Internet', u'http://feeds.feedburner.com/media2/internet'),
+            (u'Media', 'http://feeds.feedburner.com/media2/media'), (u'Telekomunikacja', 'http://feeds.feedburner.com/media2/telekomunikacja'),
+            (u'Reklama/PR', 'http://feeds.feedburner.com/media2/reklama-pr'), (u'Technologie', 'http://feeds.feedburner.com/media2/technologie'),
+            (u'Badania', 'http://feeds.feedburner.com/media2/badania')
+            ]
\ No newline at end of file
diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe
index 715780d162..2a44aa7e84 100644
--- a/recipes/nauka_w_polsce.recipe
+++ b/recipes/nauka_w_polsce.recipe
@@ -1,7 +1,7 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 class NaukawPolsce(BasicNewsRecipe):
-    title = u'Nauka w Polsce'
+    title = u'PAP Nauka w Polsce'
     __author__ = 'fenuks'
     description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.'
     category = 'science'
diff --git a/recipes/polter_pl.recipe b/recipes/polter_pl.recipe
index 1f9cef3be3..aea21dca9c 100644
--- a/recipes/polter_pl.recipe
+++ b/recipes/polter_pl.recipe
@@ -3,7 +3,7 @@ import re
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class Poltergeist(BasicNewsRecipe):
-    title          = u'Poltergeist'
+    title          = u'Polter.pl'
     __author__        = 'fenuks'
     description   = u'Największy polski serwis poświęcony ogólno pojętej fantastyce - grom fabularnym (RPG), książkom, filmowi, komiksowi, grom planszowym, karcianym i bitewnym.'
     category       = 'fantasy, books, rpg, games'
diff --git a/recipes/ppe_pl.recipe b/recipes/ppe_pl.recipe
index 2edc611ad7..597c9ef2d3 100644
--- a/recipes/ppe_pl.recipe
+++ b/recipes/ppe_pl.recipe
@@ -1,41 +1,35 @@
 #!/usr/bin/env  python
 
 __license__ = 'GPL v3'
-
+import re
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class ppeRecipe(BasicNewsRecipe):
     __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
     language = 'pl'
-
     title = u'ppe.pl'
     category = u'News'
     description = u'Portal o konsolach i grach wideo.'
-    cover_url=''
-    remove_empty_feeds= True
-    no_stylesheets=True
-    oldest_article = 1
-    max_articles_per_feed = 100000
-    recursions = 0
+    extra_css = '.categories > li {list-style: none; display: inline;} .galmini > li {list-style: none; float: left;} .calibre_navbar {clear: both;}'
+    remove_empty_feeds = True
     no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 100
     remove_javascript = True
-    simultaneous_downloads = 2
+    remove_empty_feeds = True
+    remove_attributes = ['style']
+    
+    keep_only_tags = [dict(attrs={'class':'box'})]
+    remove_tags = [dict(attrs={'class':['voltage-1', 'voltage-2', 'encyklopedia', 'nag', 'related', 'comment_form', 'komentarze-box']})]
 
-    keep_only_tags =[]
-    keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-heading'}))
-    keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'tresc-poziom'}))
+    feeds = [
+            ('Newsy', 'http://ppe.pl/rss.html'),
+            ('Recenzje', 'http://ppe.pl/rss-recenzje.html'),
+            ('Publicystyka', 'http://ppe.pl/rss-publicystyka.html'),
+            ]
 
-    remove_tags =[]
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria1'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria2'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria3'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-photo'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'fbl'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'info'}))
-    remove_tags.append(dict(name = 'div', attrs = {'class' : 'links'}))
-
-    remove_tags.append(dict(name = 'div', attrs = {'style' : 'padding: 4px'}))
-
-    feeds          = [
-                            ('Newsy', 'feed://ppe.pl/rss/rss.xml'),
-                           ]
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.ppe.pl/psx_extreme.html')
+        part = soup.find(attrs={'class':'archiwum-foto'})['style']
+        part = re.search("'(.+)'", part).group(1).replace('_min', '')
+        return 'http://www.ppe.pl' + part
diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe
index 13d9307a09..167136c90f 100644
--- a/recipes/pure_pc.recipe
+++ b/recipes/pure_pc.recipe
@@ -1,3 +1,4 @@
+import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Comment
 
@@ -11,6 +12,7 @@ class PurePC(BasicNewsRecipe):
     language       = 'pl'
     masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
     cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
+    extra_css = '.wykres_logo {float: left; margin-right: 5px;}'
     no_stylesheets = True
     keep_only_tags= [dict(id='content')]
     remove_tags_after= dict(attrs={'class':'fivestar-widget'})
@@ -19,11 +21,14 @@ class PurePC(BasicNewsRecipe):
 
 
     def append_page(self, soup, appendtag):
-        nexturl= appendtag.find(attrs={'class':'pager-next'})
-        if nexturl:
-            while nexturl:
-                soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href'])
-                nexturl=soup2.find(attrs={'class':'pager-next'})
+        lasturl = appendtag.find(attrs={'class':'pager-last'})
+        if lasturl:
+            regex = re.search('(.+?2C)(\d+)', lasturl.a['href'])
+            baseurl = regex.group(1).replace('?page=0%2C', '?page=1%2C')
+            baseurl = 'http://www.purepc.pl' + baseurl
+            nr = int(regex.group(2))
+            for page_nr in range(1, nr+1):
+                soup2 = self.index_to_soup(baseurl+str(page_nr))
                 pagetext = soup2.find(attrs={'class':'article'})
                 pos = len(appendtag.contents)
                 appendtag.insert(pos, pagetext)
@@ -35,4 +40,4 @@ class PurePC(BasicNewsRecipe):
 
     def preprocess_html(self, soup):
         self.append_page(soup, soup.body)
-        return soup
+        return soup
\ No newline at end of file