From 632ae65855f1c14b75b74e0812c9a875d19af6c5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 21 Mar 2012 08:52:11 +0530
Subject: [PATCH] Updated various Polish recipes

---
 recipes/android_com_pl.recipe |  1 +
 recipes/cgm_pl.recipe         | 13 ++++++++-----
 recipes/elektroda_pl.recipe   | 16 ++++++++++++++++
 recipes/film_web.recipe       |  2 +-
 recipes/gram_pl.recipe        | 36 ++++++++++++++++++++++++++++++++---
 recipes/naczytniki.recipe     |  6 +++---
 recipes/overclock_pl.recipe   | 23 +++++-----------------
 recipes/palmtop_pl.recipe     |  4 +++-
 recipes/pc_arena.recipe       | 36 +++++++++++++++--------------------
 recipes/pc_centre_pl.recipe   | 35 +++++++---------------------------
 recipes/tablety_pl.recipe     |  7 ++++---
 recipes/wnp.recipe            |  9 +++++++--
 12 files changed, 103 insertions(+), 85 deletions(-)
diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe
index a44d5e560a..c7a4a97d3c 100644
--- a/recipes/android_com_pl.recipe
+++ b/recipes/android_com_pl.recipe
@@ -6,6 +6,7 @@ class Android_com_pl(BasicNewsRecipe):
     description   = 'Android.com.pl - biggest polish Android site'
     category       = 'Android, mobile'
     language       = 'pl'
+    use_embedded_content=True
     cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png'
     oldest_article = 8
     max_articles_per_feed = 100
diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe
index 673a9f940b..4ab4402c3a 100644
--- a/recipes/cgm_pl.recipe
+++ b/recipes/cgm_pl.recipe
@@ -1,4 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
 
 class CGM(BasicNewsRecipe):
     title          = u'CGM'
@@ -17,9 +18,9 @@ class CGM(BasicNewsRecipe):
     remove_tags_before=dict(id='mainContent')
     remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
     remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
- 	      dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}), 
-	      dict(id=['movieShare', 'container'])]
-    feeds          = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'), 
+          dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
+          dict(id=['movieShare', 'container'])]
+    feeds          = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),
                           (u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')]
 
 
@@ -33,10 +34,12 @@ class CGM(BasicNewsRecipe):
                 img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
                 gallery.contents[1].name='img'
                 gallery.contents[1]['src']=img
+                pos = len(gallery.contents)
+                gallery.insert(pos, BeautifulSoup('<br />'))
         for item in soup.findAll(style=True):
             del item['style']
         ad=soup.findAll('a')
         for r in ad:
-            if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:                
+            if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
                  r.extract()
-        return soup
\ No newline at end of file
+        return soup
diff --git a/recipes/elektroda_pl.recipe b/recipes/elektroda_pl.recipe
index c2123cb8cf..55858020ad 100644
--- a/recipes/elektroda_pl.recipe
+++ b/recipes/elektroda_pl.recipe
@@ -1,4 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
 
 class Elektroda(BasicNewsRecipe):
     title          = u'Elektroda'
@@ -13,3 +14,18 @@ class Elektroda(BasicNewsRecipe):
     remove_tags_after=dict(name='td', attrs={'class':'spaceRow'})
     remove_tags=[dict(name='a', attrs={'href':'#top'})]
     feeds          = [(u'Elektroda', u'http://www.elektroda.pl/rtvforum/rss.php')]
+
+
+    def preprocess_html(self, soup):
+        tag=soup.find('span', attrs={'class':'postbody'})
+        if tag:
+            pos = len(tag.contents)
+            tag.insert(pos, BeautifulSoup('<br />'))
+        return soup
+
+    def parse_feeds (self):
+      feeds = BasicNewsRecipe.parse_feeds(self)
+      for feed in feeds:
+        for article in feed.articles[:]:
+            article.title=article.title[article.title.find("::")+3:]
+      return feeds
diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe
index 0671deec6c..877d4472bc 100644
--- a/recipes/film_web.recipe
+++ b/recipes/film_web.recipe
@@ -13,7 +13,7 @@ class Filmweb_pl(BasicNewsRecipe):
     remove_empty_feeds=True
     extra_css      = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
     remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
-    keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})]
+    keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
     feeds          = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
                          (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
                          (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe
index c8655dc9cd..07927796c0 100644
--- a/recipes/gram_pl.recipe
+++ b/recipes/gram_pl.recipe
@@ -9,12 +9,12 @@ class Gram_pl(BasicNewsRecipe):
     oldest_article = 8
     max_articles_per_feed = 100
     no_stylesheets= True
-    extra_css = 'h2 {font-style: italic;  font-size:20px;}'
+    extra_css = 'h2 {font-style: italic;  font-size:20px;} .picbox div {float: left;}'
     cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
     remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
     keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
-    feeds          = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'),
-	      (u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
+    feeds          = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
+	      (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
 
     def parse_feeds (self): 
       feeds = BasicNewsRecipe.parse_feeds(self) 
@@ -23,3 +23,33 @@ class Gram_pl(BasicNewsRecipe):
           if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
             feed.articles.remove(article)
       return feeds
+
+    def append_page(self, soup, appendtag):
+        nexturl = appendtag.find('a', attrs={'class':'cpn'})
+        while nexturl:
+            soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href'])
+            r=appendtag.find(id='pgbox')
+            if r:
+                r.extract()
+            pagetext = soup2.find(attrs={'class':'main'})
+            r=pagetext.find('h1')
+            if r:
+                r.extract()
+            r=pagetext.find('h2')
+            if r:
+                r.extract()
+            for r in pagetext.findAll('script'):
+                r.extract()
+            pos = len(appendtag.contents)
+            appendtag.insert(pos, pagetext)
+            nexturl = appendtag.find('a', attrs={'class':'cpn'})
+        r=appendtag.find(id='pgbox')
+        if r:
+           r.extract()
+
+    def preprocess_html(self, soup):
+        self.append_page(soup, soup.body)
+        tag=soup.findAll(name='div', attrs={'class':'picbox'})
+        for t in tag:
+            t['style']='float: left;'
+        return soup
\ No newline at end of file
diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe
index 2ae6bc391e..3d1a8b6095 100644
--- a/recipes/naczytniki.recipe
+++ b/recipes/naczytniki.recipe
@@ -7,12 +7,12 @@ class naczytniki(BasicNewsRecipe):
     cover_url      = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
     language       = 'pl'
     description ='everything about e-readers'
-    category='readers'
+    category='e-readers'
     no_stylesheets=True
+    use_embedded_content=False
     oldest_article = 7
     max_articles_per_feed = 100
     preprocess_regexps = [(re.compile(ur'<p><br><b>Zobacz także:</b></p>.*?</body>', re.DOTALL), lambda match: '</body>') ]
-    remove_tags_after= dict(name='div', attrs={'class':'sociable'})
     keep_only_tags=[dict(name='div', attrs={'class':'post'})]
     remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
-    feeds          = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]
+    feeds          = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]
\ No newline at end of file
diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe
index d7f4c8093d..953dee67eb 100644
--- a/recipes/overclock_pl.recipe
+++ b/recipes/overclock_pl.recipe
@@ -17,21 +17,8 @@ class Overclock_pl(BasicNewsRecipe):
     remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})]
     feeds          = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')]
 
-
-    def append_page(self, soup, appendtag):
-        tag=soup.find(id='navigation')
-        if tag:
-            nexturl=tag.findAll('option')
-            tag.extract()
-            for nextpage in nexturl[2:]:
-               soup2 = self.index_to_soup(nextpage['value'])
-               pagetext = soup2.find(id='content')
-               pos = len(appendtag.contents)
-               appendtag.insert(pos, pagetext)
-            rem=appendtag.find(attrs={'alt':'Pierwsza'})
-            if rem:
-                rem.parent.extract()
-
-    def preprocess_html(self, soup):
-        self.append_page(soup, soup.body)
-        return soup
\ No newline at end of file
+    def print_version(self, url):
+        if 'articles/show' in url:
+            return url.replace('show', 'showall')
+        else:
+            return url
\ No newline at end of file
diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe
index ace772e7e7..87da5d0d1c 100644
--- a/recipes/palmtop_pl.recipe
+++ b/recipes/palmtop_pl.recipe
@@ -10,5 +10,7 @@ class palmtop_pl(BasicNewsRecipe):
     oldest_article = 7
     max_articles_per_feed = 100
     no_stylesheets = True
-
+    use_embedded_content=True
+    #remove_tags_before=dict(name='h2')
+    #remove_tags_after=dict(attrs={'class':'entry clearfix'})
     feeds          = [(u'Newsy', u'http://palmtop.pl/feed/atom/')]
diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe
index faefeb25c0..952db30c3e 100644
--- a/recipes/pc_arena.recipe
+++ b/recipes/pc_arena.recipe
@@ -1,31 +1,25 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class PC_Arena(BasicNewsRecipe):
     title          = u'PCArena'
-    oldest_article = 18300
+    oldest_article = 7
     max_articles_per_feed = 100
     __author__        = 'fenuks'
     description   = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
     category       = 'IT'
     language       = 'pl'
-    masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif'
-    cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif'
+    masthead_url='http://pcarena.pl/pcarena/img/logo.png'
+    cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
     no_stylesheets = True
-    keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
-    remove_tags=[dict(attrs={'class':'pages'})]
-    feeds          = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')]
+    remove_empty_feeds=True
+    #keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
+    #remove_tags=[dict(attrs={'class':'pages'})]
+    feeds          = [(u'Aktualności', u'http://pcarena.pl/aktualnosci/feeds.rss'), (u'Testy', u'http://pcarena.pl/testy/feeds.rss'), (u'Software', u'http://pcarena.pl/oprogramowanie/feeds.rss'), (u'Poradniki', u'http://pcarena.pl/poradniki/feeds.rss'), (u'Mobile', u'http://pcarena.pl/mobile/feeds.rss')]
+    
+    def print_version(self, url):
+        return url.replace('show', 'print')
 
-    def append_page(self, soup, appendtag):
-        tag=soup.find(name='div', attrs={'class':'pagNum'})
-        if tag:
-            nexturl=tag.findAll('a')
-            tag.extract()
-            for nextpage in nexturl[1:]:
-               nextpage= 'http://pcarena.pl' + nextpage['href']
-               soup2 = self.index_to_soup(nextpage)
-               pagetext = soup2.find(attrs={'class':'artBody'})
-               pos = len(appendtag.contents)
-               appendtag.insert(pos, pagetext)
-
-    def preprocess_html(self, soup):
-        self.append_page(soup, soup.body)
-        return soup
\ No newline at end of file
+    def image_url_processor(self, baseurl, url):
+        if 'http' not in url:
+            return 'http://pcarena.pl' + url
+        else:
+            return url
\ No newline at end of file
diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe
index 68a17888ce..f4eccd70a0 100644
--- a/recipes/pc_centre_pl.recipe
+++ b/recipes/pc_centre_pl.recipe
@@ -10,32 +10,11 @@ class PC_Centre(BasicNewsRecipe):
     masthead_url= 'http://pccentre.pl/views/images/logo.gif'
     cover_url= 'http://pccentre.pl/views/images/logo.gif'
     no_stylesheets = True
-    keep_only_tags= [dict(id='content')]
-    remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
-    feeds          = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')]
+    remove_empty_feeds = True
+    #keep_only_tags= [dict(id='content')]
+    #remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
+    remove_tags=[dict(attrs={'class':'logo_print'})]
+    feeds          = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')]
 
-
-    def append_page(self, soup, appendtag):
-        tag=soup.find(name='div', attrs={'class':'pages'})
-        if tag:
-            nexturl=tag.findAll('a')
-            tag.extract()
-            for nextpage in nexturl[:-1]:
-               nextpage= 'http://pccentre.pl' + nextpage['href']
-               soup2 = self.index_to_soup(nextpage)
-               pagetext = soup2.find(id='content')
-               rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']})
-               for r in rem:
-                   r.extract()
-               rem=pagetext.findAll(id='comments')
-               for r in rem:
-                   r.extract()
-               rem=pagetext.findAll('h1')
-               for r in rem:
-                   r.extract()
-               pos = len(appendtag.contents)
-               appendtag.insert(pos, pagetext)
-
-    def preprocess_html(self, soup):
-        self.append_page(soup, soup.body)
-        return soup
\ No newline at end of file
+    def print_version(self, url):
+        return url.replace('show', 'print')
\ No newline at end of file
diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe
index f4c1efa9b8..1c3f46f967 100644
--- a/recipes/tablety_pl.recipe
+++ b/recipes/tablety_pl.recipe
@@ -8,10 +8,11 @@ class Tablety_pl(BasicNewsRecipe):
     cover_url      = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
     category       = 'IT'
     language       = 'pl'
+    use_embedded_content=True
     oldest_article = 8
     max_articles_per_feed = 100
     preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
-    remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
-    remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'})
-    remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})]
+    #remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
+    #remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
+    #remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})]
     feeds          = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
diff --git a/recipes/wnp.recipe b/recipes/wnp.recipe
index e53e4cc66b..ee87112437 100644
--- a/recipes/wnp.recipe
+++ b/recipes/wnp.recipe
@@ -1,5 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
+import re
 
 class AdvancedUserRecipe1312886443(BasicNewsRecipe):
     title          = u'WNP'
@@ -8,10 +8,11 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
     description   = u'Wirtualny Nowy Przemysł'
     category       = 'economy'
     language       = 'pl'
+    preprocess_regexps = [(re.compile(ur'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')]
     oldest_article = 8
     max_articles_per_feed = 100
     no_stylesheets= True
-    keep_only_tags = dict(name='div', attrs={'id':'contentText'})
+    remove_tags=[dict(attrs={'class':'printF'})]
     feeds          = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'),
                           (u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'),
           (u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'),
@@ -19,3 +20,7 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
           (u'Serwis Górnictwo', u'http://www.wnp.pl/rss/serwis_rss_4.xml'),
           (u'Serwis Logistyka', u'http://www.wnp.pl/rss/serwis_rss_5.xml'),
           (u'Serwis IT', u'http://www.wnp.pl/rss/serwis_rss_6.xml')]
+
+
+    def print_version(self, url):
+        return 'http://wnp.pl/drukuj/' +url[url.find(',')+1:]
\ No newline at end of file