Various updated Polish recipes

2025-07-09 03:04:10 -04:00 · 2012-07-27 01:09:49 +05:30 · 2012-07-27 01:09:49 +05:30 · 3afc065c2a
commit 3afc065c2a
parent 8822ef28f9
5 changed files with 30 additions and 21 deletions
--- a/recipes/benchmark_pl.recipe
+++ b/recipes/benchmark_pl.recipe
@ -1,6 +1,6 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
-class Benchmark_pl(BasicNewsRecipe):
+class BenchmarkPl(BasicNewsRecipe):
    title          = u'Benchmark.pl'
    __author__        = 'fenuks'
    description   = u'benchmark.pl -IT site'
@ -14,7 +14,7 @@ class Benchmark_pl(BasicNewsRecipe):
    preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
    keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
    remove_tags_after=dict(name='div', attrs={'class':'body'})
-    remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
+    remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
    INDEX= 'http://www.benchmark.pl'
    feeds          = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), 
                          (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
--- a/recipes/film_web.recipe
+++ b/recipes/film_web.recipe
@ -1,6 +1,7 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
-class Filmweb_pl(BasicNewsRecipe):
+import re
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+class FilmWebPl(BasicNewsRecipe):
    title          = u'FilmWeb'
    __author__        = 'fenuks'
    description   = 'FilmWeb - biggest polish movie site'
@ -12,8 +13,9 @@ class Filmweb_pl(BasicNewsRecipe):
    max_articles_per_feed = 100
    no_stylesheets= True
    remove_empty_feeds=True
+    preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
    extra_css      = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
-    remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
+    remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
    keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
    feeds          = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
                         (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
@ -31,18 +33,22 @@ class Filmweb_pl(BasicNewsRecipe):
                         (u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'),
                         (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'),
                         (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'),
-                         (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')]
+                         (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')
+                          ]

-    def skip_ad_pages(self, soup):   
+    def skip_ad_pages(self, soup):
        skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'})
        if skip_tag is not None:
-            self.log.warn('skip_tag')
-            self.log.warn(skip_tag)
            return self.index_to_soup(skip_tag['href'], raw=True)
-        
+

    def preprocess_html(self, soup):
        for a in soup('a'):
            if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
                a['href']=self.index + a['href']
-        return soup
+        for i in soup.findAll('a', attrs={'class':'fn'}):
+            i.insert(len(i), BeautifulSoup('<br />'))
+        for i in soup.findAll('sup'):
+            if not i.string or i.string.startswith('(kliknij'):
+                i.extract()
+        return soup
--- a/recipes/gry_online_pl.recipe
+++ b/recipes/gry_online_pl.recipe
@ -1,6 +1,6 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe

-class Gry_online_pl(BasicNewsRecipe):
+class GryOnlinePl(BasicNewsRecipe):
    title          = u'Gry-Online.pl'
    __author__        = 'fenuks'
    description   = 'Gry-Online.pl - computer games'
@ -21,17 +21,18 @@ class Gry_online_pl(BasicNewsRecipe):
        tag = appendtag.find('div', attrs={'class':'n5p'})
        if tag:
            nexturls=tag.findAll('a')
-            for nexturl in nexturls[1:]:
-                try:
-                    soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href'])
-                except:
-                    soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href'])
+            url_part = soup.find('link', attrs={'rel':'canonical'})['href']
+            url_part = url_part[25:].rpartition('?')[0]
+            for nexturl in nexturls[1:-1]:
+                soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href'])
                pagetext = soup2.find(attrs={'class':'gc660'})
                for r in pagetext.findAll(name='header'):
                    r.extract()
+                for r in pagetext.findAll(attrs={'itemprop':'description'}):
+                    r.extract()
                pos = len(appendtag.contents)
                appendtag.insert(pos, pagetext)
-            for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}):
+            for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
                r.extract()


--- a/recipes/natemat_pl.recipe
+++ b/recipes/natemat_pl.recipe
@ -1,3 +1,4 @@
+import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class NaTemat(BasicNewsRecipe):
@ -8,8 +9,9 @@ class NaTemat(BasicNewsRecipe):
    description   = u'informacje, komentarze, opinie'
    category       = 'news'
    language       = 'pl'
+    preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj więcej\:.*?</a>', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?</a>', re.IGNORECASE), lambda m: '')]
    cover_url= 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png'
    no_stylesheets = True
    keep_only_tags= [dict(id='main')]
-    remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related']})]
+    remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related', 'user-header', 'links']}), dict(name='img', attrs={'class':'indent'})]
    feeds          = [(u'Artyku\u0142y', u'http://natemat.pl/rss/wszystkie')]
--- a/recipes/wnp.recipe
+++ b/recipes/wnp.recipe
@ -1,7 +1,7 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re

-class AdvancedUserRecipe1312886443(BasicNewsRecipe):
+class WNP(BasicNewsRecipe):
    title          = u'WNP'
    cover_url= 'http://k.wnp.pl/images/wnpLogo.gif'
    __author__        = 'fenuks'
@ -12,7 +12,7 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
    oldest_article = 8
    max_articles_per_feed = 100
    no_stylesheets= True
-    remove_tags=[dict(attrs={'class':'printF'})]
+    remove_tags=[dict(attrs={'class':['printF', 'border3B2 clearfix', 'articleMenu clearfix']})]
    feeds          = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'),
                          (u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'),
          (u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'),