Various Polish news sources by fenuks

2025-06-23 15:30:45 -04:00 · 2011-09-21 10:50:11 -06:00 · 2011-09-21 10:50:11 -06:00 · 02f8f08b65
commit 02f8f08b65
parent 2f82d4a8fc
14 changed files with 205 additions and 0 deletions
--- a/recipes/benchmark_pl.recipe
+++ b/recipes/benchmark_pl.recipe
@ -0,0 +1,70 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 class Benchmark_pl(BasicNewsRecipe):
    title          = u'Benchmark.pl'
    __author__        = 'fenuks'
    description   = u'benchmark.pl -IT site'
    cover_url      = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
    category       = 'IT'
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
    no_stylesheets=True
    preprocess_regexps = [(re.compile(ur'\bWięcej o .*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
    keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
    remove_tags_after=dict(name='div', attrs={'class':'body'})
    remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})]
    INDEX= 'http://www.benchmark.pl'
    feeds          = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), 
                          (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
    def append_page(self, soup, appendtag):
        nexturl = soup.find('span', attrs={'class':'next'})
        while nexturl is not None:
            nexturl= self.INDEX + nexturl.parent['href']
            soup2 = self.index_to_soup(nexturl)
            nexturl=soup2.find('span', attrs={'class':'next'})
            pagetext = soup2.find(name='div', attrs={'class':'body'})
            appendtag.find('div', attrs={'class':'k_ster'}).extract()
            pos = len(appendtag.contents)
            appendtag.insert(pos, pagetext)
        if appendtag.find('div', attrs={'class':'k_ster'}) is not None:
            appendtag.find('div', attrs={'class':'k_ster'}).extract()
    def image_article(self, soup, appendtag):
        nexturl=soup.find('div', attrs={'class':'preview'})
        if nexturl is not None:
            nexturl=nexturl.find('a', attrs={'class':'move_next'})
            image=appendtag.find('div', attrs={'class':'preview'}).div['style'][16:]
            image=self.INDEX + image[:image.find("')")]
            appendtag.find(attrs={'class':'preview'}).name='img'
            appendtag.find(attrs={'class':'preview'})['src']=image
            appendtag.find('a', attrs={'class':'move_next'}).extract()
        while nexturl is not None:
            nexturl= self.INDEX + nexturl['href']
            soup2 = self.index_to_soup(nexturl)
            nexturl=soup2.find('a', attrs={'class':'move_next'})
            image=soup2.find('div', attrs={'class':'preview'}).div['style'][16:]
            image=self.INDEX + image[:image.find("')")]
            soup2.find(attrs={'class':'preview'}).name='img'
            soup2.find(attrs={'class':'preview'})['src']=image
            pagetext=soup2.find('div', attrs={'class':'gallery'})
            pagetext.find('div', attrs={'class':'title'}).extract()
            pagetext.find('div', attrs={'class':'thumb'}).extract()
            pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract()       
            if nexturl is not None:
                pagetext.find('a', attrs={'class':'move_next'}).extract()
            pagetext.find('a', attrs={'class':'move_back'}).extract()
            pos = len(appendtag.contents)
            appendtag.insert(pos, pagetext)
    def preprocess_html(self, soup):
        if soup.find('div', attrs={'class':'preview'}) is not None:
            self.image_article(soup, soup.body)
        else:
            self.append_page(soup, soup.body)
        return soup
--- a/recipes/cgm_pl.recipe
+++ b/recipes/cgm_pl.recipe
@ -0,0 +1,40 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class CGM(BasicNewsRecipe):
    title          = u'CGM'
    oldest_article = 7
    __author__        = 'fenuks'
    description   = u'Codzienna Gazeta Muzyczna'
    cover_url      = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg'
    category       = 'music'
    language       = 'pl'
    use_embedded_content   = False
    max_articles_per_feed = 100
    no_stylesheers=True
    extra_css      = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;}'
    remove_tags_before=dict(id='mainContent')
    remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
    remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
 	      dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}), 
 	      dict(id=['movieShare', 'container'])]
    feeds          = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'), 
                          (u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')]
    def preprocess_html(self, soup):
        ad=soup.findAll('img')
        for r in ad:
            if '/_vault/_article_photos/5841.jpg' in r['src'] or '_vault/_article_photos/5807.jpg' in r['src'] or 'article_photos/5841.jpg' in r['src'] or 'article_photos/5825.jpg' in r['src'] or '_article_photos/5920.jpg' in r['src']  or '_article_photos/5919.jpg' in r['src'] or '_article_photos/5918.jpg' in r['src'] or '_article_photos/5914.jpg' in r['src'] or '_article_photos/5911.jpg' in r['src'] or '_article_photos/5923.jpg' in r['src'] or '_article_photos/5921.jpg' in r['src']:                
                 ad[ad.index(r)].extract()
        gallery=soup.find('div', attrs={'class':'galleryFlash'})
        if gallery:
            img=gallery.find('embed')
            if img:
                img=img['src'][35:]
                img='http://www.cgm.pl/_vault/_gallery/_photo/'+img
                param=gallery.findAll(name='param')
                for i in param:
                    i.extract()
                gallery.contents[1].name='img'
                gallery.contents[1]['src']=img
        return soup
--- a/recipes/dzieje_pl.recipe
+++ b/recipes/dzieje_pl.recipe
@ -0,0 +1,17 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class Dzieje(BasicNewsRecipe):
    title          = u'dzieje.pl'
    __author__        = 'fenuks'
    description   = 'Dzieje - history of Poland'
    cover_url      = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
    category       = 'history'
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
    remove_javascript=True
    no_stylesheets= True
    remove_tags_before= dict(name='h1', attrs={'class':'title'})
    remove_tags_after= dict(id='dogory')
    remove_tags=[dict(id='dogory')]
    feeds          = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
--- a/recipes/greenlinux_pl.recipe
+++ b/recipes/greenlinux_pl.recipe
@ -0,0 +1,13 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class GreenLinux(BasicNewsRecipe):
    title          = u'GreenLinux.pl'
    __author__  = 'fenuks'
    category       = 'IT'
    language       = 'pl'
    cover_url = 'http://lh5.ggpht.com/_xd_6Y9kXhEc/S8tjyqlfhfI/AAAAAAAAAYU/zFNTp07ZQko/top.png'
    oldest_article = 15
    max_articles_per_feed = 100
    auto_cleanup = True
    feeds          = [(u'Newsy', u'http://feeds.feedburner.com/greenlinux')]
--- a/recipes/historia_pl.recipe
+++ b/recipes/historia_pl.recipe
@ -0,0 +1,11 @@
 class Historia_org_pl(BasicNewsRecipe):
    title          = u'Historia.org.pl'
    __author__        = 'fenuks'
    description   = u'history site'
    cover_url      = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg'
    category       = 'history'
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
    feeds          = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')]
--- a/recipes/icons/benchmark_pl.png
+++ b/recipes/icons/benchmark_pl.png
--- a/recipes/icons/cgm_pl.png
+++ b/recipes/icons/cgm_pl.png
--- a/recipes/icons/dzieje_pl.png
+++ b/recipes/icons/dzieje_pl.png
--- a/recipes/icons/greenlinux_pl.png
+++ b/recipes/icons/greenlinux_pl.png
--- a/recipes/icons/historia_pl.png
+++ b/recipes/icons/historia_pl.png
--- a/recipes/icons/lomza.png
+++ b/recipes/icons/lomza.png
--- a/recipes/ksiazka_pl.recipe
+++ b/recipes/ksiazka_pl.recipe
@ -0,0 +1,28 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 class Ksiazka_net_pl(BasicNewsRecipe):
    title          = u'ksiazka.net.pl'
    __author__        = 'fenuks'
    description   = u'Ksiazka.net.pl - book vortal'
    cover_url      = 'http://www.ksiazka.net.pl/fileadmin/templates/ksiazka.net.pl/images/1PortalKsiegarski-logo.jpg'
    category       = 'books'
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
    no_stylesheets= True
    #extra_css      = 'img {float: right;}'
    preprocess_regexps = [(re.compile(ur'Podoba mi się, kupuję:'), lambda match: '<br />')]
    remove_tags_before= dict(name='div', attrs={'class':'m-body'})
    remove_tags_after= dict(name='div', attrs={'class':'m-body-link'})
    remove_tags=[dict(attrs={'class':['mk_library-icon', 'm-body-link', 'tagi']})]
    feeds          = [(u'Wiadomości', u'http://www.ksiazka.net.pl/?id=wiadomosci&type=100'), 
 	      (u'Książki', u'http://www.ksiazka.net.pl/?id=ksiazki&type=100'), 
 	      (u'Rynek', u'http://www.ksiazka.net.pl/?id=rynek&type=100')]
    def image_url_processor(self, baseurl, url):
        if (('file://' in url) and ('www.ksiazka.net.pl/' not in url)):
            return 'http://www.ksiazka.net.pl/' + url[8:]
        elif 'http://' not in url:
            return 'http://www.ksiazka.net.pl/' + url
        else:
            return url
--- a/recipes/lomza.recipe
+++ b/recipes/lomza.recipe
@ -0,0 +1,14 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class Lomza(BasicNewsRecipe):
    title          = u'4Lomza'
    __author__        = 'fenuks'
    description   = u'4Łomża - regional site'
    cover_url      = 'http://www.4lomza.pl/i/logo4lomza_m.jpg'
    language       = 'pl' 
    oldest_article = 15
    no_styleseets=True
    max_articles_per_feed = 100
    remove_tags=[dict(name='div', attrs={'class':['bxbanner', 'drukuj', 'wyslijznajomemu']})]
    keep_only_tags=[dict(name='div', attrs={'class':'wiadomosc'})]
    feeds          = [(u'Łomża', u'http://feeds.feedburner.com/4lomza.pl')]
--- a/recipes/tablety_pl.recipe
+++ b/recipes/tablety_pl.recipe
@ -0,0 +1,12 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class Tablety_pl(BasicNewsRecipe):
    title          = u'Tablety.pl'
    __author__        = 'fenuks'
    description   = u'tablety.pl - latest tablet news'
    cover_url      = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
    category       = 'IT'
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
    feeds          = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]