Updated Polish news sources

2025-08-11 09:13:57 -04:00 · 2013-04-05 08:08:00 +05:30 · 2013-04-05 08:08:00 +05:30 · e8e38f4748
commit e8e38f4748
parent 1e97aa0a9e da3c080baa
18 changed files with 186 additions and 154 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -40,6 +40,7 @@ recipes/.gitignore
 recipes/README.md
 recipes/icon_checker.py
 recipes/readme_updater.py
 recipes/garfield.recipe
 recipes/katalog_egazeciarz.recipe
 recipes/tv_axnscifi.recipe
 recipes/tv_comedycentral.recipe
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
 recipes/tv_tvpuls.recipe
 recipes/tv_viasathistory.recipe
 recipes/icons/katalog_egazeciarz.png
 recipes/icons/garfield.png
 recipes/icons/tv_axnscifi.png
 recipes/icons/tv_comedycentral.png
 recipes/icons/tv_discoveryscience.png
--- a/recipes/esensja_(rss).recipe
+++ b/recipes/esensja_(rss).recipe
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
    language       = 'pl'
    encoding = 'utf-8'
    INDEX = 'http://www.esensja.pl'
    extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
                    .t-author {font-size: x-small; text-align: left}
                    .t-title2 {font-size: x-small; font-style: italic; text-align: left}
                    .text {font-size: small; text-align: left}
                    .annot-ref {font-style: italic; text-align: left}
                    '''
    cover_url = ''
    masthead_url = 'http://esensja.pl/img/wrss.gif'
    use_embedded_content = False
--- a/recipes/forbes_pl.recipe
+++ b/recipes/forbes_pl.recipe
@ -0,0 +1,53 @@
 #!/usr/bin/env python
 __license__ = 'GPL v3'
 from calibre.web.feeds.news import BasicNewsRecipe
 import datetime
 import re
 class forbes_pl(BasicNewsRecipe):
    title = u'Forbes.pl'
    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
    language = 'pl'
    description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
    oldest_article = 1
    index = 'http://www.forbes.pl'
    cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
    max_articles_per_feed = 100
    extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
    preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
    remove_javascript = True
    no_stylesheets = True
    now = datetime.datetime.now()
    yesterday = now - datetime.timedelta(hours=24)
    yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
    pages_count = 4
    keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
    remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
    feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
    '''def preprocess_html(self, soup):
        self.append_page(soup, soup.body)
        return soup
    def append_page(self, soup, appendtag):
        cleanup = False
        nexturl = appendtag.find('a', attrs={'class':'next'})
        if nexturl:
            cleanup = True
        while nexturl:
            soup2 = self.index_to_soup(self.index + nexturl['href'])
            nexturl = soup2.find('a', attrs={'class':'next'})
            pagetext = soup2.findAll(id='article-body-wrapper')
            if not pagetext:
                pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
            for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
                comment.extract()
            pos = len(appendtag.contents)
            appendtag.insert(pos, pagetext)
        if cleanup:
            for r in appendtag.findAll(attrs={'class':'paginator'}):
                r.extract()'''
--- a/recipes/gazeta_pl_krakow.recipe
+++ b/recipes/gazeta_pl_krakow.recipe
@ -10,7 +10,7 @@ krakow.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe
 class gw_krakow(BasicNewsRecipe):
-    title          = u'Gazeta.pl Kraków'
+    title          = u'Gazeta Wyborcza Kraków'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
--- a/recipes/gazeta_pl_szczecin.recipe
+++ b/recipes/gazeta_pl_szczecin.recipe
@ -5,7 +5,7 @@ import string
 from calibre.web.feeds.news import BasicNewsRecipe
 class GazetaPlSzczecin(BasicNewsRecipe):
-    title          = u'Gazeta.pl Szczecin'
+    title          = u'Gazeta Wyborcza Szczecin'
    description    = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
    __author__     = u'Michał Szkutnik'
    __license__    = u'GPL v3'
--- a/recipes/gazeta_pl_warszawa.recipe
+++ b/recipes/gazeta_pl_warszawa.recipe
@ -10,7 +10,7 @@ warszawa.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe
 class gw_wawa(BasicNewsRecipe):
-    title          = u'Gazeta.pl Warszawa'
+    title          = u'Gazeta Wyborcza Warszawa'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
--- a/recipes/gazeta_wyborcza.recipe
+++ b/recipes/gazeta_wyborcza.recipe
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Comment
 class Gazeta_Wyborcza(BasicNewsRecipe):
-    title = u'Gazeta.pl'
+    title = u'Gazeta Wyborcza'
    __author__ = 'fenuks, Artur Stachecki'
    language = 'pl'
    description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
--- a/recipes/icons/forbes_pl.png
+++ b/recipes/icons/forbes_pl.png
--- a/recipes/icons/gazeta_pl_krakow.png
+++ b/recipes/icons/gazeta_pl_krakow.png
--- a/recipes/icons/gazeta_pl_szczecin.png
+++ b/recipes/icons/gazeta_pl_szczecin.png
--- a/recipes/icons/gazeta_pl_warszawa.png
+++ b/recipes/icons/gazeta_pl_warszawa.png
--- a/recipes/icons/gazeta_wyborcza.png
+++ b/recipes/icons/gazeta_wyborcza.png
--- a/recipes/icons/slashdot.png
+++ b/recipes/icons/slashdot.png
--- a/recipes/icons/sportowefakty.png
+++ b/recipes/icons/sportowefakty.png
--- a/recipes/icons/wysokie_obcasy.png
+++ b/recipes/icons/wysokie_obcasy.png
--- a/recipes/sportowefakty.recipe
+++ b/recipes/sportowefakty.recipe
@ -0,0 +1,70 @@
 #!/usr/bin/env  python
 __license__ = 'GPL v3'
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.magick import Image
 class sportowefakty(BasicNewsRecipe):
    title          = u'SportoweFakty'
    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
    language       = 'pl'
    description    = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
    oldest_article = 1
    masthead_url='http://www.sportowefakty.pl/images/logo.png'
    max_articles_per_feed = 100
    simultaneous_downloads = 5
    use_embedded_content=False
    remove_javascript=True
    no_stylesheets=True
    ignore_duplicate_articles = {'title', 'url'}
    keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
    remove_tags =[]
    remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
    remove_tags.append(dict(attrs = {'target' : '_blank'}))
    feeds          = [
                      (u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
                      (u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
                      (u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
                      (u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
                      (u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
                      (u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
                      (u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
                      (u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
                     ]
    def get_article_url(self, article):
        link = article.get('link', None)
        if 'utm_source' in link:
            return link.split('?utm')[0]
        else:
            return link
    def print_version(self, url):
        print_url = url + '/drukuj'
        return print_url
    def preprocess_html(self, soup):
        head = soup.find('h1')
        if 'Fotorelacja' in self.tag_to_string(head):
            return None
        else:
            for alink in soup.findAll('a'):
                if alink.string is not None:
                    tstr = alink.string
                    alink.replaceWith(tstr)
            return soup
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup
--- a/recipes/wyborcza_duzy_format.recipe
+++ b/recipes/wyborcza_duzy_format.recipe
@ -1,144 +0,0 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class GazetaWyborczaDuzyForma(BasicNewsRecipe):
    cover_url             = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
    title                 = u"Gazeta Wyborcza Duzy Format"
    __author__            = 'ravcio - rlelusz[at]gmail.com'
    description           = u"Articles from Gazeta's website"
    language              = 'pl'
    max_articles_per_feed = 50  #you can increade it event up to maybe 600, should still work
    recursions            = 0
    encoding              = 'iso-8859-2'
    no_stylesheets        = True
    remove_javascript     = True
    use_embedded_content  = False
    keep_only_tags    = [
            dict(name='div', attrs={'id':['k1']})
                ]
    remove_tags = [
            dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
            ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
            ,dict(name='ul', attrs={'id':['articleToolbar']})
            ,dict(name='img', attrs={'class':['brand']})
            ,dict(name='h5', attrs={'class':['author']})
            ,dict(name='h6', attrs={'class':['date']})
            ,dict(name='p', attrs={'class':['txt_upl']})
                ]
    remove_tags_after = [
            dict(name='div', attrs={'id':['Str']})                #nawigator numerow linii
                ]
    def load_article_links(self, url, count):
        print '--- load_article_links', url, count
 		#page with link to articles
        soup = self.index_to_soup(url)
 		#table with articles
        list = soup.find('div', attrs={'class':'GWdalt'})
 		#single articles (link, title, ...)
        links = list.findAll('div', attrs={'class':['GWdaltE']})
        if len(links) < count:
            #load links to more articles...
 			#remove new link
            pages_nav = list.find('div', attrs={'class':'pages'})
            next = pages_nav.find('a', attrs={'class':'next'})
            if next:
                print 'next=', next['href']
                url = 'http://wyborcza.pl' + next['href']
                #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
                older_links = self.load_article_links(url, count - len(links))
                links.extend(older_links)
        return links
    #produce list of articles to download
    def parse_index(self):
        print '--- parse_index'
        max_articles = 8000
        links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
        ans = []
        key = None
        articles = {}
        key = 'Uncategorized'
        articles[key] = []
        for div_art in links:
            div_date = div_art.find('div', attrs={'class':'kL'})
            div = div_art.find('div', attrs={'class':'kR'})
            a = div.find('a', href=True)
            url = a['href']
            title = a.string
            description = ''
            pubdate = div_date.string.rstrip().lstrip()
            summary = div.find('span', attrs={'class':'lead'})
            desc = summary.find('a', href=True)
            if desc:
                desc.extract()
            description = self.tag_to_string(summary, use_alt=False)
            description = description.rstrip().lstrip()
            feed = key if key is not None else 'Duzy Format'
            if not articles.has_key(feed):
                articles[feed] = []
            if description != '':  # skip just pictures atricle
                articles[feed].append(
                                   dict(title=title, url=url, date=pubdate,
                                        description=description,
                                        content=''))
        ans = [(key, articles[key])]
        return ans
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'id':'Str'})
        if pager:
 			#seek for 'a' element with nast value (if not found exit)
            list = pager.findAll('a')
            for elem in list:
                if 'nast' in elem.string:
                    nexturl = elem['href']
                    soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
                    texttag = soup2.find('div', attrs={'id':'artykul'})
                    newpos = len(texttag.contents)
                    self.append_page(soup2,texttag,newpos)
                    texttag.extract()
                    appendtag.insert(position,texttag)
    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        # finally remove some tags
        pager = soup.find('div',attrs={'id':'Str'})
        if pager:
           pager.extract()
        pager = soup.find('div',attrs={'class':'tylko_int'})
        if pager:
           pager.extract()
        return soup
--- a/recipes/wysokie_obcasy.recipe
+++ b/recipes/wysokie_obcasy.recipe
@ -0,0 +1,57 @@
 #!/usr/bin/env python
 __license__ = 'GPL v3'
 from calibre.web.feeds.news import BasicNewsRecipe
 class WysokieObcasyRecipe(BasicNewsRecipe):
    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
    language = 'pl'
    version = 1
    title = u'Wysokie Obcasy'
    publisher = 'Agora SA'
    description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
    category='magazine'
    language = 'pl'
    publication_type = 'magazine'
    cover_url=''
    remove_empty_feeds= True
    no_stylesheets=True
    oldest_article = 7
    max_articles_per_feed = 100000
    recursions = 0
    no_stylesheets = True
    remove_javascript = True
    simultaneous_downloads = 5
    keep_only_tags =[]
    keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
    remove_tags =[]
    remove_tags.append(dict(name = 'img'))
    remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
    extra_css = '''
                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
                    h1{text-align: left;}
                       '''
    feeds          = [
                            ('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
                          ]
    def print_version(self,url):
        baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
        segments = url.split(',')
        subPath= '/2029020,'
        articleURL1 = segments[1]
        articleURL2 = segments[2]
        printVerString=articleURL1 + ',' + articleURL2
        s=  baseURL + subPath + printVerString + '.html'
        return s
    def get_cover_url(self):
        soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
        self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
        return getattr(self, 'cover_url', self.cover_url)