Updated Polish news sources

2025-07-08 18:54:09 -04:00 · 2013-04-05 08:08:00 +05:30 · 2013-04-05 08:08:00 +05:30 · e8e38f4748
commit e8e38f4748
parent 1e97aa0a9e da3c080baa
18 changed files with 186 additions and 154 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -40,6 +40,7 @@ recipes/.gitignore
 recipes/README.md
 recipes/icon_checker.py
 recipes/readme_updater.py
+recipes/garfield.recipe
 recipes/katalog_egazeciarz.recipe
 recipes/tv_axnscifi.recipe
 recipes/tv_comedycentral.recipe
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
 recipes/tv_tvpuls.recipe
 recipes/tv_viasathistory.recipe
 recipes/icons/katalog_egazeciarz.png
+recipes/icons/garfield.png
 recipes/icons/tv_axnscifi.png
 recipes/icons/tv_comedycentral.png
 recipes/icons/tv_discoveryscience.png
--- a/recipes/esensja_(rss).recipe
+++ b/recipes/esensja_(rss).recipe
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
    language       = 'pl'
    encoding = 'utf-8'
    INDEX = 'http://www.esensja.pl'
-    extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
-                    .t-author {font-size: x-small; text-align: left}
-                    .t-title2 {font-size: x-small; font-style: italic; text-align: left}
-                    .text {font-size: small; text-align: left}
-                    .annot-ref {font-style: italic; text-align: left}
-                    '''
    cover_url = ''
    masthead_url = 'http://esensja.pl/img/wrss.gif'
    use_embedded_content = False
--- a/recipes/forbes_pl.recipe
+++ b/recipes/forbes_pl.recipe
@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import datetime
+import re
+
+class forbes_pl(BasicNewsRecipe):
+    title = u'Forbes.pl'
+    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
+    language = 'pl'
+    description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
+    oldest_article = 1
+    index = 'http://www.forbes.pl'
+    cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
+    max_articles_per_feed = 100
+    extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
+    preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
+    remove_javascript = True
+    no_stylesheets = True
+    now = datetime.datetime.now()
+    yesterday = now - datetime.timedelta(hours=24)
+    yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
+    pages_count = 4
+    keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
+    remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
+
+    feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
+
+    '''def preprocess_html(self, soup):
+        self.append_page(soup, soup.body)
+        return soup
+
+
+    def append_page(self, soup, appendtag):
+        cleanup = False
+        nexturl = appendtag.find('a', attrs={'class':'next'})
+        if nexturl:
+            cleanup = True
+        while nexturl:
+            soup2 = self.index_to_soup(self.index + nexturl['href'])
+            nexturl = soup2.find('a', attrs={'class':'next'})
+            pagetext = soup2.findAll(id='article-body-wrapper')
+            if not pagetext:
+                pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
+            for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
+                comment.extract()
+            pos = len(appendtag.contents)
+            appendtag.insert(pos, pagetext)
+        if cleanup:
+            for r in appendtag.findAll(attrs={'class':'paginator'}):
+                r.extract()'''
--- a/recipes/gazeta_pl_krakow.recipe
+++ b/recipes/gazeta_pl_krakow.recipe
@ -10,7 +10,7 @@ krakow.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe

 class gw_krakow(BasicNewsRecipe):
-    title          = u'Gazeta.pl Kraków'
+    title          = u'Gazeta Wyborcza Kraków'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
--- a/recipes/gazeta_pl_szczecin.recipe
+++ b/recipes/gazeta_pl_szczecin.recipe
@ -5,7 +5,7 @@ import string
 from calibre.web.feeds.news import BasicNewsRecipe

 class GazetaPlSzczecin(BasicNewsRecipe):
-    title          = u'Gazeta.pl Szczecin'
+    title          = u'Gazeta Wyborcza Szczecin'
    description    = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
    __author__     = u'Michał Szkutnik'
    __license__    = u'GPL v3'
--- a/recipes/gazeta_pl_warszawa.recipe
+++ b/recipes/gazeta_pl_warszawa.recipe
@ -10,7 +10,7 @@ warszawa.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe

 class gw_wawa(BasicNewsRecipe):
-    title          = u'Gazeta.pl Warszawa'
+    title          = u'Gazeta Wyborcza Warszawa'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
--- a/recipes/gazeta_wyborcza.recipe
+++ b/recipes/gazeta_wyborcza.recipe
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Comment

 class Gazeta_Wyborcza(BasicNewsRecipe):
-    title = u'Gazeta.pl'
+    title = u'Gazeta Wyborcza'
    __author__ = 'fenuks, Artur Stachecki'
    language = 'pl'
    description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
--- a/recipes/icons/forbes_pl.png
+++ b/recipes/icons/forbes_pl.png
--- a/recipes/icons/gazeta_pl_krakow.png
+++ b/recipes/icons/gazeta_pl_krakow.png
--- a/recipes/icons/gazeta_pl_szczecin.png
+++ b/recipes/icons/gazeta_pl_szczecin.png
--- a/recipes/icons/gazeta_pl_warszawa.png
+++ b/recipes/icons/gazeta_pl_warszawa.png
--- a/recipes/icons/gazeta_wyborcza.png
+++ b/recipes/icons/gazeta_wyborcza.png
--- a/recipes/icons/slashdot.png
+++ b/recipes/icons/slashdot.png
--- a/recipes/icons/sportowefakty.png
+++ b/recipes/icons/sportowefakty.png
--- a/recipes/icons/wysokie_obcasy.png
+++ b/recipes/icons/wysokie_obcasy.png
--- a/recipes/sportowefakty.recipe
+++ b/recipes/sportowefakty.recipe
@ -0,0 +1,70 @@
+#!/usr/bin/env  python
+
+__license__ = 'GPL v3'
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.utils.magick import Image
+
+class sportowefakty(BasicNewsRecipe):
+    title          = u'SportoweFakty'
+    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
+    language       = 'pl'
+    description    = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
+    oldest_article = 1
+    masthead_url='http://www.sportowefakty.pl/images/logo.png'
+    max_articles_per_feed = 100
+    simultaneous_downloads = 5
+    use_embedded_content=False
+    remove_javascript=True
+    no_stylesheets=True
+    ignore_duplicate_articles = {'title', 'url'}
+
+    keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
+    remove_tags =[]
+    remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
+    remove_tags.append(dict(attrs = {'target' : '_blank'}))
+
+    feeds          = [
+                      (u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
+                      (u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
+                      (u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
+                      (u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
+                      (u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
+                      (u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
+                      (u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
+                      (u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
+                     ]
+
+    def get_article_url(self, article):
+        link = article.get('link', None)
+        if 'utm_source' in link:
+            return link.split('?utm')[0]
+        else:
+            return link
+
+    def print_version(self, url):
+        print_url = url + '/drukuj'
+        return print_url
+
+    def preprocess_html(self, soup):
+        head = soup.find('h1')
+        if 'Fotorelacja' in self.tag_to_string(head):
+            return None
+        else:
+            for alink in soup.findAll('a'):
+                if alink.string is not None:
+                    tstr = alink.string
+                    alink.replaceWith(tstr)
+            return soup
+
+    def postprocess_html(self, soup, first):
+        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
+            iurl = tag['src']
+            img = Image()
+            img.open(iurl)
+            if img < 0:
+                raise RuntimeError('Out of memory')
+            img.type = "GrayscaleType"
+            img.save(iurl)
+        return soup
--- a/recipes/wyborcza_duzy_format.recipe
+++ b/recipes/wyborcza_duzy_format.recipe
@ -1,144 +0,0 @@
-#!/usr/bin/env  python
-
-from calibre.web.feeds.recipes import BasicNewsRecipe
-
-class GazetaWyborczaDuzyForma(BasicNewsRecipe):
-    cover_url             = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
-    title                 = u"Gazeta Wyborcza Duzy Format"
-    __author__            = 'ravcio - rlelusz[at]gmail.com'
-    description           = u"Articles from Gazeta's website"
-    language              = 'pl'
-    max_articles_per_feed = 50  #you can increade it event up to maybe 600, should still work
-    recursions            = 0
-    encoding              = 'iso-8859-2'
-    no_stylesheets        = True
-    remove_javascript     = True
-    use_embedded_content  = False
-
-
-    keep_only_tags    = [
-            dict(name='div', attrs={'id':['k1']})
-                ]
-
-    remove_tags = [
-            dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
-            ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
-            ,dict(name='ul', attrs={'id':['articleToolbar']})
-            ,dict(name='img', attrs={'class':['brand']})
-            ,dict(name='h5', attrs={'class':['author']})
-            ,dict(name='h6', attrs={'class':['date']})
-            ,dict(name='p', attrs={'class':['txt_upl']})
-                ]
-
-    remove_tags_after = [
-            dict(name='div', attrs={'id':['Str']})                #nawigator numerow linii
-                ]
-
-    def load_article_links(self, url, count):
-        print '--- load_article_links', url, count
-
-		#page with link to articles
-        soup = self.index_to_soup(url)
-
-		#table with articles
-        list = soup.find('div', attrs={'class':'GWdalt'})
-
-		#single articles (link, title, ...)
-        links = list.findAll('div', attrs={'class':['GWdaltE']})
-
-        if len(links) < count:
-            #load links to more articles...
-
-			#remove new link
-            pages_nav = list.find('div', attrs={'class':'pages'})
-            next = pages_nav.find('a', attrs={'class':'next'})
-            if next:
-                print 'next=', next['href']
-                url = 'http://wyborcza.pl' + next['href']
-                #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
-
-                older_links = self.load_article_links(url, count - len(links))
-                links.extend(older_links)
-
-        return links
-
-
-    #produce list of articles to download
-    def parse_index(self):
-        print '--- parse_index'
-
-        max_articles = 8000
-        links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
-
-        ans = []
-        key = None
-        articles = {}
-
-        key = 'Uncategorized'
-        articles[key] = []
-
-        for div_art in links:
-            div_date = div_art.find('div', attrs={'class':'kL'})
-            div = div_art.find('div', attrs={'class':'kR'})
-
-            a = div.find('a', href=True)
-
-            url = a['href']
-            title = a.string
-            description = ''
-            pubdate = div_date.string.rstrip().lstrip()
-            summary = div.find('span', attrs={'class':'lead'})
-
-            desc = summary.find('a', href=True)
-            if desc:
-                desc.extract()
-
-            description = self.tag_to_string(summary, use_alt=False)
-            description = description.rstrip().lstrip()
-
-            feed = key if key is not None else 'Duzy Format'
-
-            if not articles.has_key(feed):
-                articles[feed] = []
-
-            if description != '':  # skip just pictures atricle
-                articles[feed].append(
-                                   dict(title=title, url=url, date=pubdate,
-                                        description=description,
-                                        content=''))
-
-        ans = [(key, articles[key])]
-        return ans
-
-    def append_page(self, soup, appendtag, position):
-        pager = soup.find('div',attrs={'id':'Str'})
-        if pager:
-			#seek for 'a' element with nast value (if not found exit)
-            list = pager.findAll('a')
-
-            for elem in list:
-                if 'nast' in elem.string:
-                    nexturl = elem['href']
-
-                    soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
-
-                    texttag = soup2.find('div', attrs={'id':'artykul'})
-
-                    newpos = len(texttag.contents)
-                    self.append_page(soup2,texttag,newpos)
-                    texttag.extract()
-                    appendtag.insert(position,texttag)
-
-    def preprocess_html(self, soup):
-        self.append_page(soup, soup.body, 3)
-
-        # finally remove some tags
-        pager = soup.find('div',attrs={'id':'Str'})
-        if pager:
-           pager.extract()
-
-        pager = soup.find('div',attrs={'class':'tylko_int'})
-        if pager:
-           pager.extract()
-
-        return soup
--- a/recipes/wysokie_obcasy.recipe
+++ b/recipes/wysokie_obcasy.recipe
@ -0,0 +1,57 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class WysokieObcasyRecipe(BasicNewsRecipe):
+    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
+    language = 'pl'
+    version = 1
+
+    title = u'Wysokie Obcasy'
+    publisher = 'Agora SA'
+    description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
+    category='magazine'
+    language = 'pl'
+    publication_type = 'magazine'
+    cover_url=''
+    remove_empty_feeds= True
+    no_stylesheets=True
+    oldest_article = 7
+    max_articles_per_feed = 100000
+    recursions = 0
+
+    no_stylesheets = True
+    remove_javascript = True
+    simultaneous_downloads = 5
+
+    keep_only_tags =[]
+    keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
+
+    remove_tags =[]
+    remove_tags.append(dict(name = 'img'))
+    remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
+
+    extra_css = '''
+                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
+                    h1{text-align: left;}
+                       '''
+
+    feeds          = [
+                            ('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
+                          ]
+
+    def print_version(self,url):
+        baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
+        segments = url.split(',')
+        subPath= '/2029020,'
+        articleURL1 = segments[1]
+        articleURL2 = segments[2]
+        printVerString=articleURL1 + ',' + articleURL2
+        s=  baseURL + subPath + printVerString + '.html'
+        return s
+
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
+        self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
+        return getattr(self, 'cover_url', self.cover_url)