new Polish news sources

2025-08-30 23:00:21 -04:00 · 2013-06-17 09:53:53 +02:00 · 2013-06-17 09:53:53 +02:00 · b08854e60a
commit b08854e60a
parent 1e5ce66ca3
2 changed files with 153 additions and 0 deletions
--- a/recipes/cdrinfo_pl.recipe
+++ b/recipes/cdrinfo_pl.recipe
@ -0,0 +1,65 @@
+__license__ = 'GPL v3'
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Comment
+class cdrinfo(BasicNewsRecipe):
+    title          = u'CDRinfo.pl'
+    __author__        = 'fenuks'
+    description   = u'Serwis poświęcony archiwizacji danych. Testy i recenzje nagrywarek. Programy do nagrywania płyt.  Dyski twarde, dyski SSD i serwery sieciowe NAS. Rankingi dyskow twardych, najszybsze dyski twarde, newsy, artykuły, testy, recenzje, porady, oprogramowanie. Zestawienie nagrywarek, najnowsze biosy do nagrywarek, programy dla dysków twardych.'
+    category       = 'it, hardware'
+    #publication_type = ''
+    language       = 'pl'
+    #encoding = ''
+    #extra_css = ''
+    cover_url = 'http://www.cdrinfo.pl/gfx/graph3/top.jpg'
+    #masthead_url = ''
+    use_embedded_content = False
+    oldest_article = 777
+    max_articles_per_feed = 100
+    no_stylesheets = True
+    remove_empty_feeds = True
+    remove_javascript = True
+    remove_attributes = ['style']
+    preprocess_regexps = [(re.compile(u'<p[^>]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com</a>\.</p>', re.DOTALL), lambda match: '')]
+    ignore_duplicate_articles = {'title', 'url'}
+
+    keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id='text')]
+    remove_tags = [dict(attrs={'class':['navigation', 'sociable']}), dict(name='hr'), dict(id='respond')]
+    remove_tags_after = dict(id='artnawigacja')
+    feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'),
+            (u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'),
+            (u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml')
+            ]
+
+    def preprocess_html(self, soup):
+        if soup.find(id='artnawigacja'):
+            self.append_page(soup, soup.body)
+        return soup
+        
+    def append_page(self, soup, appendtag):
+        baseurl = 'http://cdrinfo.pl' + soup.find(name='input', attrs={'name':'ref'})['value'] + '/'
+        if baseurl[-2] == '/':
+            baseurl = baseurl[:-1]
+        tag = soup.find(id='artnawigacja')
+        div = tag.find('div', attrs={'align':'right'})
+        while div:
+            counter = 0
+            while counter < 5:
+                try:
+                    soup2 = self.index_to_soup(baseurl+div.a['href'])
+                    break
+                except:
+                    counter += 1
+            tag2 = soup2.find(id='artnawigacja')
+            div = tag2.find('div', attrs={'align':'right'})
+            pagetext = soup2.find(attrs={'class':'art'})
+            comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
+            for comment in comments:
+                comment.extract()
+            for r in soup2.findAll(attrs={'class':'star-rating'}):
+                r.extract()
+            for r in soup2.findAll(attrs={'class':'star-rating2'}):
+                r.extract()
+            pos = len(appendtag.contents)
+            appendtag.insert(pos, pagetext)
+        tag.extract()
--- a/recipes/gazeta_pl_bydgoszcz.recipe
+++ b/recipes/gazeta_pl_bydgoszcz.recipe
@ -0,0 +1,88 @@
+#!/usr/bin/env  python
+
+__license__ = 'GPL v3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Comment
+import re
+class gw_bydgoszcz(BasicNewsRecipe):
+    title          = u'Gazeta Wyborcza Bydgoszcz'
+    __author__ = 'fenuks'
+    language       = 'pl'
+    description = 'Wiadomości z Bydgoszczy na portalu Gazeta.pl.'
+    category = 'newspaper'
+    publication_type = 'newspaper'
+    masthead_url = 'http://bi.gazeta.pl/im/3/4089/m4089863.gif'
+    INDEX = 'http://bydgoszcz.gazeta.pl'
+    cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif'
+    remove_empty_feeds = True
+    oldest_article = 3
+    max_articles_per_feed = 100
+    remove_javascript = True
+    no_stylesheets = True
+    use_embedded_content = False
+    ignore_duplicate_articles = {'title', 'url'}
+
+    #rules for gazeta.pl
+    preprocess_regexps = [(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
+    keep_only_tags = [dict(id='gazeta_article')]
+    remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
+    remove_tags_after = dict(id='gazeta_article_body')
+
+    feeds          = [(u'Wiadomości', u'http://rss.feedsportal.com/c/32739/f/530239/index.rss')]
+
+    def print_version(self, url):
+        if 'feedsportal.com' in url:
+            s = url.rpartition('gazeta0Bpl')
+            u = s[2]
+            if not s[0]:
+                u = url.rpartition('wyborcza0Bpl')[2]
+            u = u.replace('/l/', '/')
+            u = u.replace('/ia1.htm', '')
+            u = u.replace('0Dbo0F1', '')
+            u = u.replace('/story01.htm', '')
+            u = u.replace('0C', '/')
+            u = u.replace('A', '')
+            u = u.replace('0E', '-')
+            u = u.replace('0H', ',')
+            u = u.replace('0I', '_')
+            u = u.replace('0B', '.')
+            u = self.INDEX + u
+            return u
+        else:
+            return url
+
+    def preprocess_html(self, soup):
+        tag = soup.find(id='Str')
+        if soup.find(attrs={'class': 'piano_btn_1'}):
+            return None
+        elif tag and tag.findAll('a'):
+            self.append_page(soup, soup.body)
+        return soup
+        
+    def append_page(self, soup, appendtag):
+        loop = False
+        tag = soup.find('div', attrs={'id': 'Str'})
+        try:
+            baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content']
+        except:
+            return 1
+        link = tag.findAll('a')[-1]
+        while link:
+            soup2 = self.index_to_soup(baseurl + link['href'])
+            link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
+            if not u'następne' in link.string:
+                link = ''
+            pagetext = soup2.find(id='artykul')
+            comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
+            for comment in comments:
+                comment.extract()
+            pos = len(appendtag.contents)
+            appendtag.insert(pos, pagetext)
+        tag.extract()
+
+    def image_url_processor(self, baseurl, url):
+        if url.startswith(' '):
+            return url.strip()
+        else:
+            return url