new recipes

2025-12-17 18:45:04 -05:00 · 2013-11-01 11:14:21 +01:00 · 2013-11-01 11:14:21 +01:00 · 4c8deb0d5f
commit 4c8deb0d5f
parent d7a5118c42
2 changed files with 129 additions and 0 deletions
--- a/recipes/odkrywcy_pl.recipe
+++ b/recipes/odkrywcy_pl.recipe
@ -0,0 +1,102 @@
+__license__ = 'GPL v3'
+import re
+import datetime
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Comment
+
+class Odkrywcy(BasicNewsRecipe):
+    title          = u'Odkrywcy.pl'
+    __author__        = 'fenuks'
+    description   = u''
+    #publication_type = ''
+    language       = 'pl'
+    #encoding = ''
+    extra_css = 'img {display: block;}'
+    cover_url = ''
+    #masthead_url = ''
+    INDEX = 'http://odkrywcy.pl'
+    use_embedded_content = False
+    oldest_article = 7
+    max_articles_per_feed = 100
+    no_stylesheets = True
+    remove_empty_feeds = True
+    remove_javascript = True
+    remove_attributes = ['style', 'font']
+    ignore_duplicate_articles = {'title', 'url'}
+
+    keep_only_tags = [dict(attrs={'class':'content'})]
+    remove_tags = [dict(name='a', attrs={'href':['#opOpinie', '#opinie']}), dict(attrs={'class':['fr', 'clra', 'close', 'wpsocial-fbFanpageBox', 'tagi', 'test']}), dict(id=['rekSrd05', 'moreTopNews']), dict(name='img', attrs={'class':'zr'}), dict(name='img', attrs={'alt':u'Następne'})]
+    remove_tags_after = dict(id='aTxt')
+    #remove_tags_before = dict()
+    feeds = [(u'', '')]
+    
+    def find_articles(self, url):
+        articles = []
+        soup = self.index_to_soup(url)
+        for i in soup.findAll(attrs={'class':'katZj clra'}):
+            tmp = i.find('small')
+            datestring = re.search('dodano: (\d{4}-\d{2}-\d{2})', tmp.string).group(1)
+            d = datetime.datetime.strptime(datestring, "%Y-%m-%d").date()
+            if (datetime.datetime.now().date() - d).days > self.oldest_article:
+                continue
+            tmp = i.find('a')
+            title = tmp.string
+            url = self.INDEX + tmp['href']
+            articles.append({'title' : title,
+                   'url'   : url,
+                   'date'  : '',
+                   'description' : ''
+                    })
+        return articles
+
+    def parse_index(self):
+         feeds = []
+         feeds.append((u'Człowiek', self.find_articles('http://odkrywcy.pl/kat,111396,name,Czlowiek,kategoria.html')))
+         feeds.append((u'Technologie', self.find_articles('http://odkrywcy.pl/kat,111398,name,Technologie,kategoria.html')))
+         feeds.append((u'Ekologia', self.find_articles('http://odkrywcy.pl/kat,111400,name,Ekologia,kategoria.html')))
+         feeds.append((u'Kosmos', self.find_articles('http://odkrywcy.pl/kat,111402,name,Kosmos,kategoria.html')))
+         feeds.append((u'Cywilizacja', self.find_articles('http://odkrywcy.pl/kat,111404,name,Cywilizacja,kategoria.html')))
+         feeds.append((u'Przyroda', self.find_articles('http://odkrywcy.pl/kat,111406,name,Przyroda,kategoria.html')))
+         feeds.append((u'Fizyka i chemia', self.find_articles('http://odkrywcy.pl/kat,111408,name,Fizyka,kategoria.html')))
+         feeds.append((u'Historia', self.find_articles('http://odkrywcy.pl/kat,122994,name,Historia,kategoria.html')))
+         feeds.append((u'Media', self.find_articles('http://odkrywcy.pl/kat,116794,name,Media,media.html')))
+
+         return feeds
+         
+    def append_page(self, soup, appendtag):
+        tag = soup.find('a', attrs={'class': 'btnNext'})
+        urls = []
+        while tag is not None:
+            if tag['href'] in urls:
+                break
+            urls.append(tag['href'])
+            soup2 = self.index_to_soup(self.INDEX + tag['href'])
+            tag = soup2.find(name='a', attrs={'class': 'btnNext'})
+            pagetext = soup2.findAll(attrs={'class':'content'})
+            for container in pagetext:
+                header = container.find(name='h1')
+                if header:
+                    header.extract()
+                for comment in container.findAll(text=lambda text:isinstance(text, Comment)):
+                    comment.extract()
+            for container in pagetext:
+                pos = len(appendtag.contents)
+                appendtag.insert(pos, container)
+        for r in appendtag.findAll(attrs={'class':'galStr'}):
+            r.extract()
+        for r in appendtag.findAll(attrs={'alt':'Następne'}):
+            r.extract()
+        for r in appendtag.findAll(attrs={'alt':'Poprzednie'}):
+            r.extract()
+        for r in appendtag.findAll(attrs={'class':'clra'}):
+            r.extract()
+        for r in appendtag.findAll(attrs={'class':'close'}):
+            r.extract()
+        for r in appendtag.findAll(attrs={'class':'tagi'}):
+            r.extract()
+        for r in appendtag.findAll(attrs={'id':'moreTopNews'}):
+            r.extract()
+
+    def preprocess_html(self, soup):
+        self.append_page(soup, soup.body)
+        return soup
--- a/recipes/znadplanszy_pl.recipe
+++ b/recipes/znadplanszy_pl.recipe
@ -0,0 +1,27 @@
+__license__ = 'GPL v3'
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ZnadPlanszy(BasicNewsRecipe):
+    title          = u'ZnadPlanszy.pl'
+    __author__        = 'fenuks'
+    description   = u''
+    #publication_type = ''
+    language       = 'pl'
+    #encoding = ''
+    #extra_css = ''
+    cover_url = 'http://znadplanszy.pl/wp-content/uploads/2013/05/logo-znadplanszy.png'
+    #masthead_url = ''
+    use_embedded_content = False
+    oldest_article = 14
+    max_articles_per_feed = 100
+    no_stylesheets = True
+    remove_empty_feeds = True
+    remove_javascript = True
+    remove_attributes = ['style', 'font']
+    ignore_duplicate_articles = {'title', 'url'}
+
+    #keep_only_tags = [dict()]
+    remove_tags = [dict(attrs={'class':'rounded-container'})]
+    remove_tags_after = dict(attrs={'id':'dotEPUBcontent'})
+    remove_tags_before = dict(attrs={'class':'content units nine alpha'})
+    feeds = [(u'Wszystkie', 'http://znadplanszy.pl/full-feed/posts/')]