Various Czech and Hungarian recipes by bubak

2025-07-09 03:04:10 -04:00 · 2012-11-18 23:53:33 +05:30 · 2012-11-18 23:53:33 +05:30 · b7bd073d4a
commit b7bd073d4a
parent 6712594a3e
23 changed files with 972 additions and 0 deletions
--- a/recipes/aktualne.cz.recipe
+++ b/recipes/aktualne.cz.recipe
@ -0,0 +1,69 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import re
+
+class aktualneRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'aktualne.cz'
+    publisher = u'Centrum holdings'
+    description = 'aktuálně.cz'
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'),
+            (u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'),
+            (u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'),
+            (u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'),
+            (u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'),
+            (u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php')
+            ]
+
+
+    language = 'cs'
+    cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png'
+    remove_javascript = True
+    no_stylesheets = True
+
+    remove_attributes = []
+    remove_tags_before  = dict(name='h1', attrs={'class':['titulek-clanku']})
+    filter_regexps = [r'img.aktualne.centrum.cz']
+    remove_tags = [dict(name='div',   attrs={'id':['social-bookmark']}),
+                dict(name='div', attrs={'class':['box1', 'svazane-tagy']}),
+                dict(name='div', attrs={'class':'itemcomment id0'}),
+                dict(name='div', attrs={'class':'hlavicka'}),
+                dict(name='div', attrs={'class':'hlavni-menu'}),
+                dict(name='div', attrs={'class':'top-standard-brand-obal'}),
+                dict(name='div', attrs={'class':'breadcrumb'}),
+                dict(name='div', attrs={'id':'start-standard'}),
+                dict(name='div', attrs={'id':'forum'}),
+                dict(name='span', attrs={'class':'akce'}),
+                dict(name='span', attrs={'class':'odrazka vetsi'}),
+                dict(name='div', attrs={'class':'boxP'}),
+                dict(name='div', attrs={'class':'box2'})]
+    preprocess_regexps = [
+            (re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
+            (re.compile(r'<div class="[^"]*poutak-clanek-trojka".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
+
+    keep_only_tags = []
+
+    visited_urls = {}
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        if url in self.visited_urls:
+            self.log.debug('Ignoring duplicate: ' + url)
+            return None
+        else:
+            self.visited_urls[url] = True
+            self.log.debug('Accepting: ' + url)
+            return url
+
+    def encoding(self, source):
+        if source.newurl.find('blog.aktualne') >= 0:
+            enc = 'utf-8'
+        else:
+            enc = 'iso-8859-2'
+        self.log.debug('Called encoding ' + enc + " " + str(source.newurl))
+        return source.decode(enc, 'replace')
+
--- a/recipes/blesk.recipe
+++ b/recipes/blesk.recipe
@ -0,0 +1,55 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import re
+
+class bleskRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Blesk'
+    publisher = u''
+    description = 'blesk.cz'
+    oldest_article = 1
+    max_articles_per_feed = 20
+    use_embedded_content = False
+
+    feeds = [
+            (u'Zprávy', u'http://www.blesk.cz/rss/7'),
+            (u'Blesk', u'http://www.blesk.cz/rss/1'),
+            (u'Sex a tabu', u'http://www.blesk.cz/rss/2'),
+            (u'Celebrity', u'http://www.blesk.cz/rss/5'),
+            (u'Cestování', u'http://www.blesk.cz/rss/12')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://img.blesk.cz/images/blesk/blesk-logo.png'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_attributes = []
+    remove_tags_before  = dict(name='div', attrs={'id':['boxContent']})
+    remove_tags_after  = dict(name='div', attrs={'class':['artAuthors']})
+    remove_tags = [dict(name='div',   attrs={'class':['link_clanek']}),
+                dict(name='div',   attrs={'id':['partHeader']}),
+                dict(name='div',   attrs={'id':['top_bottom_box', 'lista_top']})]
+    preprocess_regexps = [(re.compile(r'<div class="(textovytip|related)".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
+
+    keep_only_tags = [dict(name='div', attrs={'class':'articleContent'})]
+
+    visited_urls = {}
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        if url in self.visited_urls:
+            self.log.debug('Ignoring duplicate: ' + url)
+            return None
+        else:
+            self.visited_urls[url] = True
+            self.log.debug('Accepting: ' + url)
+            return url
+
+
+
+
--- a/recipes/ceska_pozice.recipe
+++ b/recipes/ceska_pozice.recipe
@ -0,0 +1,68 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class ceskaPoziceRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Česká pozice'
+    description = 'Česká pozice'
+    oldest_article = 2
+    max_articles_per_feed = 20
+
+    feeds = [
+        (u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'),
+        (u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'),
+        (u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'),
+        (u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed')
+            ]
+
+
+    language = 'cs'
+    cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png'
+    remove_javascript = True
+    no_stylesheets = True
+    domain = u'http://www.ceskapozice.cz'
+    use_embedded_content = False
+
+
+    remove_tags = [dict(name='div',   attrs={'class':['block-ad', 'region region-content-ad']}),
+               dict(name='ul',   attrs={'class':'links'}),
+               dict(name='div',   attrs={'id':['comments', 'back-to-top']}),
+               dict(name='div',   attrs={'class':['next-page', 'region region-content-ad']}),
+           dict(name='cite')]
+
+    keep_only_tags = [dict(name='div',   attrs={'id':'content'})]
+
+    visited_urls = {}
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        if url in self.visited_urls:
+            self.log.debug('Ignoring duplicate: ' + url)
+            return None
+        else:
+            self.visited_urls[url] = True
+            self.log.debug('Accepting: ' + url)
+            return url
+
+    def preprocess_html(self, soup):
+        self.append_page(soup, soup.body, 3)
+        return soup
+
+    def append_page(self, soup, appendtag, position):
+        pager = soup.find('div', attrs={'class':'paging-bottom'})
+        if pager:
+            nextbutton = pager.find('li', attrs={'class':'pager-next'})
+            if nextbutton:
+                nexturl = self.domain + nextbutton.a['href']
+                soup2 = self.index_to_soup(nexturl)
+                texttag = soup2.find('div', attrs={'class':'main-body'})
+                for it in texttag.findAll('div', attrs={'class':'region region-content-ad'}):
+                    it.extract()
+                    for it in texttag.findAll('cite'):
+                        it.extract()
+                        newpos = len(texttag.contents)
+                        self.append_page(soup2, texttag, newpos)
+                        texttag.extract()
+                        appendtag.insert(position, texttag)
+                        pager.extract()
+
--- a/recipes/ceske_noviny.recipe
+++ b/recipes/ceske_noviny.recipe
@ -0,0 +1,30 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class ceskenovinyRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'České Noviny'
+    description = 'ceskenoviny.cz'
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Domácí', u'http://www.ceskenoviny.cz/sluzby/rss/domov.php')
+            #,(u'Hlavní události', u'http://www.ceskenoviny.cz/sluzby/rss/index.php')
+            #,(u'Přehled zpráv', u'http://www.ceskenoviny.cz/sluzby/rss/zpravy.php')
+            #,(u'Ze světa', u'http://www.ceskenoviny.cz/sluzby/rss/svet.php')
+            #,(u'Kultura', u'http://www.ceskenoviny.cz/sluzby/rss/kultura.php')
+            #,(u'IT', u'http://www.ceskenoviny.cz/sluzby/rss/pocitace.php')
+            ]
+
+
+    language = 'cs'
+    cover_url = 'http://i4.cn.cz/grafika/cn_logo-print.gif'
+    remove_javascript = True
+    no_stylesheets = True
+
+    remove_attributes = []
+    filter_regexps = [r'img.aktualne.centrum.cz']
+
+    keep_only_tags = [dict(name='div', attrs={'id':'clnk'})]
--- a/recipes/cesky_rozhlas_6.recipe
+++ b/recipes/cesky_rozhlas_6.recipe
@ -0,0 +1,26 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class cro6Recipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Český rozhlas 6'
+    description = 'Český rozhlas 6'
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Český rozhlas 6', u'http://www.rozhlas.cz/export/cro6/')
+            ]
+
+
+    language = 'cs'
+    cover_url = 'http://www.rozhlas.cz/img/e5/logo/cro6.png'
+    remove_javascript = True
+    no_stylesheets = True
+
+    remove_attributes = []
+    remove_tags = [dict(name='div',   attrs={'class':['audio-play-all', 'poradHeaders', 'actions']}),
+               dict(name='p', attrs={'class':['para-last']})]
+
+    keep_only_tags = [dict(name='div', attrs={'id':'article'})]
--- a/recipes/demagog.cz.recipe
+++ b/recipes/demagog.cz.recipe
@ -0,0 +1,39 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import re
+
+class demagogRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Demagog.cz'
+    publisher = u''
+    description = 'demagog.cz'
+    oldest_article = 6
+    max_articles_per_feed = 20
+    use_embedded_content = False
+    remove_empty_feeds = True
+
+    feeds = [
+            (u'Aktuality', u'http://demagog.cz/rss')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://demagog.cz/content/images/demagog.cz.png'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+				.vyrok_suhrn{margin-top:50px; }
+				.vyrok{margin-bottom:30px; }
+                            """
+
+    remove_tags = [dict(name='a', attrs={'class':'vyrok_odovodnenie_tgl'}),
+		dict(name='img', attrs={'class':'vyrok_fotografia'})]
+    remove_tags_before = dict(name='h1')
+    remove_tags_after = dict(name='div', attrs={'class':'vyrok_text_after'})
+    preprocess_regexps = [(re.compile(r'(<div class="vyrok_suhrn">)', re.DOTALL|re.IGNORECASE), lambda match: '\1<hr>')]
+
+
+
+
--- a/recipes/denik.cz.recipe
+++ b/recipes/denik.cz.recipe
@ -0,0 +1,36 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class ceskyDenikRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'denik.cz'
+    publisher = u''
+    description = u'Český deník'
+    oldest_article = 1
+    max_articles_per_feed = 20
+    use_embedded_content = False
+    remove_empty_feeds = True
+
+    feeds = [
+           (u'Z domova', u'http://www.denik.cz/rss/z_domova.html')
+            ,(u'Pražský deník - Moje Praha', u'http://prazsky.denik.cz/rss/zpravy_region.html')
+            #,(u'Zahraničí', u'http://www.denik.cz/rss/ze_sveta.html')
+            #,(u'Kultura', u'http://www.denik.cz/rss/kultura.html')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://g.denik.cz/images/loga/denik.png'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_tags = []
+    keep_only_tags = [dict(name='div', attrs={'class':'content'})]
+    #remove_tags_before = dict(name='h1')
+    remove_tags_after = dict(name='p', attrs={'class':'clanek-autor'})
+
+
--- a/recipes/denik_referendum.recipe
+++ b/recipes/denik_referendum.recipe
@ -0,0 +1,28 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class denikReferendumRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Den\u00edk Referendum'
+    publisher = u''
+    description = ''
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Deník Referendum', u'http://feeds.feedburner.com/DenikReferendum')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    remove_javascript = True
+    no_stylesheets = True
+    use_embedded_content = False
+    remove_attributes = []
+    remove_tags_after  = dict(name='div', attrs={'class':['text']})
+    remove_tags = [dict(name='div', attrs={'class':['box boxLine', 'box noprint', 'box']}),
+          dict(name='h3', attrs={'class':'head alt'})]
+
+    keep_only_tags = [dict(name='div',   attrs={'id':['content']})]
--- a/recipes/ihned.cz.recipe
+++ b/recipes/ihned.cz.recipe
@ -0,0 +1,36 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class ihnedRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'iHNed.cz'
+    publisher = u''
+    description = 'ihned.cz'
+    oldest_article = 1
+    max_articles_per_feed = 20
+    use_embedded_content = False
+
+    feeds = [
+            (u'Zprávy', u'http://zpravy.ihned.cz/?m=rss'),
+            (u'Hospodářské noviny', u'http://hn.ihned.cz/?p=500000_rss'),
+            (u'Byznys', u'http://byznys.ihned.cz/?m=rss'),
+            (u'Life', u'http://life.ihned.cz/?m=rss'),
+            (u'Dialog', u'http://dialog.ihned.cz/?m=rss')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://rss.ihned.cz/img/0/0_hp09/ihned.cz.gif'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_attributes = []
+    remove_tags_before  = dict(name='div', attrs={'id':['heading']})
+    remove_tags_after  = dict(name='div', attrs={'id':['next-authors']})
+    remove_tags = [dict(name='ul',   attrs={'id':['comm']}),
+                dict(name='div',   attrs={'id':['r-big']}),
+                dict(name='div',   attrs={'class':['tools tools-top']})]
--- a/recipes/insider.recipe
+++ b/recipes/insider.recipe
@ -0,0 +1,59 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class insider(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title      = 'Insider'
+    language = 'cz'
+
+    remove_tags        = [dict(name='div', attrs={'class':'article-related-content'})
+                 ,dict(name='div', attrs={'class':'calendar'})
+                 ,dict(name='span', attrs={'id':'labelHolder'})
+    ]
+
+    no_stylesheets = True
+    keep_only_tags = [dict(name='div', attrs={'class':['doubleBlock textContentFormat']})]
+
+    preprocess_regexps = [(re.compile(r'T.mata:.*', re.DOTALL|re.IGNORECASE), lambda m: '</body>')]
+    needs_subscription = True
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        br.open('http://www.denikinsider.cz/')
+        br.select_form(nr=0)
+        br['login-name'] = self.username
+        br['login-password'] = self.password
+        res = br.submit()
+        raw = res.read()
+        if u'Odhlásit se' not in raw:
+            raise ValueError('Failed to login to insider.cz'
+                             'Check your username and password.')
+        return br
+
+    def parse_index(self):
+        articles = []
+
+        soup = self.index_to_soup('http://www.denikinsider.cz')
+        titles = soup.findAll('span', attrs={'class':'homepageArticleTitle'})
+        if titles is None:
+            raise ValueError('Could not find category content')
+
+        articles = []
+        seen_titles = set([])
+        for title in titles:
+            if title.string in seen_titles:
+                continue
+            article = title.parent
+            seen_titles.add(title.string)
+            url = article['href']
+            if url.startswith('/'):
+                url = 'http://www.denikinsider.cz/'+url
+            self.log('\tFound article:', title, 'at', url)
+            articles.append({'title':title.string, 'url':url, 'description':'',
+                            'date':''})
+        return [(self.title, articles)]
+
+
--- a/recipes/kudy_z_nudy.recipe
+++ b/recipes/kudy_z_nudy.recipe
@ -0,0 +1,32 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class kudyznudyRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Kudy z nudy'
+    publisher = u''
+    description = 'kudyznudy.cz'
+    oldest_article = 3
+    max_articles_per_feed = 20
+    use_embedded_content = False
+
+    feeds = [
+            (u'Praha nejnovější', u'http://www.kudyznudy.cz/RSS/Charts.aspx?Type=Newest&Lang=cs-CZ&RegionId=1')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://www.kudyznudy.cz/App_Themes/KzN/Images/Containers/Header/HeaderLogoKZN.png'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_attributes = []
+    remove_tags_before  = dict(name='div', attrs={'class':['C_WholeContentPadding']})
+    remove_tags_after  = dict(name='div', attrs={'class':['SurroundingsContainer']})
+    remove_tags = [dict(name='div',   attrs={'class':['Details', 'buttons', 'SurroundingsContainer', 'breadcrumb']})]
+
+    keep_only_tags = []
--- a/recipes/lidovky.recipe
+++ b/recipes/lidovky.recipe
@ -0,0 +1,40 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import re
+
+class lnRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'lidovky'
+    publisher = u''
+    description = 'lidovky.cz'
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Události', u'http://www.lidovky.cz/export/rss.asp?r=ln_domov'),
+            (u'Svět', u'http://www.lidovky.cz/export/rss.asp?r=ln_zahranici'),
+            (u'Byznys', u'http://www.lidovky.cz/export/rss.asp?c=ln_byznys'),
+            (u'Věda', u'http://www.lidovky.cz/export/rss.asp?r=ln_veda'),
+            (u'Názory', u'http://www.lidovky.cz/export/rss.asp?r=ln_nazory'),
+            (u'Relax', u'http://www.lidovky.cz/export/rss.asp?c=ln_relax')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://g.lidovky.cz/o/lidovky_ln3b/lidovky-logo.png'
+    remove_javascript = True
+    no_stylesheets = True
+    use_embedded_content = False
+    remove_attributes = []
+    remove_tags_before  = dict(name='div', attrs={'id':['content']})
+    remove_tags_after  = dict(name='div', attrs={'class':['authors']})
+    preprocess_regexps = [(re.compile(r'<div id="(fb-root)".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
+
+    keep_only_tags = []
+
+
+
+
+
--- a/recipes/metropol_tv.recipe
+++ b/recipes/metropol_tv.recipe
@ -0,0 +1,29 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class metropolRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Metropol TV'
+    publisher = u''
+    description = 'metropol.cz'
+    oldest_article = 1
+    max_articles_per_feed = 20
+    use_embedded_content = False
+
+    feeds = [
+            (u'Metropolcv.cz', u'http://www.metropol.cz/rss/')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://www.metropol.cz/public/css/../images/logo/metropoltv.png'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_attributes = []
+
+    keep_only_tags = [dict(name='div', attrs={'id':['art-full']})]
--- a/recipes/nadacni_fond_proti_korupci.recipe
+++ b/recipes/nadacni_fond_proti_korupci.recipe
@ -0,0 +1,30 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class nfpkRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Nadační fond proti korupci'
+    publisher = u''
+    description = 'nfpk.cz'
+    oldest_article = 7
+    max_articles_per_feed = 20
+    use_embedded_content = False
+    remove_empty_feeds = True
+
+    feeds = [
+            (u'Aktuality', u'http://feeds.feedburner.com/nfpk')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://www.nfpk.cz/_templates/nfpk/_images/logo.gif'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_attributes = []
+    keep_only_tags = [dict(name='div', attrs={'id':'content'})]
+
--- a/recipes/nepszabadsag.recipe
+++ b/recipes/nepszabadsag.recipe
@ -0,0 +1,56 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+'''
+Fetch Népszabadság
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class nepszabadsag(BasicNewsRecipe):
+    title = u'N\u00e9pszabads\u00e1g'
+    description = ''
+    __author__ = 'bubak'
+    use_embedded_content   = False
+    timefmt = ' [%d %b %Y]'
+    oldest_article = 2
+    max_articles_per_feed = 20
+    no_stylesheets = True
+    language = 'hu'
+    #delay = 1
+    #timeout = 10
+    simultaneous_downloads = 5
+
+    #encoding = 'utf-8'
+    remove_javascript = True
+    cover_url = 'http://nol.hu/_design/image/logo_nol_live.jpg'
+
+    feeds = [
+             (u'Belföld', u'http://nol.hu/feed/belfold.rss')
+             #,(u'Külföld', u'http://nol.hu/feed/kulfold.rss')
+             #,(u'Gazdaság', u'http://nol.hu/feed/gazdasag.rss')
+             #,(u'Kultúra', u'http://nol.hu/feed/kult.rss')
+             ]
+
+    extra_css = '''
+    		'''
+
+    remove_attributes = []
+    remove_tags_before  = dict(name='div', attrs={'class':['d-source']})
+    remove_tags_after  = dict(name='div', attrs={'class':['tags']})
+    remove_tags = [dict(name='div',   attrs={'class':['h']}),
+			dict(name='tfoot')]
+
+
+    keep_only_tags = [dict(name='table', attrs={'class':'article-box'})]
+
+    # NS sends an ad page sometimes but not frequently enough, TBD
+    def AAskip_ad_pages(self, soup):
+        if ('advertisement' in soup.find('title').string.lower()):
+            href = soup.find('a').get('href')
+            self.log.debug('Skipping to: ' + href)
+            new = self.browser.open(href).read().decode('utf-8', 'ignore')
+            #ipython(locals())
+            self.log.debug('Finished: ' + href)
+            return new
+        else:
+            return None
+
--- a/recipes/neviditelny_pes.recipe
+++ b/recipes/neviditelny_pes.recipe
@ -0,0 +1,32 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class pesRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Neviditelný pes'
+    publisher = u''
+    description = u'Neviditelný pes'
+    oldest_article = 1
+    max_articles_per_feed = 20
+    use_embedded_content = False
+    remove_empty_feeds = True
+
+    feeds = [
+            (u'Neviditelný pes', u'http://neviditelnypes.lidovky.cz/export/rss.asp?c=pes_neviditelny')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://g.zpravy.cz/o/pes/logo_pes.jpg'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_tags = []
+    remove_tags_before = dict(name='div', attrs={'id':'art-full'})
+    remove_tags_after = dict(name='div', attrs={'id':'authors'})
+
+
--- a/recipes/novinky.cz.recipe
+++ b/recipes/novinky.cz.recipe
@ -0,0 +1,50 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class novinkyRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'novinky.cz'
+    publisher = u'seznam.cz'
+    description = 'novinky.cz'
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Domácí', u'http://www.novinky.cz/rss2/domaci/'),
+            (u'Praha', u'http://www.novinky.cz/rss2/vase-zpravy/praha/'),
+            (u'Ekonomika', u'http://www.novinky.cz/rss2/ekonomika/'),
+            (u'Finance', u'http://www.novinky.cz/rss2/finance/'),
+            ]
+
+
+    #encoding = 'utf-8'
+    language = 'cs'
+    cover_url = 'http://www.novinky.cz/static/images/logo.gif'
+    remove_javascript = True
+    no_stylesheets = True
+
+    remove_tags = [dict(name='div',   attrs={'id':['pictureInnerBox']}),
+        dict(name='div',   attrs={'id':['discussionEntry']}),
+        dict(name='span',   attrs={'id':['mynews-hits', 'mynews-author']}),
+        dict(name='div',   attrs={'class':['related']}),
+        dict(name='div',   attrs={'id':['multimediaInfo']})]
+    remove_tags_before  = dict(name='div',attrs={'class':['articleHeader']})
+    remove_tags_after  = dict(name='div',attrs={'class':'related'})
+
+    keep_only_tags = []
+
+    # This source has identical articles under different links
+    # which are redirected to the common url. I've found
+    # just this API method that has the real URL
+    visited_urls = {}
+    def encoding(self, source):
+        url = source.newurl
+        if url in self.visited_urls:
+            self.log.debug('Ignoring duplicate: ' + url)
+            return None
+        else:
+            self.visited_urls[url] = True
+            self.log.debug('Accepting: ' + url)
+            return source.decode('utf-8', 'replace')
+
--- a/recipes/parlamentni_listy.recipe
+++ b/recipes/parlamentni_listy.recipe
@ -0,0 +1,38 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import re
+
+class plRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Parlamentn\u00ed Listy'
+    publisher = u''
+    description = ''
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Parlamentní listy.cz', u'http://www.parlamentnilisty.cz/export/rss.aspx')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://www.parlamentnilisty.cz/design/listy-logo2.png'
+    remove_javascript = True
+    no_stylesheets = True
+    use_embedded_content = False
+    remove_attributes = []
+    remove_tags = [dict(name='div', attrs={'class':['articledetailboxin','crumbs', 'relatedarticles articledetailbox']}),
+          dict(name='div', attrs={'class':['socialshare-1 noprint', 'socialshare-2 noprint']}),
+          dict(name='div', attrs={'id':'widget'}),
+          dict(name='div', attrs={'class':'article-discussion-box noprint'})]
+    preprocess_regexps = [(re.compile(r'<(span|strong)[^>]*>\s*Ptejte se politik.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
+
+    keep_only_tags = [dict(name='div',   attrs={'class':['article-detail']})]
+
+
+
+
+
+
--- a/recipes/piratska_strana.recipe
+++ b/recipes/piratska_strana.recipe
@ -0,0 +1,40 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class cpsRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Piratská strana'
+    publisher = u''
+    description = ''
+    oldest_article = 3
+    max_articles_per_feed = 20
+    use_embedded_content = False
+    remove_empty_feeds = True
+
+    feeds = [
+            (u'Články', u'http://www.pirati.cz/rss.xml')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://www.pirati.cz/sites/all/themes/addari-cps/images/headbg.jpg'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_attributes = []
+    keep_only_tags = [dict(name='div', attrs={'id':'postarea'})]
+    remove_tags = [dict(name='div', attrs={'class':['breadcrumb', 'submitted', 'links-readmore']}),
+        dict(name='div', attrs={'id':['comments']})]
+    remove_tags_before  = dict(name='font', attrs={'size':'+3'})
+    remove_tags_after  = [dict(name='iframe')]
+
+    conversion_options = {'linearize_tables' : True}
+
+
+
+
+
--- a/recipes/piratske_noviny.recipe
+++ b/recipes/piratske_noviny.recipe
@ -0,0 +1,34 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class nfpkRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Piratské noviny'
+    publisher = u''
+    description = 'nfpk.cz'
+    oldest_article = 2
+    max_articles_per_feed = 20
+    use_embedded_content = False
+    remove_empty_feeds = True
+
+    feeds = [
+            (u'Aktuality', u'http://www.piratskenoviny.cz/run/rss.php')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://www.piratskenoviny.cz/imgs/piratske-noviny.gif'
+    remove_javascript = True
+    no_stylesheets = True
+    extra_css             = """
+                            """
+
+    remove_attributes = []
+    remove_tags_before  = dict(name='font', attrs={'size':'+3'})
+    remove_tags_after  = [dict(name='iframe')]
+    conversion_options = {'linearize_tables' : True}
+
+
+
--- a/recipes/pravo.recipe
+++ b/recipes/pravo.recipe
@ -0,0 +1,64 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class pravo(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title      = 'Právo'
+    language = 'cz'
+
+    remove_tags_before = dict(name='div', attrs={'class':'rubrika-ostat'})
+    remove_tags_after = dict(name='td', attrs={'class':'rubrika'})
+    remove_tags        = [dict(name='td', attrs={'width':'273'})
+                 ,dict(name='td', attrs={'class':'rubrika'})
+                 ,dict(name='div', attrs={'class':'rubrika-ostat'})
+    ]
+    extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}'
+    cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif'
+    cover_margins = (0, 100, '#ffffff')
+    conversion_options = {'linearize_tables' : True}
+
+    no_stylesheets = True
+
+    # our variables
+    seen_titles = set([])
+    # only yesterday's articles are online
+    parent_url = 'http://pravo.novinky.cz/minule/'
+    feeds = [
+            ('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'),
+            ('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'),
+            ('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'),
+            ('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php')
+        ]
+
+
+    def parse_index(self):
+        articles = []
+
+        for feed in self.feeds:
+            articles.append(self.parse_page(feed))
+        return articles
+
+    def parse_page(self, (feed_title, url)):
+        articles = []
+
+        soup = self.index_to_soup(url)
+        titles = soup.findAll('a', attrs={'class':'nadpis'})
+        if titles is None:
+            raise ValueError('Could not find any articles on page ' + url)
+
+        articles = []
+        for article in titles:
+            title = article.string
+            if title in self.seen_titles:
+                continue
+            self.seen_titles.add(title)
+            url = article['href']
+            if not url.startswith('http'):
+                url = self.parent_url + url
+            self.log('\tFound article:', title, 'at', url)
+            articles.append({'title':title.string, 'url':url, 'description':'',
+                            'date':''})
+            return (feed_title, articles)
+
--- a/recipes/respekt.recipe
+++ b/recipes/respekt.recipe
@ -0,0 +1,37 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import re
+
+class respektRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Respekt'
+    publisher = u'Respekt'
+    description = 'Respekt'
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Všechny články', u'http://respekt.ihned.cz/index.php?p=R00000_rss')
+            ,(u'Blogy', u'http://blog.respekt.ihned.cz/?p=Rb00VR_rss')
+            #,(u'Respekt DJ', u'http://respekt.ihned.cz/index.php?p=R00RDJ_rss')
+            ]
+
+
+    encoding = 'cp1250'
+    language = 'cs'
+    cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
+    remove_javascript = True
+    no_stylesheets = True
+
+    remove_tags = [dict(name='div',   attrs={'class':['d-tools', 'actions']})]
+    remove_tags_before  = dict(name='div',attrs={'id':['detail']})
+    remove_tags_after  = dict(name='div',attrs={'class':'d-tools'})
+    preprocess_regexps = [(re.compile(r'<div class="paid-zone".*', re.DOTALL|re.IGNORECASE), lambda match: 'Za zbytek článku je nutno platit. </body>'),
+			(re.compile(r'.*<div class="mm-ow">', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
+			(re.compile(r'<div class="col3">.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
+
+    keep_only_tags = []
+
+
+
--- a/recipes/tyden.cz.recipe
+++ b/recipes/tyden.cz.recipe
@ -0,0 +1,44 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class tydenRecipe(BasicNewsRecipe):
+    __author__  = 'bubak'
+    title = u'Tyden.cz'
+    publisher = u''
+    description = ''
+    oldest_article = 1
+    max_articles_per_feed = 20
+
+    feeds = [
+            (u'Domácí', u'http://www.tyden.cz/rss/rss.php?rubrika_id=6'),
+            (u'Politika', u'http://www.tyden.cz/rss/rss.php?rubrika_id=173'),
+            (u'Kauzy', u'http://www.tyden.cz/rss/rss.php?rubrika_id=340')
+            ]
+
+
+    #encoding = 'iso-8859-2'
+    language = 'cs'
+    cover_url = 'http://www.tyden.cz/img/tyden-logo.png'
+    remove_javascript = True
+    no_stylesheets = True
+    remove_attributes = []
+    remove_tags_before  = dict(name='p', attrs={'id':['breadcrumbs']})
+    remove_tags_after  = dict(name='p', attrs={'class':['author']})
+
+    visited_urls = {}
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        if url in self.visited_urls:
+            self.log.debug('Ignoring duplicate: ' + url)
+            return None
+        else:
+            self.visited_urls[url] = True
+            self.log.debug('Accepting: ' + url)
+            return url
+
+
+
+
+
+