Various Czech and Hungarian recipes by bubak

This commit is contained in:
Kovid Goyal 2012-11-18 23:53:33 +05:30
parent 6712594a3e
commit b7bd073d4a
23 changed files with 972 additions and 0 deletions

View File

@ -0,0 +1,69 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class aktualneRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'aktualne.cz'
publisher = u'Centrum holdings'
description = 'aktuálně.cz'
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'),
(u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'),
(u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'),
(u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'),
(u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'),
(u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php')
]
language = 'cs'
cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png'
remove_javascript = True
no_stylesheets = True
remove_attributes = []
remove_tags_before = dict(name='h1', attrs={'class':['titulek-clanku']})
filter_regexps = [r'img.aktualne.centrum.cz']
remove_tags = [dict(name='div', attrs={'id':['social-bookmark']}),
dict(name='div', attrs={'class':['box1', 'svazane-tagy']}),
dict(name='div', attrs={'class':'itemcomment id0'}),
dict(name='div', attrs={'class':'hlavicka'}),
dict(name='div', attrs={'class':'hlavni-menu'}),
dict(name='div', attrs={'class':'top-standard-brand-obal'}),
dict(name='div', attrs={'class':'breadcrumb'}),
dict(name='div', attrs={'id':'start-standard'}),
dict(name='div', attrs={'id':'forum'}),
dict(name='span', attrs={'class':'akce'}),
dict(name='span', attrs={'class':'odrazka vetsi'}),
dict(name='div', attrs={'class':'boxP'}),
dict(name='div', attrs={'class':'box2'})]
preprocess_regexps = [
(re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<div class="[^"]*poutak-clanek-trojka".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
keep_only_tags = []
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:
self.log.debug('Ignoring duplicate: ' + url)
return None
else:
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return url
def encoding(self, source):
if source.newurl.find('blog.aktualne') >= 0:
enc = 'utf-8'
else:
enc = 'iso-8859-2'
self.log.debug('Called encoding ' + enc + " " + str(source.newurl))
return source.decode(enc, 'replace')

55
recipes/blesk.recipe Normal file
View File

@ -0,0 +1,55 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class bleskRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Blesk'
publisher = u''
description = 'blesk.cz'
oldest_article = 1
max_articles_per_feed = 20
use_embedded_content = False
feeds = [
(u'Zprávy', u'http://www.blesk.cz/rss/7'),
(u'Blesk', u'http://www.blesk.cz/rss/1'),
(u'Sex a tabu', u'http://www.blesk.cz/rss/2'),
(u'Celebrity', u'http://www.blesk.cz/rss/5'),
(u'Cestování', u'http://www.blesk.cz/rss/12')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://img.blesk.cz/images/blesk/blesk-logo.png'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_attributes = []
remove_tags_before = dict(name='div', attrs={'id':['boxContent']})
remove_tags_after = dict(name='div', attrs={'class':['artAuthors']})
remove_tags = [dict(name='div', attrs={'class':['link_clanek']}),
dict(name='div', attrs={'id':['partHeader']}),
dict(name='div', attrs={'id':['top_bottom_box', 'lista_top']})]
preprocess_regexps = [(re.compile(r'<div class="(textovytip|related)".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
keep_only_tags = [dict(name='div', attrs={'class':'articleContent'})]
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:
self.log.debug('Ignoring duplicate: ' + url)
return None
else:
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return url

View File

@ -0,0 +1,68 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class ceskaPoziceRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Česká pozice'
description = 'Česká pozice'
oldest_article = 2
max_articles_per_feed = 20
feeds = [
(u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'),
(u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'),
(u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'),
(u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed')
]
language = 'cs'
cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png'
remove_javascript = True
no_stylesheets = True
domain = u'http://www.ceskapozice.cz'
use_embedded_content = False
remove_tags = [dict(name='div', attrs={'class':['block-ad', 'region region-content-ad']}),
dict(name='ul', attrs={'class':'links'}),
dict(name='div', attrs={'id':['comments', 'back-to-top']}),
dict(name='div', attrs={'class':['next-page', 'region region-content-ad']}),
dict(name='cite')]
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:
self.log.debug('Ignoring duplicate: ' + url)
return None
else:
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return url
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
return soup
def append_page(self, soup, appendtag, position):
pager = soup.find('div', attrs={'class':'paging-bottom'})
if pager:
nextbutton = pager.find('li', attrs={'class':'pager-next'})
if nextbutton:
nexturl = self.domain + nextbutton.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'main-body'})
for it in texttag.findAll('div', attrs={'class':'region region-content-ad'}):
it.extract()
for it in texttag.findAll('cite'):
it.extract()
newpos = len(texttag.contents)
self.append_page(soup2, texttag, newpos)
texttag.extract()
appendtag.insert(position, texttag)
pager.extract()

View File

@ -0,0 +1,30 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class ceskenovinyRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'České Noviny'
description = 'ceskenoviny.cz'
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Domácí', u'http://www.ceskenoviny.cz/sluzby/rss/domov.php')
#,(u'Hlavní události', u'http://www.ceskenoviny.cz/sluzby/rss/index.php')
#,(u'Přehled zpráv', u'http://www.ceskenoviny.cz/sluzby/rss/zpravy.php')
#,(u'Ze světa', u'http://www.ceskenoviny.cz/sluzby/rss/svet.php')
#,(u'Kultura', u'http://www.ceskenoviny.cz/sluzby/rss/kultura.php')
#,(u'IT', u'http://www.ceskenoviny.cz/sluzby/rss/pocitace.php')
]
language = 'cs'
cover_url = 'http://i4.cn.cz/grafika/cn_logo-print.gif'
remove_javascript = True
no_stylesheets = True
remove_attributes = []
filter_regexps = [r'img.aktualne.centrum.cz']
keep_only_tags = [dict(name='div', attrs={'id':'clnk'})]

View File

@ -0,0 +1,26 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class cro6Recipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Český rozhlas 6'
description = 'Český rozhlas 6'
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Český rozhlas 6', u'http://www.rozhlas.cz/export/cro6/')
]
language = 'cs'
cover_url = 'http://www.rozhlas.cz/img/e5/logo/cro6.png'
remove_javascript = True
no_stylesheets = True
remove_attributes = []
remove_tags = [dict(name='div', attrs={'class':['audio-play-all', 'poradHeaders', 'actions']}),
dict(name='p', attrs={'class':['para-last']})]
keep_only_tags = [dict(name='div', attrs={'id':'article'})]

39
recipes/demagog.cz.recipe Normal file
View File

@ -0,0 +1,39 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class demagogRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Demagog.cz'
publisher = u''
description = 'demagog.cz'
oldest_article = 6
max_articles_per_feed = 20
use_embedded_content = False
remove_empty_feeds = True
feeds = [
(u'Aktuality', u'http://demagog.cz/rss')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://demagog.cz/content/images/demagog.cz.png'
remove_javascript = True
no_stylesheets = True
extra_css = """
.vyrok_suhrn{margin-top:50px; }
.vyrok{margin-bottom:30px; }
"""
remove_tags = [dict(name='a', attrs={'class':'vyrok_odovodnenie_tgl'}),
dict(name='img', attrs={'class':'vyrok_fotografia'})]
remove_tags_before = dict(name='h1')
remove_tags_after = dict(name='div', attrs={'class':'vyrok_text_after'})
preprocess_regexps = [(re.compile(r'(<div class="vyrok_suhrn">)', re.DOTALL|re.IGNORECASE), lambda match: '\1<hr>')]

36
recipes/denik.cz.recipe Normal file
View File

@ -0,0 +1,36 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class ceskyDenikRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'denik.cz'
publisher = u''
description = u'Český deník'
oldest_article = 1
max_articles_per_feed = 20
use_embedded_content = False
remove_empty_feeds = True
feeds = [
(u'Z domova', u'http://www.denik.cz/rss/z_domova.html')
,(u'Pražský deník - Moje Praha', u'http://prazsky.denik.cz/rss/zpravy_region.html')
#,(u'Zahraničí', u'http://www.denik.cz/rss/ze_sveta.html')
#,(u'Kultura', u'http://www.denik.cz/rss/kultura.html')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://g.denik.cz/images/loga/denik.png'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_tags = []
keep_only_tags = [dict(name='div', attrs={'class':'content'})]
#remove_tags_before = dict(name='h1')
remove_tags_after = dict(name='p', attrs={'class':'clanek-autor'})

View File

@ -0,0 +1,28 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class denikReferendumRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Den\u00edk Referendum'
publisher = u''
description = ''
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Deník Referendum', u'http://feeds.feedburner.com/DenikReferendum')
]
#encoding = 'iso-8859-2'
language = 'cs'
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_attributes = []
remove_tags_after = dict(name='div', attrs={'class':['text']})
remove_tags = [dict(name='div', attrs={'class':['box boxLine', 'box noprint', 'box']}),
dict(name='h3', attrs={'class':'head alt'})]
keep_only_tags = [dict(name='div', attrs={'id':['content']})]

36
recipes/ihned.cz.recipe Normal file
View File

@ -0,0 +1,36 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class ihnedRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'iHNed.cz'
publisher = u''
description = 'ihned.cz'
oldest_article = 1
max_articles_per_feed = 20
use_embedded_content = False
feeds = [
(u'Zprávy', u'http://zpravy.ihned.cz/?m=rss'),
(u'Hospodářské noviny', u'http://hn.ihned.cz/?p=500000_rss'),
(u'Byznys', u'http://byznys.ihned.cz/?m=rss'),
(u'Life', u'http://life.ihned.cz/?m=rss'),
(u'Dialog', u'http://dialog.ihned.cz/?m=rss')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://rss.ihned.cz/img/0/0_hp09/ihned.cz.gif'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_attributes = []
remove_tags_before = dict(name='div', attrs={'id':['heading']})
remove_tags_after = dict(name='div', attrs={'id':['next-authors']})
remove_tags = [dict(name='ul', attrs={'id':['comm']}),
dict(name='div', attrs={'id':['r-big']}),
dict(name='div', attrs={'class':['tools tools-top']})]

59
recipes/insider.recipe Normal file
View File

@ -0,0 +1,59 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
import re
from calibre.web.feeds.news import BasicNewsRecipe
class insider(BasicNewsRecipe):
__author__ = 'bubak'
title = 'Insider'
language = 'cz'
remove_tags = [dict(name='div', attrs={'class':'article-related-content'})
,dict(name='div', attrs={'class':'calendar'})
,dict(name='span', attrs={'id':'labelHolder'})
]
no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'class':['doubleBlock textContentFormat']})]
preprocess_regexps = [(re.compile(r'T.mata:.*', re.DOTALL|re.IGNORECASE), lambda m: '</body>')]
needs_subscription = True
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.open('http://www.denikinsider.cz/')
br.select_form(nr=0)
br['login-name'] = self.username
br['login-password'] = self.password
res = br.submit()
raw = res.read()
if u'Odhlásit se' not in raw:
raise ValueError('Failed to login to insider.cz'
'Check your username and password.')
return br
def parse_index(self):
articles = []
soup = self.index_to_soup('http://www.denikinsider.cz')
titles = soup.findAll('span', attrs={'class':'homepageArticleTitle'})
if titles is None:
raise ValueError('Could not find category content')
articles = []
seen_titles = set([])
for title in titles:
if title.string in seen_titles:
continue
article = title.parent
seen_titles.add(title.string)
url = article['href']
if url.startswith('/'):
url = 'http://www.denikinsider.cz/'+url
self.log('\tFound article:', title, 'at', url)
articles.append({'title':title.string, 'url':url, 'description':'',
'date':''})
return [(self.title, articles)]

View File

@ -0,0 +1,32 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class kudyznudyRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Kudy z nudy'
publisher = u''
description = 'kudyznudy.cz'
oldest_article = 3
max_articles_per_feed = 20
use_embedded_content = False
feeds = [
(u'Praha nejnovější', u'http://www.kudyznudy.cz/RSS/Charts.aspx?Type=Newest&Lang=cs-CZ&RegionId=1')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://www.kudyznudy.cz/App_Themes/KzN/Images/Containers/Header/HeaderLogoKZN.png'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_attributes = []
remove_tags_before = dict(name='div', attrs={'class':['C_WholeContentPadding']})
remove_tags_after = dict(name='div', attrs={'class':['SurroundingsContainer']})
remove_tags = [dict(name='div', attrs={'class':['Details', 'buttons', 'SurroundingsContainer', 'breadcrumb']})]
keep_only_tags = []

40
recipes/lidovky.recipe Normal file
View File

@ -0,0 +1,40 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class lnRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'lidovky'
publisher = u''
description = 'lidovky.cz'
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Události', u'http://www.lidovky.cz/export/rss.asp?r=ln_domov'),
(u'Svět', u'http://www.lidovky.cz/export/rss.asp?r=ln_zahranici'),
(u'Byznys', u'http://www.lidovky.cz/export/rss.asp?c=ln_byznys'),
(u'Věda', u'http://www.lidovky.cz/export/rss.asp?r=ln_veda'),
(u'Názory', u'http://www.lidovky.cz/export/rss.asp?r=ln_nazory'),
(u'Relax', u'http://www.lidovky.cz/export/rss.asp?c=ln_relax')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://g.lidovky.cz/o/lidovky_ln3b/lidovky-logo.png'
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_attributes = []
remove_tags_before = dict(name='div', attrs={'id':['content']})
remove_tags_after = dict(name='div', attrs={'class':['authors']})
preprocess_regexps = [(re.compile(r'<div id="(fb-root)".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
keep_only_tags = []

View File

@ -0,0 +1,29 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class metropolRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Metropol TV'
publisher = u''
description = 'metropol.cz'
oldest_article = 1
max_articles_per_feed = 20
use_embedded_content = False
feeds = [
(u'Metropolcv.cz', u'http://www.metropol.cz/rss/')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://www.metropol.cz/public/css/../images/logo/metropoltv.png'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_attributes = []
keep_only_tags = [dict(name='div', attrs={'id':['art-full']})]

View File

@ -0,0 +1,30 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class nfpkRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Nadační fond proti korupci'
publisher = u''
description = 'nfpk.cz'
oldest_article = 7
max_articles_per_feed = 20
use_embedded_content = False
remove_empty_feeds = True
feeds = [
(u'Aktuality', u'http://feeds.feedburner.com/nfpk')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://www.nfpk.cz/_templates/nfpk/_images/logo.gif'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_attributes = []
keep_only_tags = [dict(name='div', attrs={'id':'content'})]

View File

@ -0,0 +1,56 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
'''
Fetch Népszabadság
'''
from calibre.web.feeds.news import BasicNewsRecipe
class nepszabadsag(BasicNewsRecipe):
title = u'N\u00e9pszabads\u00e1g'
description = ''
__author__ = 'bubak'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
oldest_article = 2
max_articles_per_feed = 20
no_stylesheets = True
language = 'hu'
#delay = 1
#timeout = 10
simultaneous_downloads = 5
#encoding = 'utf-8'
remove_javascript = True
cover_url = 'http://nol.hu/_design/image/logo_nol_live.jpg'
feeds = [
(u'Belföld', u'http://nol.hu/feed/belfold.rss')
#,(u'Külföld', u'http://nol.hu/feed/kulfold.rss')
#,(u'Gazdaság', u'http://nol.hu/feed/gazdasag.rss')
#,(u'Kultúra', u'http://nol.hu/feed/kult.rss')
]
extra_css = '''
'''
remove_attributes = []
remove_tags_before = dict(name='div', attrs={'class':['d-source']})
remove_tags_after = dict(name='div', attrs={'class':['tags']})
remove_tags = [dict(name='div', attrs={'class':['h']}),
dict(name='tfoot')]
keep_only_tags = [dict(name='table', attrs={'class':'article-box'})]
# NS sends an ad page sometimes but not frequently enough, TBD
def AAskip_ad_pages(self, soup):
if ('advertisement' in soup.find('title').string.lower()):
href = soup.find('a').get('href')
self.log.debug('Skipping to: ' + href)
new = self.browser.open(href).read().decode('utf-8', 'ignore')
#ipython(locals())
self.log.debug('Finished: ' + href)
return new
else:
return None

View File

@ -0,0 +1,32 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class pesRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Neviditelný pes'
publisher = u''
description = u'Neviditelný pes'
oldest_article = 1
max_articles_per_feed = 20
use_embedded_content = False
remove_empty_feeds = True
feeds = [
(u'Neviditelný pes', u'http://neviditelnypes.lidovky.cz/export/rss.asp?c=pes_neviditelny')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://g.zpravy.cz/o/pes/logo_pes.jpg'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_tags = []
remove_tags_before = dict(name='div', attrs={'id':'art-full'})
remove_tags_after = dict(name='div', attrs={'id':'authors'})

50
recipes/novinky.cz.recipe Normal file
View File

@ -0,0 +1,50 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class novinkyRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'novinky.cz'
publisher = u'seznam.cz'
description = 'novinky.cz'
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Domácí', u'http://www.novinky.cz/rss2/domaci/'),
(u'Praha', u'http://www.novinky.cz/rss2/vase-zpravy/praha/'),
(u'Ekonomika', u'http://www.novinky.cz/rss2/ekonomika/'),
(u'Finance', u'http://www.novinky.cz/rss2/finance/'),
]
#encoding = 'utf-8'
language = 'cs'
cover_url = 'http://www.novinky.cz/static/images/logo.gif'
remove_javascript = True
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'id':['pictureInnerBox']}),
dict(name='div', attrs={'id':['discussionEntry']}),
dict(name='span', attrs={'id':['mynews-hits', 'mynews-author']}),
dict(name='div', attrs={'class':['related']}),
dict(name='div', attrs={'id':['multimediaInfo']})]
remove_tags_before = dict(name='div',attrs={'class':['articleHeader']})
remove_tags_after = dict(name='div',attrs={'class':'related'})
keep_only_tags = []
# This source has identical articles under different links
# which are redirected to the common url. I've found
# just this API method that has the real URL
visited_urls = {}
def encoding(self, source):
url = source.newurl
if url in self.visited_urls:
self.log.debug('Ignoring duplicate: ' + url)
return None
else:
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return source.decode('utf-8', 'replace')

View File

@ -0,0 +1,38 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class plRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Parlamentn\u00ed Listy'
publisher = u''
description = ''
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Parlamentní listy.cz', u'http://www.parlamentnilisty.cz/export/rss.aspx')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://www.parlamentnilisty.cz/design/listy-logo2.png'
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_attributes = []
remove_tags = [dict(name='div', attrs={'class':['articledetailboxin','crumbs', 'relatedarticles articledetailbox']}),
dict(name='div', attrs={'class':['socialshare-1 noprint', 'socialshare-2 noprint']}),
dict(name='div', attrs={'id':'widget'}),
dict(name='div', attrs={'class':'article-discussion-box noprint'})]
preprocess_regexps = [(re.compile(r'<(span|strong)[^>]*>\s*Ptejte se politik.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
keep_only_tags = [dict(name='div', attrs={'class':['article-detail']})]

View File

@ -0,0 +1,40 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class cpsRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Piratská strana'
publisher = u''
description = ''
oldest_article = 3
max_articles_per_feed = 20
use_embedded_content = False
remove_empty_feeds = True
feeds = [
(u'Články', u'http://www.pirati.cz/rss.xml')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://www.pirati.cz/sites/all/themes/addari-cps/images/headbg.jpg'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_attributes = []
keep_only_tags = [dict(name='div', attrs={'id':'postarea'})]
remove_tags = [dict(name='div', attrs={'class':['breadcrumb', 'submitted', 'links-readmore']}),
dict(name='div', attrs={'id':['comments']})]
remove_tags_before = dict(name='font', attrs={'size':'+3'})
remove_tags_after = [dict(name='iframe')]
conversion_options = {'linearize_tables' : True}

View File

@ -0,0 +1,34 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class nfpkRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Piratské noviny'
publisher = u''
description = 'nfpk.cz'
oldest_article = 2
max_articles_per_feed = 20
use_embedded_content = False
remove_empty_feeds = True
feeds = [
(u'Aktuality', u'http://www.piratskenoviny.cz/run/rss.php')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://www.piratskenoviny.cz/imgs/piratske-noviny.gif'
remove_javascript = True
no_stylesheets = True
extra_css = """
"""
remove_attributes = []
remove_tags_before = dict(name='font', attrs={'size':'+3'})
remove_tags_after = [dict(name='iframe')]
conversion_options = {'linearize_tables' : True}

64
recipes/pravo.recipe Normal file
View File

@ -0,0 +1,64 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
class pravo(BasicNewsRecipe):
__author__ = 'bubak'
title = 'Právo'
language = 'cz'
remove_tags_before = dict(name='div', attrs={'class':'rubrika-ostat'})
remove_tags_after = dict(name='td', attrs={'class':'rubrika'})
remove_tags = [dict(name='td', attrs={'width':'273'})
,dict(name='td', attrs={'class':'rubrika'})
,dict(name='div', attrs={'class':'rubrika-ostat'})
]
extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}'
cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif'
cover_margins = (0, 100, '#ffffff')
conversion_options = {'linearize_tables' : True}
no_stylesheets = True
# our variables
seen_titles = set([])
# only yesterday's articles are online
parent_url = 'http://pravo.novinky.cz/minule/'
feeds = [
('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'),
('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'),
('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'),
('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php')
]
def parse_index(self):
articles = []
for feed in self.feeds:
articles.append(self.parse_page(feed))
return articles
def parse_page(self, (feed_title, url)):
articles = []
soup = self.index_to_soup(url)
titles = soup.findAll('a', attrs={'class':'nadpis'})
if titles is None:
raise ValueError('Could not find any articles on page ' + url)
articles = []
for article in titles:
title = article.string
if title in self.seen_titles:
continue
self.seen_titles.add(title)
url = article['href']
if not url.startswith('http'):
url = self.parent_url + url
self.log('\tFound article:', title, 'at', url)
articles.append({'title':title.string, 'url':url, 'description':'',
'date':''})
return (feed_title, articles)

37
recipes/respekt.recipe Normal file
View File

@ -0,0 +1,37 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class respektRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Respekt'
publisher = u'Respekt'
description = 'Respekt'
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Všechny články', u'http://respekt.ihned.cz/index.php?p=R00000_rss')
,(u'Blogy', u'http://blog.respekt.ihned.cz/?p=Rb00VR_rss')
#,(u'Respekt DJ', u'http://respekt.ihned.cz/index.php?p=R00RDJ_rss')
]
encoding = 'cp1250'
language = 'cs'
cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
remove_javascript = True
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':['d-tools', 'actions']})]
remove_tags_before = dict(name='div',attrs={'id':['detail']})
remove_tags_after = dict(name='div',attrs={'class':'d-tools'})
preprocess_regexps = [(re.compile(r'<div class="paid-zone".*', re.DOTALL|re.IGNORECASE), lambda match: 'Za zbytek článku je nutno platit. </body>'),
(re.compile(r'.*<div class="mm-ow">', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
(re.compile(r'<div class="col3">.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
keep_only_tags = []

44
recipes/tyden.cz.recipe Normal file
View File

@ -0,0 +1,44 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class tydenRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Tyden.cz'
publisher = u''
description = ''
oldest_article = 1
max_articles_per_feed = 20
feeds = [
(u'Domácí', u'http://www.tyden.cz/rss/rss.php?rubrika_id=6'),
(u'Politika', u'http://www.tyden.cz/rss/rss.php?rubrika_id=173'),
(u'Kauzy', u'http://www.tyden.cz/rss/rss.php?rubrika_id=340')
]
#encoding = 'iso-8859-2'
language = 'cs'
cover_url = 'http://www.tyden.cz/img/tyden-logo.png'
remove_javascript = True
no_stylesheets = True
remove_attributes = []
remove_tags_before = dict(name='p', attrs={'id':['breadcrumbs']})
remove_tags_after = dict(name='p', attrs={'class':['author']})
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:
self.log.debug('Ignoring duplicate: ' + url)
return None
else:
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return url