diff --git a/recipes/aktualne.cz.recipe b/recipes/aktualne.cz.recipe new file mode 100644 index 0000000000..cd2dcc5f09 --- /dev/null +++ b/recipes/aktualne.cz.recipe @@ -0,0 +1,69 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class aktualneRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'aktualne.cz' + publisher = u'Centrum holdings' + description = 'aktuálně.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'), + (u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'), + (u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'), + (u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'), + (u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'), + (u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php') + ] + + + language = 'cs' + cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + remove_tags_before = dict(name='h1', attrs={'class':['titulek-clanku']}) + filter_regexps = [r'img.aktualne.centrum.cz'] + remove_tags = [dict(name='div', attrs={'id':['social-bookmark']}), + dict(name='div', attrs={'class':['box1', 'svazane-tagy']}), + dict(name='div', attrs={'class':'itemcomment id0'}), + dict(name='div', attrs={'class':'hlavicka'}), + dict(name='div', attrs={'class':'hlavni-menu'}), + dict(name='div', attrs={'class':'top-standard-brand-obal'}), + dict(name='div', attrs={'class':'breadcrumb'}), + dict(name='div', attrs={'id':'start-standard'}), + dict(name='div', attrs={'id':'forum'}), + dict(name='span', attrs={'class':'akce'}), + dict(name='span', attrs={'class':'odrazka vetsi'}), + dict(name='div', attrs={'class':'boxP'}), + dict(name='div', attrs={'class':'box2'})] + preprocess_regexps = [ + (re.compile(r'
'), + (re.compile(r'
')] + + keep_only_tags = [] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + def encoding(self, source): + if source.newurl.find('blog.aktualne') >= 0: + enc = 'utf-8' + else: + enc = 'iso-8859-2' + self.log.debug('Called encoding ' + enc + " " + str(source.newurl)) + return source.decode(enc, 'replace') + diff --git a/recipes/antyweb.recipe b/recipes/antyweb.recipe new file mode 100644 index 0000000000..c2576191dd --- /dev/null +++ b/recipes/antyweb.recipe @@ -0,0 +1,48 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class AntywebRecipe(BasicNewsRecipe): + encoding = 'utf-8' + __license__ = 'GPL v3' + __author__ = u'Artur Stachecki ' + language = 'pl' + version = 1 + title = u'Antyweb' + category = u'News' + description = u'Blog o internecie i nowych technologiach' + cover_url='' + remove_empty_feeds= True + auto_cleanup = False + no_stylesheets=True + use_embedded_content = False + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript = True + simultaneous_downloads = 3 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'mm-article-title'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'mm-article-content'})) + + + remove_tags =[] + remove_tags.append(dict(name = 'h2', attrs = {'class' : 'widgettitle'})) + remove_tags.append(dict(name = 'img', attrs = {'class' : 'alignleft'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'float: right;margin-left:1em;margin-bottom: 0.5em;padding-bottom: 3px; width: 72px;'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : 'http://antyweb.pl/wp-content/uploads/2011/09/HOSTERSI_testy_pasek600x30.gif'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'podwpisowe'})) + + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + ''' + + feeds = [ + (u'Artykuly', u'feed://feeds.feedburner.com/Antyweb?format=xml'), + ] + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/bankier_pl.recipe b/recipes/bankier_pl.recipe new file mode 100644 index 0000000000..8a68d844b3 --- /dev/null +++ b/recipes/bankier_pl.recipe @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +bankier.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class bankier(BasicNewsRecipe): + title = u'Bankier.pl' + __author__ = 'teepel ' + language = 'pl' + description ='Polski portal finansowy. Informacje o: gospodarka, inwestowanie, finanse osobiste, prowadzenie firmy, kursy walut, notowania akcji, fundusze.' + masthead_url='http://www.bankier.pl/gfx/hd-mid-02.gif' + INDEX='http://bankier.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + simultaneous_downloads = 5 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'align' : 'left'})) + + remove_tags =[] + remove_tags.append(dict(name = 'table', attrs = {'cellspacing' : '2'})) + remove_tags.append(dict(name = 'div', attrs = {'align' : 'center'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : '/gfx/hd-mid-02.gif'})) + #remove_tags.append(dict(name = 'a', attrs = {'target' : '_blank'})) + #remove_tags.append(dict(name = 'br', attrs = {'clear' : 'all'})) + + feeds = [ + (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'), + (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'), + (u'Firma', u'http://feeds.feedburner.com/bankier-firma'), + (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'), + (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'), + (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'), + ] + def print_version(self, url): + segment = url.split('.') + urlPart = segment[2] + segments = urlPart.split('-') + urlPart2 = segments[-1] + return 'http://www.bankier.pl/wiadomosci/print.html?article_id=' + urlPart2 + diff --git a/recipes/blesk.recipe b/recipes/blesk.recipe new file mode 100644 index 0000000000..7eff4c42d0 --- /dev/null +++ b/recipes/blesk.recipe @@ -0,0 +1,55 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class bleskRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Blesk' + publisher = u'' + description = 'blesk.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Zprávy', u'http://www.blesk.cz/rss/7'), + (u'Blesk', u'http://www.blesk.cz/rss/1'), + (u'Sex a tabu', u'http://www.blesk.cz/rss/2'), + (u'Celebrity', u'http://www.blesk.cz/rss/5'), + (u'Cestování', u'http://www.blesk.cz/rss/12') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://img.blesk.cz/images/blesk/blesk-logo.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['boxContent']}) + remove_tags_after = dict(name='div', attrs={'class':['artAuthors']}) + remove_tags = [dict(name='div', attrs={'class':['link_clanek']}), + dict(name='div', attrs={'id':['partHeader']}), + dict(name='div', attrs={'id':['top_bottom_box', 'lista_top']})] + preprocess_regexps = [(re.compile(r'
')] + + keep_only_tags = [dict(name='div', attrs={'class':'articleContent'})] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + + + diff --git a/recipes/buchreport.recipe b/recipes/buchreport.recipe new file mode 100644 index 0000000000..5ed34d1ee8 --- /dev/null +++ b/recipes/buchreport.recipe @@ -0,0 +1,45 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the Buchreport to an ebook.''' + +class Buchreport(BasicNewsRecipe) : + __author__ = 'a.peter' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + description = 'Buchreport' + version = 4 + title = u'Buchreport' + timefmt = ' [%d.%m.%Y]' + encoding = 'cp1252' + language = 'de' + + + extra_css = 'body { margin-left: 0.00em; margin-right: 0.00em; } \ + article, articledate, articledescription { text-align: left; } \ + h1 { text-align: left; font-size: 140%; font-weight: bold; } \ + h2 { text-align: left; font-size: 100%; font-weight: bold; font-style: italic; } \ + h3 { text-align: left; font-size: 100%; font-weight: regular; font-style: italic; } \ + h4, h5, h6 { text-align: left; font-size: 100%; font-weight: bold; }' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_tags_before = dict(name='h2') + remove_tags_after = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}) + ] + remove_tags = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}), + dict(name='iframe'), + dict(name='img') + ] + + feeds = [ + (u'Buchreport', u'http://www.buchreport.de/index.php?id=5&type=100') + ] + + def get_masthead_url(self): + return 'http://www.buchreport.de/fileadmin/template/img/buchreport_logo.jpg' diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe index badca48733..a61c32aa42 100644 --- a/recipes/business_standard.recipe +++ b/recipes/business_standard.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2009-2010, Darko Miletic ' +__copyright__ = '2009-2012, Darko Miletic ' ''' www.business-standard.com ''' @@ -14,10 +14,12 @@ class BusinessStandard(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + auto_cleanup = False encoding = 'cp1252' publisher = 'Business Standard Limited' category = 'news, business, money, india, world' language = 'en_IN' + masthead_url = 'http://feeds.business-standard.com/images/logo_08.jpg' conversion_options = { 'comments' : description @@ -26,7 +28,7 @@ class BusinessStandard(BasicNewsRecipe): ,'publisher' : publisher ,'linearize_tables': True } - keep_only_tags=[dict(attrs={'class':'TableClas'})] + #keep_only_tags=[dict(name='td', attrs={'class':'TableClas'})] remove_tags = [ dict(name=['object','link','script','iframe','base','meta']) ,dict(attrs={'class':'rightDiv2'}) @@ -45,3 +47,8 @@ class BusinessStandard(BasicNewsRecipe): ,(u'Management & Mktg' , u'http://feeds.business-standard.com/rss/7_0.xml' ) ,(u'Opinion' , u'http://feeds.business-standard.com/rss/5_0.xml' ) ] + + def print_version(self, url): + l, s, tp = url.rpartition('/') + t, k, autono = l.rpartition('/') + return 'http://www.business-standard.com/india/printpage.php?autono=' + autono + '&tp=' + tp diff --git a/recipes/ceska_pozice.recipe b/recipes/ceska_pozice.recipe new file mode 100644 index 0000000000..478f6823b9 --- /dev/null +++ b/recipes/ceska_pozice.recipe @@ -0,0 +1,68 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskaPoziceRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Česká pozice' + description = 'Česká pozice' + oldest_article = 2 + max_articles_per_feed = 20 + + feeds = [ + (u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'), + (u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'), + (u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'), + (u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed') + ] + + + language = 'cs' + cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png' + remove_javascript = True + no_stylesheets = True + domain = u'http://www.ceskapozice.cz' + use_embedded_content = False + + + remove_tags = [dict(name='div', attrs={'class':['block-ad', 'region region-content-ad']}), + dict(name='ul', attrs={'class':'links'}), + dict(name='div', attrs={'id':['comments', 'back-to-top']}), + dict(name='div', attrs={'class':['next-page', 'region region-content-ad']}), + dict(name='cite')] + + keep_only_tags = [dict(name='div', attrs={'id':'content'})] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) + return soup + + def append_page(self, soup, appendtag, position): + pager = soup.find('div', attrs={'class':'paging-bottom'}) + if pager: + nextbutton = pager.find('li', attrs={'class':'pager-next'}) + if nextbutton: + nexturl = self.domain + nextbutton.a['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class':'main-body'}) + for it in texttag.findAll('div', attrs={'class':'region region-content-ad'}): + it.extract() + for it in texttag.findAll('cite'): + it.extract() + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + appendtag.insert(position, texttag) + pager.extract() + diff --git a/recipes/ceske_noviny.recipe b/recipes/ceske_noviny.recipe new file mode 100644 index 0000000000..10dd16689d --- /dev/null +++ b/recipes/ceske_noviny.recipe @@ -0,0 +1,30 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskenovinyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'České Noviny' + description = 'ceskenoviny.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://www.ceskenoviny.cz/sluzby/rss/domov.php') + #,(u'Hlavní události', u'http://www.ceskenoviny.cz/sluzby/rss/index.php') + #,(u'Přehled zpráv', u'http://www.ceskenoviny.cz/sluzby/rss/zpravy.php') + #,(u'Ze světa', u'http://www.ceskenoviny.cz/sluzby/rss/svet.php') + #,(u'Kultura', u'http://www.ceskenoviny.cz/sluzby/rss/kultura.php') + #,(u'IT', u'http://www.ceskenoviny.cz/sluzby/rss/pocitace.php') + ] + + + language = 'cs' + cover_url = 'http://i4.cn.cz/grafika/cn_logo-print.gif' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + filter_regexps = [r'img.aktualne.centrum.cz'] + + keep_only_tags = [dict(name='div', attrs={'id':'clnk'})] diff --git a/recipes/cesky_rozhlas_6.recipe b/recipes/cesky_rozhlas_6.recipe new file mode 100644 index 0000000000..eca32af02c --- /dev/null +++ b/recipes/cesky_rozhlas_6.recipe @@ -0,0 +1,26 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class cro6Recipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Český rozhlas 6' + description = 'Český rozhlas 6' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Český rozhlas 6', u'http://www.rozhlas.cz/export/cro6/') + ] + + + language = 'cs' + cover_url = 'http://www.rozhlas.cz/img/e5/logo/cro6.png' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + remove_tags = [dict(name='div', attrs={'class':['audio-play-all', 'poradHeaders', 'actions']}), + dict(name='p', attrs={'class':['para-last']})] + + keep_only_tags = [dict(name='div', attrs={'id':'article'})] diff --git a/recipes/demagog.cz.recipe b/recipes/demagog.cz.recipe new file mode 100644 index 0000000000..7d89af41bd --- /dev/null +++ b/recipes/demagog.cz.recipe @@ -0,0 +1,39 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class demagogRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Demagog.cz' + publisher = u'' + description = 'demagog.cz' + oldest_article = 6 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://demagog.cz/rss') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://demagog.cz/content/images/demagog.cz.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + .vyrok_suhrn{margin-top:50px; } + .vyrok{margin-bottom:30px; } + """ + + remove_tags = [dict(name='a', attrs={'class':'vyrok_odovodnenie_tgl'}), + dict(name='img', attrs={'class':'vyrok_fotografia'})] + remove_tags_before = dict(name='h1') + remove_tags_after = dict(name='div', attrs={'class':'vyrok_text_after'}) + preprocess_regexps = [(re.compile(r'(
)', re.DOTALL|re.IGNORECASE), lambda match: '\1
')] + + + + diff --git a/recipes/denik.cz.recipe b/recipes/denik.cz.recipe new file mode 100644 index 0000000000..2ccf8caa40 --- /dev/null +++ b/recipes/denik.cz.recipe @@ -0,0 +1,36 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskyDenikRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'denik.cz' + publisher = u'' + description = u'Český deník' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Z domova', u'http://www.denik.cz/rss/z_domova.html') + ,(u'Pražský deník - Moje Praha', u'http://prazsky.denik.cz/rss/zpravy_region.html') + #,(u'Zahraničí', u'http://www.denik.cz/rss/ze_sveta.html') + #,(u'Kultura', u'http://www.denik.cz/rss/kultura.html') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.denik.cz/images/loga/denik.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_tags = [] + keep_only_tags = [dict(name='div', attrs={'class':'content'})] + #remove_tags_before = dict(name='h1') + remove_tags_after = dict(name='p', attrs={'class':'clanek-autor'}) + + diff --git a/recipes/denik_referendum.recipe b/recipes/denik_referendum.recipe new file mode 100644 index 0000000000..e04871d067 --- /dev/null +++ b/recipes/denik_referendum.recipe @@ -0,0 +1,28 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class denikReferendumRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Den\u00edk Referendum' + publisher = u'' + description = '' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Deník Referendum', u'http://feeds.feedburner.com/DenikReferendum') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags_after = dict(name='div', attrs={'class':['text']}) + remove_tags = [dict(name='div', attrs={'class':['box boxLine', 'box noprint', 'box']}), + dict(name='h3', attrs={'class':'head alt'})] + + keep_only_tags = [dict(name='div', attrs={'id':['content']})] diff --git a/recipes/editoriali.recipe b/recipes/editoriali.recipe index 1b0c558df4..c5596bd743 100644 --- a/recipes/editoriali.recipe +++ b/recipes/editoriali.recipe @@ -7,6 +7,7 @@ class AdvancedUserRecipe1332847053(BasicNewsRecipe): title = u'Editoriali' __author__ = 'faber1971' description = 'Leading articles on Italy by the best Italian editorials' + language = 'it' oldest_article = 1 max_articles_per_feed = 100 diff --git a/recipes/f1_ultra.recipe b/recipes/f1_ultra.recipe new file mode 100644 index 0000000000..ada82542fc --- /dev/null +++ b/recipes/f1_ultra.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class f1ultra(BasicNewsRecipe): + title = u'Formuła 1 - F1 ultra' + __license__ = 'GPL v3' + __author__ = 'MrStefan , Artur Stachecki ' + language = 'pl' + description =u'Formuła 1, Robert Kubica, F3, GP2 oraz inne serie wyścigowe.' + masthead_url='http://www.f1ultra.pl/templates/f1ultra/images/logo.gif' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[(dict(name = 'div', attrs = {'id' : 'main'}))] + remove_tags_after =[dict(attrs = {'style' : 'margin-top:5px;margin-bottom:5px;display: inline;'})] + remove_tags =[(dict(attrs = {'class' : ['buttonheading', 'avPlayerContainer', 'createdate']}))] + remove_tags.append(dict(attrs = {'title' : ['PDF', 'Drukuj', 'Email']})) + remove_tags.append(dict(name = 'form', attrs = {'method' : 'post'})) + remove_tags.append(dict(name = 'hr', attrs = {'size' : '2'})) + + preprocess_regexps = [(re.compile(r'align="left"'), lambda match: ''), + (re.compile(r'align="right"'), lambda match: ''), + (re.compile(r'width=\"*\"'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; } + img { display: block; clear: both;} + ''' + remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align'] + + feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')] diff --git a/recipes/foreign_policy.recipe b/recipes/foreign_policy.recipe index 893d055a05..4ddecf842f 100644 --- a/recipes/foreign_policy.recipe +++ b/recipes/foreign_policy.recipe @@ -8,6 +8,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1349086293(BasicNewsRecipe): title = u'Foreign Policy' + language = 'en' __author__ = 'Darko Miletic' description = 'International News' publisher = 'Washingtonpost.Newsweek Interactive, LLC' diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe index 0f35e536f6..59188a5d6a 100644 --- a/recipes/gazeta_pl_krakow.recipe +++ b/recipes/gazeta_pl_krakow.recipe @@ -8,7 +8,6 @@ krakow.gazeta.pl ''' from calibre.web.feeds.news import BasicNewsRecipe -import re class gw_krakow(BasicNewsRecipe): title = u'Gazeta.pl Kraków' @@ -46,7 +45,7 @@ class gw_krakow(BasicNewsRecipe): remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'})) remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})] - + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] def skip_ad_pages(self, soup): diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe index 7a43931db4..2d95bcc06f 100644 --- a/recipes/gazeta_pl_warszawa.recipe +++ b/recipes/gazeta_pl_warszawa.recipe @@ -8,7 +8,6 @@ warszawa.gazeta.pl ''' from calibre.web.feeds.news import BasicNewsRecipe -import re class gw_wawa(BasicNewsRecipe): title = u'Gazeta.pl Warszawa' @@ -43,7 +42,7 @@ class gw_wawa(BasicNewsRecipe): remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) - + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] def skip_ad_pages(self, soup): diff --git a/recipes/icons/antyweb.png b/recipes/icons/antyweb.png new file mode 100644 index 0000000000..8ca9870f60 Binary files /dev/null and b/recipes/icons/antyweb.png differ diff --git a/recipes/icons/bankier_pl.png b/recipes/icons/bankier_pl.png new file mode 100644 index 0000000000..c26f006a57 Binary files /dev/null and b/recipes/icons/bankier_pl.png differ diff --git a/recipes/icons/business_standard.png b/recipes/icons/business_standard.png index 1edff420c0..f4c04e566a 100644 Binary files a/recipes/icons/business_standard.png and b/recipes/icons/business_standard.png differ diff --git a/recipes/icons/f1_ultra.png b/recipes/icons/f1_ultra.png new file mode 100644 index 0000000000..f45a94f53a Binary files /dev/null and b/recipes/icons/f1_ultra.png differ diff --git a/recipes/icons/myapple_pl.png b/recipes/icons/myapple_pl.png new file mode 100644 index 0000000000..a68cf4e7ef Binary files /dev/null and b/recipes/icons/myapple_pl.png differ diff --git a/recipes/icons/telepolis_pl.png b/recipes/icons/telepolis_pl.png new file mode 100644 index 0000000000..0b94658d94 Binary files /dev/null and b/recipes/icons/telepolis_pl.png differ diff --git a/recipes/ihned.cz.recipe b/recipes/ihned.cz.recipe new file mode 100644 index 0000000000..a35be06dd1 --- /dev/null +++ b/recipes/ihned.cz.recipe @@ -0,0 +1,36 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ihnedRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'iHNed.cz' + publisher = u'' + description = 'ihned.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Zprávy', u'http://zpravy.ihned.cz/?m=rss'), + (u'Hospodářské noviny', u'http://hn.ihned.cz/?p=500000_rss'), + (u'Byznys', u'http://byznys.ihned.cz/?m=rss'), + (u'Life', u'http://life.ihned.cz/?m=rss'), + (u'Dialog', u'http://dialog.ihned.cz/?m=rss') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://rss.ihned.cz/img/0/0_hp09/ihned.cz.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['heading']}) + remove_tags_after = dict(name='div', attrs={'id':['next-authors']}) + remove_tags = [dict(name='ul', attrs={'id':['comm']}), + dict(name='div', attrs={'id':['r-big']}), + dict(name='div', attrs={'class':['tools tools-top']})] diff --git a/recipes/insider.recipe b/recipes/insider.recipe new file mode 100644 index 0000000000..faaf00a14a --- /dev/null +++ b/recipes/insider.recipe @@ -0,0 +1,59 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +import re + +from calibre.web.feeds.news import BasicNewsRecipe + +class insider(BasicNewsRecipe): + __author__ = 'bubak' + title = 'Insider' + language = 'cs' + + remove_tags = [dict(name='div', attrs={'class':'article-related-content'}) + ,dict(name='div', attrs={'class':'calendar'}) + ,dict(name='span', attrs={'id':'labelHolder'}) + ] + + no_stylesheets = True + keep_only_tags = [dict(name='div', attrs={'class':['doubleBlock textContentFormat']})] + + preprocess_regexps = [(re.compile(r'T.mata:.*', re.DOTALL|re.IGNORECASE), lambda m: '')] + needs_subscription = True + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('http://www.denikinsider.cz/') + br.select_form(nr=0) + br['login-name'] = self.username + br['login-password'] = self.password + res = br.submit() + raw = res.read() + if u'Odhlásit se' not in raw: + raise ValueError('Failed to login to insider.cz' + 'Check your username and password.') + return br + + def parse_index(self): + articles = [] + + soup = self.index_to_soup('http://www.denikinsider.cz') + titles = soup.findAll('span', attrs={'class':'homepageArticleTitle'}) + if titles is None: + raise ValueError('Could not find category content') + + articles = [] + seen_titles = set([]) + for title in titles: + if title.string in seen_titles: + continue + article = title.parent + seen_titles.add(title.string) + url = article['href'] + if url.startswith('/'): + url = 'http://www.denikinsider.cz/'+url + self.log('\tFound article:', title, 'at', url) + articles.append({'title':title.string, 'url':url, 'description':'', + 'date':''}) + return [(self.title, articles)] + + diff --git a/recipes/kudy_z_nudy.recipe b/recipes/kudy_z_nudy.recipe new file mode 100644 index 0000000000..d7c0d9ecf9 --- /dev/null +++ b/recipes/kudy_z_nudy.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class kudyznudyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Kudy z nudy' + publisher = u'' + description = 'kudyznudy.cz' + oldest_article = 3 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Praha nejnovější', u'http://www.kudyznudy.cz/RSS/Charts.aspx?Type=Newest&Lang=cs-CZ&RegionId=1') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.kudyznudy.cz/App_Themes/KzN/Images/Containers/Header/HeaderLogoKZN.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'class':['C_WholeContentPadding']}) + remove_tags_after = dict(name='div', attrs={'class':['SurroundingsContainer']}) + remove_tags = [dict(name='div', attrs={'class':['Details', 'buttons', 'SurroundingsContainer', 'breadcrumb']})] + + keep_only_tags = [] diff --git a/recipes/lidovky.recipe b/recipes/lidovky.recipe new file mode 100644 index 0000000000..8e4754829b --- /dev/null +++ b/recipes/lidovky.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class lnRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'lidovky' + publisher = u'' + description = 'lidovky.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Události', u'http://www.lidovky.cz/export/rss.asp?r=ln_domov'), + (u'Svět', u'http://www.lidovky.cz/export/rss.asp?r=ln_zahranici'), + (u'Byznys', u'http://www.lidovky.cz/export/rss.asp?c=ln_byznys'), + (u'Věda', u'http://www.lidovky.cz/export/rss.asp?r=ln_veda'), + (u'Názory', u'http://www.lidovky.cz/export/rss.asp?r=ln_nazory'), + (u'Relax', u'http://www.lidovky.cz/export/rss.asp?c=ln_relax') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.lidovky.cz/o/lidovky_ln3b/lidovky-logo.png' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['content']}) + remove_tags_after = dict(name='div', attrs={'class':['authors']}) + preprocess_regexps = [(re.compile(r'
')] + + keep_only_tags = [] + + + + + diff --git a/recipes/metropol_tv.recipe b/recipes/metropol_tv.recipe new file mode 100644 index 0000000000..56f393c96a --- /dev/null +++ b/recipes/metropol_tv.recipe @@ -0,0 +1,29 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class metropolRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Metropol TV' + publisher = u'' + description = 'metropol.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Metropolcv.cz', u'http://www.metropol.cz/rss/') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.metropol.cz/public/css/../images/logo/metropoltv.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + + keep_only_tags = [dict(name='div', attrs={'id':['art-full']})] diff --git a/recipes/myapple_pl.recipe b/recipes/myapple_pl.recipe new file mode 100644 index 0000000000..df5708a325 --- /dev/null +++ b/recipes/myapple_pl.recipe @@ -0,0 +1,49 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class MyAppleRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = u'Artur Stachecki ' + language = 'pl' + version = 1 + + title = u'MyApple.pl' + category = u'News' + description = u' Największy w Polsce serwis zajmujący się tematyką związaną z Apple i wszelkimi produktami tej firmy.' + cover_url='' + remove_empty_feeds= True + no_stylesheets=True + oldest_article = 7 + max_articles_per_feed = 100000 + recursions = 0 + + no_stylesheets = True + remove_javascript = True + simultaneous_downloads = 3 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article_content'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'article_author_date_comment_container'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'fullwidth'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'cmslinks'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'googleads-468'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'comments'})) + + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + td.contentheading{font-size: large; font-weight: bold;} + ''' + + feeds = [ + ('News', 'feed://myapple.pl/external.php?do=rss&type=newcontent§ionid=1&days=120&count=10'), + ] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/nadacni_fond_proti_korupci.recipe b/recipes/nadacni_fond_proti_korupci.recipe new file mode 100644 index 0000000000..2a8a69283c --- /dev/null +++ b/recipes/nadacni_fond_proti_korupci.recipe @@ -0,0 +1,30 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class nfpkRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Nadační fond proti korupci' + publisher = u'' + description = 'nfpk.cz' + oldest_article = 7 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://feeds.feedburner.com/nfpk') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.nfpk.cz/_templates/nfpk/_images/logo.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + keep_only_tags = [dict(name='div', attrs={'id':'content'})] + diff --git a/recipes/nepszabadsag.recipe b/recipes/nepszabadsag.recipe new file mode 100644 index 0000000000..8ae5447dd6 --- /dev/null +++ b/recipes/nepszabadsag.recipe @@ -0,0 +1,56 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +''' +Fetch Népszabadság +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class nepszabadsag(BasicNewsRecipe): + title = u'N\u00e9pszabads\u00e1g' + description = '' + __author__ = 'bubak' + use_embedded_content = False + timefmt = ' [%d %b %Y]' + oldest_article = 2 + max_articles_per_feed = 20 + no_stylesheets = True + language = 'hu' + #delay = 1 + #timeout = 10 + simultaneous_downloads = 5 + + #encoding = 'utf-8' + remove_javascript = True + cover_url = 'http://nol.hu/_design/image/logo_nol_live.jpg' + + feeds = [ + (u'Belföld', u'http://nol.hu/feed/belfold.rss') + #,(u'Külföld', u'http://nol.hu/feed/kulfold.rss') + #,(u'Gazdaság', u'http://nol.hu/feed/gazdasag.rss') + #,(u'Kultúra', u'http://nol.hu/feed/kult.rss') + ] + + extra_css = ''' + ''' + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'class':['d-source']}) + remove_tags_after = dict(name='div', attrs={'class':['tags']}) + remove_tags = [dict(name='div', attrs={'class':['h']}), + dict(name='tfoot')] + + + keep_only_tags = [dict(name='table', attrs={'class':'article-box'})] + + # NS sends an ad page sometimes but not frequently enough, TBD + def AAskip_ad_pages(self, soup): + if ('advertisement' in soup.find('title').string.lower()): + href = soup.find('a').get('href') + self.log.debug('Skipping to: ' + href) + new = self.browser.open(href).read().decode('utf-8', 'ignore') + #ipython(locals()) + self.log.debug('Finished: ' + href) + return new + else: + return None + diff --git a/recipes/neviditelny_pes.recipe b/recipes/neviditelny_pes.recipe new file mode 100644 index 0000000000..65cfb2b7ec --- /dev/null +++ b/recipes/neviditelny_pes.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class pesRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Neviditelný pes' + publisher = u'' + description = u'Neviditelný pes' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Neviditelný pes', u'http://neviditelnypes.lidovky.cz/export/rss.asp?c=pes_neviditelny') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.zpravy.cz/o/pes/logo_pes.jpg' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_tags = [] + remove_tags_before = dict(name='div', attrs={'id':'art-full'}) + remove_tags_after = dict(name='div', attrs={'id':'authors'}) + + diff --git a/recipes/novinky.cz.recipe b/recipes/novinky.cz.recipe new file mode 100644 index 0000000000..19fd52a371 --- /dev/null +++ b/recipes/novinky.cz.recipe @@ -0,0 +1,50 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class novinkyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'novinky.cz' + publisher = u'seznam.cz' + description = 'novinky.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://www.novinky.cz/rss2/domaci/'), + (u'Praha', u'http://www.novinky.cz/rss2/vase-zpravy/praha/'), + (u'Ekonomika', u'http://www.novinky.cz/rss2/ekonomika/'), + (u'Finance', u'http://www.novinky.cz/rss2/finance/'), + ] + + + #encoding = 'utf-8' + language = 'cs' + cover_url = 'http://www.novinky.cz/static/images/logo.gif' + remove_javascript = True + no_stylesheets = True + + remove_tags = [dict(name='div', attrs={'id':['pictureInnerBox']}), + dict(name='div', attrs={'id':['discussionEntry']}), + dict(name='span', attrs={'id':['mynews-hits', 'mynews-author']}), + dict(name='div', attrs={'class':['related']}), + dict(name='div', attrs={'id':['multimediaInfo']})] + remove_tags_before = dict(name='div',attrs={'class':['articleHeader']}) + remove_tags_after = dict(name='div',attrs={'class':'related'}) + + keep_only_tags = [] + + # This source has identical articles under different links + # which are redirected to the common url. I've found + # just this API method that has the real URL + visited_urls = {} + def encoding(self, source): + url = source.newurl + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return source.decode('utf-8', 'replace') + diff --git a/recipes/parlamentni_listy.recipe b/recipes/parlamentni_listy.recipe new file mode 100644 index 0000000000..71d904866a --- /dev/null +++ b/recipes/parlamentni_listy.recipe @@ -0,0 +1,38 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class plRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Parlamentn\u00ed Listy' + publisher = u'' + description = '' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Parlamentní listy.cz', u'http://www.parlamentnilisty.cz/export/rss.aspx') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.parlamentnilisty.cz/design/listy-logo2.png' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags = [dict(name='div', attrs={'class':['articledetailboxin','crumbs', 'relatedarticles articledetailbox']}), + dict(name='div', attrs={'class':['socialshare-1 noprint', 'socialshare-2 noprint']}), + dict(name='div', attrs={'id':'widget'}), + dict(name='div', attrs={'class':'article-discussion-box noprint'})] + preprocess_regexps = [(re.compile(r'<(span|strong)[^>]*>\s*Ptejte se politik.*', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(name='div', attrs={'class':['article-detail']})] + + + + + + diff --git a/recipes/piratska_strana.recipe b/recipes/piratska_strana.recipe new file mode 100644 index 0000000000..c125eb8aad --- /dev/null +++ b/recipes/piratska_strana.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class cpsRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Piratská strana' + publisher = u'' + description = '' + oldest_article = 3 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Články', u'http://www.pirati.cz/rss.xml') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.pirati.cz/sites/all/themes/addari-cps/images/headbg.jpg' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + keep_only_tags = [dict(name='div', attrs={'id':'postarea'})] + remove_tags = [dict(name='div', attrs={'class':['breadcrumb', 'submitted', 'links-readmore']}), + dict(name='div', attrs={'id':['comments']})] + remove_tags_before = dict(name='font', attrs={'size':'+3'}) + remove_tags_after = [dict(name='iframe')] + + conversion_options = {'linearize_tables' : True} + + + + + diff --git a/recipes/piratske_noviny.recipe b/recipes/piratske_noviny.recipe new file mode 100644 index 0000000000..a2d30374ed --- /dev/null +++ b/recipes/piratske_noviny.recipe @@ -0,0 +1,34 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class nfpkRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Piratské noviny' + publisher = u'' + description = 'nfpk.cz' + oldest_article = 2 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://www.piratskenoviny.cz/run/rss.php') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.piratskenoviny.cz/imgs/piratske-noviny.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='font', attrs={'size':'+3'}) + remove_tags_after = [dict(name='iframe')] + conversion_options = {'linearize_tables' : True} + + + diff --git a/recipes/portfolio_hu.recipe b/recipes/portfolio_hu.recipe index 5eaf0e886f..feadd124ed 100644 --- a/recipes/portfolio_hu.recipe +++ b/recipes/portfolio_hu.recipe @@ -4,7 +4,7 @@ class AdvancedUserRecipe1348063712(BasicNewsRecipe): title = u'Portfolio.hu - English Edition' __author__ = 'laca' oldest_article = 7 - language = 'en_HUN' + language = 'en_HU' masthead_url = 'http://www.portfolio.hu/img/sit/angolfejlec2010.jpg' use_embedded_content = False auto_cleanup = True diff --git a/recipes/pravo.recipe b/recipes/pravo.recipe new file mode 100644 index 0000000000..02d2c13439 --- /dev/null +++ b/recipes/pravo.recipe @@ -0,0 +1,64 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals + +from calibre.web.feeds.news import BasicNewsRecipe + +class pravo(BasicNewsRecipe): + __author__ = 'bubak' + title = 'Právo' + language = 'cs' + + remove_tags_before = dict(name='div', attrs={'class':'rubrika-ostat'}) + remove_tags_after = dict(name='td', attrs={'class':'rubrika'}) + remove_tags = [dict(name='td', attrs={'width':'273'}) + ,dict(name='td', attrs={'class':'rubrika'}) + ,dict(name='div', attrs={'class':'rubrika-ostat'}) + ] + extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}' + cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif' + cover_margins = (0, 100, '#ffffff') + conversion_options = {'linearize_tables' : True} + + no_stylesheets = True + + # our variables + seen_titles = set([]) + # only yesterday's articles are online + parent_url = 'http://pravo.novinky.cz/minule/' + feeds = [ + ('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'), + ('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'), + ('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'), + ('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php') + ] + + + def parse_index(self): + articles = [] + + for feed in self.feeds: + articles.append(self.parse_page(feed)) + return articles + + def parse_page(self, (feed_title, url)): + articles = [] + + soup = self.index_to_soup(url) + titles = soup.findAll('a', attrs={'class':'nadpis'}) + if titles is None: + raise ValueError('Could not find any articles on page ' + url) + + articles = [] + for article in titles: + title = article.string + if title in self.seen_titles: + continue + self.seen_titles.add(title) + url = article['href'] + if not url.startswith('http'): + url = self.parent_url + url + self.log('\tFound article:', title, 'at', url) + articles.append({'title':title.string, 'url':url, 'description':'', + 'date':''}) + return (feed_title, articles) + diff --git a/recipes/respekt.recipe b/recipes/respekt.recipe new file mode 100644 index 0000000000..91aa2edb40 --- /dev/null +++ b/recipes/respekt.recipe @@ -0,0 +1,37 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class respektRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Respekt' + publisher = u'Respekt' + description = 'Respekt' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Všechny články', u'http://respekt.ihned.cz/index.php?p=R00000_rss') + ,(u'Blogy', u'http://blog.respekt.ihned.cz/?p=Rb00VR_rss') + #,(u'Respekt DJ', u'http://respekt.ihned.cz/index.php?p=R00RDJ_rss') + ] + + + encoding = 'cp1250' + language = 'cs' + cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png' + remove_javascript = True + no_stylesheets = True + + remove_tags = [dict(name='div', attrs={'class':['d-tools', 'actions']})] + remove_tags_before = dict(name='div',attrs={'id':['detail']}) + remove_tags_after = dict(name='div',attrs={'class':'d-tools'}) + preprocess_regexps = [(re.compile(r'