Updated Polish news sources
@ -40,6 +40,7 @@ recipes/.gitignore
|
|||||||
recipes/README.md
|
recipes/README.md
|
||||||
recipes/icon_checker.py
|
recipes/icon_checker.py
|
||||||
recipes/readme_updater.py
|
recipes/readme_updater.py
|
||||||
|
recipes/garfield.recipe
|
||||||
recipes/katalog_egazeciarz.recipe
|
recipes/katalog_egazeciarz.recipe
|
||||||
recipes/tv_axnscifi.recipe
|
recipes/tv_axnscifi.recipe
|
||||||
recipes/tv_comedycentral.recipe
|
recipes/tv_comedycentral.recipe
|
||||||
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
|
|||||||
recipes/tv_tvpuls.recipe
|
recipes/tv_tvpuls.recipe
|
||||||
recipes/tv_viasathistory.recipe
|
recipes/tv_viasathistory.recipe
|
||||||
recipes/icons/katalog_egazeciarz.png
|
recipes/icons/katalog_egazeciarz.png
|
||||||
|
recipes/icons/garfield.png
|
||||||
recipes/icons/tv_axnscifi.png
|
recipes/icons/tv_axnscifi.png
|
||||||
recipes/icons/tv_comedycentral.png
|
recipes/icons/tv_comedycentral.png
|
||||||
recipes/icons/tv_discoveryscience.png
|
recipes/icons/tv_discoveryscience.png
|
||||||
|
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
INDEX = 'http://www.esensja.pl'
|
INDEX = 'http://www.esensja.pl'
|
||||||
extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
|
||||||
.t-author {font-size: x-small; text-align: left}
|
|
||||||
.t-title2 {font-size: x-small; font-style: italic; text-align: left}
|
|
||||||
.text {font-size: small; text-align: left}
|
|
||||||
.annot-ref {font-style: italic; text-align: left}
|
|
||||||
'''
|
|
||||||
cover_url = ''
|
cover_url = ''
|
||||||
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
53
recipes/forbes_pl.recipe
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
|
||||||
|
class forbes_pl(BasicNewsRecipe):
|
||||||
|
title = u'Forbes.pl'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
|
||||||
|
oldest_article = 1
|
||||||
|
index = 'http://www.forbes.pl'
|
||||||
|
cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
|
||||||
|
preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
yesterday = now - datetime.timedelta(hours=24)
|
||||||
|
yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
|
||||||
|
pages_count = 4
|
||||||
|
keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
|
||||||
|
remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
|
||||||
|
|
||||||
|
feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
|
||||||
|
|
||||||
|
'''def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
cleanup = False
|
||||||
|
nexturl = appendtag.find('a', attrs={'class':'next'})
|
||||||
|
if nexturl:
|
||||||
|
cleanup = True
|
||||||
|
while nexturl:
|
||||||
|
soup2 = self.index_to_soup(self.index + nexturl['href'])
|
||||||
|
nexturl = soup2.find('a', attrs={'class':'next'})
|
||||||
|
pagetext = soup2.findAll(id='article-body-wrapper')
|
||||||
|
if not pagetext:
|
||||||
|
pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
|
||||||
|
for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
if cleanup:
|
||||||
|
for r in appendtag.findAll(attrs={'class':'paginator'}):
|
||||||
|
r.extract()'''
|
@ -10,7 +10,7 @@ krakow.gazeta.pl
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class gw_krakow(BasicNewsRecipe):
|
class gw_krakow(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Kraków'
|
title = u'Gazeta Wyborcza Kraków'
|
||||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
||||||
|
@ -5,7 +5,7 @@ import string
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class GazetaPlSzczecin(BasicNewsRecipe):
|
class GazetaPlSzczecin(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Szczecin'
|
title = u'Gazeta Wyborcza Szczecin'
|
||||||
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
||||||
__author__ = u'Michał Szkutnik'
|
__author__ = u'Michał Szkutnik'
|
||||||
__license__ = u'GPL v3'
|
__license__ = u'GPL v3'
|
||||||
|
@ -10,7 +10,7 @@ warszawa.gazeta.pl
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class gw_wawa(BasicNewsRecipe):
|
class gw_wawa(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Warszawa'
|
title = u'Gazeta Wyborcza Warszawa'
|
||||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
|
description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
|
||||||
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.ebooks.BeautifulSoup import Comment
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl'
|
title = u'Gazeta Wyborcza'
|
||||||
__author__ = 'fenuks, Artur Stachecki'
|
__author__ = 'fenuks, Artur Stachecki'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
||||||
|
BIN
recipes/icons/forbes_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
BIN
recipes/icons/slashdot.png
Normal file
After Width: | Height: | Size: 250 B |
BIN
recipes/icons/sportowefakty.png
Normal file
After Width: | Height: | Size: 511 B |
BIN
recipes/icons/wysokie_obcasy.png
Normal file
After Width: | Height: | Size: 205 B |
70
recipes/sportowefakty.recipe
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.magick import Image
|
||||||
|
|
||||||
|
class sportowefakty(BasicNewsRecipe):
|
||||||
|
title = u'SportoweFakty'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
|
||||||
|
oldest_article = 1
|
||||||
|
masthead_url='http://www.sportowefakty.pl/images/logo.png'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
use_embedded_content=False
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
|
||||||
|
remove_tags.append(dict(attrs = {'target' : '_blank'}))
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
|
||||||
|
(u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
|
||||||
|
(u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
|
||||||
|
(u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
|
||||||
|
(u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
|
||||||
|
(u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
|
||||||
|
(u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
|
||||||
|
(u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = article.get('link', None)
|
||||||
|
if 'utm_source' in link:
|
||||||
|
return link.split('?utm')[0]
|
||||||
|
else:
|
||||||
|
return link
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
print_url = url + '/drukuj'
|
||||||
|
return print_url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
head = soup.find('h1')
|
||||||
|
if 'Fotorelacja' in self.tag_to_string(head):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.type = "GrayscaleType"
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
@ -1,144 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
||||||
|
|
||||||
class GazetaWyborczaDuzyForma(BasicNewsRecipe):
|
|
||||||
cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
|
|
||||||
title = u"Gazeta Wyborcza Duzy Format"
|
|
||||||
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
|
||||||
description = u"Articles from Gazeta's website"
|
|
||||||
language = 'pl'
|
|
||||||
max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work
|
|
||||||
recursions = 0
|
|
||||||
encoding = 'iso-8859-2'
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'id':['k1']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
|
|
||||||
,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
|
|
||||||
,dict(name='ul', attrs={'id':['articleToolbar']})
|
|
||||||
,dict(name='img', attrs={'class':['brand']})
|
|
||||||
,dict(name='h5', attrs={'class':['author']})
|
|
||||||
,dict(name='h6', attrs={'class':['date']})
|
|
||||||
,dict(name='p', attrs={'class':['txt_upl']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [
|
|
||||||
dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii
|
|
||||||
]
|
|
||||||
|
|
||||||
def load_article_links(self, url, count):
|
|
||||||
print '--- load_article_links', url, count
|
|
||||||
|
|
||||||
#page with link to articles
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
|
|
||||||
#table with articles
|
|
||||||
list = soup.find('div', attrs={'class':'GWdalt'})
|
|
||||||
|
|
||||||
#single articles (link, title, ...)
|
|
||||||
links = list.findAll('div', attrs={'class':['GWdaltE']})
|
|
||||||
|
|
||||||
if len(links) < count:
|
|
||||||
#load links to more articles...
|
|
||||||
|
|
||||||
#remove new link
|
|
||||||
pages_nav = list.find('div', attrs={'class':'pages'})
|
|
||||||
next = pages_nav.find('a', attrs={'class':'next'})
|
|
||||||
if next:
|
|
||||||
print 'next=', next['href']
|
|
||||||
url = 'http://wyborcza.pl' + next['href']
|
|
||||||
#e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
|
|
||||||
|
|
||||||
older_links = self.load_article_links(url, count - len(links))
|
|
||||||
links.extend(older_links)
|
|
||||||
|
|
||||||
return links
|
|
||||||
|
|
||||||
|
|
||||||
#produce list of articles to download
|
|
||||||
def parse_index(self):
|
|
||||||
print '--- parse_index'
|
|
||||||
|
|
||||||
max_articles = 8000
|
|
||||||
links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
|
|
||||||
|
|
||||||
ans = []
|
|
||||||
key = None
|
|
||||||
articles = {}
|
|
||||||
|
|
||||||
key = 'Uncategorized'
|
|
||||||
articles[key] = []
|
|
||||||
|
|
||||||
for div_art in links:
|
|
||||||
div_date = div_art.find('div', attrs={'class':'kL'})
|
|
||||||
div = div_art.find('div', attrs={'class':'kR'})
|
|
||||||
|
|
||||||
a = div.find('a', href=True)
|
|
||||||
|
|
||||||
url = a['href']
|
|
||||||
title = a.string
|
|
||||||
description = ''
|
|
||||||
pubdate = div_date.string.rstrip().lstrip()
|
|
||||||
summary = div.find('span', attrs={'class':'lead'})
|
|
||||||
|
|
||||||
desc = summary.find('a', href=True)
|
|
||||||
if desc:
|
|
||||||
desc.extract()
|
|
||||||
|
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
|
||||||
description = description.rstrip().lstrip()
|
|
||||||
|
|
||||||
feed = key if key is not None else 'Duzy Format'
|
|
||||||
|
|
||||||
if not articles.has_key(feed):
|
|
||||||
articles[feed] = []
|
|
||||||
|
|
||||||
if description != '': # skip just pictures atricle
|
|
||||||
articles[feed].append(
|
|
||||||
dict(title=title, url=url, date=pubdate,
|
|
||||||
description=description,
|
|
||||||
content=''))
|
|
||||||
|
|
||||||
ans = [(key, articles[key])]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag, position):
|
|
||||||
pager = soup.find('div',attrs={'id':'Str'})
|
|
||||||
if pager:
|
|
||||||
#seek for 'a' element with nast value (if not found exit)
|
|
||||||
list = pager.findAll('a')
|
|
||||||
|
|
||||||
for elem in list:
|
|
||||||
if 'nast' in elem.string:
|
|
||||||
nexturl = elem['href']
|
|
||||||
|
|
||||||
soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
|
|
||||||
|
|
||||||
texttag = soup2.find('div', attrs={'id':'artykul'})
|
|
||||||
|
|
||||||
newpos = len(texttag.contents)
|
|
||||||
self.append_page(soup2,texttag,newpos)
|
|
||||||
texttag.extract()
|
|
||||||
appendtag.insert(position,texttag)
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
self.append_page(soup, soup.body, 3)
|
|
||||||
|
|
||||||
# finally remove some tags
|
|
||||||
pager = soup.find('div',attrs={'id':'Str'})
|
|
||||||
if pager:
|
|
||||||
pager.extract()
|
|
||||||
|
|
||||||
pager = soup.find('div',attrs={'class':'tylko_int'})
|
|
||||||
if pager:
|
|
||||||
pager.extract()
|
|
||||||
|
|
||||||
return soup
|
|
57
recipes/wysokie_obcasy.recipe
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class WysokieObcasyRecipe(BasicNewsRecipe):
|
||||||
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
version = 1
|
||||||
|
|
||||||
|
title = u'Wysokie Obcasy'
|
||||||
|
publisher = 'Agora SA'
|
||||||
|
description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
|
||||||
|
category='magazine'
|
||||||
|
language = 'pl'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
cover_url=''
|
||||||
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets=True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100000
|
||||||
|
recursions = 0
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'img'))
|
||||||
|
remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
||||||
|
h1{text-align: left;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self,url):
|
||||||
|
baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
|
||||||
|
segments = url.split(',')
|
||||||
|
subPath= '/2029020,'
|
||||||
|
articleURL1 = segments[1]
|
||||||
|
articleURL2 = segments[2]
|
||||||
|
printVerString=articleURL1 + ',' + articleURL2
|
||||||
|
s= baseURL + subPath + printVerString + '.html'
|
||||||
|
return s
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
|
||||||
|
self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
|
||||||
|
return getattr(self, 'cover_url', self.cover_url)
|