Various new and updated Polish news sources
37
recipes/alejakomiksu_com.recipe
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AlejaKomiksu(BasicNewsRecipe):
|
||||||
|
title = u'Aleja Komiksu'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Serwis poświęcony komiksom. Najnowsze wieści, recenzje, artykuły, wywiady, galerie, komiksy online, konkursy, linki, baza komiksów online.'
|
||||||
|
category = 'comics'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
extra_css = 'ul {list-style-type: none;} .gfx_news {float: right;}'
|
||||||
|
preprocess_regexps = [(re.compile(ur'((<li class="no_img_b">(Do poczytania)|(Nowości):</li>)|(<p class="head2">Komentarze</p>)).*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
|
||||||
|
cover_url = 'http://www.alejakomiksu.com/gfx/build/logo.png'
|
||||||
|
masthead_url = 'http://www.alejakomiksu.com/gfx/build/logo.png'
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':'cont_tresc'})]
|
||||||
|
#remove_tags = [dict()]
|
||||||
|
#remove_tags_before = dict()
|
||||||
|
|
||||||
|
feeds = [(u'Wiadomości', 'http://www.alejakomiksu.com/rss.php5')]
|
||||||
|
|
||||||
|
def skip_ad_pages(self, soup):
|
||||||
|
tag = soup.find(attrs={'class':'rodzaj'})
|
||||||
|
if tag and tag.a.string.lower().strip() == 'recenzje':
|
||||||
|
link = soup.find(text=re.compile('recenzuje'))
|
||||||
|
if link:
|
||||||
|
return self.index_to_soup(link.parent['href'], raw=True)
|
49
recipes/fdb_pl.recipe
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class FDBPl(BasicNewsRecipe):
|
||||||
|
title = u'Fdb.pl'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Wiadomości ze świata filmu, baza danych filmowych, recenzje, zwiastuny, boxoffice.'
|
||||||
|
category = 'film'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
extra_css = '.options-left > li {display: inline;} em {display: block;}'
|
||||||
|
cover_url = 'http://fdb.pl/assets/fdb2/logo.png'
|
||||||
|
#masthead_url = ''
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':'news-item news-first'})]
|
||||||
|
remove_tags = [dict(attrs={'class':['dig dig-first', 'ads clearfix', 'comments']})]
|
||||||
|
#remove_tags_after = dict()
|
||||||
|
#remove_tags_before = dict()
|
||||||
|
feeds = []
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
feeds.append((u'Wiadomości', self.get_articles('http://fdb.pl/wiadomosci?page={0}', 2)))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def get_articles(self, url, pages=1):
|
||||||
|
articles = []
|
||||||
|
for nr in range(1, pages+1):
|
||||||
|
soup = self.index_to_soup(url.format(nr))
|
||||||
|
for tag in soup.findAll(attrs={'class':'news-item clearfix'}):
|
||||||
|
node = tag.find('h2')
|
||||||
|
title = node.a.string
|
||||||
|
url = 'http://fdb.pl' + node.a['href']
|
||||||
|
date = ''
|
||||||
|
articles.append({'title' : title,
|
||||||
|
'url' : url,
|
||||||
|
'date' : date,
|
||||||
|
'description' : ''
|
||||||
|
})
|
||||||
|
return articles
|
@ -8,94 +8,87 @@ krakow.gazeta.pl
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class gw_krakow(BasicNewsRecipe):
|
class gw_krakow(BasicNewsRecipe):
|
||||||
title = u'Gazeta Wyborcza Kraków'
|
title = u'Gazeta Wyborcza Kraków'
|
||||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
description = u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
||||||
category='newspaper'
|
category = 'newspaper'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url='http://bi.gazeta.pl/im/5/8528/m8528105.gif'
|
# encoding = 'iso-8859-2'
|
||||||
INDEX='http://krakow.gazeta.pl/'
|
masthead_url = 'http://bi.gazeta.pl/im/5/8528/m8528105.gif'
|
||||||
remove_empty_feeds= True
|
INDEX = 'http://krakow.gazeta.pl'
|
||||||
oldest_article = 1
|
cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
oldest_article = 3
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript = True
|
||||||
no_stylesheets=True
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
keep_only_tags =[]
|
# rules for gazeta.pl
|
||||||
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'}))
|
preprocess_regexps = [(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||||
|
keep_only_tags = [dict(id='gazeta_article')]
|
||||||
remove_tags =[]
|
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'}))
|
remove_tags_after = dict(id='gazeta_article_body')
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'}))
|
|
||||||
remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'}))
|
|
||||||
remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'}))
|
|
||||||
remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'}))
|
|
||||||
remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'}))
|
|
||||||
|
|
||||||
remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})]
|
|
||||||
|
|
||||||
feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')]
|
feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')]
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def print_version(self, url):
|
||||||
tag=soup.find(name='a', attrs={'class':'btn'})
|
if 'feedsportal.com' in url:
|
||||||
if tag:
|
s = url.rpartition('gazeta0Bpl')
|
||||||
new_soup=self.index_to_soup(tag['href'], raw=True)
|
u = s[2]
|
||||||
return new_soup
|
if not s[0]:
|
||||||
|
u = url.rpartition('wyborcza0Bpl')[2]
|
||||||
|
u = u.replace('/l/', '/')
|
||||||
def append_page(self, soup, appendtag):
|
u = u.replace('/ia1.htm', '')
|
||||||
loop=False
|
u = u.replace('/story01.htm', '')
|
||||||
tag = soup.find('div', attrs={'id':'Str'})
|
u = u.replace('0C', '/')
|
||||||
if appendtag.find('div', attrs={'id':'Str'}):
|
u = u.replace('A', '')
|
||||||
nexturl=tag.findAll('a')
|
u = u.replace('0E', '-')
|
||||||
appendtag.find('div', attrs={'id':'Str'}).extract()
|
u = u.replace('0H', ',')
|
||||||
loop=True
|
u = u.replace('0I', '_')
|
||||||
if appendtag.find(id='source'):
|
u = u.replace('0B', '.')
|
||||||
appendtag.find(id='source').extract()
|
u = self.INDEX + u
|
||||||
while loop:
|
return u
|
||||||
loop=False
|
else:
|
||||||
for link in nexturl:
|
return url
|
||||||
if u'następne' in link.string:
|
|
||||||
url= self.INDEX + link['href']
|
|
||||||
soup2 = self.index_to_soup(url)
|
|
||||||
pagetext = soup2.find(id='artykul')
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
tag = soup2.find('div', attrs={'id':'Str'})
|
|
||||||
nexturl=tag.findAll('a')
|
|
||||||
loop=True
|
|
||||||
|
|
||||||
def gallery_article(self, appendtag):
|
|
||||||
tag=appendtag.find(id='container_gal')
|
|
||||||
if tag:
|
|
||||||
nexturl=appendtag.find(id='gal_btn_next').a['href']
|
|
||||||
appendtag.find(id='gal_navi').extract()
|
|
||||||
while nexturl:
|
|
||||||
soup2=self.index_to_soup(nexturl)
|
|
||||||
pagetext=soup2.find(id='container_gal')
|
|
||||||
nexturl=pagetext.find(id='gal_btn_next')
|
|
||||||
if nexturl:
|
|
||||||
nexturl=nexturl.a['href']
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
rem=appendtag.find(id='gal_navi')
|
|
||||||
if rem:
|
|
||||||
rem.extract()
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
tag = soup.find(id='Str')
|
||||||
if soup.find(id='container_gal'):
|
if soup.find(attrs={'class': 'piano_btn_1'}):
|
||||||
self.gallery_article(soup.body)
|
return None
|
||||||
|
elif tag and tag.findAll('a'):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('div', attrs={'id': 'Str'})
|
||||||
|
try:
|
||||||
|
baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content']
|
||||||
|
except:
|
||||||
|
return 1
|
||||||
|
link = tag.findAll('a')[-1]
|
||||||
|
while link:
|
||||||
|
soup2 = self.index_to_soup(baseurl + link['href'])
|
||||||
|
link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
|
||||||
|
if not u'następne' in link.string:
|
||||||
|
link = ''
|
||||||
|
pagetext = soup2.find(id='artykul')
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
|
def image_url_processor(self, baseurl, url):
|
||||||
|
if url.startswith(' '):
|
||||||
|
return url.strip()
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import string
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class GazetaPlSzczecin(BasicNewsRecipe):
|
class GazetaPlSzczecin(BasicNewsRecipe):
|
||||||
title = u'Gazeta Wyborcza Szczecin'
|
title = u'Gazeta Wyborcza Szczecin'
|
||||||
@ -12,24 +12,74 @@ class GazetaPlSzczecin(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
publisher = 'Agora S.A.'
|
publisher = 'Agora S.A.'
|
||||||
category = 'news, szczecin'
|
category = 'news, szczecin'
|
||||||
oldest_article = 2
|
INDEX = 'http://szczecin.gazeta.pl'
|
||||||
|
cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
oldest_article = 3
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
auto_cleanup = True
|
remove_javascript = True
|
||||||
remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}]
|
no_stylesheets = True
|
||||||
cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif"
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
# rules for gazeta.pl
|
||||||
|
preprocess_regexps = [(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||||
|
keep_only_tags = [dict(id='gazeta_article')]
|
||||||
|
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||||
|
remove_tags_after = dict(id='gazeta_article_body')
|
||||||
feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
|
feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
s = re.search("""/0L(szczecin.*)/story01.htm""", article.link)
|
|
||||||
s = s.group(1)
|
|
||||||
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"}
|
|
||||||
for (a, b) in replacements.iteritems():
|
|
||||||
s = string.replace(s, a, b)
|
|
||||||
s = string.replace(s, "0A", "0")
|
|
||||||
return "http://"+s
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url)
|
if 'feedsportal.com' in url:
|
||||||
no1 = s.group(2)
|
s = url.rpartition('gazeta0Bpl')
|
||||||
no2 = s.group(3)
|
u = s[2]
|
||||||
return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2)
|
if not s[0]:
|
||||||
|
u = url.rpartition('wyborcza0Bpl')[2]
|
||||||
|
u = u.replace('/l/', '/')
|
||||||
|
u = u.replace('/ia1.htm', '')
|
||||||
|
u = u.replace('/story01.htm', '')
|
||||||
|
u = u.replace('0C', '/')
|
||||||
|
u = u.replace('A', '')
|
||||||
|
u = u.replace('0E', '-')
|
||||||
|
u = u.replace('0H', ',')
|
||||||
|
u = u.replace('0I', '_')
|
||||||
|
u = u.replace('0B', '.')
|
||||||
|
u = self.INDEX + u
|
||||||
|
return u
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
tag = soup.find(id='Str')
|
||||||
|
if soup.find(attrs={'class': 'piano_btn_1'}):
|
||||||
|
return None
|
||||||
|
elif tag and tag.findAll('a'):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('div', attrs={'id': 'Str'})
|
||||||
|
try:
|
||||||
|
baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content']
|
||||||
|
except:
|
||||||
|
return 1
|
||||||
|
link = tag.findAll('a')[-1]
|
||||||
|
while link:
|
||||||
|
soup2 = self.index_to_soup(baseurl + link['href'])
|
||||||
|
link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
|
||||||
|
if not u'następne' in link.string:
|
||||||
|
link = ''
|
||||||
|
pagetext = soup2.find(id='artykul')
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
|
def image_url_processor(self, baseurl, url):
|
||||||
|
if url.startswith(' '):
|
||||||
|
return url.strip()
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
@ -7,7 +7,9 @@ __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
|||||||
warszawa.gazeta.pl
|
warszawa.gazeta.pl
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class gw_wawa(BasicNewsRecipe):
|
class gw_wawa(BasicNewsRecipe):
|
||||||
title = u'Gazeta Wyborcza Warszawa'
|
title = u'Gazeta Wyborcza Warszawa'
|
||||||
@ -17,82 +19,75 @@ class gw_wawa(BasicNewsRecipe):
|
|||||||
category='newspaper'
|
category='newspaper'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif'
|
masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif'
|
||||||
INDEX='http://warszawa.gazeta.pl/'
|
INDEX = 'http://warszawa.gazeta.pl'
|
||||||
remove_empty_feeds= True
|
cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif'
|
||||||
oldest_article = 1
|
remove_empty_feeds = True
|
||||||
|
oldest_article = 3
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript = True
|
||||||
no_stylesheets=True
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
keep_only_tags =[]
|
# rules for gazeta.pl
|
||||||
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'}))
|
preprocess_regexps = [(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||||
|
keep_only_tags = [dict(id='gazeta_article')]
|
||||||
remove_tags =[]
|
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'}))
|
remove_tags_after = dict(id='gazeta_article_body')
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'}))
|
|
||||||
remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'}))
|
|
||||||
remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'}))
|
|
||||||
remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'}))
|
|
||||||
remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'}))
|
|
||||||
|
|
||||||
feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')]
|
feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')]
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def print_version(self, url):
|
||||||
tag=soup.find(name='a', attrs={'class':'btn'})
|
if 'feedsportal.com' in url:
|
||||||
if tag:
|
s = url.rpartition('gazeta0Bpl')
|
||||||
new_soup=self.index_to_soup(tag['href'], raw=True)
|
u = s[2]
|
||||||
return new_soup
|
if not s[0]:
|
||||||
|
u = url.rpartition('wyborcza0Bpl')[2]
|
||||||
|
u = u.replace('/l/', '/')
|
||||||
def append_page(self, soup, appendtag):
|
u = u.replace('/ia1.htm', '')
|
||||||
loop=False
|
u = u.replace('/story01.htm', '')
|
||||||
tag = soup.find('div', attrs={'id':'Str'})
|
u = u.replace('0C', '/')
|
||||||
if appendtag.find('div', attrs={'id':'Str'}):
|
u = u.replace('A', '')
|
||||||
nexturl=tag.findAll('a')
|
u = u.replace('0E', '-')
|
||||||
appendtag.find('div', attrs={'id':'Str'}).extract()
|
u = u.replace('0H', ',')
|
||||||
loop=True
|
u = u.replace('0I', '_')
|
||||||
if appendtag.find(id='source'):
|
u = u.replace('0B', '.')
|
||||||
appendtag.find(id='source').extract()
|
u = self.INDEX + u
|
||||||
while loop:
|
return u
|
||||||
loop=False
|
else:
|
||||||
for link in nexturl:
|
return url
|
||||||
if u'następne' in link.string:
|
|
||||||
url= self.INDEX + link['href']
|
|
||||||
soup2 = self.index_to_soup(url)
|
|
||||||
pagetext = soup2.find(id='artykul')
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
tag = soup2.find('div', attrs={'id':'Str'})
|
|
||||||
nexturl=tag.findAll('a')
|
|
||||||
loop=True
|
|
||||||
|
|
||||||
def gallery_article(self, appendtag):
|
|
||||||
tag=appendtag.find(id='container_gal')
|
|
||||||
if tag:
|
|
||||||
nexturl=appendtag.find(id='gal_btn_next').a['href']
|
|
||||||
appendtag.find(id='gal_navi').extract()
|
|
||||||
while nexturl:
|
|
||||||
soup2=self.index_to_soup(nexturl)
|
|
||||||
pagetext=soup2.find(id='container_gal')
|
|
||||||
nexturl=pagetext.find(id='gal_btn_next')
|
|
||||||
if nexturl:
|
|
||||||
nexturl=nexturl.a['href']
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
rem=appendtag.find(id='gal_navi')
|
|
||||||
if rem:
|
|
||||||
rem.extract()
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
tag = soup.find(id='Str')
|
||||||
if soup.find(id='container_gal'):
|
if soup.find(attrs={'class': 'piano_btn_1'}):
|
||||||
self.gallery_article(soup.body)
|
return None
|
||||||
|
elif tag and tag.findAll('a'):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('div', attrs={'id': 'Str'})
|
||||||
|
try:
|
||||||
|
baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content']
|
||||||
|
except:
|
||||||
|
return 1
|
||||||
|
link = tag.findAll('a')[-1]
|
||||||
|
while link:
|
||||||
|
soup2 = self.index_to_soup(baseurl + link['href'])
|
||||||
|
link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
|
||||||
|
if not u'następne' in link.string:
|
||||||
|
link = ''
|
||||||
|
pagetext = soup2.find(id='artykul')
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
|
def image_url_processor(self, baseurl, url):
|
||||||
|
if url.startswith(' '):
|
||||||
|
return url.strip()
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Comment
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
import re
|
||||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||||
title = u'Gazeta Wyborcza'
|
title = u'Gazeta Wyborcza'
|
||||||
__author__ = 'fenuks, Artur Stachecki'
|
__author__ = 'fenuks, Artur Stachecki'
|
||||||
@ -9,7 +9,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
||||||
category = 'newspaper'
|
category = 'newspaper'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
#encoding = 'iso-8859-2'
|
# encoding = 'iso-8859-2'
|
||||||
masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
||||||
INDEX = 'http://wyborcza.pl'
|
INDEX = 'http://wyborcza.pl'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
@ -19,10 +19,18 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
remove_tags_before = dict(id='k0')
|
|
||||||
remove_tags_after = dict(id='banP4')
|
# rules for gazeta.pl
|
||||||
remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
|
preprocess_regexps = [(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||||
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
|
keep_only_tags = [dict(id='gazeta_article')]
|
||||||
|
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||||
|
remove_tags_after = dict(id='gazeta_article_body')
|
||||||
|
|
||||||
|
# rules for wyborcza.biz
|
||||||
|
preprocess_regexps.append((re.compile(u'(<br>)?(<br>)? Czytaj (także|też):.*?</a>\.?<br>', re.DOTALL), lambda m: ''))
|
||||||
|
|
||||||
|
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'),
|
||||||
|
(u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
|
||||||
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
|
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
|
||||||
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
|
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
|
||||||
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
|
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
|
||||||
@ -39,86 +47,55 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
(u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
|
(u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'),
|
||||||
(u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'),
|
(u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'),
|
||||||
(u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
|
(u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
|
||||||
]
|
]
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
|
||||||
tag = soup.find(name='a', attrs={'class': 'btn'})
|
|
||||||
if tag:
|
|
||||||
new_soup = self.index_to_soup(tag['href'], raw=True)
|
|
||||||
return new_soup
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
|
||||||
loop = False
|
|
||||||
tag = soup.find('div', attrs={'id': 'Str'})
|
|
||||||
if appendtag.find('div', attrs={'id': 'Str'}):
|
|
||||||
nexturl = tag.findAll('a')
|
|
||||||
appendtag.find('div', attrs={'id': 'Str'}).extract()
|
|
||||||
loop = True
|
|
||||||
if appendtag.find(id='source'):
|
|
||||||
appendtag.find(id='source').extract()
|
|
||||||
while loop:
|
|
||||||
loop = False
|
|
||||||
for link in nexturl:
|
|
||||||
if u'następne' in link.string:
|
|
||||||
url = self.INDEX + link['href']
|
|
||||||
soup2 = self.index_to_soup(url)
|
|
||||||
pagetext = soup2.find(id='artykul')
|
|
||||||
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
|
||||||
for comment in comments:
|
|
||||||
comment.extract()
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
tag = soup2.find('div', attrs={'id': 'Str'})
|
|
||||||
nexturl = tag.findAll('a')
|
|
||||||
loop = True
|
|
||||||
|
|
||||||
def gallery_article(self, appendtag):
|
|
||||||
tag = appendtag.find(id='container_gal')
|
|
||||||
if tag:
|
|
||||||
nexturl = appendtag.find(id='gal_btn_next').a['href']
|
|
||||||
appendtag.find(id='gal_navi').extract()
|
|
||||||
while nexturl:
|
|
||||||
soup2 = self.index_to_soup(nexturl)
|
|
||||||
pagetext = soup2.find(id='container_gal')
|
|
||||||
nexturl = pagetext.find(id='gal_btn_next')
|
|
||||||
if nexturl:
|
|
||||||
nexturl = nexturl.a['href']
|
|
||||||
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
|
||||||
for comment in comments:
|
|
||||||
comment.extract()
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
rem = appendtag.find(id='gal_navi')
|
|
||||||
if rem:
|
|
||||||
rem.extract()
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
if soup.find(attrs={'class': 'piano_btn_1'}):
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
self.append_page(soup, soup.body)
|
|
||||||
if soup.find(id='container_gal'):
|
|
||||||
self.gallery_article(soup.body)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
if url.count('rss.feedsportal.com'):
|
if 'feedsportal.com' in url:
|
||||||
u = url.find('wyborcza0Bpl')
|
s = url.rpartition('wyborcza0Bpl')
|
||||||
u = 'http://www.wyborcza.pl/' + url[u + 11:]
|
u = s[2]
|
||||||
|
if not s[0]:
|
||||||
|
u = url.rpartition('gazeta0Bpl')[2]
|
||||||
|
u = u.replace('/l/', '/')
|
||||||
|
u = u.replace('/ia1.htm', '')
|
||||||
|
u = u.replace('/story01.htm', '')
|
||||||
u = u.replace('0C', '/')
|
u = u.replace('0C', '/')
|
||||||
u = u.replace('A', '')
|
u = u.replace('A', '')
|
||||||
u = u.replace('0E', '-')
|
u = u.replace('0E', '-')
|
||||||
u = u.replace('0H', ',')
|
u = u.replace('0H', ',')
|
||||||
u = u.replace('0I', '_')
|
u = u.replace('0I', '_')
|
||||||
u = u.replace('0B', '.')
|
u = u.replace('0B', '.')
|
||||||
u = u.replace('/1,', '/2029020,')
|
u = self.INDEX + u
|
||||||
u = u.replace('/story01.htm', '')
|
|
||||||
print(u)
|
|
||||||
return u
|
return u
|
||||||
elif 'http://wyborcza.pl/1' in url:
|
|
||||||
return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020')
|
|
||||||
else:
|
else:
|
||||||
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
|
return url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
tag = soup.find(id='Str')
|
||||||
|
if soup.find(attrs={'class': 'piano_btn_1'}):
|
||||||
|
return None
|
||||||
|
elif tag and tag.findAll('a'):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('div', attrs={'id': 'Str'})
|
||||||
|
try:
|
||||||
|
baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content']
|
||||||
|
except:
|
||||||
|
return 1
|
||||||
|
link = tag.findAll('a')[-1]
|
||||||
|
while link:
|
||||||
|
soup2 = self.index_to_soup(baseurl + link['href'])
|
||||||
|
link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
|
||||||
|
if not u'następne' in link.string:
|
||||||
|
link = ''
|
||||||
|
pagetext = soup2.find(id='artykul')
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
|
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
|
||||||
@ -127,6 +104,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
|
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
|
||||||
return getattr(self, 'cover_url', self.cover_url)
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
|
||||||
'''def image_url_processor(self, baseurl, url):
|
def image_url_processor(self, baseurl, url):
|
||||||
print "@@@@@@@@", url
|
if url.startswith(' '):
|
||||||
return url.replace('http://wyborcza.pl/ ', '')'''
|
return url.strip()
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ class GN(BasicNewsRecipe):
|
|||||||
|
|
||||||
__author__ = 'Piotr Kontek, Tomasz Długosz'
|
__author__ = 'Piotr Kontek, Tomasz Długosz'
|
||||||
title = u'Gość Niedzielny'
|
title = u'Gość Niedzielny'
|
||||||
description = 'Ogólnopolski tygodnik katolicki'
|
description = 'Ogólnopolski tygodnik katolicki - fragmenty artykułów z poprzedniego numeru'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
@ -33,7 +33,7 @@ class GN(BasicNewsRecipe):
|
|||||||
a = soup.find('div',attrs={'class':'release-wp-b'}).find('a')
|
a = soup.find('div',attrs={'class':'release-wp-b'}).find('a')
|
||||||
articles = [
|
articles = [
|
||||||
{'title' : self.tag_to_string(a),
|
{'title' : self.tag_to_string(a),
|
||||||
'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/')
|
'url' : 'http://www.gosc.pl' + a['href']
|
||||||
}]
|
}]
|
||||||
feeds.append((u'Wstępniak',articles))
|
feeds.append((u'Wstępniak',articles))
|
||||||
#kategorie
|
#kategorie
|
||||||
@ -71,12 +71,11 @@ class GN(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
'''
|
return soup
|
||||||
for image_div in soup.findAll(attrs={'class':'doc_image'}):
|
|
||||||
link =
|
def postprocess_html(self, soup, first_fetch):
|
||||||
if 'm.jpg' in image['src']:
|
for r in soup.findAll(attrs={'class':'pgr'}):
|
||||||
image['src'] = image['src'].replace('m.jpg', '.jpg')
|
r.extract()
|
||||||
'''
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
@ -85,12 +84,14 @@ class GN(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}),
|
dict(name='p', attrs={'class':['r tr', 'l l-2', 'wykop']}),
|
||||||
dict(name='div', attrs={'class':['doc_actions', 'pgr', 'fr1_cl']}),
|
dict(name='div', attrs={'class':['doc_actions', 'cf', 'fr1_cl']}),
|
||||||
dict(name='div', attrs={'id':'vote'})
|
dict(name='div', attrs={'id':'vote'}),
|
||||||
|
dict(name='a', attrs={'class':'img_enlarge'})
|
||||||
]
|
]
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1 {font-size:150%}
|
h1 {font-size:150%}
|
||||||
div#doc_image {font-style:italic; font-size:70%}
|
|
||||||
p.limiter {font-size:150%; font-weight: bold}
|
p.limiter {font-size:150%; font-weight: bold}
|
||||||
|
span.cm-i-a {text-transform:uppercase;}
|
||||||
|
span.cm-i-p {font-style:italic; font-size:70%}
|
||||||
'''
|
'''
|
||||||
|
BIN
recipes/icons/alejakomiksu_com.png
Normal file
After Width: | Height: | Size: 575 B |
BIN
recipes/icons/linuxportal_pl.png
Normal file
After Width: | Height: | Size: 1.4 KiB |
BIN
recipes/icons/picoboard_pl.png
Normal file
After Width: | Height: | Size: 469 B |
BIN
recipes/icons/polter_pl.png
Normal file
After Width: | Height: | Size: 766 B |
Before Width: | Height: | Size: 850 B After Width: | Height: | Size: 476 B |
BIN
recipes/icons/sekurak_pl.png
Normal file
After Width: | Height: | Size: 956 B |
BIN
recipes/icons/tawernarpg_pl.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
@ -4,10 +4,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
|||||||
|
|
||||||
class IHNed(BasicNewsRecipe):
|
class IHNed(BasicNewsRecipe):
|
||||||
|
|
||||||
|
|
||||||
stahnout_vsechny = True
|
stahnout_vsechny = True
|
||||||
#True = stahuje vsechny z homepage
|
# True = stahuje vsechny z homepage
|
||||||
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
# False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
||||||
|
|
||||||
title = 'iHNed'
|
title = 'iHNed'
|
||||||
__author__ = 'Karel Bílek'
|
__author__ = 'Karel Bílek'
|
||||||
@ -28,38 +27,33 @@ class IHNed(BasicNewsRecipe):
|
|||||||
'linearize_tables' : True,
|
'linearize_tables' : True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
def makeurl(wat):
|
def makeurl(wat):
|
||||||
return "http://ihned.cz"+wat;
|
return "http://ihned.cz"+wat
|
||||||
|
|
||||||
for h1 in soup.findAll('h1'):
|
for h1 in soup.findAll('h1'):
|
||||||
a = h1.find('a')
|
a = h1.find('a')
|
||||||
if a:
|
if a:
|
||||||
string = a.string
|
string = a.string
|
||||||
if string:
|
if string:
|
||||||
soup.a.replaceWith(string)
|
soup.a.replaceWith(string)
|
||||||
for a in soup.findAll('a', href=True) :
|
for a in soup.findAll('a', href=True) :
|
||||||
cil = str(a['href'])
|
cil = str(a['href'])
|
||||||
if cil.startswith("/") or cil.startswith("index"):
|
if cil.startswith("/") or cil.startswith("index"):
|
||||||
a['href'] = makeurl(cil)
|
a['href'] = makeurl(cil)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
|
||||||
def makeurl(wat):
|
def makeurl(wat):
|
||||||
if wat.startswith("/") or wat.startswith("index"):
|
if wat.startswith("/") or wat.startswith("index"):
|
||||||
return "http://ihned.cz"+wat;
|
return "http://ihned.cz"+wat
|
||||||
else:
|
else:
|
||||||
return wat
|
return wat
|
||||||
|
|
||||||
|
articles = {} # vysledek, asi
|
||||||
articles = {} #vysledek, asi
|
ans = [] # vsechny sekce
|
||||||
key = None #soucasna sekce
|
|
||||||
ans = [] #vsechny sekce
|
|
||||||
|
|
||||||
articles["Hlavní"] = []
|
articles["Hlavní"] = []
|
||||||
ans.append("Hlavní")
|
ans.append("Hlavní")
|
||||||
@ -70,12 +64,11 @@ class IHNed(BasicNewsRecipe):
|
|||||||
articles[name] = []
|
articles[name] = []
|
||||||
ans.append(name)
|
ans.append(name)
|
||||||
|
|
||||||
|
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||||
if otvirak:
|
if otvirak:
|
||||||
|
|
||||||
#the code is copypasted here because I don't know python. simple as that.
|
# the code is copypasted here because I don't know python. simple as that.
|
||||||
a = otvirak.find('a', href=True)
|
a = otvirak.find('a', href=True)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
txt = otvirak.find(True, attrs={'class':['txt']})
|
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||||
@ -98,13 +91,13 @@ class IHNed(BasicNewsRecipe):
|
|||||||
a = ow.find('a', href=True)
|
a = ow.find('a', href=True)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
description=''
|
description=''
|
||||||
prx = ow.find(True, attrs={'class':['prx']});
|
prx = ow.find(True, attrs={'class':['prx']})
|
||||||
if prx:
|
if prx:
|
||||||
description = str(prx.string)
|
description = str(prx.string)
|
||||||
nfo = ow.find(True, attrs={'class':['nfo']});
|
nfo = ow.find(True, attrs={'class':['nfo']})
|
||||||
pubdate = ''
|
pubdate = ''
|
||||||
if nfo:
|
if nfo:
|
||||||
dtime = time.localtime();
|
dtime = time.localtime()
|
||||||
day = dtime[2]
|
day = dtime[2]
|
||||||
month = dtime[1]
|
month = dtime[1]
|
||||||
|
|
||||||
@ -119,11 +112,6 @@ class IHNed(BasicNewsRecipe):
|
|||||||
description=description,
|
description=description,
|
||||||
content=''))
|
content=''))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
soup = self.index_to_soup('http://ihned.cz/')
|
soup = self.index_to_soup('http://ihned.cz/')
|
||||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||||
if otvirak:
|
if otvirak:
|
||||||
@ -150,7 +138,7 @@ class IHNed(BasicNewsRecipe):
|
|||||||
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
|
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
description=''
|
description=''
|
||||||
span = otv2.find('span');
|
span = otv2.find('span')
|
||||||
if span:
|
if span:
|
||||||
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
||||||
if match:
|
if match:
|
||||||
@ -163,20 +151,19 @@ class IHNed(BasicNewsRecipe):
|
|||||||
content=''))
|
content=''))
|
||||||
was[title]=1
|
was[title]=1
|
||||||
|
|
||||||
|
|
||||||
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
||||||
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
||||||
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
||||||
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
|
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí")
|
||||||
parse_subpage("http://finweb.ihned.cz/", "Finance");
|
parse_subpage("http://finweb.ihned.cz/", "Finance")
|
||||||
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
|
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb")
|
||||||
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
||||||
parse_subpage("http://sport.ihned.cz/", "Sport");
|
parse_subpage("http://sport.ihned.cz/", "Sport")
|
||||||
|
|
||||||
#seradi kategorie
|
# seradi kategorie
|
||||||
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
|
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
|
||||||
|
|
||||||
#vrati, ale pouze, kdyz je v kategoriich...
|
# vrati, ale pouze, kdyz je v kategoriich...
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
62
recipes/linuxportal_pl.recipe
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class LinuxPortal(BasicNewsRecipe):
|
||||||
|
title = u'LinuxPortal'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Na LinuxPortal.pl znajdziesz wiadomości o systemie Linux, open source oraz Androidzie.'
|
||||||
|
category = 'it'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
#extra_css = ''
|
||||||
|
cover_url = 'http://www.linuxportal.pl/templates/css/loga/Linuxportal.gif'
|
||||||
|
masthead_url = 'http://www.linuxportal.pl/templates/css/loga/Linuxportal.gif'
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
auto_cleanup = True
|
||||||
|
#keep_only_tags = [dict()]
|
||||||
|
#remove_tags = [dict()]
|
||||||
|
#remove_tags_after = dict()
|
||||||
|
#remove_tags_before = dict()
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
feeds.append((u'Wszystkie wiadomości', self.get_articles('http://www.linuxportal.pl/news/wszystkie')))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def get_articles(self, url):
|
||||||
|
articles = []
|
||||||
|
blacklist = {'dobreprogramy.pl', 'osworld.pl', 'osnews.pl',}
|
||||||
|
nexturl = url
|
||||||
|
counter = 0
|
||||||
|
skip = False
|
||||||
|
while counter < self.max_articles_per_feed:
|
||||||
|
soup = self.index_to_soup(nexturl)
|
||||||
|
nexturl = soup.find(attrs={'title':'Starsze wyniki'})['href']
|
||||||
|
for tag in soup.findAll(attrs={'class':'lista_wizyt_kol_tytul_news'}):
|
||||||
|
title = tag.h2.a.string
|
||||||
|
url = tag.find(attrs={'class':'linkzrodlo'})['href']
|
||||||
|
date = ''
|
||||||
|
for item in blacklist:
|
||||||
|
if item in url:
|
||||||
|
counter -= 1
|
||||||
|
skip = True
|
||||||
|
break
|
||||||
|
if skip:
|
||||||
|
skip = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
articles.append({'title' : title,
|
||||||
|
'url' : url,
|
||||||
|
'date' : date,
|
||||||
|
'description' : ''
|
||||||
|
})
|
||||||
|
counter += 1
|
||||||
|
return articles
|
33
recipes/picoboard_pl.recipe
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class Pikoboard(BasicNewsRecipe):
|
||||||
|
title = u'Pikoboard.pl'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Portal poświęcony takim urządzeniom jak: Raspberry Pi, XBMC, ODROID-X, BeagleBoard czy CuBox. Systemy operacyjne, modyfikacje oraz obudowy i innego rodzaju dodatki.'
|
||||||
|
category = 'IT, open source, Linux, Raspberry Pi'
|
||||||
|
language = 'pl'
|
||||||
|
cover_url = 'http://picoboard.pl/wp-content/themes/portal/img/logo.jpg'
|
||||||
|
extra_css = 'img.alignleft {float: left; margin-right: 5px;}'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = False
|
||||||
|
keep_only_tags = [dict(id=['dzial', 'posts'])]
|
||||||
|
remove_tags = [dict(attrs={'class':'post-comments'})]
|
||||||
|
remove_tags_after = dict(attrs={'class':'entry clr'})
|
||||||
|
feeds = [(u'Newsy', u'http://picoboard.pl/feed/atom/'), (u'Artyku\u0142y', u'http://picoboard.pl/category/artykuly/feed/')]
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = appendtag.find(attrs={'id':'paginacja'})
|
||||||
|
if tag:
|
||||||
|
for nexturl in tag.findAll('a'):
|
||||||
|
soup2 = self.index_to_soup(nexturl['href'])
|
||||||
|
pagetext = soup2.find(attrs={'class':'entry clr'})
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
for r in appendtag.findAll(attrs={'id':'paginacja'}):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
43
recipes/polter_pl.recipe
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Poltergeist(BasicNewsRecipe):
|
||||||
|
title = u'Poltergeist'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Największy polski serwis poświęcony ogólno pojętej fantastyce - grom fabularnym (RPG), książkom, filmowi, komiksowi, grom planszowym, karcianym i bitewnym.'
|
||||||
|
category = 'fantasy, books, rpg, games'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
extra_css = '.image, .floatright {float: right; margin-left: 10px;} .floatleft {float: left; margin-right: 10px;}'
|
||||||
|
cover_url = 'http://static.polter.pl/sub/promo/bpromo2524.jpg'
|
||||||
|
#masthead_url = ''
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
preprocess_regexps = [(re.compile(ur'<div[^>]*?id="pol_lista"[^>]*?>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'<a[^>]*?>wersja do druku</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':'boxcontent'})]
|
||||||
|
remove_tags = [dict(attrs={'class':'fb-like'}), dict(attrs={'alt':'Wersja do druku'}), dict(id='pol_liczba'), dict(attrs={'scr':'http://static.polter.pl/tplimg/buttons/ceneo_140_40.gif'})]
|
||||||
|
remove_tags_after = dict(attrs={'class':'fb-like'})
|
||||||
|
#remove_tags_before = dict()
|
||||||
|
|
||||||
|
feeds = [(u'Wieści', 'http://polter.pl/wiesci,rss.html'), (u'RPG', 'http://rpg.polter.pl/wiesci,rss.html'), (u'Książki', 'http://ksiazki.polter.pl/wiesci,rss.html'), (u'Film', 'http://film.polter.pl/wiesci,rss.html'), (u'Komiks', 'http://komiks.polter.pl/wiesci,rss.html'), (u'Gry bitewne', 'http://bitewniaki.polter.pl/wiesci,rss.html'), (u'Gry karciane', 'http://karcianki.polter.pl/wiesci,rss.html'), (u'Gry planszowe', 'http://planszowki.polter.pl/wiesci,rss.html'), (u'Gry PC', 'http://gry.polter.pl/wiesci,rss.html'), (u'Gry konsolowe', 'http://konsole.polter.pl/wiesci,rss.html'), (u'Konwenty', 'http://konwenty.polter.pl/wiesci,rss.html'), (u'Blogi', 'http://polter.pl/blogi,rss.html')]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for s in soup.findAll(attrs={'style':re.compile('float: ?left')}):
|
||||||
|
s['class'] = 'floatleft'
|
||||||
|
for s in soup.findAll(attrs={'style':re.compile('float: ?right')}):
|
||||||
|
s['class'] = 'floatright'
|
||||||
|
tag = soup.find(id='twoja_ocena')
|
||||||
|
if tag:
|
||||||
|
tag.parent.extract()
|
||||||
|
for tag in soup.findAll(id='lista_chce_ile'):
|
||||||
|
tag.parent.parent.extract()
|
||||||
|
return soup
|
28
recipes/sekurak_pl.recipe
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Sekurak(BasicNewsRecipe):
|
||||||
|
title = u'Sekurak'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Wiadomości z dziedziny bezpieczeństwa'
|
||||||
|
category = 'it, security'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
#extra_css = ''
|
||||||
|
cover_url = 'http://www.securitum.pl/aktualnosci/sekurak.pl/image'
|
||||||
|
masthead_url = ''
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(id='articleContent')]
|
||||||
|
#remove_tags = []
|
||||||
|
#remove_tags_after = dict()
|
||||||
|
#remove_tags_before = dict()
|
||||||
|
|
||||||
|
feeds = [(u'Wpisy', u'http://feeds.feedburner.com/sekurak')]
|
38
recipes/tawernarpg_pl.recipe
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class TawernaRPG(BasicNewsRecipe):
|
||||||
|
title = u'Tawerna RPG'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Tawerna RPG to ogólnopolski serwis zajmujący się fantastyką i grami fantastycznymi. Znajdziesz u nas zarówno gry fabularne, karciane, planszowe i komputerowe, a także recenzje, opowiadania i sporą dawkę humoru.'
|
||||||
|
category = 'fantasy, rpg, board games'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
extra_css = '.slajd {list-style-type: none; padding-left: 0px; margin-left: 0px;} .lewanc {float: left; margin-right: 5px;} .srodek {display: block; margin-left: auto; margin-right: auto;}'
|
||||||
|
cover_url = 'http://www.tawerna.rpg.pl/img/logo.png'
|
||||||
|
#masthead_url = ''
|
||||||
|
preprocess_regexps = [(re.compile(ur'<h2>Dodaj komentarz</h2>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(id='site')]
|
||||||
|
remove_tags = [dict(id=['player', 'komentarz'])]
|
||||||
|
remove_tags_after = dict(id='komentarz')
|
||||||
|
#remove_tags_before = dict()
|
||||||
|
|
||||||
|
feeds = [(u'Artykuły', 'http://www.tawerna.rpg.pl/css/rss.rss')]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for r in soup.findAll(attrs={'class':'powi'}):
|
||||||
|
r.parent.extract()
|
||||||
|
for c in soup.findAll(name=['li', 'ol', 'ul']):
|
||||||
|
c.name = 'div'
|
||||||
|
return soup
|