mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
new recipes
This commit is contained in:
parent
d7a5118c42
commit
4c8deb0d5f
102
recipes/odkrywcy_pl.recipe
Normal file
102
recipes/odkrywcy_pl.recipe
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
|
class Odkrywcy(BasicNewsRecipe):
|
||||||
|
title = u'Odkrywcy.pl'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u''
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
extra_css = 'img {display: block;}'
|
||||||
|
cover_url = ''
|
||||||
|
#masthead_url = ''
|
||||||
|
INDEX = 'http://odkrywcy.pl'
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':'content'})]
|
||||||
|
remove_tags = [dict(name='a', attrs={'href':['#opOpinie', '#opinie']}), dict(attrs={'class':['fr', 'clra', 'close', 'wpsocial-fbFanpageBox', 'tagi', 'test']}), dict(id=['rekSrd05', 'moreTopNews']), dict(name='img', attrs={'class':'zr'}), dict(name='img', attrs={'alt':u'Następne'})]
|
||||||
|
remove_tags_after = dict(id='aTxt')
|
||||||
|
#remove_tags_before = dict()
|
||||||
|
feeds = [(u'', '')]
|
||||||
|
|
||||||
|
def find_articles(self, url):
|
||||||
|
articles = []
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
for i in soup.findAll(attrs={'class':'katZj clra'}):
|
||||||
|
tmp = i.find('small')
|
||||||
|
datestring = re.search('dodano: (\d{4}-\d{2}-\d{2})', tmp.string).group(1)
|
||||||
|
d = datetime.datetime.strptime(datestring, "%Y-%m-%d").date()
|
||||||
|
if (datetime.datetime.now().date() - d).days > self.oldest_article:
|
||||||
|
continue
|
||||||
|
tmp = i.find('a')
|
||||||
|
title = tmp.string
|
||||||
|
url = self.INDEX + tmp['href']
|
||||||
|
articles.append({'title' : title,
|
||||||
|
'url' : url,
|
||||||
|
'date' : '',
|
||||||
|
'description' : ''
|
||||||
|
})
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
feeds.append((u'Człowiek', self.find_articles('http://odkrywcy.pl/kat,111396,name,Czlowiek,kategoria.html')))
|
||||||
|
feeds.append((u'Technologie', self.find_articles('http://odkrywcy.pl/kat,111398,name,Technologie,kategoria.html')))
|
||||||
|
feeds.append((u'Ekologia', self.find_articles('http://odkrywcy.pl/kat,111400,name,Ekologia,kategoria.html')))
|
||||||
|
feeds.append((u'Kosmos', self.find_articles('http://odkrywcy.pl/kat,111402,name,Kosmos,kategoria.html')))
|
||||||
|
feeds.append((u'Cywilizacja', self.find_articles('http://odkrywcy.pl/kat,111404,name,Cywilizacja,kategoria.html')))
|
||||||
|
feeds.append((u'Przyroda', self.find_articles('http://odkrywcy.pl/kat,111406,name,Przyroda,kategoria.html')))
|
||||||
|
feeds.append((u'Fizyka i chemia', self.find_articles('http://odkrywcy.pl/kat,111408,name,Fizyka,kategoria.html')))
|
||||||
|
feeds.append((u'Historia', self.find_articles('http://odkrywcy.pl/kat,122994,name,Historia,kategoria.html')))
|
||||||
|
feeds.append((u'Media', self.find_articles('http://odkrywcy.pl/kat,116794,name,Media,media.html')))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('a', attrs={'class': 'btnNext'})
|
||||||
|
urls = []
|
||||||
|
while tag is not None:
|
||||||
|
if tag['href'] in urls:
|
||||||
|
break
|
||||||
|
urls.append(tag['href'])
|
||||||
|
soup2 = self.index_to_soup(self.INDEX + tag['href'])
|
||||||
|
tag = soup2.find(name='a', attrs={'class': 'btnNext'})
|
||||||
|
pagetext = soup2.findAll(attrs={'class':'content'})
|
||||||
|
for container in pagetext:
|
||||||
|
header = container.find(name='h1')
|
||||||
|
if header:
|
||||||
|
header.extract()
|
||||||
|
for comment in container.findAll(text=lambda text:isinstance(text, Comment)):
|
||||||
|
comment.extract()
|
||||||
|
for container in pagetext:
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, container)
|
||||||
|
for r in appendtag.findAll(attrs={'class':'galStr'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'alt':'Następne'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'alt':'Poprzednie'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'class':'clra'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'class':'close'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'class':'tagi'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'id':'moreTopNews'}):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
27
recipes/znadplanszy_pl.recipe
Normal file
27
recipes/znadplanszy_pl.recipe
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ZnadPlanszy(BasicNewsRecipe):
|
||||||
|
title = u'ZnadPlanszy.pl'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u''
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
#extra_css = ''
|
||||||
|
cover_url = 'http://znadplanszy.pl/wp-content/uploads/2013/05/logo-znadplanszy.png'
|
||||||
|
#masthead_url = ''
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 14
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
#keep_only_tags = [dict()]
|
||||||
|
remove_tags = [dict(attrs={'class':'rounded-container'})]
|
||||||
|
remove_tags_after = dict(attrs={'id':'dotEPUBcontent'})
|
||||||
|
remove_tags_before = dict(attrs={'class':'content units nine alpha'})
|
||||||
|
feeds = [(u'Wszystkie', 'http://znadplanszy.pl/full-feed/posts/')]
|
Loading…
x
Reference in New Issue
Block a user