mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Merge branch 'master' of https://github.com/t3d/calibre
This commit is contained in:
commit
2b682e5d94
@ -1,28 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__author__ = 'teepel <teepel44@gmail.com>'
|
|
||||||
|
|
||||||
'''
|
|
||||||
dzialzagraniczny.pl
|
|
||||||
'''
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class dzial_zagraniczny(BasicNewsRecipe):
|
|
||||||
title = u'Dział Zagraniczny'
|
|
||||||
__author__ = 'teepel <teepel44@gmail.com>'
|
|
||||||
language = 'pl'
|
|
||||||
description = u'Polskiego czytelnika to nie interesuje'
|
|
||||||
INDEX = 'http://dzialzagraniczny.pl'
|
|
||||||
extra_css = 'img {display: block;}'
|
|
||||||
oldest_article = 7
|
|
||||||
cover_url = 'https://fbcdn-profile-a.akamaihd.net/hprofile-ak-prn1/c145.5.160.160/559442_415653975115959_2126205128_n.jpg'
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
remove_empty_feeds = True
|
|
||||||
remove_javascript = True
|
|
||||||
no_stylesheets = True
|
|
||||||
use_embedded_content = True
|
|
||||||
|
|
||||||
feeds = [(u'Dział zagraniczny', u'http://feeds.feedburner.com/dyndns/UOfz')]
|
|
@ -1,32 +0,0 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class AdvancedUserRecipe1325420346(BasicNewsRecipe):
|
|
||||||
title = u'Homopedia'
|
|
||||||
__author__ = 'rainbowwarrior'
|
|
||||||
language = 'pl'
|
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
publication_type = 'newspaper'
|
|
||||||
masthead_url = 'http://a5.sphotos.ak.fbcdn.net/hphotos-ak-snc6/67335_168352243178437_166186720061656_594975_5800720_n.jpg'
|
|
||||||
encoding = 'utf-8'
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
return 'http://a7.sphotos.ak.fbcdn.net/hphotos-ak-snc4/65568_166186970061631_166186720061656_580324_7584264_n.jpg'
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Nowe has\u0142a', u'http://www.homopedia.pl/w/index.php?title=Specjalna:Nowe_strony&feed=atom&hideliu=&hidepatrolled=&hidebots=&hideredirs=1&limit=50&namespace=0'), # noqa
|
|
||||||
|
|
||||||
(u'Blog', u'http://blog.homopedia.pl/feeds/posts/default')]
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
artl = article.get('link', None)
|
|
||||||
rest, sep, article_id = artl.rpartition('/')
|
|
||||||
return 'http://www.homopedia.pl/w/index.php?redirect=no&printable=yes&title=' + article_id
|
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'class': 'noprint'}), dict(name='ul', attrs={'class': 'noprint'}), dict(name='ul', attrs={'id': 'footer-places'}), dict(name='li', attrs={'id': 'footer-info-viewcount'}), dict(name='span', attrs={'class': 'editsection'}), dict(name='div', attrs={'id': 'jump-to-nav'})] # noqa
|
|
||||||
|
|
||||||
remove_tags_before = dict(dict(name='h2', attrs={'class': 'post-title'}))
|
|
||||||
remove_tags_after = dict(dict(name='a', attrs={'class': 'timestamp-link'}))
|
|
||||||
|
|
||||||
extra_css = 'p{text-indent:1.5em!important;padding:0!important;margin;0!important}'
|
|
Binary file not shown.
Before Width: | Height: | Size: 438 B |
Binary file not shown.
Before Width: | Height: | Size: 314 B |
Binary file not shown.
Before Width: | Height: | Size: 739 B |
Binary file not shown.
Before Width: | Height: | Size: 542 B |
@ -1,111 +0,0 @@
|
|||||||
__license__ = 'GPL v3'
|
|
||||||
import re
|
|
||||||
import datetime
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
from calibre.ebooks.BeautifulSoup import Comment
|
|
||||||
|
|
||||||
|
|
||||||
class Odkrywcy(BasicNewsRecipe):
|
|
||||||
title = u'Odkrywcy.pl'
|
|
||||||
__author__ = 'fenuks'
|
|
||||||
description = u''
|
|
||||||
language = 'pl'
|
|
||||||
extra_css = 'img {display: block;}'
|
|
||||||
cover_url = ''
|
|
||||||
INDEX = 'http://odkrywcy.pl'
|
|
||||||
use_embedded_content = False
|
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_empty_feeds = True
|
|
||||||
remove_javascript = True
|
|
||||||
remove_attributes = ['style', 'font']
|
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
|
||||||
|
|
||||||
keep_only_tags = [dict(attrs={'class': 'content'})]
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='a', attrs={'href': ['#opOpinie', '#opinie']}), dict(attrs={'class': ['fr', 'clra', 'close', 'wpsocial-fbFanpageBox', 'tagi', 'test']}),
|
|
||||||
dict(id=['rekSrd05', 'moreTopNews']), dict(name='img', attrs={'class': 'zr'}), dict(name='img', attrs={'alt': u'Następne'})]
|
|
||||||
remove_tags_after = dict(id='aTxt')
|
|
||||||
feeds = [(u'', '')]
|
|
||||||
|
|
||||||
def find_articles(self, url):
|
|
||||||
articles = []
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
for i in soup.findAll(attrs={'class': 'katZj clra'}):
|
|
||||||
tmp = i.find('small')
|
|
||||||
datestring = re.search(
|
|
||||||
'dodano: (\d{4}-\d{2}-\d{2})', tmp.string).group(1)
|
|
||||||
d = datetime.datetime.strptime(datestring, "%Y-%m-%d").date()
|
|
||||||
if (datetime.datetime.now().date() - d).days > self.oldest_article:
|
|
||||||
continue
|
|
||||||
tmp = i.find('a')
|
|
||||||
title = tmp.string
|
|
||||||
url = self.INDEX + tmp['href']
|
|
||||||
articles.append({'title': title,
|
|
||||||
'url': url,
|
|
||||||
'date': '',
|
|
||||||
'description': ''
|
|
||||||
})
|
|
||||||
return articles
|
|
||||||
|
|
||||||
def parse_index(self):
|
|
||||||
feeds = []
|
|
||||||
feeds.append((u'Człowiek', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,111396,name,Czlowiek,kategoria.html')))
|
|
||||||
feeds.append((u'Technologie', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,111398,name,Technologie,kategoria.html')))
|
|
||||||
feeds.append((u'Ekologia', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,111400,name,Ekologia,kategoria.html')))
|
|
||||||
feeds.append((u'Kosmos', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,111402,name,Kosmos,kategoria.html')))
|
|
||||||
feeds.append((u'Cywilizacja', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,111404,name,Cywilizacja,kategoria.html')))
|
|
||||||
feeds.append((u'Przyroda', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,111406,name,Przyroda,kategoria.html')))
|
|
||||||
feeds.append((u'Fizyka i chemia', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,111408,name,Fizyka,kategoria.html')))
|
|
||||||
feeds.append((u'Historia', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,122994,name,Historia,kategoria.html')))
|
|
||||||
feeds.append((u'Media', self.find_articles(
|
|
||||||
'http://odkrywcy.pl/kat,116794,name,Media,media.html')))
|
|
||||||
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
|
||||||
tag = soup.find('a', attrs={'class': 'btnNext'})
|
|
||||||
urls = []
|
|
||||||
while tag is not None:
|
|
||||||
if tag['href'] in urls:
|
|
||||||
break
|
|
||||||
urls.append(tag['href'])
|
|
||||||
soup2 = self.index_to_soup(self.INDEX + tag['href'])
|
|
||||||
tag = soup2.find(name='a', attrs={'class': 'btnNext'})
|
|
||||||
pagetext = soup2.findAll(attrs={'class': 'content'})
|
|
||||||
for container in pagetext:
|
|
||||||
header = container.find(name='h1')
|
|
||||||
if header:
|
|
||||||
header.extract()
|
|
||||||
for comment in container.findAll(text=lambda text: isinstance(text, Comment)):
|
|
||||||
comment.extract()
|
|
||||||
for container in pagetext:
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, container)
|
|
||||||
for r in appendtag.findAll(attrs={'class': 'galStr'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'alt': 'Następne'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'alt': 'Poprzednie'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'class': 'clra'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'class': 'close'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'class': 'tagi'}):
|
|
||||||
r.extract()
|
|
||||||
for r in appendtag.findAll(attrs={'id': 'moreTopNews'}):
|
|
||||||
r.extract()
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
self.append_page(soup, soup.body)
|
|
||||||
return soup
|
|
@ -1,29 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = u'2012, Tomasz Dlugosz <tomek3d@gmail.com>'
|
|
||||||
'''
|
|
||||||
rybinski.eu
|
|
||||||
'''
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class Rybinski(BasicNewsRecipe):
|
|
||||||
title = u'Rybinski.eu - economy of the XXI century'
|
|
||||||
description = u'Blog ekonomiczny dra hab. Krzysztofa Rybi\u0144skiego'
|
|
||||||
language = 'pl'
|
|
||||||
__author__ = u'Tomasz D\u0142ugosz'
|
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
|
|
||||||
feeds = [(u'wpisy', u'http://www.rybinski.eu/?feed=rss2&lang=pl')]
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class': 'post'})]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class': 'post-meta-1'}),
|
|
||||||
dict(name='div', attrs={'class': 'post-meta-2'}),
|
|
||||||
dict(name='div', attrs={'class': 'post-comments'})
|
|
||||||
]
|
|
@ -5,7 +5,6 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = u'kwetal, Tomasz Dlugosz, adrianf0'
|
__author__ = u'kwetal, Tomasz Dlugosz, adrianf0'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
version = 2
|
|
||||||
|
|
||||||
title = u'Rzeczpospolita OnLine'
|
title = u'Rzeczpospolita OnLine'
|
||||||
publisher = u'Presspublica Sp.'
|
publisher = u'Presspublica Sp.'
|
||||||
@ -25,28 +24,15 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
|
|||||||
feeds.append((u"Prawo", u'http://www.rp.pl/rss/1037')) # Prawo
|
feeds.append((u"Prawo", u'http://www.rp.pl/rss/1037')) # Prawo
|
||||||
|
|
||||||
keep_only_tags = []
|
keep_only_tags = []
|
||||||
keep_only_tags.append(dict(name='div', attrs={'class': 'article-content'}))
|
keep_only_tags.append(dict(name='h1', attrs={'id': 'article-title'}))
|
||||||
|
keep_only_tags.append(dict(name='img', attrs={'class': 'img-responsive article__image'}))
|
||||||
|
keep_only_tags.append(dict(name='div', attrs={'class': ['article-content', 'article__lead js-voice-read', 'article__content js-voice-read','article__image-desc','article__image-author']}))
|
||||||
|
|
||||||
remove_tags = []
|
remove_tags = []
|
||||||
remove_tags.append(dict(name='div', attrs={'id': 'article-copyright-box'}))
|
remove_tags.append(dict(name='div', attrs={'class': 'related-articles__wrapper'}))
|
||||||
remove_tags.append(dict(name='div', attrs={'class': 'article-footer'}))
|
remove_tags.append(dict(name='span', attrs={'class': ['article__premium-player','ad-label']}))
|
||||||
remove_tags.append(dict(name='div', attrs={'class': 'article-tags'}))
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
div.article__image-desc {font-style:italic; font-size:70%;text-align:right}
|
||||||
h1{text-align: left;}
|
div.article__image-author {font-size:60%;text-align:right}
|
||||||
h2{font-size: medium; font-weight: bold;}
|
'''
|
||||||
p.lead {font-weight: bold; text-align: left;}
|
|
||||||
.authordate {font-size: small; color: #696969;}
|
|
||||||
.fot{font-size: x-small; color: #666666;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
# def skip_ad_pages(self, soup):
|
|
||||||
# if ('advertisement' in soup.find('title').string.lower()):
|
|
||||||
# href = soup.find('a').get('href')
|
|
||||||
# return self.index_to_soup(href, raw=True)
|
|
||||||
# else:
|
|
||||||
# return None
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url + '?template=printart'
|
|
||||||
|
@ -17,18 +17,11 @@ class SATKurier(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
keep_only_tags = []
|
keep_only_tags = [dict(name='div', attrs={'id': ['leftNewsContainer', 'content']})]
|
||||||
keep_only_tags.append(
|
|
||||||
dict(name='div', attrs={'id': ['single_news', 'content']}))
|
|
||||||
|
|
||||||
remove_tags = []
|
remove_tags = [dict(name='div', attrs={'class': ['col-xs-20', 'coverNews','btn-group']})]
|
||||||
remove_tags.append(dict(attrs={'id': ['news_info', 'comments']}))
|
|
||||||
remove_tags.append(dict(attrs={'href': '#czytaj'}))
|
|
||||||
remove_tags.append(dict(attrs={'align': 'center'}))
|
|
||||||
remove_tags.append(dict(attrs={'class': [
|
|
||||||
'date', 'category', 'right mini-add-comment', 'socialLinks', 'commentlist']}))
|
|
||||||
|
|
||||||
remove_tags_after = [(dict(id='entry'))]
|
remove_tags_after = [dict(name='div',attrs={'class':'btn-group'})]
|
||||||
|
|
||||||
feeds = [(u'Najnowsze wiadomości', u'http://feeds.feedburner.com/satkurierpl?format=xml'),
|
feeds = [(u'Najnowsze wiadomości', u'http://feeds.feedburner.com/satkurierpl?format=xml'),
|
||||||
(u'Sport w telewizji',
|
(u'Sport w telewizji',
|
||||||
|
@ -1,20 +0,0 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class SpidersWeb(BasicNewsRecipe):
|
|
||||||
title = u"Spider's Web"
|
|
||||||
oldest_article = 7
|
|
||||||
__author__ = 'fenuks'
|
|
||||||
description = u'Autorskie teksty popularnych blogerów, testy sprzętu i aplikacji, oraz wiele więcej.'
|
|
||||||
cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png'
|
|
||||||
category = 'IT, WEB'
|
|
||||||
language = 'pl'
|
|
||||||
no_stylesheers = True
|
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
keep_only_tags = [dict(id='start')]
|
|
||||||
remove_tags_after = dict(attrs={'class': 'padding20'})
|
|
||||||
remove_tags = [dict(name='div', attrs={
|
|
||||||
'class': ['padding border-bottom', 'padding20', 'padding border-top']})]
|
|
||||||
feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]
|
|
@ -1,75 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = 'teepel 2012'
|
|
||||||
|
|
||||||
'''
|
|
||||||
sport.pl
|
|
||||||
'''
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class sport_pl(BasicNewsRecipe):
|
|
||||||
title = 'Sport.pl'
|
|
||||||
__author__ = 'teepel <teepel44@gmail.com>'
|
|
||||||
language = 'pl'
|
|
||||||
description = u'Największy portal sportowy w Polsce. Wiadomości sportowe z najważniejszych wydarzeń, relacje i wyniki meczów na żywo.'
|
|
||||||
masthead_url = 'http://press.gazeta.pl/file/mediakit/154509/c8/sportpl.jpg'
|
|
||||||
oldest_article = 1
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
remove_javascript = True
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_empty_feeds = True
|
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
|
||||||
keep_only_tags = []
|
|
||||||
keep_only_tags.append(dict(name='div', attrs={'id': 'article'}))
|
|
||||||
|
|
||||||
remove_tags = []
|
|
||||||
remove_tags.append(dict(name='a', attrs={'href': 'www.gazeta.pl'}))
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Wszystkie wiadomości', u'http://rss.gazeta.pl/pub/rss/sport.xml'),
|
|
||||||
(u'Piłka nożna',
|
|
||||||
u'http://www.sport.pl/pub/rss/sport/pilka_nozna.htm'),
|
|
||||||
(u'F1', u'http://www.sport.pl/pub/rss/sportf1.htm'),
|
|
||||||
(u'Tenis', u'http://serwisy.gazeta.pl/pub/rss/tenis.htm'),
|
|
||||||
(u'Siatkówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611628/index.rss'),
|
|
||||||
(u'Koszykówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611647/index.rss'),
|
|
||||||
(u'Piłka ręczna',
|
|
||||||
u'http://gazeta.pl.feedsportal.com/c/32739/f/611635/index.rss'),
|
|
||||||
(u'Inne sporty', u'http://gazeta.pl.feedsportal.com/c/32739/f/611649/index.rss'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def parse_feeds(self):
|
|
||||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
|
||||||
for feed in feeds:
|
|
||||||
for article in feed.articles[:]:
|
|
||||||
if '[ZDJĘCIA]' in article.title:
|
|
||||||
article.title = article.title.replace('[ZDJĘCIA]', '')
|
|
||||||
elif '[WIDEO]' in article.title:
|
|
||||||
article.title = article.title.replace('[WIDEO]', '')
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
if 'feedsportal' in url:
|
|
||||||
segment = url.split('/')
|
|
||||||
urlPart = segment[-2]
|
|
||||||
urlPart = urlPart.replace('0L0Ssport0Bpl0C', '')
|
|
||||||
urlPart = urlPart.replace('0C10H', '/')
|
|
||||||
urlPart = urlPart.replace('0H', ',')
|
|
||||||
urlPart = urlPart.replace('0I', '_')
|
|
||||||
urlPart = urlPart.replace('A', '')
|
|
||||||
segment1 = urlPart.split('/')
|
|
||||||
seg1 = segment1[0]
|
|
||||||
seg2 = segment1[1]
|
|
||||||
segment2 = seg2.split(',')
|
|
||||||
part = segment2[0] + ',' + segment2[1]
|
|
||||||
return 'http://www.sport.pl/' + seg1 + '/2029020,' + part + '.html'
|
|
||||||
else:
|
|
||||||
segment = url.split('/')
|
|
||||||
part2 = segment[-2]
|
|
||||||
part1 = segment[-1]
|
|
||||||
segment2 = part1.split(',')
|
|
||||||
part = segment2[1] + ',' + segment2[2]
|
|
||||||
return 'http://www.sport.pl/' + part2 + '/2029020,' + part + '.html'
|
|
Loading…
x
Reference in New Issue
Block a user