mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/t3d/calibre
This commit is contained in:
commit
f6fa902490
Binary file not shown.
Before Width: | Height: | Size: 137 B |
Binary file not shown.
Before Width: | Height: | Size: 201 B |
Binary file not shown.
Before Width: | Height: | Size: 472 B |
Binary file not shown.
Before Width: | Height: | Size: 284 B |
@ -1,48 +0,0 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
|
||||
class PB_PL(BasicNewsRecipe):
|
||||
title = u'Puls Biznesu'
|
||||
__author__ = 'fenuks'
|
||||
language = 'pl'
|
||||
description = u'Puls Biznesu - biznes, ekonomia, giełda, inwestycje'
|
||||
category = u'newspaper'
|
||||
publication_type = u'newspaper'
|
||||
encoding = 'utf-8'
|
||||
# masthead_url = 'http://www.pb.pl/img/pb.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
remove_tags_after = dict(name='div', attrs={'class': 'news_content'})
|
||||
feeds = [
|
||||
(u'Wszystkie', u'http://www.pb.pl/atom'),
|
||||
(u'Puls inwestora', u'http://pulsinwestora.pb.pl/atom'),
|
||||
(u'Puls Firmy', u'http://firma.pb.pl/atom'),
|
||||
(u'PB Weekend', u'http://weekend.pb.pl/atom'),
|
||||
(u'Forum MPS', u'http://forummsp.pb.pl/atom'),
|
||||
(u'Moto', u'http://moto.pb.pl/atom'),
|
||||
(u'Kariera i praca', u'http://kariera.pb.pl/atom'),
|
||||
|
||||
(u'Nieruchomości', u'http://nieruchomosci.pb.pl/atom'),
|
||||
(u'Samorządy', u'http://samorzady.pb.pl/atom'),
|
||||
(u'Tech', u'http://tech.pb.pl/atom'),
|
||||
(u'Energetyka', u'http://energetyka.pb.pl/atom'),
|
||||
(u'Retailing', u'http://retailing.pb.pl/atom'),
|
||||
(u'Puls medycyny', u'http://pulsmedycyny.pl/atom'),
|
||||
(u'Logistyka', u'http://logistyka.pb.pl/atom')]
|
||||
|
||||
def print_version(self, url):
|
||||
article_id = re.search(r'(?P<id>\d+,\d+)', url)
|
||||
if article_id:
|
||||
return 'http://www.pb.pl/actionprint/' + article_id.group('id')
|
||||
else:
|
||||
return url
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://archiwum.pb.pl/')
|
||||
cover = soup.find(name='img', attrs={'class': 'cover_picture'})
|
||||
self.cover_url = cover['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
@ -11,35 +11,10 @@ class PurePC(BasicNewsRecipe):
|
||||
description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
masthead_url = 'http://www.purepc.pl/themes/new/images/purepc.jpg'
|
||||
cover_url = 'http://www.purepc.pl/themes/new/images/purepc.jpg'
|
||||
extra_css = '.wykres_logo {float: left; margin-right: 5px;}'
|
||||
no_stylesheets = True
|
||||
keep_only_tags = [dict(id='content')]
|
||||
remove_tags_after = dict(attrs={'class': 'fivestar-widget'})
|
||||
remove_tags = [dict(id='navigator'), dict(
|
||||
attrs={'class': ['box-tools', 'fivestar-widget', 'PageMenuList']})]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'node page0'})]
|
||||
remove_tags = [dict(name='div', attrs={'class':'article-options'})]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')]
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
lasturl = appendtag.find(attrs={'class': 'pager-last'})
|
||||
if lasturl:
|
||||
regex = re.search('(.+?2C)(\d+)', lasturl.a['href'])
|
||||
baseurl = regex.group(1).replace('?page=0%2C', '?page=1%2C')
|
||||
baseurl = 'http://www.purepc.pl' + baseurl
|
||||
nr = int(regex.group(2))
|
||||
for page_nr in range(1, nr + 1):
|
||||
soup2 = self.index_to_soup(baseurl + str(page_nr))
|
||||
pagetext = soup2.find(attrs={'class': 'article'})
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for r in appendtag.findAll(attrs={'class': ['PageMenuList', 'pager', 'fivestar-widget']}):
|
||||
r.extract()
|
||||
comments = appendtag.findAll(
|
||||
text=lambda text: isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
@ -1,58 +0,0 @@
|
||||
__copyright__ = '2012, Micha\u0142 <webmaster@racjonalista.pl>'
|
||||
'''
|
||||
Racjonalista.pl
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
|
||||
class Racjonalista(BasicNewsRecipe):
|
||||
__author__ = u'Micha\u0142 <webmaster@racjonalista.pl>'
|
||||
publisher = u'Fundacja Wolnej My\u015bli'
|
||||
title = u'Racjonalista.pl'
|
||||
description = u'Racjonalista.pl'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
simultaneous_downloads = 2
|
||||
timeout = 30
|
||||
cover_url = 'http://www.racjonalista.pl/img/uimg/rac.gif'
|
||||
|
||||
feeds = [(u'Racjonalista.pl', u'http://www.racjonalista.pl/rss.php')]
|
||||
|
||||
match_regexps = [r'kk\.php']
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/s,', '/t,')
|
||||
|
||||
extra_css = 'h2 {font: serif large} .cytat {text-align: right}'
|
||||
|
||||
remove_attributes = ['target', 'width', 'height']
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(i[0], re.DOTALL), i[1]) for i in
|
||||
[(r'<p[^>]*> </p>', lambda match: ''),
|
||||
(r' ', lambda match: ' '),
|
||||
(r'<meta[^>]+>', lambda match: ''),
|
||||
(r'<link[^>]+>', lambda match: ''),
|
||||
(r'</?center>', lambda match: ''),
|
||||
(r'<a href="[^"]+" rel=author><b>(?P<a>[^<]+)</b></a>',
|
||||
lambda match: '<b>' + match.group('a') + '</b>'),
|
||||
(r'<div align=center style="font-size:18px">(?P<t>[^<]+)</div>',
|
||||
lambda match: '<h2>' + match.group('t') + '</h2>'),
|
||||
(r'<table align=center width=700 border=0 cellpadding=0 cellspacing=0><tr><td width="100%" bgcolor="#edeceb" height="100%" style="font-size:12px">', lambda match: ''), # noqa
|
||||
(r'</td></tr><tr><td>', lambda match: ''),
|
||||
(r'</td></tr></table></body>', lambda match: '</body>'),
|
||||
(r'<a[^>]+><sup>(?P<p>[^<]+)</sup></a>',
|
||||
lambda match: '<sup>' + match.group('p') + '</sup>'),
|
||||
(r'<a name=p[^>]+>(?P<a>[^<]+)</a>', lambda match: match.group('a')),
|
||||
(r'<a href="[^"]+" target=_blank class=linkext>Orygin[^<]+</a>',
|
||||
lambda match: ''),
|
||||
(r'<a href="[^"]+" class=powiazanie>Poka[^<]+</a>', lambda match: '')]
|
||||
]
|
@ -1,36 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class ResPublicaNowaRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||
language = 'pl'
|
||||
version = 1
|
||||
|
||||
title = u'Res Publica Nowa'
|
||||
category = u'News'
|
||||
description = u'Portal kulturalno-społecznego kwartalnika o profilu liberalnym, wydawany przez Fundację Res Publica'
|
||||
cover_url = ''
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100000
|
||||
recursions = 0
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
simultaneous_downloads = 5
|
||||
|
||||
feeds = [
|
||||
('Artykuly', 'feed://publica.pl/feed'),
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
@ -26,13 +26,11 @@ class RMF24_ESKN(BasicNewsRecipe):
|
||||
(u'Nauka', u'http://www.rmf24.pl/nauka/feed')]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class': 'box articleSingle print'})]
|
||||
dict(name='header', attrs={'class': 'article-header'}),
|
||||
dict(name='div', attrs={'class': 'article-container'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': 'toTop'}),
|
||||
dict(name='div', attrs={'class': 'category'}),
|
||||
dict(name='div', attrs={'class': 'REMOVE'}),
|
||||
dict(name='div', attrs={'class': 'embed embedAd'})]
|
||||
remove_tags = [dict(name='div', attrs={'id': 'ReklamaMobile'}),
|
||||
dict(name='img', attrs={'class': 'img-responsive hidden-lg hidden-md hidden-sm'})]
|
||||
|
||||
extra_css = '''
|
||||
h1 { font-size: 1.2em; }
|
||||
@ -42,7 +40,6 @@ class RMF24_ESKN(BasicNewsRecipe):
|
||||
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<h2>Zdj.cie</h2>', lambda match: ''),
|
||||
(r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'), # noqa
|
||||
(r'<a href="http://www.facebook.com/pages/RMF24pl/.*?>RMF24.pl</a> on Facebook</div>',
|
||||
lambda match: '</div>')
|
||||
]
|
||||
|
@ -24,14 +24,11 @@ class RMF24(BasicNewsRecipe):
|
||||
(u'\u015awiat', u'http://www.rmf24.pl/fakty/swiat/feed')]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class': 'box articleSingle print'})]
|
||||
dict(name='header', attrs={'class': 'article-header'}),
|
||||
dict(name='div', attrs={'class': 'article-container'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id': 'adBox625'}),
|
||||
dict(name='div', attrs={'class': 'toTop'}),
|
||||
dict(name='div', attrs={'class': 'category'}),
|
||||
dict(name='div', attrs={'class': 'REMOVE'}),
|
||||
dict(name='div', attrs={'class': 'embed embedAd'})]
|
||||
remove_tags = [dict(name='div', attrs={'id': 'ReklamaMobile'}),
|
||||
dict(name='img', attrs={'class': 'img-responsive hidden-lg hidden-md hidden-sm'})]
|
||||
|
||||
extra_css = '''
|
||||
h1 { font-size: 1.2em; }
|
||||
@ -40,7 +37,6 @@ class RMF24(BasicNewsRecipe):
|
||||
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<h2>Zdj.cie</h2>', lambda match: ''),
|
||||
(r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'), # noqa
|
||||
(r'<a href="http://www.facebook.com/pages/RMF24pl/.*?>RMF24.pl</a> on Facebook</div>',
|
||||
lambda match: '</div>')
|
||||
]
|
||||
|
@ -19,6 +19,7 @@ class RMF24_opinie(BasicNewsRecipe):
|
||||
__author__ = u'Tomasz D\u0142ugosz'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
|
||||
feeds = [(u'Blogi', u'http://www.rmf24.pl/opinie/blogi/feed'),
|
||||
(u'Kontrwywiad',
|
||||
@ -28,16 +29,10 @@ class RMF24_opinie(BasicNewsRecipe):
|
||||
(u'Komentarze', u'http://www.rmf24.pl/opinie/komentarze/feed')]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class': 'box articleSingle print'}),
|
||||
dict(name='div', attrs={
|
||||
'class': 'box articleSingle print singleCommentary'}),
|
||||
dict(name='div', attrs={'class': 'box articleSingle print blogSingleEntry'})]
|
||||
dict(name='header', attrs={'class': 'article-header'}),
|
||||
dict(name='div', attrs={'class': 'article-container'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': 'toTop'}),
|
||||
dict(name='div', attrs={'class': 'category'}),
|
||||
dict(name='div', attrs={'class': 'REMOVE'}),
|
||||
dict(name='div', attrs={'class': 'embed embedAd'})]
|
||||
remove_tags = [dict(name='div', attrs={'id': 'ReklamaMobile'})]
|
||||
|
||||
extra_css = '''
|
||||
h1 { font-size: 1.2em; }
|
||||
|
@ -17,26 +17,22 @@ class prawica_recipe(BasicNewsRecipe):
|
||||
description = u'Portal "Rynek Infrastruktury" to źródło informacji o kluczowych elementach polskiej gospodarki: drogach, kolei, lotniskach, portach, telekomunikacji, energetyce, prawie i polityce, wzmocnione eksperckimi komentarzami kluczowych analityków.' # noqa
|
||||
remove_empty_feeds = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
max_articles_per_feed = 50
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
feeds = [
|
||||
(u'Drogi', u'http://www.rynekinfrastruktury.pl/rss/41'),
|
||||
(u'Lotniska', u'http://www.rynekinfrastruktury.pl/rss/42'),
|
||||
(u'Kolej', u'http://www.rynekinfrastruktury.pl/rss/37'),
|
||||
(u'Energetyka', u'http://www.rynekinfrastruktury.pl/rss/30'),
|
||||
(u'Telekomunikacja', u'http://www.rynekinfrastruktury.pl/rss/31'),
|
||||
(u'Porty', u'http://www.rynekinfrastruktury.pl/rss/32'),
|
||||
(u'Prawo i polityka', u'http://www.rynekinfrastruktury.pl/rss/47'),
|
||||
(u'Komentarze', u'http://www.rynekinfrastruktury.pl/rss/38'),
|
||||
(u'Drogi', u'http://www.rynekinfrastruktury.pl/rss/drogi.xml'),
|
||||
(u'Kolej', u'http://www.rynekinfrastruktury.pl/rss/kolej.xml'),
|
||||
(u'Energetyka', u'http://www.rynekinfrastruktury.pl/rss/energetyka.xml')
|
||||
# no news in these feeds since 4 years:
|
||||
#(u'Porty i lotniska', u'http://www.rynekinfrastruktury.pl/rss/porty-i-lotniska.xml'),
|
||||
#(u'Komentarze', u'http://www.rynekinfrastruktury.pl/rss/komentarze-i-felietony.xml'),
|
||||
]
|
||||
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name='div', attrs={'class': 'articleContent'}))
|
||||
keep_only_tags = [
|
||||
dict(name='h1', attrs={'class': 'wiadTit'}),
|
||||
dict(name='div', attrs={'class': ['wiadSzczegol', 'multimediaWiadomosci', 'wiadTresc']})
|
||||
]
|
||||
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name='span', attrs={'class': 'date'}))
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.rynekinfrastruktury.pl/artykul/', 'http://www.rynekinfrastruktury.pl/artykul/drukuj/')
|
||||
remove_tags = [dict(name='span', attrs={'class': 'kom'})]
|
||||
|
@ -1,40 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'teepel <teepel44@gmail.com>'
|
||||
|
||||
'''
|
||||
rynek-kolejowy.pl
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class rynek_kolejowy(BasicNewsRecipe):
|
||||
title = u'Rynek Kolejowy'
|
||||
__author__ = 'teepel <teepel44@gmail.com>'
|
||||
language = 'pl'
|
||||
description = u'Rynek Kolejowy - kalendarium wydarzeń branży kolejowej, konferencje, sympozja, targi kolejowe, krajowe i zagraniczne.'
|
||||
masthead_url = 'http://p.wnp.pl/images/i/partners/rynek_kolejowy.gif'
|
||||
remove_empty_feeds = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name='div', attrs={'id': 'mainContent'}))
|
||||
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name='div', attrs={'class': 'right no-print'}))
|
||||
remove_tags.append(dict(name='div', attrs={'id': 'font-size'}))
|
||||
remove_tags.append(dict(name='div', attrs={'class': 'no-print'}))
|
||||
|
||||
extra_css = '''.wiadomosc_title{ font-size: 1.4em; font-weight: bold; }'''
|
||||
|
||||
feeds = [(u'Wiadomości', u'http://www.rynek-kolejowy.pl/rss/rss.php')]
|
||||
|
||||
def print_version(self, url):
|
||||
segment = url.split('/')
|
||||
urlPart = segment[3]
|
||||
return 'http://www.rynek-kolejowy.pl/drukuj.php?id=' + urlPart
|
Loading…
x
Reference in New Issue
Block a user