mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Merge branch 'master' of https://github.com/fenuks/calibre
New and updated Polish news sources.
This commit is contained in:
commit
c56989ea1a
65
recipes/cdrinfo_pl.recipe
Normal file
65
recipes/cdrinfo_pl.recipe
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
class cdrinfo(BasicNewsRecipe):
|
||||||
|
title = u'CDRinfo.pl'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Serwis poświęcony archiwizacji danych. Testy i recenzje nagrywarek. Programy do nagrywania płyt. Dyski twarde, dyski SSD i serwery sieciowe NAS. Rankingi dyskow twardych, najszybsze dyski twarde, newsy, artykuły, testy, recenzje, porady, oprogramowanie. Zestawienie nagrywarek, najnowsze biosy do nagrywarek, programy dla dysków twardych.'
|
||||||
|
category = 'it, hardware'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
#extra_css = ''
|
||||||
|
cover_url = 'http://www.cdrinfo.pl/gfx/graph3/top.jpg'
|
||||||
|
#masthead_url = ''
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 777
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style']
|
||||||
|
preprocess_regexps = [(re.compile(u'<p[^>]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com</a>\.</p>', re.DOTALL), lambda match: '')]
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id='text')]
|
||||||
|
remove_tags = [dict(attrs={'class':['navigation', 'sociable']}), dict(name='hr'), dict(id='respond')]
|
||||||
|
remove_tags_after = dict(id='artnawigacja')
|
||||||
|
feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'),
|
||||||
|
(u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'),
|
||||||
|
(u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml')
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
if soup.find(id='artnawigacja'):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
baseurl = 'http://cdrinfo.pl' + soup.find(name='input', attrs={'name':'ref'})['value'] + '/'
|
||||||
|
if baseurl[-2] == '/':
|
||||||
|
baseurl = baseurl[:-1]
|
||||||
|
tag = soup.find(id='artnawigacja')
|
||||||
|
div = tag.find('div', attrs={'align':'right'})
|
||||||
|
while div:
|
||||||
|
counter = 0
|
||||||
|
while counter < 5:
|
||||||
|
try:
|
||||||
|
soup2 = self.index_to_soup(baseurl+div.a['href'])
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
counter += 1
|
||||||
|
tag2 = soup2.find(id='artnawigacja')
|
||||||
|
div = tag2.find('div', attrs={'align':'right'})
|
||||||
|
pagetext = soup2.find(attrs={'class':'art'})
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
for r in soup2.findAll(attrs={'class':'star-rating'}):
|
||||||
|
r.extract()
|
||||||
|
for r in soup2.findAll(attrs={'class':'star-rating2'}):
|
||||||
|
r.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
tag.extract()
|
@ -9,13 +9,15 @@ class EkologiaPl(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png'
|
cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
extra_css = '.title {font-size: 200%;} .imagePowiazane, .imgCon {float:left; margin-right:5px;}'
|
extra_css = '.title {font-size: 200%;} .imagePowiazane {float:left; margin-right:5px; width: 200px;}'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_attrs = ['style']
|
remove_attrs = ['style']
|
||||||
|
keep_only_tags = [dict(attrs={'class':'contentParent'})]
|
||||||
remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})]
|
remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})]
|
||||||
|
|
||||||
feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')]
|
feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')]
|
||||||
|
87
recipes/gazeta_pl_bydgoszcz.recipe
Normal file
87
recipes/gazeta_pl_bydgoszcz.recipe
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
import re
|
||||||
|
class gw_bydgoszcz(BasicNewsRecipe):
|
||||||
|
title = u'Gazeta Wyborcza Bydgoszcz'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
language = 'pl'
|
||||||
|
description = 'Wiadomości z Bydgoszczy na portalu Gazeta.pl.'
|
||||||
|
category = 'newspaper'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
masthead_url = 'http://bi.gazeta.pl/im/3/4089/m4089863.gif'
|
||||||
|
INDEX = 'http://bydgoszcz.gazeta.pl'
|
||||||
|
cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
oldest_article = 3
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
#rules for gazeta.pl
|
||||||
|
preprocess_regexps = [(re.compile(u'<b>Czytaj więcej</b>.*', re.DOTALL), lambda m: '</body>')]
|
||||||
|
keep_only_tags = [dict(id='gazeta_article')]
|
||||||
|
remove_tags = [dict(id=['gazeta_article_tools', 'gazeta_article_miniatures']), dict(attrs={'class':['mod mod_sociallist', 'c0', 'fb', 'voteNeedLogin']})]
|
||||||
|
remove_tags_after = dict(id='gazeta_article_body')
|
||||||
|
|
||||||
|
feeds = [(u'Wiadomości', u'http://rss.feedsportal.com/c/32739/f/530239/index.rss')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
if 'feedsportal.com' in url:
|
||||||
|
s = url.rpartition('gazeta0Bpl')
|
||||||
|
u = s[2]
|
||||||
|
if not s[0]:
|
||||||
|
u = url.rpartition('wyborcza0Bpl')[2]
|
||||||
|
u = u.replace('/l/', '/')
|
||||||
|
u = u.replace('/ia1.htm', '')
|
||||||
|
u = u.replace('0Dbo0F1', '')
|
||||||
|
u = u.replace('/story01.htm', '')
|
||||||
|
u = u.replace('0C', '/')
|
||||||
|
u = u.replace('A', '')
|
||||||
|
u = u.replace('0E', '-')
|
||||||
|
u = u.replace('0H', ',')
|
||||||
|
u = u.replace('0I', '_')
|
||||||
|
u = u.replace('0B', '.')
|
||||||
|
u = self.INDEX + u
|
||||||
|
return u
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
tag = soup.find(id='Str')
|
||||||
|
if soup.find(attrs={'class': 'piano_btn_1'}):
|
||||||
|
return None
|
||||||
|
elif tag and tag.findAll('a'):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('div', attrs={'id': 'Str'})
|
||||||
|
try:
|
||||||
|
baseurl = soup.find(name='meta', attrs={'property':'og:url'})['content']
|
||||||
|
except:
|
||||||
|
return 1
|
||||||
|
link = tag.findAll('a')[-1]
|
||||||
|
while link:
|
||||||
|
soup2 = self.index_to_soup(baseurl + link['href'])
|
||||||
|
link = soup2.find('div', attrs={'id': 'Str'}).findAll('a')[-1]
|
||||||
|
if not u'następne' in link.string:
|
||||||
|
link = ''
|
||||||
|
pagetext = soup2.find(id='artykul')
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
|
def image_url_processor(self, baseurl, url):
|
||||||
|
if url.startswith(' '):
|
||||||
|
return url.strip()
|
||||||
|
else:
|
||||||
|
return url
|
@ -16,40 +16,47 @@ class Gildia(BasicNewsRecipe):
|
|||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '') ]
|
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '') ]
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
remove_tags = [dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
|
remove_tags = [dict(name='div', attrs={'class':['backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})]
|
||||||
keep_only_tags = dict(name='div', attrs={'class':'widetext'})
|
keep_only_tags = [dict(name='div', attrs={'class':'widetext'})]
|
||||||
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')]
|
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'),
|
||||||
|
(u'Literatura', u'http://www.literatura.gildia.pl/rss'),
|
||||||
|
(u'Film', u'http://www.film.gildia.pl/rss'),
|
||||||
|
(u'Horror', u'http://www.horror.gildia.pl/rss'),
|
||||||
|
(u'Konwenty', u'http://www.konwenty.gildia.pl/rss'),
|
||||||
|
(u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'),
|
||||||
|
(u'Manga i anime', u'http://www.manga.gildia.pl/rss'),
|
||||||
|
(u'Star Wars', u'http://www.starwars.gildia.pl/rss'),
|
||||||
|
(u'Techno', u'http://www.techno.gildia.pl/rss'),
|
||||||
|
(u'Historia', u'http://www.historia.gildia.pl/rss'),
|
||||||
|
(u'Magia', u'http://www.magia.gildia.pl/rss'),
|
||||||
|
(u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'),
|
||||||
|
(u'RPG', u'http://www.rpg.gildia.pl/rss'),
|
||||||
|
(u'LARP', u'http://www.larp.gildia.pl/rss'),
|
||||||
|
(u'Muzyka', u'http://www.muzyka.gildia.pl/rss'),
|
||||||
|
(u'Nauka', u'http://www.nauka.gildia.pl/rss'),
|
||||||
|
]
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
content = soup.find('div', attrs={'class':'news'})
|
content = soup.find('div', attrs={'class':'news'})
|
||||||
if 'recenzj' in soup.title.string.lower():
|
words = ('recenzj', 'zapowied','fragmen', 'relacj', 'wywiad', 'nominacj')
|
||||||
for link in content.findAll(name='a'):
|
for word in words:
|
||||||
if 'recenzj' in link['href'] or 'muzyka/plyty' in link['href']:
|
if word in soup.title.string.lower():
|
||||||
return self.index_to_soup(link['href'], raw=True)
|
for link in content.findAll(name='a'):
|
||||||
if 'fragmen' in soup.title.string.lower():
|
if word in link['href'] or (link.string and word in link.string):
|
||||||
for link in content.findAll(name='a'):
|
return self.index_to_soup(link['href'], raw=True)
|
||||||
if 'fragment' in link['href']:
|
for tag in content.findAll(name='a', href=re.compile('/publicystyka/')):
|
||||||
return self.index_to_soup(link['href'], raw=True)
|
if 'Więcej...' == tag.string:
|
||||||
if 'relacj' in soup.title.string.lower():
|
return self.index_to_soup(tag['href'], raw=True)
|
||||||
for link in content.findAll(name='a'):
|
|
||||||
if 'relacj' in link['href']:
|
|
||||||
return self.index_to_soup(link['href'], raw=True)
|
|
||||||
if 'wywiad' in soup.title.string.lower():
|
|
||||||
for link in content.findAll(name='a'):
|
|
||||||
if 'wywiad' in link['href']:
|
|
||||||
return self.index_to_soup(link['href'], raw=True)
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and not a['href'].startswith('http'):
|
if a.has_key('href') and not a['href'].startswith('http'):
|
||||||
if '/gry/' in a['href']:
|
if '/gry/' in a['href']:
|
||||||
a['href']='http://www.gry.gildia.pl' + a['href']
|
a['href'] = 'http://www.gry.gildia.pl' + a['href']
|
||||||
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
|
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
|
||||||
a['href']='http://www.literatura.gildia.pl' + a['href']
|
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
||||||
elif u'komiks' in soup.title.string.lower():
|
elif u'komiks' in soup.title.string.lower():
|
||||||
a['href']='http://www.literatura.gildia.pl' + a['href']
|
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
||||||
else:
|
else:
|
||||||
a['href']='http://www.gildia.pl' + a['href']
|
a['href'] = 'http://www.gildia.pl' + a['href']
|
||||||
return soup
|
return soup
|
BIN
recipes/icons/cdrinfo_pl.png
Normal file
BIN
recipes/icons/cdrinfo_pl.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 909 B |
BIN
recipes/icons/gazeta_pl_bydgoszcz.png
Normal file
BIN
recipes/icons/gazeta_pl_bydgoszcz.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 294 B |
@ -3,33 +3,29 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = 'teepel'
|
__copyright__ = 'teepel'
|
||||||
|
|
||||||
'''
|
|
||||||
media2.pl
|
|
||||||
'''
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class media2_pl(BasicNewsRecipe):
|
class media2_pl(BasicNewsRecipe):
|
||||||
title = u'Media2'
|
title = u'Media2'
|
||||||
__author__ = 'teepel <teepel44@gmail.com>'
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description =u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.'
|
description = u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.'
|
||||||
masthead_url='http://media2.pl/res/logo/www.png'
|
masthead_url = 'http://media2.pl/res/logo/www.png'
|
||||||
remove_empty_feeds= True
|
cover_url = 'http://media2.pl/res/logo/www.png'
|
||||||
oldest_article = 1
|
remove_empty_feeds = True
|
||||||
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript = True
|
||||||
no_stylesheets=True
|
no_stylesheets = True
|
||||||
simultaneous_downloads = 5
|
remove_attributes = ['style']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
extra_css = '''.news-lead{font-weight: bold; }'''
|
extra_css = '''.news-lead{font-weight: bold; }'''
|
||||||
|
|
||||||
keep_only_tags =[]
|
keep_only_tags = [dict(name = 'div', attrs = {'class' : 'news-item tpl-big'})]
|
||||||
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-item tpl-big'}))
|
remove_tags = [dict(name = 'span', attrs = {'class' : 'news-comments'}), dict(name = 'div', attrs = {'class' : 'item-sidebar'}), dict(name = 'div', attrs = {'class' : 'news-tags'})]
|
||||||
|
|
||||||
remove_tags =[]
|
feeds = [(u'Media2', u'http://feeds.feedburner.com/media2'), (u'Internet', u'http://feeds.feedburner.com/media2/internet'),
|
||||||
remove_tags.append(dict(name = 'span', attrs = {'class' : 'news-comments'}))
|
(u'Media', 'http://feeds.feedburner.com/media2/media'), (u'Telekomunikacja', 'http://feeds.feedburner.com/media2/telekomunikacja'),
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'item-sidebar'}))
|
(u'Reklama/PR', 'http://feeds.feedburner.com/media2/reklama-pr'), (u'Technologie', 'http://feeds.feedburner.com/media2/technologie'),
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-tags'}))
|
(u'Badania', 'http://feeds.feedburner.com/media2/badania')
|
||||||
|
]
|
||||||
feeds = [(u'Media2', u'http://feeds.feedburner.com/media2')]
|
|
@ -1,7 +1,7 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
class NaukawPolsce(BasicNewsRecipe):
|
class NaukawPolsce(BasicNewsRecipe):
|
||||||
title = u'Nauka w Polsce'
|
title = u'PAP Nauka w Polsce'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.'
|
description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.'
|
||||||
category = 'science'
|
category = 'science'
|
||||||
|
@ -3,7 +3,7 @@ import re
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Poltergeist(BasicNewsRecipe):
|
class Poltergeist(BasicNewsRecipe):
|
||||||
title = u'Poltergeist'
|
title = u'Polter.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Największy polski serwis poświęcony ogólno pojętej fantastyce - grom fabularnym (RPG), książkom, filmowi, komiksowi, grom planszowym, karcianym i bitewnym.'
|
description = u'Największy polski serwis poświęcony ogólno pojętej fantastyce - grom fabularnym (RPG), książkom, filmowi, komiksowi, grom planszowym, karcianym i bitewnym.'
|
||||||
category = 'fantasy, books, rpg, games'
|
category = 'fantasy, books, rpg, games'
|
||||||
|
@ -1,41 +1,35 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class ppeRecipe(BasicNewsRecipe):
|
class ppeRecipe(BasicNewsRecipe):
|
||||||
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
|
||||||
title = u'ppe.pl'
|
title = u'ppe.pl'
|
||||||
category = u'News'
|
category = u'News'
|
||||||
description = u'Portal o konsolach i grach wideo.'
|
description = u'Portal o konsolach i grach wideo.'
|
||||||
cover_url=''
|
extra_css = '.categories > li {list-style: none; display: inline;} .galmini > li {list-style: none; float: left;} .calibre_navbar {clear: both;}'
|
||||||
remove_empty_feeds= True
|
remove_empty_feeds = True
|
||||||
no_stylesheets=True
|
|
||||||
oldest_article = 1
|
|
||||||
max_articles_per_feed = 100000
|
|
||||||
recursions = 0
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
simultaneous_downloads = 2
|
remove_empty_feeds = True
|
||||||
|
remove_attributes = ['style']
|
||||||
|
|
||||||
keep_only_tags =[]
|
keep_only_tags = [dict(attrs={'class':'box'})]
|
||||||
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-heading'}))
|
remove_tags = [dict(attrs={'class':['voltage-1', 'voltage-2', 'encyklopedia', 'nag', 'related', 'comment_form', 'komentarze-box']})]
|
||||||
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'tresc-poziom'}))
|
|
||||||
|
|
||||||
remove_tags =[]
|
feeds = [
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria1'}))
|
('Newsy', 'http://ppe.pl/rss.html'),
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria2'}))
|
('Recenzje', 'http://ppe.pl/rss-recenzje.html'),
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria3'}))
|
('Publicystyka', 'http://ppe.pl/rss-publicystyka.html'),
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-photo'}))
|
]
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'fbl'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'info'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'links'}))
|
|
||||||
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'style' : 'padding: 4px'}))
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.ppe.pl/psx_extreme.html')
|
||||||
feeds = [
|
part = soup.find(attrs={'class':'archiwum-foto'})['style']
|
||||||
('Newsy', 'feed://ppe.pl/rss/rss.xml'),
|
part = re.search("'(.+)'", part).group(1).replace('_min', '')
|
||||||
]
|
return 'http://www.ppe.pl' + part
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Comment
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
@ -11,6 +12,7 @@ class PurePC(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
|
masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
|
||||||
cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
|
cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
|
||||||
|
extra_css = '.wykres_logo {float: left; margin-right: 5px;}'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
keep_only_tags= [dict(id='content')]
|
keep_only_tags= [dict(id='content')]
|
||||||
remove_tags_after= dict(attrs={'class':'fivestar-widget'})
|
remove_tags_after= dict(attrs={'class':'fivestar-widget'})
|
||||||
@ -19,11 +21,14 @@ class PurePC(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def append_page(self, soup, appendtag):
|
||||||
nexturl= appendtag.find(attrs={'class':'pager-next'})
|
lasturl = appendtag.find(attrs={'class':'pager-last'})
|
||||||
if nexturl:
|
if lasturl:
|
||||||
while nexturl:
|
regex = re.search('(.+?2C)(\d+)', lasturl.a['href'])
|
||||||
soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href'])
|
baseurl = regex.group(1).replace('?page=0%2C', '?page=1%2C')
|
||||||
nexturl=soup2.find(attrs={'class':'pager-next'})
|
baseurl = 'http://www.purepc.pl' + baseurl
|
||||||
|
nr = int(regex.group(2))
|
||||||
|
for page_nr in range(1, nr+1):
|
||||||
|
soup2 = self.index_to_soup(baseurl+str(page_nr))
|
||||||
pagetext = soup2.find(attrs={'class':'article'})
|
pagetext = soup2.find(attrs={'class':'article'})
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user