mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update various Polish recipes
This commit is contained in:
parent
6c224d75cf
commit
35d15d0eb5
@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
|
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
needs_subscription='optional'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})]
|
remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})]
|
||||||
@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe):
|
|||||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
for feed in feeds:
|
for feed in feeds:
|
||||||
for article in feed.articles[:]:
|
for article in feed.articles[:]:
|
||||||
if 'subskrypcja' in article.title:
|
if self.username is None and 'subskrypcja' in article.title:
|
||||||
feed.articles.remove(article)
|
feed.articles.remove(article)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://archeowiesci.pl/wp-login.php')
|
||||||
|
br.select_form(name='loginform')
|
||||||
|
br['log'] = self.username
|
||||||
|
br['pwd'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
@ -1,15 +1,18 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
class Astronomia_pl(BasicNewsRecipe):
|
class Astronomia_pl(BasicNewsRecipe):
|
||||||
title = u'Astronomia.pl'
|
title = u'Astronomia.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = 'Astronomia - polish astronomy site'
|
description = 'Astronomia - polish astronomy site'
|
||||||
|
masthead_url = 'http://www.astronomia.pl/grafika/logo.gif'
|
||||||
cover_url = 'http://www.astronomia.pl/grafika/logo.gif'
|
cover_url = 'http://www.astronomia.pl/grafika/logo.gif'
|
||||||
category = 'astronomy, science'
|
category = 'astronomy, science'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
#no_stylesheets=True
|
extra_css='#h2 {font-size: 18px;}'
|
||||||
|
no_stylesheets=True
|
||||||
|
preprocess_regexps = [(re.compile(ur'<b>Przeczytaj także:.*?</BODY>', re.DOTALL), lambda match: '</BODY>') ]
|
||||||
remove_tags_before=dict(name='div', attrs={'id':'a1'})
|
remove_tags_before=dict(name='div', attrs={'id':'a1'})
|
||||||
keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})]
|
keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})]
|
||||||
feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')]
|
feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')]
|
||||||
|
@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe):
|
|||||||
title = u'Benchmark.pl'
|
title = u'Benchmark.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'benchmark.pl -IT site'
|
description = u'benchmark.pl -IT site'
|
||||||
|
masthead_url = 'http://www.benchmark.pl/i/logo-footer.png'
|
||||||
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
|
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets=True
|
no_stylesheets=True
|
||||||
preprocess_regexps = [(re.compile(ur'\bWięcej o .*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
|
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||||
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
|
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
|
||||||
remove_tags_after=dict(name='div', attrs={'class':'body'})
|
remove_tags_after=dict(name='div', attrs={'class':'body'})
|
||||||
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})]
|
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
|
||||||
INDEX= 'http://www.benchmark.pl'
|
INDEX= 'http://www.benchmark.pl'
|
||||||
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
|
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
|
||||||
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
|
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
|
||||||
|
@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe):
|
|||||||
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
|
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
|
||||||
category = 'biology'
|
category = 'biology'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png'
|
||||||
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
|
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
#keeps_only_tags=[dict(id='main')]
|
#keeps_only_tags=[dict(id='main')]
|
||||||
remove_tags_before=dict(id='main')
|
remove_tags_before=dict(id='main')
|
||||||
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
|
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
|
||||||
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
|
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})]
|
||||||
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
|
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
|
||||||
|
@ -1,16 +1,20 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class CD_Action(BasicNewsRecipe):
|
class CD_Action(BasicNewsRecipe):
|
||||||
title = u'CD-Action'
|
title = u'CD-Action'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = 'cdaction.pl - polish magazine about games site'
|
description = 'cdaction.pl - polish games magazine site'
|
||||||
category = 'games'
|
category = 'games'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG'
|
|
||||||
keep_only_tags= dict(id='news_content')
|
keep_only_tags= dict(id='news_content')
|
||||||
remove_tags_after= dict(name='div', attrs={'class':'tresc'})
|
remove_tags_after= dict(name='div', attrs={'class':'tresc'})
|
||||||
feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')]
|
feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')]
|
||||||
|
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
|
||||||
|
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
|
||||||
|
return getattr(self, 'cover_url', self.cover_url)
|
@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe):
|
|||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Codzienna Gazeta Muzyczna'
|
description = u'Codzienna Gazeta Muzyczna'
|
||||||
|
masthead_url='http://www.cgm.pl/img/header/logo.gif'
|
||||||
cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg'
|
cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg'
|
||||||
category = 'music'
|
category = 'music'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
||||||
|
if gallery:
|
||||||
|
img=gallery.div
|
||||||
|
gallery.img.extract()
|
||||||
|
if img:
|
||||||
|
img=img['style']
|
||||||
|
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
|
||||||
|
gallery.contents[1].name='img'
|
||||||
|
gallery.contents[1]['src']=img
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
ad=soup.findAll('a')
|
ad=soup.findAll('a')
|
||||||
for r in ad:
|
for r in ad:
|
||||||
if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
|
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
|
||||||
r.extract()
|
r.extract()
|
||||||
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
|
||||||
if gallery:
|
|
||||||
img=gallery.find('embed')
|
|
||||||
if img:
|
|
||||||
img=img['src'][35:]
|
|
||||||
img='http://www.cgm.pl/_vault/_gallery/_photo/'+img
|
|
||||||
param=gallery.findAll(name='param')
|
|
||||||
for i in param:
|
|
||||||
i.extract()
|
|
||||||
gallery.contents[1].name='img'
|
|
||||||
gallery.contents[1]['src']=img
|
|
||||||
return soup
|
return soup
|
@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe):
|
|||||||
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
|
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
|
||||||
no_stylesheets=True
|
no_stylesheets=True
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
keep_only_tags=[dict(name='div', attrs={'id':'s'})]
|
keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
|
||||||
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
|
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
|
||||||
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
|
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
|
||||||
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
|
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
|
||||||
|
@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
|||||||
__licence__ ='GPL v3'
|
__licence__ ='GPL v3'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png'
|
||||||
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
|
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
|
||||||
description = u'Aktualności i blogi z dobreprogramy.pl'
|
description = u'Aktualności i blogi z dobreprogramy.pl'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
|||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
|
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
|
||||||
remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
|
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})]
|
remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})]
|
||||||
|
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
|
||||||
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
|
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
|
||||||
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
|
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
|
||||||
|
@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe):
|
|||||||
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
|
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
|
||||||
category = 'newspaper'
|
category = 'newspaper'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
|
masthead_url= 'http://5.s.dziennik.pl/images/logos.png'
|
||||||
|
cover_url= 'http://5.s.dziennik.pl/images/logos.png'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript=True
|
||||||
remove_empty_feeds=True
|
remove_empty_feeds=True
|
||||||
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
|
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
|
||||||
|
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
||||||
keep_only_tags=[dict(id='article')]
|
keep_only_tags=[dict(id='article')]
|
||||||
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
|
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})]
|
||||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||||
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
||||||
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
||||||
@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe):
|
|||||||
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
|
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
|
||||||
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
||||||
|
|
||||||
|
def skip_ad_pages(self, soup):
|
||||||
|
tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})
|
||||||
|
if tag:
|
||||||
|
new_soup=self.index_to_soup(tag['href'], raw=True)
|
||||||
|
return new_soup
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def append_page(self, soup, appendtag):
|
||||||
tag=soup.find('a', attrs={'class':'page_next'})
|
tag=soup.find('a', attrs={'class':'page_next'})
|
||||||
if tag:
|
if tag:
|
||||||
@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe):
|
|||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
extra_css = '.hdrBig {font-size:22px;}'
|
remove_empty_feeds=True
|
||||||
|
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
|
||||||
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
|
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
|
||||||
keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})]
|
keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})]
|
||||||
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
|
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
|
||||||
|
@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||||
title = u'Gazeta Wyborcza'
|
title = u'Gazeta Wyborcza'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description ='news from gazeta.pl'
|
description ='news from gazeta.pl'
|
||||||
category='newspaper'
|
category='newspaper'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
||||||
INDEX='http://wyborcza.pl'
|
INDEX='http://wyborcza.pl'
|
||||||
remove_empty_feeds= True
|
remove_empty_feeds= True
|
||||||
oldest_article = 3
|
oldest_article = 3
|
||||||
@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
return url
|
return url
|
||||||
else:
|
else:
|
||||||
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
|
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
|
||||||
|
cover=soup.find(id='GWmini2')
|
||||||
|
soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href'])
|
||||||
|
self.cover_url='http://wyborcza.pl' + soup.img['src']
|
||||||
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 13
|
oldest_article = 13
|
||||||
INDEX= 'http://www.gry-online.pl/'
|
INDEX= 'http://www.gry-online.pl/'
|
||||||
cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png'
|
masthead_url='http://www.gry-online.pl/im/gry-online-logo.png'
|
||||||
|
cover_url='http://www.gry-online.pl/im/gry-online-logo.png'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
extra_css = 'p.wn1{font-size:22px;}'
|
keep_only_tags=[dict(name='div', attrs={'class':'gc660'})]
|
||||||
remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})]
|
remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})]
|
|
||||||
#remove_tags= [dict(name='div', attrs={'class':['news_plat']})]
|
|
||||||
feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
|
feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
|
||||||
|
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def append_page(self, soup, appendtag):
|
||||||
nexturl = soup.find('a', attrs={'class':'num_str_nex'})
|
tag = appendtag.find('div', attrs={'class':'n5p'})
|
||||||
if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None:
|
if tag:
|
||||||
appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n')
|
nexturls=tag.findAll('a')
|
||||||
if nexturl is not None:
|
for nexturl in nexturls[1:]:
|
||||||
if 'strona' in nexturl.div.string:
|
try:
|
||||||
nexturl= self.INDEX + nexturl['href']
|
soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href'])
|
||||||
soup2 = self.index_to_soup(nexturl)
|
except:
|
||||||
pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']})
|
soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href'])
|
||||||
for tag in pagetext:
|
pagetext = soup2.find(attrs={'class':'gc660'})
|
||||||
pos = len(appendtag.contents)
|
for r in pagetext.findAll(name='header'):
|
||||||
appendtag.insert(pos, tag)
|
r.extract()
|
||||||
self.append_page(soup2, appendtag)
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
|
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
|
||||||
@ -7,10 +8,13 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
|
no_stylesheets = True
|
||||||
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
|
|
||||||
__author__ = 'faber1971'
|
__author__ = 'faber1971'
|
||||||
description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)'
|
description = 'Collection of Italian marketing websites - v1.01 (19, February 2012)'
|
||||||
language = 'it'
|
language = 'it'
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='ul', attrs={'id':'ads0'})
|
||||||
|
]
|
||||||
|
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
|
||||||
|
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
class naczytniki(BasicNewsRecipe):
|
class naczytniki(BasicNewsRecipe):
|
||||||
title = u'naczytniki.pl'
|
title = u'naczytniki.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
|
masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
|
||||||
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
|
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description ='everything about e-readers'
|
description ='everything about e-readers'
|
||||||
@ -10,6 +11,7 @@ class naczytniki(BasicNewsRecipe):
|
|||||||
no_stylesheets=True
|
no_stylesheets=True
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
preprocess_regexps = [(re.compile(ur'<p><br><b>Zobacz także:</b></p>.*?</body>', re.DOTALL), lambda match: '</body>') ]
|
||||||
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
|
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
|
||||||
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
|
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
|
||||||
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
|
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
|
||||||
|
@ -1,21 +1,33 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
class Nowa_Fantastyka(BasicNewsRecipe):
|
class Nowa_Fantastyka(BasicNewsRecipe):
|
||||||
title = u'Nowa Fantastyka'
|
title = u'Nowa Fantastyka'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
|
__modified_by__ = 'zaslav'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
encoding='latin2'
|
encoding='latin2'
|
||||||
description ='site for fantasy readers'
|
description ='site for fantasy readers'
|
||||||
category='fantasy'
|
category='fantasy'
|
||||||
|
masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg'
|
||||||
|
#extra_css='.tytul {font-size: 20px;}' #not working
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
INDEX='http://www.fantastyka.pl/'
|
INDEX='http://www.fantastyka.pl/'
|
||||||
no_stylesheets=True
|
no_stylesheets=True
|
||||||
needs_subscription = 'optional'
|
needs_subscription = 'optional'
|
||||||
remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
|
remove_tags_before=dict(attrs={'class':'naglowek2'})
|
||||||
#remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
|
#remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
|
||||||
remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
|
remove_tags_after=dict(name='form', attrs={'name':'form1'})
|
||||||
remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})]
|
remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')]
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'\<table .*?\>'), lambda match: ''),
|
||||||
|
(re.compile(r'\<td.*?\>'), lambda match: ''),
|
||||||
|
(re.compile(r'\<center\>'), lambda match: '')]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_articles(self, url):
|
def find_articles(self, url):
|
||||||
articles = []
|
articles = []
|
||||||
@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe):
|
|||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.fantastyka.pl/1.html')
|
soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka')
|
||||||
cover=soup.find(name='img', attrs={'class':'okladka'})
|
self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href']
|
||||||
self.cover_url=self.INDEX+ cover['src']
|
|
||||||
return getattr(self, 'cover_url', self.cover_url)
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe):
|
|||||||
br['pass'] = self.password
|
br['pass'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll(font=True):
|
||||||
|
del item['font']
|
||||||
|
for item in soup.findAll(align=True):
|
||||||
|
del item['align']
|
||||||
|
for item in soup.findAll(name='tr'):
|
||||||
|
item.name='div'
|
||||||
|
title=soup.find(attrs={'class':'tytul'})
|
||||||
|
if title:
|
||||||
|
title['style']='font-size: 20px; font-weight: bold;'
|
||||||
|
self.log.warn(soup)
|
||||||
|
return soup
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
class Tablety_pl(BasicNewsRecipe):
|
class Tablety_pl(BasicNewsRecipe):
|
||||||
title = u'Tablety.pl'
|
title = u'Tablety.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'tablety.pl - latest tablet news'
|
description = u'tablety.pl - latest tablet news'
|
||||||
|
masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
||||||
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
|
||||||
remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
|
remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
|
||||||
remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'})
|
remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'})
|
||||||
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})]
|
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})]
|
||||||
|
@ -4,10 +4,12 @@ class Ubuntu_pl(BasicNewsRecipe):
|
|||||||
title = u'UBUNTU.pl'
|
title = u'UBUNTU.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = 'UBUNTU.pl - polish ubuntu community site'
|
description = 'UBUNTU.pl - polish ubuntu community site'
|
||||||
|
masthead_url= 'http://ubuntu.pl/img/logo.jpg'
|
||||||
cover_url = 'http://ubuntu.pl/img/logo.jpg'
|
cover_url = 'http://ubuntu.pl/img/logo.jpg'
|
||||||
category = 'linux, IT'
|
category = 'linux, IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
extra_css = '#main {text-align:left;}'
|
extra_css = '#main {text-align:left;}'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user