new newspapers by fenuks
34
recipes/dziennik_baltycki.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DziennikBaltycki(BasicNewsRecipe):
|
||||
title = u'Dziennik Ba\u0142tycki'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Regionalna Dziennik Bałtycki. Najnowsze Wiadomości Trójmiasto i Wiadomości Pomorskie. Czytaj!'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dziennikbaltycki.png?24'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
|
||||
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_wiadomosci.xml?201302'), (u'Sport', u'http://dziennikbaltycki.feedsportal.com/c/32980/f/533756/index.rss?201302'), (u'Rejsy', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_rejsy.xml?201302'), (u'Biznes na Pomorzu', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_biznesnapomorzu.xml?201302'), (u'GOM', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_gom.xml?201302'), (u'Opinie', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_opinie.xml?201302'), (u'Pitawal Pomorski', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_pitawalpomorski.xml?201302')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykul', 'drukuj')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-baltycki/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
35
recipes/dziennik_lodzki.recipe
Normal file
@ -0,0 +1,35 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DziennikLodzki(BasicNewsRecipe):
|
||||
title = u'Dziennik \u0141\xf3dzki'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Regionalna Dziennik Łódzki. Najnowsze Wiadomości Łódź. Czytaj Wiadomości Łódzkie!'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dzienniklodzki.png?24'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
|
||||
|
||||
feeds = [(u'Na sygnale', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_nasygnale.xml?201302'), (u'\u0141\xf3d\u017a', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_lodz.xml?201302'), (u'Opinie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_opinie.xml?201302'), (u'Pieni\u0105dze', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533763/index.rss?201302'), (u'Kultura', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533762/index.rss?201302'), (u'Sport', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533761/index.rss?201302'), (u'Akcje', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_akcje.xml?201302'), (u'M\xf3j Reporter', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_mojreporter.xml?201302'), (u'Studni\xf3wki', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_studniowki.xml?201302'), (u'Kraj', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_kraj.xml?201302'), (u'Zdrowie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_zdrowie.xml?201302')]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykul', 'drukuj')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-lodzki/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
78
recipes/dziennik_wschodni.recipe
Normal file
@ -0,0 +1,78 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class DziennikWschodni(BasicNewsRecipe):
|
||||
title = u'Dziennik Wschodni'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Dziennik Wschodni - portal regionalny województwa lubelskiego.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
INDEX = 'http://www.dziennikwschodni.pl'
|
||||
masthead_url = INDEX + '/images/top_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.dziennikwschodni.pl/rss.xml'),
|
||||
(u'Lublin', u'http://www.dziennikwschodni.pl/lublin.xml'),
|
||||
(u'Zamość', u'http://www.dziennikwschodni.pl/zamosc.xml'),
|
||||
(u'Biała Podlaska', u'http://www.dziennikwschodni.pl/biala_podlaska.xml'),
|
||||
(u'Chełm', u'http://www.dziennikwschodni.pl/chelm.xml'),
|
||||
(u'Kraśnik', u'http://www.dziennikwschodni.pl/krasnik.xml'),
|
||||
(u'Puławy', u'http://www.dziennikwschodni.pl/pulawy.xml'),
|
||||
(u'Świdnik', u'http://www.dziennikwschodni.pl/swidnik.xml'),
|
||||
(u'Łęczna', u'http://www.dziennikwschodni.pl/leczna.xml'),
|
||||
(u'Lubartów', u'http://www.dziennikwschodni.pl/lubartow.xml'),
|
||||
(u'Sport', u'http://www.dziennikwschodni.pl/sport.xml'),
|
||||
(u'Praca', u'http://www.dziennikwschodni.pl/praca.xml'),
|
||||
(u'Dom', u'http://www.dziennikwschodni.pl/dom.xml'),
|
||||
(u'Moto', u'http://www.dziennikwschodni.pl/moto.xml'),
|
||||
(u'Zdrowie', u'http://www.dziennikwschodni.pl/zdrowie.xml'),
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
soup = self.index_to_soup(nexturl)
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
34
recipes/dziennik_zachodni.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DziennikZachodni(BasicNewsRecipe):
|
||||
title = u'Dziennik Zachodni'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Regionalna Dziennik Zachodni. Najnowsze Wiadomości Śląskie. Wiadomości Śląsk. Czytaj!'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dziennikzachodni.png?24'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}), dict(attrs={'href':'http://www.dziennikzachodni.pl/piano'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533764/index.rss?201302'), (u'Wiadomo\u015bci', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533765/index.rss?201302'), (u'Regiony', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Opinie', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Blogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_blogi.xml?201302'), (u'Serwisy', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_serwisy.xml?201302'), (u'Sport', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533766/index.rss?201302'), (u'M\xf3j Reporter', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_mojreporter.xml?201302'), (u'Na narty', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_nanarty.xml?201302'), (u'Drogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_drogi.xml?201302'), (u'Pieni\u0105dze', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533768/index.rss?201302')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykul', 'drukuj')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-zachodni/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
74
recipes/echo_dnia.recipe
Normal file
@ -0,0 +1,74 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class EchoDnia(BasicNewsRecipe):
|
||||
title = u'Echo Dnia'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Echo Dnia - portal regionalny świętokrzyskiego radomskiego i podkarpackiego. Najnowsze wiadomości z Twojego regionu, galerie, video, mp3.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
INDEX = 'http://www.echodnia.eu'
|
||||
masthead_url = INDEX + '/images/top_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.echodnia.eu/rss.xml'),
|
||||
(u'Świętokrzyskie', u'http://www.echodnia.eu/swietokrzyskie.xml'),
|
||||
(u'Radomskie', u'http://www.echodnia.eu/radomskie.xml'),
|
||||
(u'Podkarpackie', u'http://www.echodnia.eu/podkarpackie.xml'),
|
||||
(u'Sport \u015bwi\u0119tokrzyski', u'http://www.echodnia.eu/sport_swi.xml'),
|
||||
(u'Sport radomski', u'http://www.echodnia.eu/sport_rad.xml'),
|
||||
(u'Sport podkarpacki', u'http://www.echodnia.eu/sport_pod.xml'),
|
||||
(u'Pi\u0142ka no\u017cna', u'http://www.echodnia.eu/pilka.xml'),
|
||||
(u'Praca', u'http://www.echodnia.eu/praca.xml'),
|
||||
(u'Dom', u'http://www.echodnia.eu/dom.xml'),
|
||||
(u'Auto', u'http://www.echodnia.eu/auto.xml'),
|
||||
(u'Zdrowie', u'http://www.echodnia.eu/zdrowie.xml')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
soup = self.index_to_soup(nexturl)
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
34
recipes/gazeta_krakowska.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GazetaKrakowska(BasicNewsRecipe):
|
||||
title = u'Gazeta Krakowska'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Regionalna Gazeta Krakowska. Najnowsze Wiadomości Kraków. Informacje Kraków. Czytaj!'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gazetakrakowska.png?24'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
|
||||
|
||||
feeds = [(u'Fakty24', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533770/index.rss?201302'), (u'Krak\xf3w', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_krakow.xml?201302'), (u'Tarn\xf3w', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_tarnow.xml?201302'), (u'Nowy S\u0105cz', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_nsacz.xml?201302'), (u'Ma\u0142. Zach.', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_malzach.xml?201302'), (u'Podhale', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_podhale.xml?201302'), (u'Sport', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533771/index.rss?201302'), (u'Kultura', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533772/index.rss?201302'), (u'Opinie', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_opinie.xml?201302'), (u'Magazyn', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_magazyn.xml?201302')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykul', 'drukuj')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/gazeta-krakowska/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
64
recipes/gazeta_lubuska.recipe
Normal file
@ -0,0 +1,64 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GazetaLubuska(BasicNewsRecipe):
|
||||
title = u'Gazeta Lubuska'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Lubuska - portal regionalny województwa lubuskiego.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
INDEX = 'http://www.gazetalubuska.pl'
|
||||
masthead_url = INDEX + '/images/top_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.gazetalubuska.pl/rss.xml'), (u'Dreznenko', u'http://www.gazetalubuska.pl/drezdenko.xml'), (u'G\u0142og\xf3w', u'http://www.gazetalubuska.pl/glogow.xml'), (u'Gorz\xf3w Wielkopolski', u'http://www.gazetalubuska.pl/gorzow-wielkopolski.xml'), (u'Gubin', u'http://www.gazetalubuska.pl/gubin.xml'), (u'Kostrzyn', u'http://www.gazetalubuska.pl/kostrzyn.xml'), (u'Krosno Odrza\u0144skie', u'http://www.gazetalubuska.pl/krosno-odrzanskie.xml'), (u'Lubsko', u'http://www.gazetalubuska.pl/lubsko.xml'), (u'Mi\u0119dzych\xf3d', u'http://www.gazetalubuska.pl/miedzychod.xml'), (u'Mi\u0119dzyrzecz', u'http://www.gazetalubuska.pl/miedzyrzecz.xml'), (u'Nowa S\xf3l', u'http://www.gazetalubuska.pl/nowa-sol.xml'), (u'S\u0142ubice', u'http://www.gazetalubuska.pl/slubice.xml'), (u'Strzelce Kraje\u0144skie', u'http://www.gazetalubuska.pl/strzelce-krajenskie.xml'), (u'Sulech\xf3w', u'http://www.gazetalubuska.pl/sulechow.xml'), (u'Sul\u0119cin', u'http://www.gazetalubuska.pl/sulecin.xml'), (u'\u015awi\u0119bodzin', u'http://www.gazetalubuska.pl/swiebodzin.xml'), (u'Wolsztyn', u'http://www.gazetalubuska.pl/wolsztyn.xml'), (u'Wschowa', u'http://www.gazetalubuska.pl/wschowa.xml'), (u'Zielona G\xf3ra', u'http://www.gazetalubuska.pl/zielona-gora.xml'), (u'\u017baga\u0144', u'http://www.gazetalubuska.pl/zagan.xml'), (u'\u017bary', u'http://www.gazetalubuska.pl/zary.xml'), (u'Sport', u'http://www.gazetalubuska.pl/sport.xml'), (u'Auto', u'http://www.gazetalubuska.pl/auto.xml'), (u'Dom', u'http://www.gazetalubuska.pl/dom.xml'), (u'Praca', u'http://www.gazetalubuska.pl/praca.xml'), (u'Zdrowie', u'http://www.gazetalubuska.pl/zdrowie.xml')]
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
soup = self.index_to_soup(nexturl)
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
34
recipes/gazeta_wroclawska.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GazetaWroclawska(BasicNewsRecipe):
|
||||
title = u'Gazeta Wroc\u0142awska'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Regionalna Gazeta Wrocławska. Najnowsze Wiadomości Wrocław, Informacje Wrocław. Czytaj!'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gazetawroclawska.png?24'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
|
||||
|
||||
feeds = [(u'Fakty24', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533775/index.rss?201302'), (u'Region', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_region.xml?201302'), (u'Kultura', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533777/index.rss?201302'), (u'Sport', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533776/index.rss?201302'), (u'Z archiwum', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_zarchiwum.xml?201302'), (u'M\xf3j reporter', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_mojreporter.xml?201302'), (u'Historia', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_historia.xml?201302'), (u'Listy do redakcji', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_listydoredakcji.xml?201302'), (u'Na drogach', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_nadrogach.xml?201302')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykul', 'drukuj')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/gazeta-wroclawska/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
63
recipes/gazeta_wspolczesna.recipe
Normal file
@ -0,0 +1,63 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GazetaWspolczesna(BasicNewsRecipe):
|
||||
title = u'Gazeta Wsp\xf3\u0142czesna'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Współczesna - portal regionalny.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
INDEX = 'http://www.wspolczesna.pl'
|
||||
masthead_url = INDEX + '/images/top_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.wspolczesna.pl/rss.xml'), (u'August\xf3w', u'http://www.wspolczesna.pl/augustow.xml'), (u'Bia\u0142ystok', u'http://www.wspolczesna.pl/bialystok.xml'), (u'Bielsk Podlaski', u'http://www.wspolczesna.pl/bielsk.xml'), (u'E\u0142k', u'http://www.wspolczesna.pl/elk.xml'), (u'Grajewo', u'http://www.wspolczesna.pl/grajewo.xml'), (u'Go\u0142dap', u'http://www.wspolczesna.pl/goldap.xml'), (u'Hajn\xf3wka', u'http://www.wspolczesna.pl/hajnowka.xml'), (u'Kolno', u'http://www.wspolczesna.pl/kolno.xml'), (u'\u0141om\u017ca', u'http://www.wspolczesna.pl/lomza.xml'), (u'Mo\u0144ki', u'http://www.wspolczesna.pl/monki.xml'), (u'Olecko', u'http://www.wspolczesna.pl/olecko.xml'), (u'Ostro\u0142\u0119ka', u'http://www.wspolczesna.pl/ostroleka.xml'), (u'Powiat Bia\u0142ostocki', u'http://www.wspolczesna.pl/powiat.xml'), (u'Sejny', u'http://www.wspolczesna.pl/sejny.xml'), (u'Siemiatycze', u'http://www.wspolczesna.pl/siemiatycze.xml'), (u'Sok\xf3\u0142ka', u'http://www.wspolczesna.pl/sokolka.xml'), (u'Suwa\u0142ki', u'http://www.wspolczesna.pl/suwalki.xml'), (u'Wysokie Mazowieckie', u'http://www.wspolczesna.pl/wysokie.xml'), (u'Zambr\xf3w', u'http://www.wspolczesna.pl/zambrow.xml'), (u'Sport', u'http://www.wspolczesna.pl/sport.xml'), (u'Praca', u'http://www.wspolczesna.pl/praca.xml'), (u'Dom', u'http://www.wspolczesna.pl/dom.xml'), (u'Auto', u'http://www.wspolczesna.pl/auto.xml'), (u'Zdrowie', u'http://www.wspolczesna.pl/zdrowie.xml')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
soup = self.index_to_soup(nexturl)
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
83
recipes/gcn.recipe
Normal file
@ -0,0 +1,83 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GCN(BasicNewsRecipe):
|
||||
title = u'Gazeta Codziennej Nowiny'
|
||||
__author__ = 'fenuks'
|
||||
description = u'nowiny24.pl - portal regionalny województwa podkarpackiego.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
INDEX = 'http://www.nowiny24.pl'
|
||||
masthead_url = INDEX + '/images/top_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.nowiny24.pl/rss.xml'),
|
||||
(u'Podkarpacie', u'http://www.nowiny24.pl/podkarpacie.xml'),
|
||||
(u'Bieszczady', u'http://www.nowiny24.pl/bieszczady.xml'),
|
||||
(u'Rzeszów', u'http://www.nowiny24.pl/rzeszow.xml'),
|
||||
(u'Przemyśl', u'http://www.nowiny24.pl/przemysl.xml'),
|
||||
(u'Leżajsk', u'http://www.nowiny24.pl/lezajsk.xml'),
|
||||
(u'Łańcut', u'http://www.nowiny24.pl/lancut.xml'),
|
||||
(u'Dębica', u'http://www.nowiny24.pl/debica.xml'),
|
||||
(u'Jarosław', u'http://www.nowiny24.pl/jaroslaw.xml'),
|
||||
(u'Krosno', u'http://www.nowiny24.pl/krosno.xml'),
|
||||
(u'Mielec', u'http://www.nowiny24.pl/mielec.xml'),
|
||||
(u'Nisko', u'http://www.nowiny24.pl/nisko.xml'),
|
||||
(u'Sanok', u'http://www.nowiny24.pl/sanok.xml'),
|
||||
(u'Stalowa Wola', u'http://www.nowiny24.pl/stalowawola.xml'),
|
||||
(u'Tarnobrzeg', u'http://www.nowiny24.pl/tarnobrzeg.xml'),
|
||||
(u'Sport', u'http://www.nowiny24.pl/sport.xml'),
|
||||
(u'Dom', u'http://www.nowiny24.pl/dom.xml'),
|
||||
(u'Auto', u'http://www.nowiny24.pl/auto.xml'),
|
||||
(u'Praca', u'http://www.nowiny24.pl/praca.xml'),
|
||||
(u'Zdrowie', u'http://www.nowiny24.pl/zdrowie.xml'),
|
||||
(u'Wywiady', u'http://www.nowiny24.pl/wywiady.xml')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
soup = self.index_to_soup(nexturl)
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
34
recipes/glos_wielkopolski.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GlosWielkopolski(BasicNewsRecipe):
|
||||
title = u'G\u0142os Wielkopolski'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Regionalna Głos Wielkopolski. Najnowsze Wiadomości Poznań. Czytaj Informacje Poznań!'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gloswielkopolski.png?24'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533779/index.rss?201302'), (u'Wiadomo\u015bci', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533780/index.rss?201302'), (u'Sport', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533781/index.rss?201302'), (u'Kultura', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533782/index.rss?201302'), (u'Porady', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_porady.xml?201302'), (u'Blogi', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_blogi.xml?201302'), (u'Nasze akcje', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_naszeakcje.xml?201302'), (u'Opinie', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_opinie.xml?201302'), (u'Magazyn', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_magazyn.xml?201302')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykul', 'drukuj')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/glos-wielkopolski/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
BIN
recipes/icons/dziennik_baltycki.png
Normal file
After Width: | Height: | Size: 865 B |
BIN
recipes/icons/dziennik_lodzki.png
Normal file
After Width: | Height: | Size: 461 B |
BIN
recipes/icons/dziennik_wschodni.png
Normal file
After Width: | Height: | Size: 414 B |
BIN
recipes/icons/dziennik_zachodni.png
Normal file
After Width: | Height: | Size: 431 B |
BIN
recipes/icons/echo_dnia.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/gazeta_krakowska.png
Normal file
After Width: | Height: | Size: 398 B |
BIN
recipes/icons/gazeta_lubuska.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/gazeta_wroclawska.png
Normal file
After Width: | Height: | Size: 470 B |
BIN
recipes/icons/gazeta_wspolczesna.png
Normal file
After Width: | Height: | Size: 921 B |
BIN
recipes/icons/gcn.png
Normal file
After Width: | Height: | Size: 554 B |
BIN
recipes/icons/glos_wielkopolski.png
Normal file
After Width: | Height: | Size: 446 B |
BIN
recipes/icons/kurier_lubelski.png
Normal file
After Width: | Height: | Size: 483 B |
BIN
recipes/icons/kurier_poranny.png
Normal file
After Width: | Height: | Size: 354 B |
BIN
recipes/icons/kurier_szczecinski.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/nto.png
Normal file
After Width: | Height: | Size: 416 B |
BIN
recipes/icons/trojmiasto_pl.png
Normal file
After Width: | Height: | Size: 537 B |
34
recipes/kurier_lubelski.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class KurierLubelski(BasicNewsRecipe):
|
||||
title = u'Kurier Lubelski'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Gazeta Regionalna Kurier Lubelski. Najnowsze Wiadomości Lublin. Czytaj Informacje Lublin!'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/kurierlubelski.png?24'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})
|
||||
remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})]
|
||||
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://kurierlubelski.feedsportal.com/c/32980/f/533785/index.rss?201302'), (u'Region', u'http://www.kurierlubelski.pl/rss/kurierlubelski_region.xml?201302'), (u'Sport', u'http://kurierlubelski.feedsportal.com/c/32980/f/533786/index.rss?201302'), (u'Kultura', u'http://kurierlubelski.feedsportal.com/c/32980/f/533787/index.rss?201302'), (u'Rozmaito\u015bci', u'http://www.kurierlubelski.pl/rss/kurierlubelski_rozmaitosci.xml?201302'), (u'Dom', u'http://www.kurierlubelski.pl/rss/kurierlubelski_dom.xml?201302'), (u'Serwisy', u'http://www.kurierlubelski.pl/rss/kurierlubelski_serwisy.xml?201302'), (u'Motofakty', u'http://www.kurierlubelski.pl/rss/kurierlubelski_motofakty.xml?201302'), (u'M\xf3j Reporter', u'http://www.kurierlubelski.pl/rss/kurierlubelski_mojreporter.xml?201302'), (u'Praca', u'http://www.kurierlubelski.pl/rss/kurierlubelski_praca.xml?201302')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('artykul', 'drukuj')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/kurier-lubelski/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
78
recipes/kurier_poranny.recipe
Normal file
@ -0,0 +1,78 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class KurierPoranny(BasicNewsRecipe):
|
||||
title = u'Kurier Poranny'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Kurier Poranny | poranny.pl - portal miejski Białegostoku,informacje,wydarzenia'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
INDEX = 'http://www.poranny.pl'
|
||||
masthead_url = INDEX + '/images/top_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.poranny.pl/rss.xml'),
|
||||
(u'Białystok', u'http://www.poranny.pl/bialystok.xml'),
|
||||
(u'Bielsk Podlaski', u'http://www.poranny.pl/bielskpodlaski.xml'),
|
||||
(u'Czarna Białostocka', u'http://www.poranny.pl/czarnabialostocka.xml'),
|
||||
(u'Hajnówka', u'http://www.poranny.pl/hajnowka.xml'),
|
||||
(u'Łapy', u'http://www.poranny.pl/lapy.xml'),
|
||||
(u'Sokółka', u'http://www.poranny.pl/sokolka.xml'),
|
||||
(u'Supraśl', u'http://www.poranny.pl/suprasl.xml'),
|
||||
(u'Wasilków', u'http://www.poranny.pl/wasilkow.xml'),
|
||||
(u'Sport', u'http://www.poranny.pl/sport.xml'),
|
||||
(u'Praca', u'http://www.poranny.pl/praca.xml'),
|
||||
(u'Kultura', u'http://www.poranny.pl/kultura.xml'),
|
||||
(u'Dom', u'http://www.poranny.pl/dom.xml'),
|
||||
(u'Auto', u'http://www.poranny.pl/auto.xml'),
|
||||
(u'Polityka', u'http://www.poranny.pl/polityka.xml')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
soup = self.index_to_soup(nexturl)
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
27
recipes/kurier_szczecinski.recipe
Normal file
@ -0,0 +1,27 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class KurierSzczecinski(BasicNewsRecipe):
|
||||
title = u'Kurier Szczeci\u0144ski'
|
||||
__author__ = 'fenuks'
|
||||
description = u'24Kurier jest portalem Kuriera Szczecińskiego. Zawiera aktualności ze Szczecina oraz wiadomości regionalne z województwa zachodniopomorskiego. '
|
||||
category = 'newspaper'
|
||||
#publication_type = ''
|
||||
language = 'pl'
|
||||
#encoding = ''
|
||||
#extra_css = ''
|
||||
cover_url = 'http://www.24kurier.pl/Administracja/Img/24kurier_logo-copy-po-zapis'
|
||||
#masthead_url = ''
|
||||
use_embedded_content = False
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
remove_attributes = ['style', 'font']
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':'section'})]
|
||||
remove_tags = [dict(attrs={'class':['Ikonki', 'rek', 'artComments']})]
|
||||
remove_tags_after = dict(attrs={'class':'artComments'})
|
||||
#remove_tags_before = dict()
|
||||
feeds = [(u'Aktualno\u015bci', u'http://www.24kurier.pl/cmspages/articles_rss.aspx'), (u'Kraj', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kraj'), (u'\u015awiat', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swiat'), (u'Sport', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=sport'), (u'Kultura', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kultura'), (u'Gospodarka', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gospodarka'), (u'Nauka', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=nauka'), (u'Region', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=region'), (u'Szczecin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=szczecin'), (u'Bia\u0142ogard', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=bialogard'), (u'Choszczno', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=choszczno'), (u'Drawsko', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=drawsko'), (u'Goleni\xf3w', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=goleniow'), (u'Gryfice', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gryfice'), (u'Gryfino', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gryfino'), (u'Kamie\u0144 Pomorski', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kamien'), (u'Ko\u0142obrzeg', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kolobrzeg'), (u'Koszalin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=koszalin'), (u'\u0141obez', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=lobez'), (u'My\u015blib\xf3rz', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=mysliborz'), (u'Police', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=police'), (u'Pyrzyce', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=pyrzyce'), (u'S\u0142awno', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=slawno'), (u'Stargard', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=stargard'), (u'Szczecinek', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=szczecinek'), (u'\u015awidwin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swidwin'), (u'\u015awinouj\u015bcie', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swinoujscie'), (u'Wa\u0142cz', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=walcz')]
|
63
recipes/nto.recipe
Normal file
@ -0,0 +1,63 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NTO(BasicNewsRecipe):
|
||||
title = u'Nowa Trybuna Opolska'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Nowa Trybuna Opolska - portal regionalny województwa opolskiego.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
INDEX = 'http://www.nto.pl'
|
||||
masthead_url = INDEX + '/images/top_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.nto.pl/rss.xml'), (u'Region', u'http://www.nto.pl/region.xml'), (u'Brzeg', u'http://www.nto.pl/brzeg.xml'), (u'G\u0142ubczyce', u'http://www.nto.pl/glubczyce.xml'), (u'K\u0119dzierzyn-Ko\u017ale', u'http://www.nto.pl/kedzierzynkozle.xml'), (u'Kluczbork', u'http://www.nto.pl/kluczbork.xml'), (u'Krapkowice', u'http://www.nto.pl/krapkowice.xml'), (u'Namys\u0142\xf3w', u'http://www.nto.pl/namyslow.xml'), (u'Nysa', u'http://www.nto.pl/nysa.xml'), (u'Olesno', u'http://www.nto.pl/olesno.xml'), (u'Opole', u'http://www.nto.pl/opole.xml'), (u'Prudnik', u'http://www.nto.pl/prudnik.xml'), (u'Strzelce Opolskie', u'http://www.nto.pl/strzelceopolskie.xml'), (u'Sport', u'http://www.nto.pl/sport.xml'), (u'Polska i \u015bwiat', u'http://www.nto.pl/apps/pbcs.dll/section?Category=RSS&channel=KRAJSWIAT'), (u'Zdrowy styl', u'http://www.nto.pl/apps/pbcs.dll/section?Category=rss_zdrowystyl'), (u'Reporta\u017c', u'http://www.nto.pl/reportaz.xml'), (u'Studia', u'http://www.nto.pl/akademicka.xml')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
soup = self.index_to_soup(nexturl)
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
37
recipes/trojmiasto_pl.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Trojmiasto(BasicNewsRecipe):
|
||||
title = u'Tr\xf3jmiasto.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Wiadomości, imprezy, wydarzenia, spektakle.Gdańsk, Gdynia, Sopot - NOCLEGI, Katalog firm, repertuar kin, wydarzenia, przewodnik, mapa, kwatery, hotele. Portal regionalny trojmiasto.pl'
|
||||
category = ''
|
||||
#publication_type = ''
|
||||
language = 'pl'
|
||||
encoding = 'iso-8859-2'
|
||||
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
|
||||
cover_url = 'http://www.trojmiasto.pl/_img/toplong2/logo_trojmiasto.gif'
|
||||
#masthead_url = ''
|
||||
use_embedded_content = False
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
remove_attributes = ['style', 'font']
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'<strong>Czytaj więcej.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'<strong>Zobacz też.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'<b>[A-ZĄĆĘŁŃÓŚŹŻ \-,.:]*?</b>', re.DOTALL), lambda match: ''),]
|
||||
|
||||
#keep_only_tags = []
|
||||
remove_tags = [dict(id=['logo', 'font_small', 'font_big']), dict(attrs={'class':['title-long', 'ankieta', 'newsletter-inside-content newsletter-wrap', 'copyright_box',
|
||||
'logo', 'btn btn-photo-add', 'related-info-wrap', 'nTabs', 'article-list', 'rate-player horizontal', 'type-box', 'rate-player'
|
||||
'hover-nav', 'live-head tC', 'prev-link', 'next-link', 'ie6']}), dict(attrs={'title':[u'drukuj artykuł', u'podziel się na Facebooku', u'prześlij artykuł']})]
|
||||
remove_tags_after = dict(attrs={'class':'author-wrap'})
|
||||
remove_tags_before = dict(attrs={'class':'text-container'})
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://rss.trojmiasto.pl/rss,0.xml'), (u'Fakty i opinie', u'http://rss.trojmiasto.pl/rss,1.xml'), (u'Sport', u'http://rss.trojmiasto.pl/rss,2.xml'), (u'Dom', u'http://rss.trojmiasto.pl/rss,3.xml'), (u'Moto', u'http://rss.trojmiasto.pl/rss,4.xml'), (u'Nauka', u'http://rss.trojmiasto.pl/rss,5.xml'), (u'Rozrywka', u'http://rss.trojmiasto.pl/rss,6.xml'), (u'Kultura', u'http://rss.trojmiasto.pl/rss,7.xml'), (u'Rowery', u'http://rss.trojmiasto.pl/rss,8.xml'), (u'Dziecko', u'http://rss.trojmiasto.pl/rss,9.xml'), (u'Zdrowie i uroda', u'http://rss.trojmiasto.pl/rss,10.xml'), (u'Praca', u'http://rss.trojmiasto.pl/rss,11.xml'), (u'Artyku\u0142y czytelnik\xf3w', u'http://rss.trojmiasto.pl/rss,12.xml'), (u'Korki', u'http://rss.trojmiasto.pl/rss,13.xml'), (u'Historia', u'http://rss.trojmiasto.pl/rss,14.xml'), (u'Biznes', u'http://rss.trojmiasto.pl/rss,16.xml'), (u'Kryminalne Tr\xf3jmiasto', u'http://rss.trojmiasto.pl/rss,17.xml'), (u'Przewodnik', u'http://rss.trojmiasto.pl/rss,18.xml'), (u'Aktywne Tr\xf3jmiasto', u'http://rss.trojmiasto.pl/rss,19.xml'), (u'Delux', u'http://rss.trojmiasto.pl/rss,20.xml')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?print=1'
|