recipe update

This commit is contained in:
fenuks 2014-01-16 22:02:16 +01:00
parent f38246d400
commit eb8c47124f
23 changed files with 131 additions and 285 deletions

View File

@ -1,3 +1,4 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Android_com_pl(BasicNewsRecipe): class Android_com_pl(BasicNewsRecipe):
@ -6,8 +7,9 @@ class Android_com_pl(BasicNewsRecipe):
description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.' description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.'
category = 'Android, mobile' category = 'Android, mobile'
language = 'pl' language = 'pl'
use_embedded_content=True use_embedded_content = True
cover_url =u'http://android.com.pl/wp-content/themes/android/images/logo.png' cover_url = 'http://android.com.pl/wp-content/themes/android/images/logo.png'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
feeds = [(u'Android', u'http://android.com.pl/feed/')] preprocess_regexps = [(re.compile(ur'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
feeds = [(u'Android', u'http://android.com.pl/feed/')]

View File

@ -10,13 +10,9 @@ class AstroNEWS(BasicNewsRecipe):
#extra_css= 'table {text-align: left;}' #extra_css= 'table {text-align: left;}'
no_stylesheets=True no_stylesheets=True
cover_url='http://news.astronet.pl/img/logo_news.jpg' cover_url='http://news.astronet.pl/img/logo_news.jpg'
remove_attributes = ['width', 'align']
remove_tags=[dict(name='hr')] remove_tags=[dict(name='hr')]
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')] feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
def print_version(self, url): def print_version(self, url):
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?') return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
def preprocess_html(self, soup):
for item in soup.findAll(align=True):
del item['align']
return soup

View File

@ -14,7 +14,6 @@ class BadaniaNet(BasicNewsRecipe):
preprocess_regexps = [(re.compile(r"<h4>Tekst sponsoruje</h4>", re.IGNORECASE), lambda m: ''),] preprocess_regexps = [(re.compile(r"<h4>Tekst sponsoruje</h4>", re.IGNORECASE), lambda m: ''),]
remove_empty_feeds = True remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False
remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})] remove_tags = []
remove_tags_after = dict(attrs={'class':'omc-single-tags'}) keep_only_tags = [dict(name='article')]
keep_only_tags = [dict(id='omc-full-article')]
feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')]

View File

@ -19,14 +19,16 @@ class cdrinfo(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
remove_attributes = ['style'] remove_attributes = ['style', 'onmouseover']
preprocess_regexps = [(re.compile(u'<p[^>]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com</a>\.</p>', re.DOTALL), lambda match: '')] preprocess_regexps = [(re.compile(u'<p[^>]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com</a>\.</p>', re.DOTALL), lambda match: ''),
(re.compile(u'<p[^>]*?>.{,2}</p>', re.DOTALL), lambda match: '')]
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id=['text', 'text2'])] keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id=['text', 'text2'])]
remove_tags = [dict(attrs={'class':['navigation', 'sociable', 'last6news']}), dict(name='hr'), dict(id='respond')] remove_tags = [dict(attrs={'class':['navigation', 'sociable', 'last6news']}), dict(name=['hr', 'br']), dict(id='respond')]
remove_tags_after = dict(id='artnawigacja') remove_tags_after = dict(id='artnawigacja')
feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'), feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'),
(u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'),
(u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'), (u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'),
(u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml') (u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml')
] ]

View File

@ -14,31 +14,31 @@ class CGM(BasicNewsRecipe):
remove_empty_feeds= True remove_empty_feeds= True
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}' extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;} img {display: block;} ul.galleryImagesList {list-style: none;} li.item {float: left;} .calibrenavbar {clear: both;}'
remove_tags_before=dict(id='mainContent') remove_tags_before=dict(id='mainContent')
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'}) remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
remove_tags=[dict(name='div', attrs={'class':['fbContainer', 'socials']}), remove_tags=[dict(name='div', attrs={'class':['fbContainer', 'socials']}),
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}), dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
dict(id=['movieShare', 'container'])] dict(id=['movieShare', 'container']), dict(name='br')]
feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'), feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),
(u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')] (u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
gallery=soup.find('div', attrs={'class':'galleryFlash'}) gallery = soup.find('div', attrs={'class':'galleryFlash'})
if gallery: if gallery and gallery.div:
img=gallery.div img = gallery.div
gallery.img.extract() gallery.img.extract()
if img: if img:
img=img['style'] img = img['style']
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] img = 'http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
gallery.contents[1].name='img' gallery.contents[1].name = 'img'
gallery.contents[1]['src']=img gallery.contents[1]['src'] = img
pos = len(gallery.contents) pos = len(gallery.contents)
gallery.insert(pos, BeautifulSoup('<br />')) gallery.insert(pos, BeautifulSoup('<br />'))
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
ad=soup.findAll('a') ad = soup.findAll('a')
for r in ad: for r in ad:
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
r.extract() r.extract()

View File

@ -16,7 +16,7 @@ class Dzieje(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')] keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')]
remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')] remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory'), dict(name='blockquote')]
#feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] #feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
def append_page(self, soup, appendtag): def append_page(self, soup, appendtag):

View File

@ -18,22 +18,22 @@ class Dziennik_pl(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
remove_empty_feeds = True remove_empty_feeds = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .foto {float: left;} .clr {clear: both;}'
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">&gt;&gt;&gt; CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')] preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">&gt;&gt;&gt; CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
keep_only_tags = [dict(id='article')] keep_only_tags = [dict(id='article')]
remove_tags = [dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] remove_tags = [dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']}), dict(name='ins'), dict(name='br')]
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'), (u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'), (u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'), (u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'), (u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'), (u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'), (u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'), (u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'), (u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
tag = soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) tag = soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})

View File

@ -25,7 +25,7 @@ class EchoDnia(BasicNewsRecipe):
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline', 'articleZoomText']),
dict(attrs={'class':'articleFunctions'})] dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.echodnia.eu/rss.xml'), feeds = [(u'Wszystkie', u'http://www.echodnia.eu/rss.xml'),

View File

@ -7,11 +7,11 @@ class Fotoblogia_pl(BasicNewsRecipe):
category = 'photography' category = 'photography'
language = 'pl' language = 'pl'
masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg' masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg' cover_url = 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
keep_only_tags=[dict(name='div', attrs={'class':['post-view post-standard', 'photo-container']})] keep_only_tags = [dict(name='article')]
remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})] remove_tags = [dict(attrs={'class':'article-related'})]
feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')] feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]

View File

@ -16,19 +16,12 @@ class GazetaLubuska(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.gazetalubuska.pl/rss.xml'), (u'Dreznenko', u'http://www.gazetalubuska.pl/drezdenko.xml'), (u'G\u0142og\xf3w', u'http://www.gazetalubuska.pl/glogow.xml'), (u'Gorz\xf3w Wielkopolski', u'http://www.gazetalubuska.pl/gorzow-wielkopolski.xml'), (u'Gubin', u'http://www.gazetalubuska.pl/gubin.xml'), (u'Kostrzyn', u'http://www.gazetalubuska.pl/kostrzyn.xml'), (u'Krosno Odrza\u0144skie', u'http://www.gazetalubuska.pl/krosno-odrzanskie.xml'), (u'Lubsko', u'http://www.gazetalubuska.pl/lubsko.xml'), (u'Mi\u0119dzych\xf3d', u'http://www.gazetalubuska.pl/miedzychod.xml'), (u'Mi\u0119dzyrzecz', u'http://www.gazetalubuska.pl/miedzyrzecz.xml'), (u'Nowa S\xf3l', u'http://www.gazetalubuska.pl/nowa-sol.xml'), (u'S\u0142ubice', u'http://www.gazetalubuska.pl/slubice.xml'), (u'Strzelce Kraje\u0144skie', u'http://www.gazetalubuska.pl/strzelce-krajenskie.xml'), (u'Sulech\xf3w', u'http://www.gazetalubuska.pl/sulechow.xml'), (u'Sul\u0119cin', u'http://www.gazetalubuska.pl/sulecin.xml'), (u'\u015awi\u0119bodzin', u'http://www.gazetalubuska.pl/swiebodzin.xml'), (u'Wolsztyn', u'http://www.gazetalubuska.pl/wolsztyn.xml'), (u'Wschowa', u'http://www.gazetalubuska.pl/wschowa.xml'), (u'Zielona G\xf3ra', u'http://www.gazetalubuska.pl/zielona-gora.xml'), (u'\u017baga\u0144', u'http://www.gazetalubuska.pl/zagan.xml'), (u'\u017bary', u'http://www.gazetalubuska.pl/zary.xml'), (u'Sport', u'http://www.gazetalubuska.pl/sport.xml'), (u'Auto', u'http://www.gazetalubuska.pl/auto.xml'), (u'Dom', u'http://www.gazetalubuska.pl/dom.xml'), (u'Praca', u'http://www.gazetalubuska.pl/praca.xml'), (u'Zdrowie', u'http://www.gazetalubuska.pl/zdrowie.xml')] feeds = [(u'Wszystkie', u'http://www.gazetalubuska.pl/rss.xml'), (u'Dreznenko', u'http://www.gazetalubuska.pl/drezdenko.xml'), (u'G\u0142og\xf3w', u'http://www.gazetalubuska.pl/glogow.xml'), (u'Gorz\xf3w Wielkopolski', u'http://www.gazetalubuska.pl/gorzow-wielkopolski.xml'), (u'Gubin', u'http://www.gazetalubuska.pl/gubin.xml'), (u'Kostrzyn', u'http://www.gazetalubuska.pl/kostrzyn.xml'), (u'Krosno Odrza\u0144skie', u'http://www.gazetalubuska.pl/krosno-odrzanskie.xml'), (u'Lubsko', u'http://www.gazetalubuska.pl/lubsko.xml'), (u'Mi\u0119dzych\xf3d', u'http://www.gazetalubuska.pl/miedzychod.xml'), (u'Mi\u0119dzyrzecz', u'http://www.gazetalubuska.pl/miedzyrzecz.xml'), (u'Nowa S\xf3l', u'http://www.gazetalubuska.pl/nowa-sol.xml'), (u'S\u0142ubice', u'http://www.gazetalubuska.pl/slubice.xml'), (u'Strzelce Kraje\u0144skie', u'http://www.gazetalubuska.pl/strzelce-krajenskie.xml'), (u'Sulech\xf3w', u'http://www.gazetalubuska.pl/sulechow.xml'), (u'Sul\u0119cin', u'http://www.gazetalubuska.pl/sulecin.xml'), (u'\u015awi\u0119bodzin', u'http://www.gazetalubuska.pl/swiebodzin.xml'), (u'Wolsztyn', u'http://www.gazetalubuska.pl/wolsztyn.xml'), (u'Wschowa', u'http://www.gazetalubuska.pl/wschowa.xml'), (u'Zielona G\xf3ra', u'http://www.gazetalubuska.pl/zielona-gora.xml'), (u'\u017baga\u0144', u'http://www.gazetalubuska.pl/zagan.xml'), (u'\u017bary', u'http://www.gazetalubuska.pl/zary.xml'), (u'Sport', u'http://www.gazetalubuska.pl/sport.xml'), (u'Auto', u'http://www.gazetalubuska.pl/auto.xml'), (u'Dom', u'http://www.gazetalubuska.pl/dom.xml'), (u'Praca', u'http://www.gazetalubuska.pl/praca.xml'), (u'Zdrowie', u'http://www.gazetalubuska.pl/zdrowie.xml')]
keep_only_tags = [dict(id='article')]
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
@ -37,33 +30,12 @@ class GazetaLubuska(BasicNewsRecipe):
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag): def decode_feedportal_url(self, url):
tag = soup.find('span', attrs={'class':'photoNavigationPages'}) link = url.rpartition('l/0L0S')[2][:-12]
if tag: replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', '')) for t in replaces:
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] link = link.replace(*t)
return 'http://' + link
for r in appendtag.findAll(attrs={'class':'photoNavigation'}): def print_version(self, url):
r.extract() return self.decode_feedportal_url(url) + '&Template=printpicart'
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -16,17 +16,9 @@ class GazetaPomorska(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.pomorska.pl/rss.xml'), feeds = [(u'Wszystkie', u'http://www.pomorska.pl/rss.xml'),
(u'Region', u'http://www.pomorska.pl/region.xml'), (u'Region', u'http://www.pomorska.pl/region.xml'),
(u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'), (u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'),
@ -57,6 +49,8 @@ class GazetaPomorska(BasicNewsRecipe):
#(u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'), #(u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'),
(u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')] (u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')]
keep_only_tags = [dict(id='article')]
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
@ -64,33 +58,12 @@ class GazetaPomorska(BasicNewsRecipe):
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag): def decode_feedportal_url(self, url):
tag = soup.find('span', attrs={'class':'photoNavigationPages'}) link = url.rpartition('l/0L0S')[2][:-12]
if tag: replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', '')) for t in replaces:
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] link = link.replace(*t)
return 'http://' + link
for r in appendtag.findAll(attrs={'class':'photoNavigation'}): def print_version(self, url):
r.extract() return self.decode_feedportal_url(url) + '&Template=printpicart'
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -16,19 +16,13 @@ class GazetaWspolczesna(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.wspolczesna.pl/rss.xml'), (u'August\xf3w', u'http://www.wspolczesna.pl/augustow.xml'), (u'Bia\u0142ystok', u'http://www.wspolczesna.pl/bialystok.xml'), (u'Bielsk Podlaski', u'http://www.wspolczesna.pl/bielsk.xml'), (u'E\u0142k', u'http://www.wspolczesna.pl/elk.xml'), (u'Grajewo', u'http://www.wspolczesna.pl/grajewo.xml'), (u'Go\u0142dap', u'http://www.wspolczesna.pl/goldap.xml'), (u'Hajn\xf3wka', u'http://www.wspolczesna.pl/hajnowka.xml'), (u'Kolno', u'http://www.wspolczesna.pl/kolno.xml'), (u'\u0141om\u017ca', u'http://www.wspolczesna.pl/lomza.xml'), (u'Mo\u0144ki', u'http://www.wspolczesna.pl/monki.xml'), (u'Olecko', u'http://www.wspolczesna.pl/olecko.xml'), (u'Ostro\u0142\u0119ka', u'http://www.wspolczesna.pl/ostroleka.xml'), (u'Powiat Bia\u0142ostocki', u'http://www.wspolczesna.pl/powiat.xml'), (u'Sejny', u'http://www.wspolczesna.pl/sejny.xml'), (u'Siemiatycze', u'http://www.wspolczesna.pl/siemiatycze.xml'), (u'Sok\xf3\u0142ka', u'http://www.wspolczesna.pl/sokolka.xml'), (u'Suwa\u0142ki', u'http://www.wspolczesna.pl/suwalki.xml'), (u'Wysokie Mazowieckie', u'http://www.wspolczesna.pl/wysokie.xml'), (u'Zambr\xf3w', u'http://www.wspolczesna.pl/zambrow.xml'), (u'Sport', u'http://www.wspolczesna.pl/sport.xml'), (u'Praca', u'http://www.wspolczesna.pl/praca.xml'), (u'Dom', u'http://www.wspolczesna.pl/dom.xml'), (u'Auto', u'http://www.wspolczesna.pl/auto.xml'), (u'Zdrowie', u'http://www.wspolczesna.pl/zdrowie.xml')] feeds = [(u'Wszystkie', u'http://www.wspolczesna.pl/rss.xml'), (u'August\xf3w', u'http://www.wspolczesna.pl/augustow.xml'), (u'Bia\u0142ystok', u'http://www.wspolczesna.pl/bialystok.xml'), (u'Bielsk Podlaski', u'http://www.wspolczesna.pl/bielsk.xml'), (u'E\u0142k', u'http://www.wspolczesna.pl/elk.xml'), (u'Grajewo', u'http://www.wspolczesna.pl/grajewo.xml'), (u'Go\u0142dap', u'http://www.wspolczesna.pl/goldap.xml'), (u'Hajn\xf3wka', u'http://www.wspolczesna.pl/hajnowka.xml'), (u'Kolno', u'http://www.wspolczesna.pl/kolno.xml'), (u'\u0141om\u017ca', u'http://www.wspolczesna.pl/lomza.xml'), (u'Mo\u0144ki', u'http://www.wspolczesna.pl/monki.xml'), (u'Olecko', u'http://www.wspolczesna.pl/olecko.xml'), (u'Ostro\u0142\u0119ka', u'http://www.wspolczesna.pl/ostroleka.xml'), (u'Powiat Bia\u0142ostocki', u'http://www.wspolczesna.pl/powiat.xml'), (u'Sejny', u'http://www.wspolczesna.pl/sejny.xml'), (u'Siemiatycze', u'http://www.wspolczesna.pl/siemiatycze.xml'), (u'Sok\xf3\u0142ka', u'http://www.wspolczesna.pl/sokolka.xml'), (u'Suwa\u0142ki', u'http://www.wspolczesna.pl/suwalki.xml'), (u'Wysokie Mazowieckie', u'http://www.wspolczesna.pl/wysokie.xml'), (u'Zambr\xf3w', u'http://www.wspolczesna.pl/zambrow.xml'), (u'Sport', u'http://www.wspolczesna.pl/sport.xml'), (u'Praca', u'http://www.wspolczesna.pl/praca.xml'), (u'Dom', u'http://www.wspolczesna.pl/dom.xml'), (u'Auto', u'http://www.wspolczesna.pl/auto.xml'), (u'Zdrowie', u'http://www.wspolczesna.pl/zdrowie.xml')]
keep_only_tags = [dict(id='article')]
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
@ -36,33 +30,12 @@ class GazetaWspolczesna(BasicNewsRecipe):
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag): def decode_feedportal_url(self, url):
tag = soup.find('span', attrs={'class':'photoNavigationPages'}) link = url.rpartition('l/0L0S')[2][:-12]
if tag: replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', '')) for t in replaces:
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] link = link.replace(*t)
return 'http://' + link
for r in appendtag.findAll(attrs={'class':'photoNavigation'}): def print_version(self, url):
r.extract() return self.decode_feedportal_url(url) + '&Template=printpicart'
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -99,9 +99,8 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
cover = soup.find(id='GWmini2') cover = soup.find(attrs={'class':'gallerycontent'})
soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) self.cover_url = cover.ul.li.a.img['src'].replace('P.jpg', '.jpg')
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def image_url_processor(self, baseurl, url): def image_url_processor(self, baseurl, url):

View File

@ -18,14 +18,7 @@ class GCN(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
remove_attributes = ['style'] remove_attributes = ['style']
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), use_embedded_content = False
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.nowiny24.pl/rss.xml'), feeds = [(u'Wszystkie', u'http://www.nowiny24.pl/rss.xml'),
(u'Podkarpacie', u'http://www.nowiny24.pl/podkarpacie.xml'), (u'Podkarpacie', u'http://www.nowiny24.pl/podkarpacie.xml'),
@ -49,6 +42,8 @@ class GCN(BasicNewsRecipe):
(u'Zdrowie', u'http://www.nowiny24.pl/zdrowie.xml'), (u'Zdrowie', u'http://www.nowiny24.pl/zdrowie.xml'),
(u'Wywiady', u'http://www.nowiny24.pl/wywiady.xml')] (u'Wywiady', u'http://www.nowiny24.pl/wywiady.xml')]
keep_only_tags = [dict(id='article')]
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
@ -56,33 +51,12 @@ class GCN(BasicNewsRecipe):
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag): def decode_feedportal_url(self, url):
tag = soup.find('span', attrs={'class':'photoNavigationPages'}) link = url.rpartition('l/0L0S')[2][:-12]
if tag: replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', '')) for t in replaces:
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] link = link.replace(*t)
return 'http://' + link
for r in appendtag.findAll(attrs={'class':'photoNavigation'}): def print_version(self, url):
r.extract() return self.decode_feedportal_url(url) + '&Template=printpicart'
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -16,7 +16,7 @@ class Gram_pl(BasicNewsRecipe):
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' #extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
keep_only_tags= [dict(id='articleModule')] keep_only_tags= [dict(id='articleModule')]
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']}), dict(name='aside')] remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']}), dict(name='aside'), dict(id='metaColumn')]
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles') (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
] ]

View File

@ -15,7 +15,8 @@ class GryOnlinePl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'class':['gc660', 'gc660 S013', 'news_endpage_tit', 'news_container', 'news']})] keep_only_tags = [dict(name='div', attrs={'class':['gc660', 'gc660 S013', 'news_endpage_tit', 'news_container', 'news']})]
remove_tags = [dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] remove_tags = [dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2',
'twitter-share-button']})]
feeds = [ feeds = [
(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), (u'Newsy', 'http://www.gry-online.pl/rss/news.xml'),
('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
@ -44,7 +45,7 @@ class GryOnlinePl(BasicNewsRecipe):
pos = len(appendtag.contents) pos = len(appendtag.contents)
appendtag.insert(pos, pagetext) appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'imh10b']}):
r.extract() r.extract()
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments: for comment in comments:
@ -80,7 +81,7 @@ class GryOnlinePl(BasicNewsRecipe):
[comment.extract() for comment in comments] [comment.extract() for comment in comments]
pos = len(appendtag.contents) pos = len(appendtag.contents)
appendtag.insert(pos, pagetext) appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}): for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony', 'imh10b']}):
r.extract() r.extract()
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments: for comment in comments:

View File

@ -30,4 +30,7 @@ class Kosmonauta(BasicNewsRecipe):
href = a['href'] href = a['href']
if not href.startswith('http'): if not href.startswith('http'):
a['href'] = self.INDEX + href a['href'] = self.INDEX + href
for a in soup.findAll(name='img'):
if a.has_key('style') and 'float:' in a['style']:
a['class'] = 'thumb-left'
return soup return soup

View File

@ -17,16 +17,7 @@ class KurierPoranny(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
use_embedded_content = False
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.poranny.pl/rss.xml'), feeds = [(u'Wszystkie', u'http://www.poranny.pl/rss.xml'),
(u'Białystok', u'http://www.poranny.pl/bialystok.xml'), (u'Białystok', u'http://www.poranny.pl/bialystok.xml'),
@ -44,6 +35,8 @@ class KurierPoranny(BasicNewsRecipe):
(u'Auto', u'http://www.poranny.pl/auto.xml'), (u'Auto', u'http://www.poranny.pl/auto.xml'),
(u'Polityka', u'http://www.poranny.pl/polityka.xml')] (u'Polityka', u'http://www.poranny.pl/polityka.xml')]
keep_only_tags = [dict(id='article')]
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
@ -51,34 +44,12 @@ class KurierPoranny(BasicNewsRecipe):
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag): def decode_feedportal_url(self, url):
tag = soup.find('span', attrs={'class':'photoNavigationPages'}) link = url.rpartition('l/0L0S')[2][:-12]
if tag: replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', '')) for t in replaces:
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] link = link.replace(*t)
return 'http://' + link
for r in appendtag.findAll(attrs={'class':'photoNavigation'}): def print_version(self, url):
r.extract() return self.decode_feedportal_url(url) + '&Template=printpicart'
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -12,6 +12,7 @@ class media2_pl(BasicNewsRecipe):
description = u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.' description = u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.'
masthead_url = 'http://media2.pl/res/logo/www.png' masthead_url = 'http://media2.pl/res/logo/www.png'
cover_url = 'http://media2.pl/res/logo/www.png' cover_url = 'http://media2.pl/res/logo/www.png'
INDEX = 'http://media2.pl'
remove_empty_feeds = True remove_empty_feeds = True
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -22,10 +23,16 @@ class media2_pl(BasicNewsRecipe):
extra_css = '''.news-lead{font-weight: bold; }''' extra_css = '''.news-lead{font-weight: bold; }'''
keep_only_tags = [dict(name = 'div', attrs = {'class' : 'news-item tpl-big'})] keep_only_tags = [dict(name = 'div', attrs = {'class' : 'news-item tpl-big'})]
remove_tags = [dict(name = 'span', attrs = {'class' : 'news-comments'}), dict(name = 'div', attrs = {'class' : 'item-sidebar'}), dict(name = 'div', attrs = {'class' : 'news-tags'})] remove_tags = [dict(name = 'span', attrs = {'class' : 'news-comments'}), dict(name = 'div', attrs = {'class' : ['item-sidebar', 'news-inline-promo nobbtext']}),
dict(name = 'div', attrs = {'class' : 'news-tags'})]
feeds = [(u'Media2', u'http://feeds.feedburner.com/media2'), (u'Internet', u'http://feeds.feedburner.com/media2/internet'), feeds = [(u'Media2', u'http://feeds.feedburner.com/media2'), (u'Internet', u'http://feeds.feedburner.com/media2/internet'),
(u'Media', 'http://feeds.feedburner.com/media2/media'), (u'Telekomunikacja', 'http://feeds.feedburner.com/media2/telekomunikacja'), (u'Media', 'http://feeds.feedburner.com/media2/media'), (u'Telekomunikacja', 'http://feeds.feedburner.com/media2/telekomunikacja'),
(u'Reklama/PR', 'http://feeds.feedburner.com/media2/reklama-pr'), (u'Technologie', 'http://feeds.feedburner.com/media2/technologie'), (u'Reklama/PR', 'http://feeds.feedburner.com/media2/reklama-pr'), (u'Technologie', 'http://feeds.feedburner.com/media2/technologie'),
(u'Badania', 'http://feeds.feedburner.com/media2/badania') (u'Badania', 'http://feeds.feedburner.com/media2/badania')
] ]
def image_url_processor(self, baseurl, url):
if url[0] == '/':
url = self.INDEX + url
return url

View File

@ -17,18 +17,12 @@ class NTO(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
use_embedded_content = False
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
feeds = [(u'Wszystkie', u'http://www.nto.pl/rss.xml'), (u'Region', u'http://www.nto.pl/region.xml'), (u'Brzeg', u'http://www.nto.pl/brzeg.xml'), (u'G\u0142ubczyce', u'http://www.nto.pl/glubczyce.xml'), (u'K\u0119dzierzyn-Ko\u017ale', u'http://www.nto.pl/kedzierzynkozle.xml'), (u'Kluczbork', u'http://www.nto.pl/kluczbork.xml'), (u'Krapkowice', u'http://www.nto.pl/krapkowice.xml'), (u'Namys\u0142\xf3w', u'http://www.nto.pl/namyslow.xml'), (u'Nysa', u'http://www.nto.pl/nysa.xml'), (u'Olesno', u'http://www.nto.pl/olesno.xml'), (u'Opole', u'http://www.nto.pl/opole.xml'), (u'Prudnik', u'http://www.nto.pl/prudnik.xml'), (u'Strzelce Opolskie', u'http://www.nto.pl/strzelceopolskie.xml'), (u'Sport', u'http://www.nto.pl/sport.xml'), (u'Polska i \u015bwiat', u'http://www.nto.pl/apps/pbcs.dll/section?Category=RSS&channel=KRAJSWIAT'), (u'Zdrowy styl', u'http://www.nto.pl/apps/pbcs.dll/section?Category=rss_zdrowystyl'), (u'Reporta\u017c', u'http://www.nto.pl/reportaz.xml'), (u'Studia', u'http://www.nto.pl/akademicka.xml')] feeds = [(u'Wszystkie', u'http://www.nto.pl/rss.xml'), (u'Region', u'http://www.nto.pl/region.xml'), (u'Brzeg', u'http://www.nto.pl/brzeg.xml'), (u'G\u0142ubczyce', u'http://www.nto.pl/glubczyce.xml'), (u'K\u0119dzierzyn-Ko\u017ale', u'http://www.nto.pl/kedzierzynkozle.xml'), (u'Kluczbork', u'http://www.nto.pl/kluczbork.xml'), (u'Krapkowice', u'http://www.nto.pl/krapkowice.xml'), (u'Namys\u0142\xf3w', u'http://www.nto.pl/namyslow.xml'), (u'Nysa', u'http://www.nto.pl/nysa.xml'), (u'Olesno', u'http://www.nto.pl/olesno.xml'), (u'Opole', u'http://www.nto.pl/opole.xml'), (u'Prudnik', u'http://www.nto.pl/prudnik.xml'), (u'Strzelce Opolskie', u'http://www.nto.pl/strzelceopolskie.xml'), (u'Sport', u'http://www.nto.pl/sport.xml'), (u'Polska i \u015bwiat', u'http://www.nto.pl/apps/pbcs.dll/section?Category=RSS&channel=KRAJSWIAT'), (u'Zdrowy styl', u'http://www.nto.pl/apps/pbcs.dll/section?Category=rss_zdrowystyl'), (u'Reporta\u017c', u'http://www.nto.pl/reportaz.xml'), (u'Studia', u'http://www.nto.pl/akademicka.xml')]
keep_only_tags = [dict(id='article')]
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
@ -36,33 +30,12 @@ class NTO(BasicNewsRecipe):
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def append_page(self, soup, appendtag): def decode_feedportal_url(self, url):
tag = soup.find('span', attrs={'class':'photoNavigationPages'}) link = url.rpartition('l/0L0S')[2][:-12]
if tag: replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', '')) for t in replaces:
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] link = link.replace(*t)
return 'http://' + link
for r in appendtag.findAll(attrs={'class':'photoNavigation'}): def print_version(self, url):
r.extract() return self.decode_feedportal_url(url) + '&Template=printpicart'
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -17,6 +17,7 @@ class presseurop(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = True auto_cleanup = True
remove_empty_feeds = True
feeds = [ feeds = [
(u'Polityka', u'http://www.presseurop.eu/pl/taxonomy/term/1/%2A/feed'), (u'Polityka', u'http://www.presseurop.eu/pl/taxonomy/term/1/%2A/feed'),

File diff suppressed because one or more lines are too long

View File

@ -14,5 +14,5 @@ class Tablety_pl(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')] preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
keep_only_tags = [dict(id='news_block')] keep_only_tags = [dict(id='news_block')]
remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments', 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer']})] remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments', 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})]
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]