mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
recipe update
This commit is contained in:
parent
f38246d400
commit
eb8c47124f
@ -1,3 +1,4 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Android_com_pl(BasicNewsRecipe):
|
||||
@ -6,8 +7,9 @@ class Android_com_pl(BasicNewsRecipe):
|
||||
description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.'
|
||||
category = 'Android, mobile'
|
||||
language = 'pl'
|
||||
use_embedded_content=True
|
||||
cover_url =u'http://android.com.pl/wp-content/themes/android/images/logo.png'
|
||||
use_embedded_content = True
|
||||
cover_url = 'http://android.com.pl/wp-content/themes/android/images/logo.png'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
feeds = [(u'Android', u'http://android.com.pl/feed/')]
|
||||
preprocess_regexps = [(re.compile(ur'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
|
||||
feeds = [(u'Android', u'http://android.com.pl/feed/')]
|
@ -10,13 +10,9 @@ class AstroNEWS(BasicNewsRecipe):
|
||||
#extra_css= 'table {text-align: left;}'
|
||||
no_stylesheets=True
|
||||
cover_url='http://news.astronet.pl/img/logo_news.jpg'
|
||||
remove_attributes = ['width', 'align']
|
||||
remove_tags=[dict(name='hr')]
|
||||
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(align=True):
|
||||
del item['align']
|
||||
return soup
|
||||
|
@ -14,7 +14,6 @@ class BadaniaNet(BasicNewsRecipe):
|
||||
preprocess_regexps = [(re.compile(r"<h4>Tekst sponsoruje</h4>", re.IGNORECASE), lambda m: ''),]
|
||||
remove_empty_feeds = True
|
||||
use_embedded_content = False
|
||||
remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})]
|
||||
remove_tags_after = dict(attrs={'class':'omc-single-tags'})
|
||||
keep_only_tags = [dict(id='omc-full-article')]
|
||||
remove_tags = []
|
||||
keep_only_tags = [dict(name='article')]
|
||||
feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')]
|
@ -19,14 +19,16 @@ class cdrinfo(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
remove_attributes = ['style']
|
||||
preprocess_regexps = [(re.compile(u'<p[^>]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com</a>\.</p>', re.DOTALL), lambda match: '')]
|
||||
remove_attributes = ['style', 'onmouseover']
|
||||
preprocess_regexps = [(re.compile(u'<p[^>]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com</a>\.</p>', re.DOTALL), lambda match: ''),
|
||||
(re.compile(u'<p[^>]*?>.{,2}</p>', re.DOTALL), lambda match: '')]
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id=['text', 'text2'])]
|
||||
remove_tags = [dict(attrs={'class':['navigation', 'sociable', 'last6news']}), dict(name='hr'), dict(id='respond')]
|
||||
remove_tags = [dict(attrs={'class':['navigation', 'sociable', 'last6news']}), dict(name=['hr', 'br']), dict(id='respond')]
|
||||
remove_tags_after = dict(id='artnawigacja')
|
||||
feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'),
|
||||
feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'),
|
||||
(u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'),
|
||||
(u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'),
|
||||
(u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml')
|
||||
]
|
||||
|
@ -14,31 +14,31 @@ class CGM(BasicNewsRecipe):
|
||||
remove_empty_feeds= True
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}'
|
||||
extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;} img {display: block;} ul.galleryImagesList {list-style: none;} li.item {float: left;} .calibrenavbar {clear: both;}'
|
||||
remove_tags_before=dict(id='mainContent')
|
||||
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
|
||||
remove_tags=[dict(name='div', attrs={'class':['fbContainer', 'socials']}),
|
||||
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
|
||||
dict(id=['movieShare', 'container'])]
|
||||
dict(id=['movieShare', 'container']), dict(name='br')]
|
||||
feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),
|
||||
(u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
||||
if gallery:
|
||||
img=gallery.div
|
||||
gallery = soup.find('div', attrs={'class':'galleryFlash'})
|
||||
if gallery and gallery.div:
|
||||
img = gallery.div
|
||||
gallery.img.extract()
|
||||
if img:
|
||||
img=img['style']
|
||||
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
|
||||
gallery.contents[1].name='img'
|
||||
gallery.contents[1]['src']=img
|
||||
img = img['style']
|
||||
img = 'http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
|
||||
gallery.contents[1].name = 'img'
|
||||
gallery.contents[1]['src'] = img
|
||||
pos = len(gallery.contents)
|
||||
gallery.insert(pos, BeautifulSoup('<br />'))
|
||||
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
ad=soup.findAll('a')
|
||||
ad = soup.findAll('a')
|
||||
for r in ad:
|
||||
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
|
||||
r.extract()
|
||||
|
@ -16,7 +16,7 @@ class Dzieje(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')]
|
||||
remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')]
|
||||
remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory'), dict(name='blockquote')]
|
||||
#feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
|
@ -18,22 +18,22 @@ class Dziennik_pl(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
|
||||
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} .foto {float: left;} .clr {clear: both;}'
|
||||
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
||||
keep_only_tags = [dict(id='article')]
|
||||
remove_tags = [dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']}), dict(name='ins'), dict(name='br')]
|
||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
||||
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
||||
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
|
||||
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
|
||||
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
|
||||
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
|
||||
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
|
||||
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
|
||||
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
|
||||
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
|
||||
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
||||
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
||||
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
||||
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
|
||||
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
|
||||
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
|
||||
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
|
||||
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
|
||||
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
|
||||
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
|
||||
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
|
||||
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
tag = soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})
|
||||
|
@ -25,7 +25,7 @@ class EchoDnia(BasicNewsRecipe):
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline', 'articleZoomText']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.echodnia.eu/rss.xml'),
|
||||
|
@ -7,11 +7,11 @@ class Fotoblogia_pl(BasicNewsRecipe):
|
||||
category = 'photography'
|
||||
language = 'pl'
|
||||
masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
|
||||
cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
|
||||
cover_url = 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
keep_only_tags=[dict(name='div', attrs={'class':['post-view post-standard', 'photo-container']})]
|
||||
remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
|
||||
keep_only_tags = [dict(name='article')]
|
||||
remove_tags = [dict(attrs={'class':'article-related'})]
|
||||
feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]
|
||||
|
@ -16,19 +16,12 @@ class GazetaLubuska(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.gazetalubuska.pl/rss.xml'), (u'Dreznenko', u'http://www.gazetalubuska.pl/drezdenko.xml'), (u'G\u0142og\xf3w', u'http://www.gazetalubuska.pl/glogow.xml'), (u'Gorz\xf3w Wielkopolski', u'http://www.gazetalubuska.pl/gorzow-wielkopolski.xml'), (u'Gubin', u'http://www.gazetalubuska.pl/gubin.xml'), (u'Kostrzyn', u'http://www.gazetalubuska.pl/kostrzyn.xml'), (u'Krosno Odrza\u0144skie', u'http://www.gazetalubuska.pl/krosno-odrzanskie.xml'), (u'Lubsko', u'http://www.gazetalubuska.pl/lubsko.xml'), (u'Mi\u0119dzych\xf3d', u'http://www.gazetalubuska.pl/miedzychod.xml'), (u'Mi\u0119dzyrzecz', u'http://www.gazetalubuska.pl/miedzyrzecz.xml'), (u'Nowa S\xf3l', u'http://www.gazetalubuska.pl/nowa-sol.xml'), (u'S\u0142ubice', u'http://www.gazetalubuska.pl/slubice.xml'), (u'Strzelce Kraje\u0144skie', u'http://www.gazetalubuska.pl/strzelce-krajenskie.xml'), (u'Sulech\xf3w', u'http://www.gazetalubuska.pl/sulechow.xml'), (u'Sul\u0119cin', u'http://www.gazetalubuska.pl/sulecin.xml'), (u'\u015awi\u0119bodzin', u'http://www.gazetalubuska.pl/swiebodzin.xml'), (u'Wolsztyn', u'http://www.gazetalubuska.pl/wolsztyn.xml'), (u'Wschowa', u'http://www.gazetalubuska.pl/wschowa.xml'), (u'Zielona G\xf3ra', u'http://www.gazetalubuska.pl/zielona-gora.xml'), (u'\u017baga\u0144', u'http://www.gazetalubuska.pl/zagan.xml'), (u'\u017bary', u'http://www.gazetalubuska.pl/zary.xml'), (u'Sport', u'http://www.gazetalubuska.pl/sport.xml'), (u'Auto', u'http://www.gazetalubuska.pl/auto.xml'), (u'Dom', u'http://www.gazetalubuska.pl/dom.xml'), (u'Praca', u'http://www.gazetalubuska.pl/praca.xml'), (u'Zdrowie', u'http://www.gazetalubuska.pl/zdrowie.xml')]
|
||||
|
||||
keep_only_tags = [dict(id='article')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
@ -37,33 +30,12 @@ class GazetaLubuska(BasicNewsRecipe):
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
def decode_feedportal_url(self, url):
|
||||
link = url.rpartition('l/0L0S')[2][:-12]
|
||||
replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
|
||||
for t in replaces:
|
||||
link = link.replace(*t)
|
||||
return 'http://' + link
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return self.decode_feedportal_url(url) + '&Template=printpicart'
|
@ -16,17 +16,9 @@ class GazetaPomorska(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.pomorska.pl/rss.xml'),
|
||||
(u'Region', u'http://www.pomorska.pl/region.xml'),
|
||||
(u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'),
|
||||
@ -57,6 +49,8 @@ class GazetaPomorska(BasicNewsRecipe):
|
||||
#(u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'),
|
||||
(u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')]
|
||||
|
||||
keep_only_tags = [dict(id='article')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
@ -64,33 +58,12 @@ class GazetaPomorska(BasicNewsRecipe):
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
def decode_feedportal_url(self, url):
|
||||
link = url.rpartition('l/0L0S')[2][:-12]
|
||||
replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
|
||||
for t in replaces:
|
||||
link = link.replace(*t)
|
||||
return 'http://' + link
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return self.decode_feedportal_url(url) + '&Template=printpicart'
|
||||
|
@ -16,19 +16,13 @@ class GazetaWspolczesna(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.wspolczesna.pl/rss.xml'), (u'August\xf3w', u'http://www.wspolczesna.pl/augustow.xml'), (u'Bia\u0142ystok', u'http://www.wspolczesna.pl/bialystok.xml'), (u'Bielsk Podlaski', u'http://www.wspolczesna.pl/bielsk.xml'), (u'E\u0142k', u'http://www.wspolczesna.pl/elk.xml'), (u'Grajewo', u'http://www.wspolczesna.pl/grajewo.xml'), (u'Go\u0142dap', u'http://www.wspolczesna.pl/goldap.xml'), (u'Hajn\xf3wka', u'http://www.wspolczesna.pl/hajnowka.xml'), (u'Kolno', u'http://www.wspolczesna.pl/kolno.xml'), (u'\u0141om\u017ca', u'http://www.wspolczesna.pl/lomza.xml'), (u'Mo\u0144ki', u'http://www.wspolczesna.pl/monki.xml'), (u'Olecko', u'http://www.wspolczesna.pl/olecko.xml'), (u'Ostro\u0142\u0119ka', u'http://www.wspolczesna.pl/ostroleka.xml'), (u'Powiat Bia\u0142ostocki', u'http://www.wspolczesna.pl/powiat.xml'), (u'Sejny', u'http://www.wspolczesna.pl/sejny.xml'), (u'Siemiatycze', u'http://www.wspolczesna.pl/siemiatycze.xml'), (u'Sok\xf3\u0142ka', u'http://www.wspolczesna.pl/sokolka.xml'), (u'Suwa\u0142ki', u'http://www.wspolczesna.pl/suwalki.xml'), (u'Wysokie Mazowieckie', u'http://www.wspolczesna.pl/wysokie.xml'), (u'Zambr\xf3w', u'http://www.wspolczesna.pl/zambrow.xml'), (u'Sport', u'http://www.wspolczesna.pl/sport.xml'), (u'Praca', u'http://www.wspolczesna.pl/praca.xml'), (u'Dom', u'http://www.wspolczesna.pl/dom.xml'), (u'Auto', u'http://www.wspolczesna.pl/auto.xml'), (u'Zdrowie', u'http://www.wspolczesna.pl/zdrowie.xml')]
|
||||
|
||||
keep_only_tags = [dict(id='article')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
@ -36,33 +30,12 @@ class GazetaWspolczesna(BasicNewsRecipe):
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
def decode_feedportal_url(self, url):
|
||||
link = url.rpartition('l/0L0S')[2][:-12]
|
||||
replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
|
||||
for t in replaces:
|
||||
link = link.replace(*t)
|
||||
return 'http://' + link
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return self.decode_feedportal_url(url) + '&Template=printpicart'
|
||||
|
@ -99,9 +99,8 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
|
||||
cover = soup.find(id='GWmini2')
|
||||
soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href'])
|
||||
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
|
||||
cover = soup.find(attrs={'class':'gallerycontent'})
|
||||
self.cover_url = cover.ul.li.a.img['src'].replace('P.jpg', '.jpg')
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
|
@ -18,14 +18,7 @@ class GCN(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
remove_attributes = ['style']
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
use_embedded_content = False
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.nowiny24.pl/rss.xml'),
|
||||
(u'Podkarpacie', u'http://www.nowiny24.pl/podkarpacie.xml'),
|
||||
@ -49,6 +42,8 @@ class GCN(BasicNewsRecipe):
|
||||
(u'Zdrowie', u'http://www.nowiny24.pl/zdrowie.xml'),
|
||||
(u'Wywiady', u'http://www.nowiny24.pl/wywiady.xml')]
|
||||
|
||||
keep_only_tags = [dict(id='article')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
@ -56,33 +51,12 @@ class GCN(BasicNewsRecipe):
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
def decode_feedportal_url(self, url):
|
||||
link = url.rpartition('l/0L0S')[2][:-12]
|
||||
replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
|
||||
for t in replaces:
|
||||
link = link.replace(*t)
|
||||
return 'http://' + link
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return self.decode_feedportal_url(url) + '&Template=printpicart'
|
@ -16,7 +16,7 @@ class Gram_pl(BasicNewsRecipe):
|
||||
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
||||
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
||||
keep_only_tags= [dict(id='articleModule')]
|
||||
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']}), dict(name='aside')]
|
||||
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']}), dict(name='aside'), dict(id='metaColumn')]
|
||||
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
|
||||
]
|
||||
|
@ -15,7 +15,8 @@ class GryOnlinePl(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['gc660', 'gc660 S013', 'news_endpage_tit', 'news_container', 'news']})]
|
||||
remove_tags = [dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
|
||||
remove_tags = [dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2',
|
||||
'twitter-share-button']})]
|
||||
feeds = [
|
||||
(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'),
|
||||
('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
|
||||
@ -44,7 +45,7 @@ class GryOnlinePl(BasicNewsRecipe):
|
||||
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
|
||||
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'imh10b']}):
|
||||
r.extract()
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
@ -80,7 +81,7 @@ class GryOnlinePl(BasicNewsRecipe):
|
||||
[comment.extract() for comment in comments]
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}):
|
||||
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony', 'imh10b']}):
|
||||
r.extract()
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
|
@ -30,4 +30,7 @@ class Kosmonauta(BasicNewsRecipe):
|
||||
href = a['href']
|
||||
if not href.startswith('http'):
|
||||
a['href'] = self.INDEX + href
|
||||
for a in soup.findAll(name='img'):
|
||||
if a.has_key('style') and 'float:' in a['style']:
|
||||
a['class'] = 'thumb-left'
|
||||
return soup
|
@ -17,16 +17,7 @@ class KurierPoranny(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
|
||||
use_embedded_content = False
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.poranny.pl/rss.xml'),
|
||||
(u'Białystok', u'http://www.poranny.pl/bialystok.xml'),
|
||||
@ -44,6 +35,8 @@ class KurierPoranny(BasicNewsRecipe):
|
||||
(u'Auto', u'http://www.poranny.pl/auto.xml'),
|
||||
(u'Polityka', u'http://www.poranny.pl/polityka.xml')]
|
||||
|
||||
keep_only_tags = [dict(id='article')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
@ -51,34 +44,12 @@ class KurierPoranny(BasicNewsRecipe):
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
def decode_feedportal_url(self, url):
|
||||
link = url.rpartition('l/0L0S')[2][:-12]
|
||||
replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
|
||||
for t in replaces:
|
||||
link = link.replace(*t)
|
||||
return 'http://' + link
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return self.decode_feedportal_url(url) + '&Template=printpicart'
|
||||
|
@ -12,6 +12,7 @@ class media2_pl(BasicNewsRecipe):
|
||||
description = u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.'
|
||||
masthead_url = 'http://media2.pl/res/logo/www.png'
|
||||
cover_url = 'http://media2.pl/res/logo/www.png'
|
||||
INDEX = 'http://media2.pl'
|
||||
remove_empty_feeds = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
@ -22,10 +23,16 @@ class media2_pl(BasicNewsRecipe):
|
||||
extra_css = '''.news-lead{font-weight: bold; }'''
|
||||
|
||||
keep_only_tags = [dict(name = 'div', attrs = {'class' : 'news-item tpl-big'})]
|
||||
remove_tags = [dict(name = 'span', attrs = {'class' : 'news-comments'}), dict(name = 'div', attrs = {'class' : 'item-sidebar'}), dict(name = 'div', attrs = {'class' : 'news-tags'})]
|
||||
remove_tags = [dict(name = 'span', attrs = {'class' : 'news-comments'}), dict(name = 'div', attrs = {'class' : ['item-sidebar', 'news-inline-promo nobbtext']}),
|
||||
dict(name = 'div', attrs = {'class' : 'news-tags'})]
|
||||
|
||||
feeds = [(u'Media2', u'http://feeds.feedburner.com/media2'), (u'Internet', u'http://feeds.feedburner.com/media2/internet'),
|
||||
(u'Media', 'http://feeds.feedburner.com/media2/media'), (u'Telekomunikacja', 'http://feeds.feedburner.com/media2/telekomunikacja'),
|
||||
(u'Reklama/PR', 'http://feeds.feedburner.com/media2/reklama-pr'), (u'Technologie', 'http://feeds.feedburner.com/media2/technologie'),
|
||||
(u'Badania', 'http://feeds.feedburner.com/media2/badania')
|
||||
]
|
||||
]
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
if url[0] == '/':
|
||||
url = self.INDEX + url
|
||||
return url
|
@ -17,18 +17,12 @@ class NTO(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
|
||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
|
||||
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
|
||||
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
|
||||
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
|
||||
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
|
||||
dict(attrs={'class':'articleFunctions'})]
|
||||
use_embedded_content = False
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://www.nto.pl/rss.xml'), (u'Region', u'http://www.nto.pl/region.xml'), (u'Brzeg', u'http://www.nto.pl/brzeg.xml'), (u'G\u0142ubczyce', u'http://www.nto.pl/glubczyce.xml'), (u'K\u0119dzierzyn-Ko\u017ale', u'http://www.nto.pl/kedzierzynkozle.xml'), (u'Kluczbork', u'http://www.nto.pl/kluczbork.xml'), (u'Krapkowice', u'http://www.nto.pl/krapkowice.xml'), (u'Namys\u0142\xf3w', u'http://www.nto.pl/namyslow.xml'), (u'Nysa', u'http://www.nto.pl/nysa.xml'), (u'Olesno', u'http://www.nto.pl/olesno.xml'), (u'Opole', u'http://www.nto.pl/opole.xml'), (u'Prudnik', u'http://www.nto.pl/prudnik.xml'), (u'Strzelce Opolskie', u'http://www.nto.pl/strzelceopolskie.xml'), (u'Sport', u'http://www.nto.pl/sport.xml'), (u'Polska i \u015bwiat', u'http://www.nto.pl/apps/pbcs.dll/section?Category=RSS&channel=KRAJSWIAT'), (u'Zdrowy styl', u'http://www.nto.pl/apps/pbcs.dll/section?Category=rss_zdrowystyl'), (u'Reporta\u017c', u'http://www.nto.pl/reportaz.xml'), (u'Studia', u'http://www.nto.pl/akademicka.xml')]
|
||||
|
||||
keep_only_tags = [dict(id='article')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
|
||||
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
|
||||
@ -36,33 +30,12 @@ class NTO(BasicNewsRecipe):
|
||||
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
|
||||
if tag:
|
||||
number = int(tag.string.rpartition('/')[-1].replace(' ', ''))
|
||||
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
|
||||
def decode_feedportal_url(self, url):
|
||||
link = url.rpartition('l/0L0S')[2][:-12]
|
||||
replaces = (('0B', '.'), ('0C', '/'), ('0H', ','), ('0D', '?'), ('0F', '='), ('0A', '0'), ('0I', '_'))
|
||||
for t in replaces:
|
||||
link = link.replace(*t)
|
||||
return 'http://' + link
|
||||
|
||||
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
|
||||
r.extract()
|
||||
for nr in range(2, number+1):
|
||||
soup2 = self.index_to_soup(baseurl + str(nr))
|
||||
pagetext = soup2.find(id='photoContainer')
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoMeta'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
pagetext = soup2.find(attrs={'class':'photoStoryText'})
|
||||
if pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||
for comment in comments:
|
||||
comment.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return self.decode_feedportal_url(url) + '&Template=printpicart'
|
||||
|
@ -17,6 +17,7 @@ class presseurop(BasicNewsRecipe):
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_empty_feeds = True
|
||||
|
||||
feeds = [
|
||||
(u'Polityka', u'http://www.presseurop.eu/pl/taxonomy/term/1/%2A/feed'),
|
||||
|
File diff suppressed because one or more lines are too long
@ -14,5 +14,5 @@ class Tablety_pl(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
|
||||
keep_only_tags = [dict(id='news_block')]
|
||||
remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments', 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer']})]
|
||||
remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments', 'wp-polls-loading', 'ts-fab-wrapper', 'entry-footer', 'social-custom']})]
|
||||
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
|
Loading…
x
Reference in New Issue
Block a user