This commit is contained in:
Kovid Goyal 2016-10-16 15:43:49 +05:30
commit c080a08b16
4 changed files with 20 additions and 46 deletions

View File

@ -1,35 +0,0 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Archeowiesci(BasicNewsRecipe):
title = u'Archeowieści'
__author__ = 'fenuks'
category = 'archeology'
language = 'pl'
description = u'Z pasją o przeszłości'
cover_url = 'http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
oldest_article = 7
needs_subscription = 'optional'
max_articles_per_feed = 100
auto_cleanup = True
remove_tags = [
dict(name='span', attrs={'class': ['post-ratings', 'post-ratings-loading']})]
feeds = [(u'Archeowieści', u'http://archeowiesci.pl/feed/')]
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if self.username is None and 'subskrypcja' in article.title:
feed.articles.remove(article)
return feeds
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://archeowiesci.pl/wp-login.php')
br.select_form(name='loginform')
br['log'] = self.username
br['pwd'] = self.password
br.submit()
return br

View File

@ -20,8 +20,8 @@ class Ciekawostki_Historyczne(BasicNewsRecipe):
remove_empty_feeds = True
keep_only_tags = [dict(name='div', attrs={'class': 'post'})]
recursions = 5
remove_tags = [dict(id='singlepostinfo'), dict(
attrs={'class': ['books short floatRight', 'unprintable', 'booksTable', 'bawmrp']})]
remove_tags = [dict(id=['catapult-cookie-bar','header','footer','rightcolumn','singlepostinfo']), dict(
attrs={'class': ['ubm_banner','ciekawostki-slider-popular','books short floatRight', 'unprintable', 'booksTable', 'bawmrp']})]
feeds = [
(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'),

View File

@ -11,14 +11,15 @@ from lxml import html
class GN(BasicNewsRecipe):
__author__ = 'Piotr Kontek, Tomasz Długosz'
title = u'Gość Niedzielny'
publisher = 'Wydawnictwo Kurii Metropolitalnej w Katowicach'
description = 'Ogólnopolski tygodnik katolicki - fragmenty artykułów z aktualnego numeru'
encoding = 'utf-8'
no_stylesheets = True
language = 'pl'
remove_javascript = True
masthead_url = 'http://m.gosc.pl/static/themes/czerwony_gosc-mobile/logo.png'
def find_last_issue(self):
raw = self.index_to_soup(
@ -30,15 +31,21 @@ class GN(BasicNewsRecipe):
return page[0]
def parse_index(self):
soup = self.index_to_soup('http://gosc.pl' + self.find_last_issue())
self.last_issue = self.find_last_issue()
soup = self.index_to_soup('http://gosc.pl' + self.last_issue)
self.cover_url = 'http://www.gosc.pl' + \
soup.find('div', attrs={'class': 'fl-w100 release-wp'}
).findAll('a')[-4].contents[0]['src']
feeds = []
# wstepniak
a = soup.find('div', attrs={'class': 'release-wp-b'}).find('a')
# editorial:
a = soup.find('div', attrs={'class': 'release-wp-b'})
art = a.find('a')
articles = [
{'title': self.tag_to_string(a),
'url': 'http://www.gosc.pl' + a['href']
{'title': self.tag_to_string(art),
'url': 'http://www.gosc.pl' + art['href'],
'description': self.tag_to_string(a.find('p', attrs={'class': 'b lead'}))
}]
feeds.append((u'Wstępniak', articles))
feeds.append((u'Na dobry początek', articles))
# kategorie
for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}):
if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb':
@ -51,11 +58,13 @@ class GN(BasicNewsRecipe):
return feeds
def find_articles(self, main_block):
for a in main_block.findAll('div', attrs={'class': ['prev_doc_n1 prev_doc_img21']}):
for a in main_block.findAll('div', attrs={'class': ['attachmentContent']}):
art = a.find('a')
yield {
'title': self.tag_to_string(art),
'url': 'http://www.gosc.pl' + art['href']
'url': 'http://www.gosc.pl' + art['href'],
'date': self.tag_to_string(a.find('b', attrs={'class': 'time'})).replace('DODANE', ' '),
'description': self.tag_to_string(a.find('div', attrs={'class': 'txt'}))
}
def append_page(self, soup, appendtag):

Binary file not shown.

Before

Width:  |  Height:  |  Size: 331 B