From 0aa65bc9de80852c2325ca072e0dacd8c2ec9f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sat, 15 Oct 2016 00:11:52 +0200 Subject: [PATCH 1/3] recipes: add cover, masthead, descriptions and dates in section menus for gosc_niedzielny --- recipes/gosc_niedzielny.recipe | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe index f736562353..d2a2f80942 100644 --- a/recipes/gosc_niedzielny.recipe +++ b/recipes/gosc_niedzielny.recipe @@ -11,14 +11,15 @@ from lxml import html class GN(BasicNewsRecipe): - __author__ = 'Piotr Kontek, Tomasz Długosz' title = u'Gość Niedzielny' + publisher = 'Wydawnictwo Kurii Metropolitalnej w Katowicach' description = 'Ogólnopolski tygodnik katolicki - fragmenty artykułów z aktualnego numeru' encoding = 'utf-8' no_stylesheets = True language = 'pl' remove_javascript = True + masthead_url = 'http://m.gosc.pl/static/themes/czerwony_gosc-mobile/logo.png' def find_last_issue(self): raw = self.index_to_soup( @@ -30,15 +31,21 @@ class GN(BasicNewsRecipe): return page[0] def parse_index(self): - soup = self.index_to_soup('http://gosc.pl' + self.find_last_issue()) + self.last_issue = self.find_last_issue() + soup = self.index_to_soup('http://gosc.pl' + self.last_issue) + self.cover_url = 'http://www.gosc.pl' + \ + soup.find('div', attrs={'class': 'fl-w100 release-wp'} + ).findAll('a')[-4].contents[0]['src'] feeds = [] - # wstepniak - a = soup.find('div', attrs={'class': 'release-wp-b'}).find('a') + # editorial: + a = soup.find('div', attrs={'class': 'release-wp-b'}) + art = a.find('a') articles = [ - {'title': self.tag_to_string(a), - 'url': 'http://www.gosc.pl' + a['href'] + {'title': self.tag_to_string(art), + 'url': 'http://www.gosc.pl' + art['href'], + 'description': self.tag_to_string(a.find('p', attrs={'class': 'b lead'})) }] - feeds.append((u'Wstępniak', articles)) + feeds.append((u'Na dobry początek', articles)) # kategorie for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}): if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb': @@ -51,11 +58,13 @@ class GN(BasicNewsRecipe): return feeds def find_articles(self, main_block): - for a in main_block.findAll('div', attrs={'class': ['prev_doc_n1 prev_doc_img21']}): + for a in main_block.findAll('div', attrs={'class': ['attachmentContent']}): art = a.find('a') yield { 'title': self.tag_to_string(art), - 'url': 'http://www.gosc.pl' + art['href'] + 'url': 'http://www.gosc.pl' + art['href'], + 'date': self.tag_to_string(a.find('b', attrs={'class': 'time'})).replace('DODANE', ' '), + 'description': self.tag_to_string(a.find('div', attrs={'class': 'txt'})) } def append_page(self, soup, appendtag): From f279dc923a38dd42033bbee2a9916d7e38bd2cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sat, 15 Oct 2016 20:47:13 +0200 Subject: [PATCH 2/3] recipes: remove archeowiesci as it is supended by author - http://archeowiesci.pl/2016/09/12/archeowiesci-biora-przerwe/ --- recipes/archeowiesci.recipe | 35 --------------------------------- recipes/icons/archeowiesci.png | Bin 331 -> 0 bytes 2 files changed, 35 deletions(-) delete mode 100644 recipes/archeowiesci.recipe delete mode 100644 recipes/icons/archeowiesci.png diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe deleted file mode 100644 index 43517e3465..0000000000 --- a/recipes/archeowiesci.recipe +++ /dev/null @@ -1,35 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class Archeowiesci(BasicNewsRecipe): - title = u'Archeowieści' - __author__ = 'fenuks' - category = 'archeology' - language = 'pl' - description = u'Z pasją o przeszłości' - cover_url = 'http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' - oldest_article = 7 - needs_subscription = 'optional' - max_articles_per_feed = 100 - auto_cleanup = True - remove_tags = [ - dict(name='span', attrs={'class': ['post-ratings', 'post-ratings-loading']})] - feeds = [(u'Archeowieści', u'http://archeowiesci.pl/feed/')] - - def parse_feeds(self): - feeds = BasicNewsRecipe.parse_feeds(self) - for feed in feeds: - for article in feed.articles[:]: - if self.username is None and 'subskrypcja' in article.title: - feed.articles.remove(article) - return feeds - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open('http://archeowiesci.pl/wp-login.php') - br.select_form(name='loginform') - br['log'] = self.username - br['pwd'] = self.password - br.submit() - return br diff --git a/recipes/icons/archeowiesci.png b/recipes/icons/archeowiesci.png deleted file mode 100644 index ac7f45a99ad394e4f5ad99eff5f758aa8f1f9769..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 331 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbK}X@F0ND<>Baa{mASUqe+zOHEZv zO-)BbT~ABXKu6nHPuE0W&(uKA)X>1($iTwb$jZdn%GB7})Wp`p+`-Dy$;R5n&eqMr z&dtHz!_mRh$uT0pFD4`?AtJ1$qO7B)s;8l*r=f25?%gL(p1gVU=8G3EzJ2=!wCkKM zJIH0sB|(0{zP`Ta`>tsKxlNufjv*3~p$855nhiu;9J7NR@;O@z{x4prv8+$}_U}jj z(XMA^jO@qt0#8U8Ma0Bx@K&>cFZ?mq3rRCQXv8l3+L^za`ef4W3AG3 z?1_55hw_QwCYFr<`tF9TQR8z-d Date: Sat, 15 Oct 2016 21:41:03 +0200 Subject: [PATCH 3/3] recipes: remove most of garbage tags from ciekawostki_historyczne --- recipes/ciekawostki_historyczne.recipe | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe index e77388c28f..a62b383833 100644 --- a/recipes/ciekawostki_historyczne.recipe +++ b/recipes/ciekawostki_historyczne.recipe @@ -20,8 +20,8 @@ class Ciekawostki_Historyczne(BasicNewsRecipe): remove_empty_feeds = True keep_only_tags = [dict(name='div', attrs={'class': 'post'})] recursions = 5 - remove_tags = [dict(id='singlepostinfo'), dict( - attrs={'class': ['books short floatRight', 'unprintable', 'booksTable', 'bawmrp']})] + remove_tags = [dict(id=['catapult-cookie-bar','header','footer','rightcolumn','singlepostinfo']), dict( + attrs={'class': ['ubm_banner','ciekawostki-slider-popular','books short floatRight', 'unprintable', 'booksTable', 'bawmrp']})] feeds = [ (u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'),