From 298d7cec05cff111c13601c05ae5bd70279b58b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sun, 16 Oct 2016 22:38:03 +0200 Subject: [PATCH] recipes: gosc_niedzielny - add content not assigned to columns --- recipes/gosc_niedzielny.recipe | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe index d2a2f80942..a76fa4e38e 100644 --- a/recipes/gosc_niedzielny.recipe +++ b/recipes/gosc_niedzielny.recipe @@ -19,6 +19,7 @@ class GN(BasicNewsRecipe): no_stylesheets = True language = 'pl' remove_javascript = True + ignore_duplicate_articles = {'url'} masthead_url = 'http://m.gosc.pl/static/themes/czerwony_gosc-mobile/logo.png' def find_last_issue(self): @@ -46,7 +47,7 @@ class GN(BasicNewsRecipe): 'description': self.tag_to_string(a.find('p', attrs={'class': 'b lead'})) }] feeds.append((u'Na dobry poczÄ…tek', articles)) - # kategorie + # columns: for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}): if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb': main_block = self.index_to_soup( @@ -55,6 +56,20 @@ class GN(BasicNewsRecipe): if len(articles) > 0: section = addr.string feeds.append((section, articles)) + # not assigned content: + page = 1 + not_assigned = [] + while True: + soup = self.index_to_soup( + 'http://gosc.pl' + self.last_issue.replace('przeglad', 'wszystko') + '/' + str(page)) + articles = list(self.find_articles(soup)) + not_assigned.extend(articles) + page += 1 + pages = soup.find('span', attrs={'class': 'pgr_nrs'}) + if str(page) not in [self.tag_to_string(x)[1] for x in pages.findAll('a')]: + break + + feeds.append((u'Nieprzypisane', not_assigned)) return feeds def find_articles(self, main_block):