diff --git a/recipes/gosc_full.recipe b/recipes/gosc_full.recipe index aee4e3a394..976b65b11d 100644 --- a/recipes/gosc_full.recipe +++ b/recipes/gosc_full.recipe @@ -3,7 +3,7 @@ from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ - 2013-2014, Tomasz Długosz, tomek3d@gmail.com' + 2013-2016, Tomasz Długosz, tomek3d@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe import re @@ -20,11 +20,12 @@ class GN(BasicNewsRecipe): no_stylesheets = True language = 'pl' remove_javascript = True + ignore_duplicate_articles = {'url'} masthead_url = 'http://m.gosc.pl/static/themes/czerwony_gosc-mobile/logo.png' def find_last_issue(self): raw = self.index_to_soup( - 'http://m.gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) + 'http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) doc = html.fromstring(raw) page = doc.xpath( '//div[@class="search-result release-result"]/div[1]/div[1]/a/@href') @@ -41,7 +42,6 @@ class GN(BasicNewsRecipe): soup.find('div', attrs={'class': 'fl-w100 release-wp'} ).findAll('a')[-4].contents[0]['src'] feeds = [] - enlisted = [] # editorial: a = soup.find('div', attrs={'class': 'release-wp-b'}) art = a.find('a') @@ -51,7 +51,6 @@ class GN(BasicNewsRecipe): 'description': self.tag_to_string(a.find('p', attrs={'class': 'b lead'})) }] feeds.append((u'Na dobry początek', articles)) - enlisted.append(articles[0].get('url')) # columns: for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}): if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb': @@ -61,8 +60,6 @@ class GN(BasicNewsRecipe): if len(articles) > 0: section = addr.string feeds.append((section, articles)) - enlisted.extend(list(article.get('url') - for article in articles)) # not assigned content: page = 1 not_assigned = [] @@ -70,24 +67,23 @@ class GN(BasicNewsRecipe): soup = self.index_to_soup( 'http://gosc.pl' + self.last_issue.replace('przeglad', 'wszystko') + '/' + str(page)) articles = list(self.find_articles(soup)) - not_assigned.extend( - [x for x in articles if x.get('url') not in enlisted]) + not_assigned.extend(articles) page += 1 pages = soup.find('span', attrs={'class': 'pgr_nrs'}) if str(page) not in [self.tag_to_string(x)[1] for x in pages.findAll('a')]: break - feeds.insert(1, (u'Nieprzypisane', not_assigned)) + feeds.append((u'Nieprzypisane', not_assigned)) return feeds def find_articles(self, main_block): - for a in main_block.findAll('div', attrs={'class': ['prev_doc2', 'sr-document']}): + for a in main_block.findAll('div', attrs={'class': ['attachmentContent']}): art = a.find('a') yield { 'title': self.tag_to_string(art), 'url': 'http://www.gosc.pl' + art['href'], - 'date': self.tag_to_string(a.find('p', attrs={'class': 'sr-date'})), - 'description': self.tag_to_string(a.find('p', attrs={'class': 'sr-lead'})) + 'date': self.tag_to_string(a.find('b', attrs={'class': 'time'})).replace('DODANE', ' '), + 'description': self.tag_to_string(a.find('div', attrs={'class': 'txt'})) } def append_page(self, soup, appendtag): @@ -121,8 +117,8 @@ class GN(BasicNewsRecipe): ] remove_tags = [ - dict(name='p', attrs={'class': ['r tr', 'l l-2', 'wykop', 'tags']}), - dict(name='div', attrs={'class': ['doc_actions', 'cf', 'fr1_cl']}), + dict(name='p', attrs={'class': ['r tr', 'l l-2', 'wykop', 'l l-2 doc-source']}), + dict(name='div', attrs={'class': ['doc_actions', 'cf', 'fr1_cl','txt__social-icons','txt__tags']}), dict(name='div', attrs={'id': 'vote'}), dict(name='link'), dict(name='a', attrs={'class': 'img_enlarge'})