recipes: update gosc_full

This commit is contained in:
Tomasz Długosz 2016-10-18 22:18:16 +02:00
parent 9a221dd0dd
commit 6bce8c5524

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \
2013-2014, Tomasz Długosz, tomek3d@gmail.com' 2013-2016, Tomasz Długosz, tomek3d@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
@ -20,11 +20,12 @@ class GN(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
language = 'pl' language = 'pl'
remove_javascript = True remove_javascript = True
ignore_duplicate_articles = {'url'}
masthead_url = 'http://m.gosc.pl/static/themes/czerwony_gosc-mobile/logo.png' masthead_url = 'http://m.gosc.pl/static/themes/czerwony_gosc-mobile/logo.png'
def find_last_issue(self): def find_last_issue(self):
raw = self.index_to_soup( raw = self.index_to_soup(
'http://m.gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) 'http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True)
doc = html.fromstring(raw) doc = html.fromstring(raw)
page = doc.xpath( page = doc.xpath(
'//div[@class="search-result release-result"]/div[1]/div[1]/a/@href') '//div[@class="search-result release-result"]/div[1]/div[1]/a/@href')
@ -41,7 +42,6 @@ class GN(BasicNewsRecipe):
soup.find('div', attrs={'class': 'fl-w100 release-wp'} soup.find('div', attrs={'class': 'fl-w100 release-wp'}
).findAll('a')[-4].contents[0]['src'] ).findAll('a')[-4].contents[0]['src']
feeds = [] feeds = []
enlisted = []
# editorial: # editorial:
a = soup.find('div', attrs={'class': 'release-wp-b'}) a = soup.find('div', attrs={'class': 'release-wp-b'})
art = a.find('a') art = a.find('a')
@ -51,7 +51,6 @@ class GN(BasicNewsRecipe):
'description': self.tag_to_string(a.find('p', attrs={'class': 'b lead'})) 'description': self.tag_to_string(a.find('p', attrs={'class': 'b lead'}))
}] }]
feeds.append((u'Na dobry początek', articles)) feeds.append((u'Na dobry początek', articles))
enlisted.append(articles[0].get('url'))
# columns: # columns:
for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}): for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}):
if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb': if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb':
@ -61,8 +60,6 @@ class GN(BasicNewsRecipe):
if len(articles) > 0: if len(articles) > 0:
section = addr.string section = addr.string
feeds.append((section, articles)) feeds.append((section, articles))
enlisted.extend(list(article.get('url')
for article in articles))
# not assigned content: # not assigned content:
page = 1 page = 1
not_assigned = [] not_assigned = []
@ -70,24 +67,23 @@ class GN(BasicNewsRecipe):
soup = self.index_to_soup( soup = self.index_to_soup(
'http://gosc.pl' + self.last_issue.replace('przeglad', 'wszystko') + '/' + str(page)) 'http://gosc.pl' + self.last_issue.replace('przeglad', 'wszystko') + '/' + str(page))
articles = list(self.find_articles(soup)) articles = list(self.find_articles(soup))
not_assigned.extend( not_assigned.extend(articles)
[x for x in articles if x.get('url') not in enlisted])
page += 1 page += 1
pages = soup.find('span', attrs={'class': 'pgr_nrs'}) pages = soup.find('span', attrs={'class': 'pgr_nrs'})
if str(page) not in [self.tag_to_string(x)[1] for x in pages.findAll('a')]: if str(page) not in [self.tag_to_string(x)[1] for x in pages.findAll('a')]:
break break
feeds.insert(1, (u'Nieprzypisane', not_assigned)) feeds.append((u'Nieprzypisane', not_assigned))
return feeds return feeds
def find_articles(self, main_block): def find_articles(self, main_block):
for a in main_block.findAll('div', attrs={'class': ['prev_doc2', 'sr-document']}): for a in main_block.findAll('div', attrs={'class': ['attachmentContent']}):
art = a.find('a') art = a.find('a')
yield { yield {
'title': self.tag_to_string(art), 'title': self.tag_to_string(art),
'url': 'http://www.gosc.pl' + art['href'], 'url': 'http://www.gosc.pl' + art['href'],
'date': self.tag_to_string(a.find('p', attrs={'class': 'sr-date'})), 'date': self.tag_to_string(a.find('b', attrs={'class': 'time'})).replace('DODANE', ' '),
'description': self.tag_to_string(a.find('p', attrs={'class': 'sr-lead'})) 'description': self.tag_to_string(a.find('div', attrs={'class': 'txt'}))
} }
def append_page(self, soup, appendtag): def append_page(self, soup, appendtag):
@ -121,8 +117,8 @@ class GN(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
dict(name='p', attrs={'class': ['r tr', 'l l-2', 'wykop', 'tags']}), dict(name='p', attrs={'class': ['r tr', 'l l-2', 'wykop', 'l l-2 doc-source']}),
dict(name='div', attrs={'class': ['doc_actions', 'cf', 'fr1_cl']}), dict(name='div', attrs={'class': ['doc_actions', 'cf', 'fr1_cl','txt__social-icons','txt__tags']}),
dict(name='div', attrs={'id': 'vote'}), dict(name='div', attrs={'id': 'vote'}),
dict(name='link'), dict(name='link'),
dict(name='a', attrs={'class': 'img_enlarge'}) dict(name='a', attrs={'class': 'img_enlarge'})