From 333b4bc970f127ad1526a16c2af71647193fabbe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 Feb 2012 08:53:25 +0530 Subject: [PATCH] Il Manifesto by Giacomo Lacava. Fixes #926374 (ENHANCEMENT: News Recipe for Italian newspaper "il manifesto") --- recipes/ilmanifesto.recipe | 110 +++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 recipes/ilmanifesto.recipe diff --git a/recipes/ilmanifesto.recipe b/recipes/ilmanifesto.recipe new file mode 100644 index 0000000000..d7428cebb2 --- /dev/null +++ b/recipes/ilmanifesto.recipe @@ -0,0 +1,110 @@ +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +MANIFESTO_BASEURL = 'http://www.ilmanifesto.it/' + +class IlManifesto(BasicNewsRecipe): + title = 'Il Manifesto' + __author__ = 'Giacomo Lacava' + description = 'quotidiano comunista - ultima edizione html disponibile' + publication_type = 'newspaper' + publisher = 'il manifesto coop. editrice a r.l.' + language = 'it' + + oldest_article = 2 + max_articles_per_feed = 100 + delay = 1 + no_stylesheets = True + simultaneous_downloads = 5 + timeout = 30 + auto_cleanup = True + remove_tags = [dict(name='div', attrs={'class':'column_1 float_left'})] + remove_tags_before = dict(name='div',attrs={'class':'column_2 float_right'}) + remove_tags_after = dict(id='myPrintArea') + + manifesto_index = None + manifesto_datestr = None + + def _set_manifesto_index(self): + if self.manifesto_index == None: + startUrl = MANIFESTO_BASEURL + 'area-abbonati/in-edicola/' + startSoup = self.index_to_soup(startUrl) + lastEdition = startSoup.findAll('div',id='accordion_inedicola')[1].find('a')['href'] + del(startSoup) + self.manifesto_index = MANIFESTO_BASEURL + lastEdition + urlsplit = lastEdition.split('/') + self.manifesto_datestr = urlsplit[-1] + if urlsplit[-1] == '': + self.manifesto_datestr = urlsplit[-2] + + + + def get_cover_url(self): + self._set_manifesto_index() + url = MANIFESTO_BASEURL + 'fileadmin/archivi/in_edicola/%sprimapagina.gif' % self.manifesto_datestr + return url + + def parse_index(self): + self._set_manifesto_index() + soup = self.index_to_soup(self.manifesto_index) + feedLinks = soup.find('div',id='accordion_inedicola').findAll('a') + result = [] + for feed in feedLinks: + articles = [] + feedName = feed.find('h2').string + feedUrl = MANIFESTO_BASEURL + feed['href'] + feedSoup = self.index_to_soup(feedUrl) + indexRoot = feedSoup.find('div',attrs={'class':'column1'}) + for div in indexRoot.findAll('div',attrs={'class':'strumenti1_inedicola'}): + artLink = div.find('a') + if artLink is None: continue # empty div + title = artLink.string + url = MANIFESTO_BASEURL + artLink['href'] + + description = '' + descNode = div.find('div',attrs={'class':'text_12'}) + if descNode is not None: + description = descNode.string + + author = '' + authNode = div.find('div',attrs={'class':'firma'}) + if authNode is not None: + author = authNode.string + + articleText = '' + article = { + 'title':title, + 'url':url, + 'date': strftime('%d %B %Y'), + 'description': description, + 'content': articleText, + 'author': author + } + articles.append(article) + result.append((feedName,articles)) + return result + + + def extract_readable_article(self, html, url): + + bs = BeautifulSoup(html) + col1 = bs.find('div',attrs={'class':'column1'}) + + content = col1.find('div',attrs={'class':'bodytext'}) + title = bs.find(id='titolo_articolo').string + author = col1.find('span',attrs={'class':'firma'}) + subtitle = '' + subNode = col1.findPrevious('div',attrs={'class':'occhiello_rosso'}) + if subNode is not None: + subtitle = subNode + summary = '' + sommNode = bs.find('div',attrs={'class':'sommario'}) + if sommNode is not None: + summary = sommNode + + template = "%(title)s

%(title)s

%(subtitle)s

%(author)s

%(summary)s
%(content)s
" + del(bs) + return template % dict(title=title,subtitle=subtitle,author=author,summary=summary,content=content) + +