diff --git a/resources/recipes/newsweek_polska.recipe b/resources/recipes/newsweek_polska.recipe new file mode 100644 index 0000000000..31dd8ccddd --- /dev/null +++ b/resources/recipes/newsweek_polska.recipe @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Newsweek(BasicNewsRecipe): + EDITION = 0 + + title = u'Newsweek Polska' + __author__ = 'Mateusz Kielar' + description = 'Weekly magazine' + encoding = 'utf-8' + no_stylesheets = True + language = 'en' + remove_javascript = True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'copy'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'url'})) + + extra_css = ''' + .body {font-size: small} + .author {font-size: x-small} + .lead {font-size: x-small} + .title{font-size: x-large; font-weight: bold} + ''' + + def print_version(self, url): + return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print' + + def find_last_full_issue(self): + page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx') + issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] + page = self.index_to_soup(issue) + issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] + page = self.index_to_soup(issue) + self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','') + + def parse_index(self): + self.find_last_full_issue() + soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION)) + img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True) + self.cover_url = img['src'] + feeds = [] + parent = soup.find(id='content-left-big') + for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}): + section = self.tag_to_string(txt).capitalize() + articles = list(self.find_articles(txt)) + feeds.append((section, articles)) + return feeds + + def find_articles(self, txt): + for a in txt.findAllNext( attrs={'class':['strong','hr']}): + if a.name in "div": + break + yield { + 'title' : self.tag_to_string(a), + 'url' : 'http://www.newsweek.pl'+a['href'], + 'date' : '', + 'description' : '' + } + + diff --git a/resources/recipes/polityka.recipe b/resources/recipes/polityka.recipe new file mode 100644 index 0000000000..ab31e148aa --- /dev/null +++ b/resources/recipes/polityka.recipe @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Polityka(BasicNewsRecipe): + + title = u'Polityka' + __author__ = 'Mateusz Kielar' + description = 'Weekly magazine. Last archive issue' + encoding = 'utf-8' + no_stylesheets = True + language = 'en' + remove_javascript = True + + remove_tags_before = dict(dict(name = 'h2', attrs = {'class' : 'box_nag'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'class' : 'box_footer'})) + + remove_tags =[] + remove_tags.append(dict(name = 'h2', attrs = {'class' : 'box_nag'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'box_footer'})) + + + extra_css = ''' + h1 {font-size: x-large; font-weight: bold} + ''' + + def parse_index(self): + soup = self.index_to_soup('http://archiwum.polityka.pl/') + box_img3 = soup.findAll(attrs={'class' : 'box_img3'}) + feeds = [] + last = 0 + self.cover_url = 'http://archiwum.polityka.pl' + box_img3[-1].find('img')['src'] + last_edition = 'http://archiwum.polityka.pl' + box_img3[-1].find('a')['href'] + + while True: + index = self.index_to_soup(last_edition) + + + box_list = index.findAll('div', attrs={'class' : 'box_list'}) + if len(box_list) == 0: + break + + articles = {} + for box in box_list: + for div in box.findAll('div', attrs={'class': 'list_tresc'}): + article_page = self.index_to_soup('http://archiwum.polityka.pl' + div.a['href'],) + section = self.tag_to_string(article_page.find('h2', attrs = {'class' : 'box_nag'})).split('/')[0].lstrip().rstrip() + print section + if not articles.has_key(section): + articles[section] = [] + articles[section].append( { + 'title' : self.tag_to_string(div.a), + 'url' : 'http://archiwum.polityka.pl' + div.a['href'], + 'date' : '', + 'description' : '' + }) + + for section in articles: + feeds.append((section, articles[section])) + + last_edition = last_edition.replace('http://archiwum.polityka.pl/wydanie/' + str(last), 'http://archiwum.polityka.pl/wydanie/' + str(last + 1)) + last = last + 1 + + return feeds +