From c63425f0b030811981b89ce1185b84588ec5176e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 19:43:16 -0700 Subject: [PATCH] Fix #7686 (Updated recipes for Newswek Polska and Polityka, new recipes for Esensja, Histmag and Wprost) --- resources/recipes/esenja.recipe | 87 ++++++++++++++++++++++ resources/recipes/histmag.recipe | 59 +++++++++++++++ resources/recipes/newsweek_polska.recipe | 53 ++++++++++---- resources/recipes/polityka.recipe | 7 +- resources/recipes/wprost.recipe | 91 ++++++++++++++++++++++++ 5 files changed, 278 insertions(+), 19 deletions(-) create mode 100644 resources/recipes/esenja.recipe create mode 100644 resources/recipes/histmag.recipe create mode 100644 resources/recipes/wprost.recipe diff --git a/resources/recipes/esenja.recipe b/resources/recipes/esenja.recipe new file mode 100644 index 0000000000..b8b94ad66e --- /dev/null +++ b/resources/recipes/esenja.recipe @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, matek09, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Esensja(BasicNewsRecipe): + + title = u'Esensja' + __author__ = 'matek09' + description = 'Monthly magazine' + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + remove_javascript = True + HREF = '0' + + #keep_only_tags =[] + #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}) + remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'})) + remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) + + remove_tags =[] + remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'})) + + extra_css = ''' + .t-title {font-size: x-large; font-weight: bold; text-align: left} + .t-author {font-size: x-small; text-align: left} + .t-title2 {font-size: x-small; font-style: italic; text-align: left} + .text {font-size: small; text-align: left} + .annot-ref {font-style: italic; text-align: left} + ''' + + preprocess_regexps = [(re.compile(r'alt="[^"]*"'), + lambda match: '')] + + def parse_index(self): + soup = self.index_to_soup('http://www.esensja.pl/magazyn/') + a = soup.find('a', attrs={'href' : re.compile('.*/index.html')}) + year = a['href'].split('/')[0] + month = a['href'].split('/')[1] + self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' + soup = self.index_to_soup(self.HREF + '01.html') + self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg' + feeds = [] + intro = soup.find('div', attrs={'class' : 'n-title'}) + introduction = {'title' : self.tag_to_string(intro.a), + 'url' : self.HREF + intro.a['href'], + 'date' : '', + 'description' : ''} + chapter = 'Wprowadzenie' + subchapter = '' + articles = [] + articles.append(introduction) + for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}): + if tag.name in 'td': + if len(articles) > 0: + section = chapter + if len(subchapter) > 0: + section += ' - ' + subchapter + feeds.append((section, articles)) + articles = [] + if tag['class'] == 'chapter': + chapter = self.tag_to_string(tag).capitalize() + subchapter = '' + else: + subchapter = self.tag_to_string(tag) + subchapter = self.tag_to_string(tag) + continue + articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''}) + + a = self.index_to_soup(self.HREF + tag.a['href']) + i = 1 + while True: + div = a.find('div', attrs={'class' : 't-title2 nextpage'}) + if div is not None: + a = self.index_to_soup(self.HREF + div.a['href']) + articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''}) + i = i + 1 + else: + break + + return feeds diff --git a/resources/recipes/histmag.recipe b/resources/recipes/histmag.recipe new file mode 100644 index 0000000000..38956e7995 --- /dev/null +++ b/resources/recipes/histmag.recipe @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, matek09, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Histmag(BasicNewsRecipe): + + title = u'Histmag' + __author__ = 'matek09' + description = u"Artykuly historyczne i publicystyczne" + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + remove_javascript = True + #max_articles_per_feed = 1 + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'})) + remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'})) + #keep_only_tags =[] + #keep_only_tags.append(dict(name = 'h2')) + #keep_only_tags.append(dict(name = 'p')) + + remove_tags =[] + remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'})) + remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'})) + + preprocess_regexps = [(re.compile(r''), lambda match: '

'), + (re.compile(r''), lambda match: '

')] + extra_css = ''' + .left {font-size: x-small} + .right {font-size: x-small} + ''' + + def find_articles(self, soup): + articles = [] + for div in soup.findAll('div', attrs={'class' : 'text'}): + articles.append({ + 'title' : self.tag_to_string(div.h3.a), + 'url' : 'http://www.histmag.org/' + div.h3.a['href'], + 'date' : self.tag_to_string(div.next('p')).split('|')[0], + 'description' : self.tag_to_string(div.next('p', podpis=False)), + }) + return articles + + def parse_index(self): + soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0') + feeds = [] + feeds.append((u"Artykuly historyczne", self.find_articles(soup))) + soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0') + feeds.append((u"Artykuly publicystyczne", self.find_articles(soup))) + soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0') + feeds.append((u"Wydarzenia", self.find_articles(soup))) + + return feeds + + diff --git a/resources/recipes/newsweek_polska.recipe b/resources/recipes/newsweek_polska.recipe index 31dd8ccddd..4227a88026 100644 --- a/resources/recipes/newsweek_polska.recipe +++ b/resources/recipes/newsweek_polska.recipe @@ -1,19 +1,22 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com' +__copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe class Newsweek(BasicNewsRecipe): - EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EDITION = '0' + EXCLUDE_LOCKED = True + LOCKED_ICO = 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif' title = u'Newsweek Polska' - __author__ = 'Mateusz Kielar' + __author__ = 'matek09' description = 'Weekly magazine' encoding = 'utf-8' no_stylesheets = True - language = 'en' + language = 'pl' remove_javascript = True keep_only_tags =[] @@ -33,34 +36,54 @@ class Newsweek(BasicNewsRecipe): def print_version(self, url): return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print' + def is_locked(self, a): + if a.findNext('img')['src'] == 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif': + return True + else: + return False + + def is_full(self, issue_soup): + if len(issue_soup.findAll('img', attrs={'src' : 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'})) > 1: + return False + else: + return True + def find_last_full_issue(self): - page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx') - issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] - page = self.index_to_soup(issue) - issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] - page = self.index_to_soup(issue) - self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','') + frame_url = 'http://www.newsweek.pl/Frames/IssueCover.aspx' + while True: + frame_soup = self.index_to_soup(frame_url) + self.EDITION = frame_soup.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','') + issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) + if self.is_full(issue_soup): + break + frame_url = 'http://www.newsweek.pl/Frames/' + frame_soup.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] + + def parse_index(self): - self.find_last_full_issue() - soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION)) + if self.FIND_LAST_FULL_ISSUE: + self.find_last_full_issue() + soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True) self.cover_url = img['src'] feeds = [] parent = soup.find(id='content-left-big') for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}): - section = self.tag_to_string(txt).capitalize() articles = list(self.find_articles(txt)) - feeds.append((section, articles)) + if len(articles) > 0: + section = self.tag_to_string(txt).capitalize() + feeds.append((section, articles)) return feeds def find_articles(self, txt): for a in txt.findAllNext( attrs={'class':['strong','hr']}): if a.name in "div": break + if (not self.FIND_LAST_FULL_ISSUE) & self.EXCLUDE_LOCKED & self.is_locked(a): + continue yield { 'title' : self.tag_to_string(a), - 'url' : 'http://www.newsweek.pl'+a['href'], + 'url' : 'http://www.newsweek.pl' + a['href'], 'date' : '', 'description' : '' } diff --git a/resources/recipes/polityka.recipe b/resources/recipes/polityka.recipe index ab31e148aa..16ccae6085 100644 --- a/resources/recipes/polityka.recipe +++ b/resources/recipes/polityka.recipe @@ -1,18 +1,18 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com' +__copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe class Polityka(BasicNewsRecipe): title = u'Polityka' - __author__ = 'Mateusz Kielar' + __author__ = 'matek09' description = 'Weekly magazine. Last archive issue' encoding = 'utf-8' no_stylesheets = True - language = 'en' + language = 'pl' remove_javascript = True remove_tags_before = dict(dict(name = 'h2', attrs = {'class' : 'box_nag'})) @@ -48,7 +48,6 @@ class Polityka(BasicNewsRecipe): for div in box.findAll('div', attrs={'class': 'list_tresc'}): article_page = self.index_to_soup('http://archiwum.polityka.pl' + div.a['href'],) section = self.tag_to_string(article_page.find('h2', attrs = {'class' : 'box_nag'})).split('/')[0].lstrip().rstrip() - print section if not articles.has_key(section): articles[section] = [] articles[section].append( { diff --git a/resources/recipes/wprost.recipe b/resources/recipes/wprost.recipe new file mode 100644 index 0000000000..b317571981 --- /dev/null +++ b/resources/recipes/wprost.recipe @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, matek09, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Wprost(BasicNewsRecipe): + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' + + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True + + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + + '''keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' + + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + (re.compile(r'display: block;'), lambda match: '')] + + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) + + + extra_css = ''' + .div-header {font-size: x-small; font-weight: bold} + ''' +#h2 {font-size: x-large; font-weight: bold} + def is_blocked(self, a): + if a.findNextSibling('img') is None: + return False + else: + return True + + + + def find_last_issue(self): + soup = self.index_to_soup('http://www.wprost.pl/archiwum/') + a = 0 + if self.FIND_LAST_FULL_ISSUE: + ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) + a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) + else: + a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) + self.EDITION = a['href'].replace('/tygodnik/?I=', '') + self.cover_url = a.img['src'] + + + + def parse_index(self): + self.find_last_issue() + soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) + feeds = [] + for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}): + articles = list(self.find_articles(main_block)) + if len(articles) > 0: + section = self.tag_to_string(main_block) + feeds.append((section, articles)) + return feeds + + def find_articles(self, main_block): + for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}): + if a.name in "td": + break + if self.EXCLUDE_LOCKED & self.is_blocked(a): + continue + yield { + 'title' : self.tag_to_string(a), + 'url' : 'http://www.wprost.pl' + a['href'], + 'date' : '', + 'description' : '' + } + +