diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index b317571981..b271665125 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -2,90 +2,92 @@ __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' +__copyright__ = 'Modified 2011, Mariusz Wolek ' from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): - EDITION = 0 - FIND_LAST_FULL_ISSUE = True - EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' - title = u'Wprost' - __author__ = 'matek09' - description = 'Weekly magazine' - encoding = 'ISO-8859-2' - no_stylesheets = True - language = 'pl' - remove_javascript = True + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - '''keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' + '''keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), - (re.compile(r'display: block;'), lambda match: '')] + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + (re.compile(r'display: block;'), lambda match: ''), + (re.compile(r'\\\<\/table\>'), lambda match: ''), + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\
'), lambda match: '')] + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) - - - extra_css = ''' - .div-header {font-size: x-small; font-weight: bold} - ''' + extra_css = ''' + .div-header {font-size: x-small; font-weight: bold} + ''' #h2 {font-size: x-large; font-weight: bold} - def is_blocked(self, a): - if a.findNextSibling('img') is None: - return False - else: - return True + def is_blocked(self, a): + if a.findNextSibling('img') is None: + return False + else: + return True - def find_last_issue(self): - soup = self.index_to_soup('http://www.wprost.pl/archiwum/') - a = 0 - if self.FIND_LAST_FULL_ISSUE: - ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) - a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) - else: - a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) - self.EDITION = a['href'].replace('/tygodnik/?I=', '') - self.cover_url = a.img['src'] + def find_last_issue(self): + soup = self.index_to_soup('http://www.wprost.pl/archiwum/') + a = 0 + if self.FIND_LAST_FULL_ISSUE: + ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) + a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) + else: + a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) + self.EDITION = a['href'].replace('/tygodnik/?I=', '') + self.cover_url = a.img['src'] - def parse_index(self): - self.find_last_issue() - soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) - feeds = [] - for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}): - articles = list(self.find_articles(main_block)) - if len(articles) > 0: - section = self.tag_to_string(main_block) - feeds.append((section, articles)) - return feeds - - def find_articles(self, main_block): - for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}): - if a.name in "td": - break - if self.EXCLUDE_LOCKED & self.is_blocked(a): - continue - yield { - 'title' : self.tag_to_string(a), - 'url' : 'http://www.wprost.pl' + a['href'], - 'date' : '', - 'description' : '' - } + def parse_index(self): + self.find_last_issue() + soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) + feeds = [] + for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}): + articles = list(self.find_articles(main_block)) + if len(articles) > 0: + section = self.tag_to_string(main_block) + feeds.append((section, articles)) + return feeds + def find_articles(self, main_block): + for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}): + if a.name in "td": + break + if self.EXCLUDE_LOCKED & self.is_blocked(a): + continue + yield { + 'title' : self.tag_to_string(a), + 'url' : 'http://www.wprost.pl' + a['href'], + 'date' : '', + 'description' : '' + }