Updated wprost

2025-07-09 03:04:10 -04:00 · 2011-06-26 20:39:35 -06:00 · 2011-06-26 20:39:35 -06:00 · f5c8e0c264
commit f5c8e0c264
parent 5cbba547f8
1 changed files with 69 additions and 67 deletions
--- a/recipes/wprost.recipe
+++ b/recipes/wprost.recipe
@ -2,90 +2,92 @@

 __license__   = 'GPL v3'
 __copyright__ = '2010, matek09, matek09@gmail.com'
+__copyright__ = 'Modified 2011,  Mariusz Wolek <mariusz_dot_wolek @ gmail dot com>'

 from calibre.web.feeds.news import BasicNewsRecipe
 import re

 class Wprost(BasicNewsRecipe):
-	EDITION = 0
-	FIND_LAST_FULL_ISSUE = True
-	EXCLUDE_LOCKED = True
-	ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif'
+        EDITION = 0
+        FIND_LAST_FULL_ISSUE = True
+        EXCLUDE_LOCKED = True
+        ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif'

-	title = u'Wprost'
-	__author__ = 'matek09'
-	description = 'Weekly magazine'
-	encoding = 'ISO-8859-2'
-	no_stylesheets = True
-	language = 'pl'
-	remove_javascript = True
+        title = u'Wprost'
+        __author__ = 'matek09'
+        description = 'Weekly magazine'
+        encoding = 'ISO-8859-2'
+        no_stylesheets = True
+        language = 'pl'
+        remove_javascript = True

-	remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
-	remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+        remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+        remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))

-	'''keep_only_tags =[]
-	keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
-	keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
-	keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
-	keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))'''
+        '''keep_only_tags =[]
+        keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
+        keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
+        keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
+        keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))'''

-	preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
-						(re.compile(r'display: block;'), lambda match: '')]
+        preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
+        (re.compile(r'display: block;'), lambda match: ''),
+        (re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''),
+        (re.compile(r'\<table .*?\>'), lambda match: ''),
+        (re.compile(r'\<tr>'), lambda match: ''),
+        (re.compile(r'\<td .*?\>'), lambda match: '')]

+        remove_tags =[]
+        remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'}))
+        remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'}))
+        remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'}))

-	remove_tags =[]
-	remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'}))
-	remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'}))
-	remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'}))
-
-
-	extra_css = '''
-					.div-header {font-size: x-small; font-weight: bold}
-					'''
+        extra_css = '''
+                                        .div-header {font-size: x-small; font-weight: bold}
+                                        '''
 #h2 {font-size: x-large; font-weight: bold}
-	def is_blocked(self, a):
-		if a.findNextSibling('img') is None:
-			return False
-		else:
-			return True
+        def is_blocked(self, a):
+                if a.findNextSibling('img') is None:
+                        return False
+                else:
+                        return True



-	def find_last_issue(self):
-		soup = self.index_to_soup('http://www.wprost.pl/archiwum/')
-		a = 0
-		if self.FIND_LAST_FULL_ISSUE:
-			ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED})
-			a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
-		else:
-			a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
-		self.EDITION = a['href'].replace('/tygodnik/?I=', '')
-		self.cover_url = a.img['src']
+        def find_last_issue(self):
+                soup = self.index_to_soup('http://www.wprost.pl/archiwum/')
+                a = 0
+                if self.FIND_LAST_FULL_ISSUE:
+                        ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED})
+                        a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
+                else:
+                        a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
+                self.EDITION = a['href'].replace('/tygodnik/?I=', '')
+                self.cover_url = a.img['src']



-	def parse_index(self):
-		self.find_last_issue()
-		soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION)
-		feeds = []
-		for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}):
-			articles = list(self.find_articles(main_block))
-			if len(articles) > 0:
-				section = self.tag_to_string(main_block)
-				feeds.append((section, articles))
-		return feeds
-
-	def find_articles(self, main_block):
-		for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}):
-			if a.name in "td":
-				break
-			if self.EXCLUDE_LOCKED & self.is_blocked(a):
-				continue
-			yield {
-				'title' : self.tag_to_string(a),
-				'url'   : 'http://www.wprost.pl' + a['href'],
-				'date'  : '',
-				'description' : ''
-				}
+        def parse_index(self):
+                self.find_last_issue()
+                soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION)
+                feeds = []
+                for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}):
+                        articles = list(self.find_articles(main_block))
+                        if len(articles) > 0:
+                                section = self.tag_to_string(main_block)
+                                feeds.append((section, articles))
+                return feeds

+        def find_articles(self, main_block):
+                for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}):
+                        if a.name in "td":
+                                break
+                        if self.EXCLUDE_LOCKED & self.is_blocked(a):
+                                continue
+                        yield {
+                                'title' : self.tag_to_string(a),
+                                'url'   : 'http://www.wprost.pl' + a['href'],
+                                'date'  : '',
+                                'description' : ''
+                                }