Update Newsweek Polska

2025-07-09 03:04:10 -04:00 · 2013-10-17 08:26:05 +05:30 · 2013-10-17 08:26:05 +05:30 · 42cc5b2813
commit 42cc5b2813
parent 66a88b5d6e
1 changed files with 223 additions and 133 deletions
--- a/recipes/newsweek_polska.recipe
+++ b/recipes/newsweek_polska.recipe
@ -2,173 +2,263 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com'
+__copyright__ = '2010, matek09, matek09@gmail.com; 2012-2013, admroz, a.rozewicki@gmail.com'
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
 from string import capwords
 import datetime
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class Newsweek(BasicNewsRecipe):
-	# how many issues to go back, 0 means get the most current one
+    # how many issues to go back, 0 means get the most current one
-	BACK_ISSUES = 2
+    BACK_ISSUES = 1
-	EDITION = '0'
+    EDITION = '0'
-	DATE = None
+    DATE = None
-	YEAR = datetime.datetime.now().year
+    YEAR = datetime.datetime.now().year
-	title = u'Newsweek Polska'
+    title = u'Newsweek Polska'
-	__author__ = 'matek09, admroz'
+    __author__ = 'matek09, admroz'
-	description = 'Weekly magazine'
+    description = 'Weekly magazine'
-	encoding = 'utf-8'
+    encoding = 'utf-8'
-	language = 'pl'
+    language = 'pl'
-	remove_javascript = True
+    remove_javascript = True
-	temp_files = []
+    temp_files = []
-	articles_are_obfuscated = True
+    articles_are_obfuscated = True
-	#
+    #
-	# Parses each article
+    # Parses article contents from one page
-	#
+    #
-	def get_obfuscated_article(self, url):
+    def get_article_divs(self, css, main_section):
-		br = self.get_browser()
+        strs = []
 		br.open(url)
 		source = br.response().read()
 		page = self.index_to_soup(source)
-		main_section = page.find(id='mainSection')
+        # get all divs with given css class
        article_divs = main_section.findAll('div', attrs={'class' : css})
        for article_div in article_divs:
-		title = main_section.find('h1')
+            # remove sections like 'read more...' etc.
-		info = main_section.find('ul', attrs={'class' : 'articleInfo'})
+            for p in article_div.findAll('p'):
 		authors = info.find('li').find('h4')
 		article = main_section.find('div', attrs={'id' : 'article'})
-		# remove related articles box
+                if p.find('span', attrs={'style' : 'color: #800000; font-size: medium;'}):
-		related = article.find('div', attrs={'class' : 'relatedBox'})
+                    p.extract()
-		if related is not None:
+                    continue
 			related.extract()
-		# remove div with social networking links and links to
+                if p.find('span', attrs={'style' : 'font-size: medium; color: #800000;'}):
-		# other articles in web version
+                    p.extract()
-		for div in article.findAll('div'):
+                    continue
 			if div.find('span', attrs={'class' : 'google-plus'}):
 				div.extract()
-			for p in div.findAll('p'):
+                if p.find('span', attrs={'style' : 'font-size: medium;'}):
-				if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}):
+                    p.extract()
-					p.extract()
+                    continue
-					continue
+
-				for a in p.findAll('a'):
+                if p.find('span', attrs={'style' : 'color: #800000;'}):
-					if a.find('span', attrs={'style' : 'font-size: larger;'}):
+                    p.extract()
-						a.extract()
+                    continue
                obj = p.find('object')
                if obj:
                    obj.extract()
                    continue
                strong = p.find('strong')
                if strong:
                    newest = re.compile("Tekst pochodzi z najnowszego numeru Tygodnika Newsweek")
                    if newest.search(str(strong)):
                        strong.extract()
                        continue
                itunes = p.find('a')
                if itunes:
                    reurl = re.compile("itunes.apple.com")
                    if reurl.search(str(itunes['href'])):
                        p.extract()
                        continue
                imagedesc = p.find('div', attrs={'class' : 'image-desc'})
                if imagedesc:
                    redesc = re.compile("Okładka numeru")
                    if (redesc.search(str(imagedesc))):
                        p.extract()
                        continue
 		html = unicode(title) + unicode(authors) + unicode(article)
 		next = main_section.find('li', attrs={'class' : 'next'})
-		while next:
+            # get actual contents
-			url = next.find('a')['href']
+            for content in article_div.contents:
-			br.open(url)
+                strs.append("".join(str(content)))
-			source = br.response().read()
+
-			page = self.index_to_soup(source)
+        # return contents as a string
-			main_section = page.find(id='mainSection')
+        return unicode("".join(strs))
 			article = main_section.find('div', attrs={'id' : 'article'})
 			aside = article.find(id='articleAside')
 			if aside is not None:
 				aside.extract()
 			html = html + unicode(article)
 			next = main_section.find('li', attrs={'class' : 'next'})
-		self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
+    #
-		self.temp_files[-1].write(html)
+    # Articles can be divided into several pages, this method parses them recursevely
-		self.temp_files[-1].close()
+    #
-		return self.temp_files[-1].name
+    def get_article_page(self, br, url, page):
        br.open(url)
        source = br.response().read()
        html = ''
        matches = re.search(r'<article>(.*)</article>', source, re.DOTALL)
        if matches is None:
            print "no article tag found, returning..."
            return
        main_section = BeautifulSoup(matches.group(0))
        if page == 0:
            title = main_section.find('h1')
            html = html + unicode(title)
            authors = ''
            authorBox = main_section.find('div', attrs={'class' : 'AuthorBox'})
            if authorBox is not None:
                authorH4 = authorBox.find('h4')
                if authorH4 is not None:
                    authors = self.tag_to_string(authorH4)
            html = html + unicode(authors)
            info = main_section.find('p', attrs={'class' : 'lead'})
            html = html + unicode(info)
        html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac articleStart', main_section)
        html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac', main_section)
        nextPage = main_section.find('a', attrs={'class' : 'next'})
        if nextPage:
            html = html + self.get_article_page(br, nextPage['href'], page+1)
        return html
    #
    # Parses each article
    #
    def get_obfuscated_article(self, url):
        br = self.get_browser()
        html = self.get_article_page(br, url, 0)
        self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()
        return self.temp_files[-1].name
-	#
+    #
-	# Goes back given number of issues. It also knows how to go back
+    # Goes back given number of issues. It also knows how to go back
-	# to the previous year if there are not enough issues in the current one
+    # to the previous year if there are not enough issues in the current one
-	#
+    #
-	def find_last_issue(self, archive_url):
+    def find_last_issue(self, archive_url):
-		archive_soup = self.index_to_soup(archive_url)
+        archive_soup = self.index_to_soup(archive_url, True)
 		select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
 		options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value'))
-		# check if need to go back to previous year
+        # workaround because html is so messed up that find() method on soup returns None
-		if len(options) > self.BACK_ISSUES:
+        # and therefore we need to extract subhtml that we need
-			option = options[self.BACK_ISSUES];
+        matches = re.search(r'<ul class="rightIssueList">(.*?)</ul>', archive_soup, re.DOTALL)
-			self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
+        if matches is None:
-			self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
+            return
-		else:
+
-			self.BACK_ISSUES = self.BACK_ISSUES - len(options)
+        subSoup = BeautifulSoup(matches.group(0))
-			self.YEAR = self.YEAR - 1
+        issueLinks = subSoup.findAll('a')
-			self.find_last_issue(archive_url + ',' + str(self.YEAR))
+
        # check if need to go back to previous year
        if len(issueLinks) > self.BACK_ISSUES:
            link = issueLinks[self.BACK_ISSUES];
            self.EDITION = link['href'].replace('http://www.newsweek.pl/wydania/','')
            self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
        else:
            self.BACK_ISSUES = self.BACK_ISSUES - len(issueLinks)
            self.YEAR = self.YEAR - 1
            self.find_last_issue(archive_url + '/' + str(self.YEAR))
-	#
+    #
-	# Looks for the last issue which we want to download. Then goes on each
+    # Looks for the last issue which we want to download. Then goes on each
-	# section and article and stores them (assigning to sections)
+    # section and article and stores them (assigning to sections)
-	#
+    #
-	def parse_index(self):
+    def parse_index(self):
-		archive_url = 'http://www.newsweek.pl/wydania/archiwum'
+        archive_url = 'http://www.newsweek.pl/wydania/archiwum'
-		self.find_last_issue(archive_url)
+        self.find_last_issue(archive_url)
-		soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
+        soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
 		self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
 		main_section = soup.find(id='mainSection')
 		img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
 		self.cover_url = img['src']
 		feeds = []
 		articles = {}
 		sections = []
-		news_list = main_section.find('ul', attrs={'class' : 'newsList'})
+        matches = re.search(r'<div class="Issue-Entry">(.*)ARTICLE_BOTTOM', soup.prettify(), re.DOTALL)
-		section = 'Inne'
+        if matches is None:
            return
-		for li in news_list.findAll('li'):
+        main_section = BeautifulSoup(matches.group(0))
 			h3 = li.find('h3')
 			if h3 is not None:
 				section = capwords(self.tag_to_string(h3))
 				continue
 			else:
 				h2 = li.find('h2')
 				if h2 is not None:
 					article = self.create_article(h2)
 					if article is None :
 						continue
-					if articles.has_key(section):
+        # date
-						articles[section].append(article)
+        matches = re.search(r'(\d{2}-\d{2}-\d{4})', self.tag_to_string(main_section.find('h2')))
-					else:
+        if matches:
-						articles[section] = [article]
+            self.DATE = matches.group(0)
-						sections.append(section)
+
        # cover
        img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
        self.cover_url = img['src']
        feeds = []
        articles = {}
        sections = []
        # sections
        for sectionUl in main_section.findAll('ul', attrs={'class' : 'whatsin'}):
            # section header
            header = sectionUl.find('li', attrs={'class' : 'header'})
            if header is None:
                continue
            section = capwords(self.tag_to_string(header))
            # articles in section
            articleUl = sectionUl.find('ul')
            if articleUl is None:
                continue
            for articleLi in articleUl.findAll('li'):
                # check if article is closed which should be skipped
                closed = articleLi.find('span', attrs={'class' : 'closeart'})
                if closed is not None:
                    continue
                article = self.create_article(articleLi)
                if article is None :
                    continue
                if articles.has_key(section):
                    articles[section].append(article)
                else:
                    articles[section] = [article]
                    sections.append(section)
        for section in sections:
 #             print("%s -> %d" % (section, len(articles[section])))
 #
 #             for article in articles[section]:
 #                 print(" - %s" % article)
            feeds.append((section, articles[section]))
        return feeds
-		for section in sections:
+    #
-			feeds.append((section, articles[section]))
+    # Creates each article metadata (skips locked ones). The content will
-		return feeds
+    # be extracted later by other method (get_obfuscated_article).
    #
    def create_article(self, articleLi):
        article = {}
        a = articleLi.find('a')
        if a is None:
            return None
-	#
+        article['title'] = self.tag_to_string(a)
-	# Creates each article metadata (skips locked ones). The content will
+        article['url'] = a['href']
-	# be extracted later by other method (get_obfuscated_article).
+        article['date'] = self.DATE
-	#
+        article['description'] = ''
 	def create_article(self, h2):
 		article = {}
 		a = h2.find('a')
 		if a is None:
 			return None
-		article['title'] = self.tag_to_string(a)
+        return article
 		article['url'] = a['href']
 		article['date'] = self.DATE
 		desc = h2.findNext('p')
 		if desc is not None:
 			article['description'] = self.tag_to_string(desc)
 		else:
 			article['description'] = ''
 		return article