Update Newsweek Polska

2025-07-09 03:04:10 -04:00 · 2013-10-17 08:26:05 +05:30 · 2013-10-17 08:26:05 +05:30 · 42cc5b2813
commit 42cc5b2813
parent 66a88b5d6e
1 changed files with 223 additions and 133 deletions
--- a/recipes/newsweek_polska.recipe
+++ b/recipes/newsweek_polska.recipe
@ -2,18 +2,20 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com'
+__copyright__ = '2010, matek09, matek09@gmail.com; 2012-2013, admroz, a.rozewicki@gmail.com'
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
 from string import capwords
 import datetime
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class Newsweek(BasicNewsRecipe):
    # how many issues to go back, 0 means get the most current one
-	BACK_ISSUES = 2
+    BACK_ISSUES = 1
    EDITION = '0'
    DATE = None
@ -30,59 +32,117 @@ class Newsweek(BasicNewsRecipe):
    articles_are_obfuscated = True
    #
    # Parses article contents from one page
    #
    def get_article_divs(self, css, main_section):
        strs = []
        # get all divs with given css class
        article_divs = main_section.findAll('div', attrs={'class' : css})
        for article_div in article_divs:
            # remove sections like 'read more...' etc.
            for p in article_div.findAll('p'):
                if p.find('span', attrs={'style' : 'color: #800000; font-size: medium;'}):
                    p.extract()
                    continue
                if p.find('span', attrs={'style' : 'font-size: medium; color: #800000;'}):
                    p.extract()
                    continue
                if p.find('span', attrs={'style' : 'font-size: medium;'}):
                    p.extract()
                    continue
                if p.find('span', attrs={'style' : 'color: #800000;'}):
                    p.extract()
                    continue
                obj = p.find('object')
                if obj:
                    obj.extract()
                    continue
                strong = p.find('strong')
                if strong:
                    newest = re.compile("Tekst pochodzi z najnowszego numeru Tygodnika Newsweek")
                    if newest.search(str(strong)):
                        strong.extract()
                        continue
                itunes = p.find('a')
                if itunes:
                    reurl = re.compile("itunes.apple.com")
                    if reurl.search(str(itunes['href'])):
                        p.extract()
                        continue
                imagedesc = p.find('div', attrs={'class' : 'image-desc'})
                if imagedesc:
                    redesc = re.compile("Okładka numeru")
                    if (redesc.search(str(imagedesc))):
                        p.extract()
                        continue
            # get actual contents
            for content in article_div.contents:
                strs.append("".join(str(content)))
        # return contents as a string
        return unicode("".join(strs))
    #
    # Articles can be divided into several pages, this method parses them recursevely
    #
    def get_article_page(self, br, url, page):
        br.open(url)
        source = br.response().read()
        html = ''
        matches = re.search(r'<article>(.*)</article>', source, re.DOTALL)
        if matches is None:
            print "no article tag found, returning..."
            return
        main_section = BeautifulSoup(matches.group(0))
        if page == 0:
            title = main_section.find('h1')
            html = html + unicode(title)
            authors = ''
            authorBox = main_section.find('div', attrs={'class' : 'AuthorBox'})
            if authorBox is not None:
                authorH4 = authorBox.find('h4')
                if authorH4 is not None:
                    authors = self.tag_to_string(authorH4)
            html = html + unicode(authors)
            info = main_section.find('p', attrs={'class' : 'lead'})
            html = html + unicode(info)
        html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac articleStart', main_section)
        html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac', main_section)
        nextPage = main_section.find('a', attrs={'class' : 'next'})
        if nextPage:
            html = html + self.get_article_page(br, nextPage['href'], page+1)
        return html
    #
    # Parses each article
    #
    def get_obfuscated_article(self, url):
        br = self.get_browser()
-		br.open(url)
+        html = self.get_article_page(br, url, 0)
 		source = br.response().read()
 		page = self.index_to_soup(source)
 		main_section = page.find(id='mainSection')
 		title = main_section.find('h1')
 		info = main_section.find('ul', attrs={'class' : 'articleInfo'})
 		authors = info.find('li').find('h4')
 		article = main_section.find('div', attrs={'id' : 'article'})
 		# remove related articles box
 		related = article.find('div', attrs={'class' : 'relatedBox'})
 		if related is not None:
 			related.extract()
 		# remove div with social networking links and links to
 		# other articles in web version
 		for div in article.findAll('div'):
 			if div.find('span', attrs={'class' : 'google-plus'}):
 				div.extract()
 			for p in div.findAll('p'):
 				if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}):
 					p.extract()
 					continue
 				for a in p.findAll('a'):
 					if a.find('span', attrs={'style' : 'font-size: larger;'}):
 						a.extract()
 		html = unicode(title) + unicode(authors) + unicode(article)
 		next = main_section.find('li', attrs={'class' : 'next'})
 		while next:
 			url = next.find('a')['href']
 			br.open(url)
 			source = br.response().read()
 			page = self.index_to_soup(source)
 			main_section = page.find(id='mainSection')
 			article = main_section.find('div', attrs={'id' : 'article'})
 			aside = article.find(id='articleAside')
 			if aside is not None:
 				aside.extract()
 			html = html + unicode(article)
 			next = main_section.find('li', attrs={'class' : 'next'})
        self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()
@ -94,19 +154,26 @@ class Newsweek(BasicNewsRecipe):
    # to the previous year if there are not enough issues in the current one
    #
    def find_last_issue(self, archive_url):
-		archive_soup = self.index_to_soup(archive_url)
+        archive_soup = self.index_to_soup(archive_url, True)
-		select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
+
-		options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value'))
+        # workaround because html is so messed up that find() method on soup returns None
        # and therefore we need to extract subhtml that we need
        matches = re.search(r'<ul class="rightIssueList">(.*?)</ul>', archive_soup, re.DOTALL)
        if matches is None:
            return
        subSoup = BeautifulSoup(matches.group(0))
        issueLinks = subSoup.findAll('a')
        # check if need to go back to previous year
-		if len(options) > self.BACK_ISSUES:
+        if len(issueLinks) > self.BACK_ISSUES:
-			option = options[self.BACK_ISSUES];
+            link = issueLinks[self.BACK_ISSUES];
-			self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
+            self.EDITION = link['href'].replace('http://www.newsweek.pl/wydania/','')
            self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
        else:
-			self.BACK_ISSUES = self.BACK_ISSUES - len(options)
+            self.BACK_ISSUES = self.BACK_ISSUES - len(issueLinks)
            self.YEAR = self.YEAR - 1
-			self.find_last_issue(archive_url + ',' + str(self.YEAR))
+            self.find_last_issue(archive_url + '/' + str(self.YEAR))
    #
@ -117,26 +184,47 @@ class Newsweek(BasicNewsRecipe):
        archive_url = 'http://www.newsweek.pl/wydania/archiwum'
        self.find_last_issue(archive_url)
        soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
-		self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
+
-		main_section = soup.find(id='mainSection')
+        matches = re.search(r'<div class="Issue-Entry">(.*)ARTICLE_BOTTOM', soup.prettify(), re.DOTALL)
        if matches is None:
            return
        main_section = BeautifulSoup(matches.group(0))
        # date
        matches = re.search(r'(\d{2}-\d{2}-\d{4})', self.tag_to_string(main_section.find('h2')))
        if matches:
            self.DATE = matches.group(0)
        # cover
        img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
        self.cover_url = img['src']
        feeds = []
        articles = {}
        sections = []
-		news_list = main_section.find('ul', attrs={'class' : 'newsList'})
+        # sections
-		section = 'Inne'
+        for sectionUl in main_section.findAll('ul', attrs={'class' : 'whatsin'}):
-		for li in news_list.findAll('li'):
+            # section header
-			h3 = li.find('h3')
+            header = sectionUl.find('li', attrs={'class' : 'header'})
-			if h3 is not None:
+            if header is None:
 				section = capwords(self.tag_to_string(h3))
                continue
-			else:
+
-				h2 = li.find('h2')
+            section = capwords(self.tag_to_string(header))
-				if h2 is not None:
+
-					article = self.create_article(h2)
+            # articles in section
            articleUl = sectionUl.find('ul')
            if articleUl is None:
                continue
            for articleLi in articleUl.findAll('li'):
                # check if article is closed which should be skipped
                closed = articleLi.find('span', attrs={'class' : 'closeart'})
                if closed is not None:
                    continue
                article = self.create_article(articleLi)
                if article is None :
                    continue
@ -146,9 +234,14 @@ class Newsweek(BasicNewsRecipe):
                    articles[section] = [article]
                    sections.append(section)
        for section in sections:
 #             print("%s -> %d" % (section, len(articles[section])))
 #
 #             for article in articles[section]:
 #                 print(" - %s" % article)
            feeds.append((section, articles[section]))
        return feeds
@ -156,19 +249,16 @@ class Newsweek(BasicNewsRecipe):
    # Creates each article metadata (skips locked ones). The content will
    # be extracted later by other method (get_obfuscated_article).
    #
-	def create_article(self, h2):
+    def create_article(self, articleLi):
        article = {}
-		a = h2.find('a')
+
        a = articleLi.find('a')
        if a is None:
            return None
        article['title'] = self.tag_to_string(a)
        article['url'] = a['href']
        article['date'] = self.DATE
 		desc = h2.findNext('p')
 		if desc is not None:
 			article['description'] = self.tag_to_string(desc)
 		else:
        article['description'] = ''
        return article