#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com; 2012-2013, admroz, a.rozewicki@gmail.com' import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile from string import capwords import datetime from calibre.ebooks.BeautifulSoup import BeautifulSoup class Newsweek(BasicNewsRecipe): # how many issues to go back, 0 means get the most current one BACK_ISSUES = 1 EDITION = '0' DATE = None YEAR = datetime.datetime.now().year title = u'Newsweek Polska' __author__ = 'matek09, admroz' description = 'Weekly magazine' encoding = 'utf-8' language = 'pl' remove_javascript = True temp_files = [] articles_are_obfuscated = True # # Parses article contents from one page # def get_article_divs(self, css, main_section): strs = [] # get all divs with given css class article_divs = main_section.findAll('div', attrs={'class': css}) for article_div in article_divs: # remove sections like 'read more...' etc. for p in article_div.findAll('p'): if p.find('span', attrs={'style': 'color: #800000; font-size: medium;'}): p.extract() continue if p.find('span', attrs={'style': 'font-size: medium; color: #800000;'}): p.extract() continue if p.find('span', attrs={'style': 'font-size: medium;'}): p.extract() continue if p.find('span', attrs={'style': 'color: #800000;'}): p.extract() continue obj = p.find('object') if obj: obj.extract() continue strong = p.find('strong') if strong: newest = re.compile( "Tekst pochodzi z najnowszego numeru Tygodnika Newsweek") if newest.search(str(strong)): strong.extract() continue itunes = p.find('a') if itunes: reurl = re.compile("itunes.apple.com") if reurl.search(str(itunes['href'])): p.extract() continue imagedesc = p.find('div', attrs={'class': 'image-desc'}) if imagedesc: redesc = re.compile("Okładka numeru") if (redesc.search(str(imagedesc))): p.extract() continue # get actual contents for content in article_div.contents: strs.append("".join(str(content))) # return contents as a string return u"".join(strs) # # Articles can be divided into several pages, this method parses them recursevely # def get_article_page(self, br, url, page): br.open(url) source = br.response().read() html = '' matches = re.search(r'
(.*)
', source, re.DOTALL) if matches is None: print("no article tag found, returning...") return main_section = BeautifulSoup(matches.group(0)) if page == 0: title = main_section.find('h1') html = html + type(u'')(title) authors = '' authorBox = main_section.find('div', attrs={'class': 'AuthorBox'}) if authorBox is not None: authorH4 = authorBox.find('h4') if authorH4 is not None: authors = self.tag_to_string(authorH4) html = html + type(u'')(authors) info = main_section.find('p', attrs={'class': 'lead'}) html = html + type(u'')(info) html = html + self.get_article_divs( '3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac articleStart', main_section) html = html + self.get_article_divs( '3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac', main_section) nextPage = main_section.find('a', attrs={'class': 'next'}) if nextPage: html = html + self.get_article_page(br, nextPage['href'], page + 1) return html # # Parses each article # def get_obfuscated_article(self, url): br = self.get_browser() html = self.get_article_page(br, url, 0) self.temp_files.append(PersistentTemporaryFile('_temparse.html')) self.temp_files[-1].write(html) self.temp_files[-1].close() return self.temp_files[-1].name # # Goes back given number of issues. It also knows how to go back # to the previous year if there are not enough issues in the current one # def find_last_issue(self, archive_url): archive_soup = self.index_to_soup(archive_url, True) # workaround because html is so messed up that find() method on soup returns None # and therefore we need to extract subhtml that we need matches = re.search( r'', archive_soup, re.DOTALL) if matches is None: return subSoup = BeautifulSoup(matches.group(0)) issueLinks = subSoup.findAll('a') # check if need to go back to previous year if len(issueLinks) > self.BACK_ISSUES: link = issueLinks[self.BACK_ISSUES] self.EDITION = link['href'].replace( 'http://www.newsweek.pl/wydania/', '') self.index_to_soup( 'http://www.newsweek.pl/wydania/' + self.EDITION) else: self.BACK_ISSUES = self.BACK_ISSUES - len(issueLinks) self.YEAR = self.YEAR - 1 self.find_last_issue(archive_url + '/' + str(self.YEAR)) # # Looks for the last issue which we want to download. Then goes on each # section and article and stores them (assigning to sections) # def parse_index(self): archive_url = 'http://www.newsweek.pl/wydania/archiwum' self.find_last_issue(archive_url) soup = self.index_to_soup( 'http://www.newsweek.pl/wydania/' + self.EDITION) matches = re.search( r'
(.*)ARTICLE_BOTTOM', soup.prettify(), re.DOTALL) if matches is None: return main_section = BeautifulSoup(matches.group(0)) # date matches = re.search(r'(\d{2}-\d{2}-\d{4})', self.tag_to_string(main_section.find('h2'))) if matches: self.DATE = matches.group(0) # cover img = main_section.find('img', src=True, alt=True, title=True) self.cover_url = img['src'] feeds = [] articles = {} sections = [] # sections for sectionUl in main_section.findAll('ul', attrs={'class': 'whatsin'}): # section header header = sectionUl.find('li', attrs={'class': 'header'}) if header is None: continue section = capwords(self.tag_to_string(header)) # articles in section articleUl = sectionUl.find('ul') if articleUl is None: continue for articleLi in articleUl.findAll('li'): # check if article is closed which should be skipped closed = articleLi.find('span', attrs={'class': 'closeart'}) if closed is not None: continue article = self.create_article(articleLi) if article is None: continue if section in articles: articles[section].append(article) else: articles[section] = [article] sections.append(section) for section in sections: # print("%s -> %d" % (section, len(articles[section]))) # # for article in articles[section]: # print(" - %s" % article) feeds.append((section, articles[section])) return feeds # # Creates each article metadata (skips locked ones). The content will # be extracted later by other method (get_obfuscated_article). # def create_article(self, articleLi): article = {} a = articleLi.find('a') if a is None: return None article['title'] = self.tag_to_string(a) article['url'] = a['href'] article['date'] = self.DATE article['description'] = '' return article