Update Newsweek Polska

This commit is contained in:
Kovid Goyal 2013-10-17 08:26:05 +05:30
parent 66a88b5d6e
commit 42cc5b2813

View File

@ -2,18 +2,20 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com' __copyright__ = '2010, matek09, matek09@gmail.com; 2012-2013, admroz, a.rozewicki@gmail.com'
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from string import capwords from string import capwords
import datetime import datetime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Newsweek(BasicNewsRecipe): class Newsweek(BasicNewsRecipe):
# how many issues to go back, 0 means get the most current one # how many issues to go back, 0 means get the most current one
BACK_ISSUES = 2 BACK_ISSUES = 1
EDITION = '0' EDITION = '0'
DATE = None DATE = None
@ -30,59 +32,117 @@ class Newsweek(BasicNewsRecipe):
articles_are_obfuscated = True articles_are_obfuscated = True
#
# Parses article contents from one page
#
def get_article_divs(self, css, main_section):
strs = []
# get all divs with given css class
article_divs = main_section.findAll('div', attrs={'class' : css})
for article_div in article_divs:
# remove sections like 'read more...' etc.
for p in article_div.findAll('p'):
if p.find('span', attrs={'style' : 'color: #800000; font-size: medium;'}):
p.extract()
continue
if p.find('span', attrs={'style' : 'font-size: medium; color: #800000;'}):
p.extract()
continue
if p.find('span', attrs={'style' : 'font-size: medium;'}):
p.extract()
continue
if p.find('span', attrs={'style' : 'color: #800000;'}):
p.extract()
continue
obj = p.find('object')
if obj:
obj.extract()
continue
strong = p.find('strong')
if strong:
newest = re.compile("Tekst pochodzi z najnowszego numeru Tygodnika Newsweek")
if newest.search(str(strong)):
strong.extract()
continue
itunes = p.find('a')
if itunes:
reurl = re.compile("itunes.apple.com")
if reurl.search(str(itunes['href'])):
p.extract()
continue
imagedesc = p.find('div', attrs={'class' : 'image-desc'})
if imagedesc:
redesc = re.compile("Okładka numeru")
if (redesc.search(str(imagedesc))):
p.extract()
continue
# get actual contents
for content in article_div.contents:
strs.append("".join(str(content)))
# return contents as a string
return unicode("".join(strs))
#
# Articles can be divided into several pages, this method parses them recursevely
#
def get_article_page(self, br, url, page):
br.open(url)
source = br.response().read()
html = ''
matches = re.search(r'<article>(.*)</article>', source, re.DOTALL)
if matches is None:
print "no article tag found, returning..."
return
main_section = BeautifulSoup(matches.group(0))
if page == 0:
title = main_section.find('h1')
html = html + unicode(title)
authors = ''
authorBox = main_section.find('div', attrs={'class' : 'AuthorBox'})
if authorBox is not None:
authorH4 = authorBox.find('h4')
if authorH4 is not None:
authors = self.tag_to_string(authorH4)
html = html + unicode(authors)
info = main_section.find('p', attrs={'class' : 'lead'})
html = html + unicode(info)
html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac articleStart', main_section)
html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac', main_section)
nextPage = main_section.find('a', attrs={'class' : 'next'})
if nextPage:
html = html + self.get_article_page(br, nextPage['href'], page+1)
return html
# #
# Parses each article # Parses each article
# #
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
br = self.get_browser() br = self.get_browser()
br.open(url) html = self.get_article_page(br, url, 0)
source = br.response().read()
page = self.index_to_soup(source)
main_section = page.find(id='mainSection')
title = main_section.find('h1')
info = main_section.find('ul', attrs={'class' : 'articleInfo'})
authors = info.find('li').find('h4')
article = main_section.find('div', attrs={'id' : 'article'})
# remove related articles box
related = article.find('div', attrs={'class' : 'relatedBox'})
if related is not None:
related.extract()
# remove div with social networking links and links to
# other articles in web version
for div in article.findAll('div'):
if div.find('span', attrs={'class' : 'google-plus'}):
div.extract()
for p in div.findAll('p'):
if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}):
p.extract()
continue
for a in p.findAll('a'):
if a.find('span', attrs={'style' : 'font-size: larger;'}):
a.extract()
html = unicode(title) + unicode(authors) + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'})
while next:
url = next.find('a')['href']
br.open(url)
source = br.response().read()
page = self.index_to_soup(source)
main_section = page.find(id='mainSection')
article = main_section.find('div', attrs={'id' : 'article'})
aside = article.find(id='articleAside')
if aside is not None:
aside.extract()
html = html + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'})
self.temp_files.append(PersistentTemporaryFile('_temparse.html')) self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
self.temp_files[-1].write(html) self.temp_files[-1].write(html)
self.temp_files[-1].close() self.temp_files[-1].close()
@ -94,19 +154,26 @@ class Newsweek(BasicNewsRecipe):
# to the previous year if there are not enough issues in the current one # to the previous year if there are not enough issues in the current one
# #
def find_last_issue(self, archive_url): def find_last_issue(self, archive_url):
archive_soup = self.index_to_soup(archive_url) archive_soup = self.index_to_soup(archive_url, True)
select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')) # workaround because html is so messed up that find() method on soup returns None
# and therefore we need to extract subhtml that we need
matches = re.search(r'<ul class="rightIssueList">(.*?)</ul>', archive_soup, re.DOTALL)
if matches is None:
return
subSoup = BeautifulSoup(matches.group(0))
issueLinks = subSoup.findAll('a')
# check if need to go back to previous year # check if need to go back to previous year
if len(options) > self.BACK_ISSUES: if len(issueLinks) > self.BACK_ISSUES:
option = options[self.BACK_ISSUES]; link = issueLinks[self.BACK_ISSUES];
self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','') self.EDITION = link['href'].replace('http://www.newsweek.pl/wydania/','')
self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
else: else:
self.BACK_ISSUES = self.BACK_ISSUES - len(options) self.BACK_ISSUES = self.BACK_ISSUES - len(issueLinks)
self.YEAR = self.YEAR - 1 self.YEAR = self.YEAR - 1
self.find_last_issue(archive_url + ',' + str(self.YEAR)) self.find_last_issue(archive_url + '/' + str(self.YEAR))
# #
@ -117,26 +184,47 @@ class Newsweek(BasicNewsRecipe):
archive_url = 'http://www.newsweek.pl/wydania/archiwum' archive_url = 'http://www.newsweek.pl/wydania/archiwum'
self.find_last_issue(archive_url) self.find_last_issue(archive_url)
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
main_section = soup.find(id='mainSection') matches = re.search(r'<div class="Issue-Entry">(.*)ARTICLE_BOTTOM', soup.prettify(), re.DOTALL)
if matches is None:
return
main_section = BeautifulSoup(matches.group(0))
# date
matches = re.search(r'(\d{2}-\d{2}-\d{4})', self.tag_to_string(main_section.find('h2')))
if matches:
self.DATE = matches.group(0)
# cover
img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title')) img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
self.cover_url = img['src'] self.cover_url = img['src']
feeds = [] feeds = []
articles = {} articles = {}
sections = [] sections = []
news_list = main_section.find('ul', attrs={'class' : 'newsList'}) # sections
section = 'Inne' for sectionUl in main_section.findAll('ul', attrs={'class' : 'whatsin'}):
for li in news_list.findAll('li'): # section header
h3 = li.find('h3') header = sectionUl.find('li', attrs={'class' : 'header'})
if h3 is not None: if header is None:
section = capwords(self.tag_to_string(h3))
continue continue
else:
h2 = li.find('h2') section = capwords(self.tag_to_string(header))
if h2 is not None:
article = self.create_article(h2) # articles in section
articleUl = sectionUl.find('ul')
if articleUl is None:
continue
for articleLi in articleUl.findAll('li'):
# check if article is closed which should be skipped
closed = articleLi.find('span', attrs={'class' : 'closeart'})
if closed is not None:
continue
article = self.create_article(articleLi)
if article is None : if article is None :
continue continue
@ -146,9 +234,14 @@ class Newsweek(BasicNewsRecipe):
articles[section] = [article] articles[section] = [article]
sections.append(section) sections.append(section)
for section in sections: for section in sections:
# print("%s -> %d" % (section, len(articles[section])))
#
# for article in articles[section]:
# print(" - %s" % article)
feeds.append((section, articles[section])) feeds.append((section, articles[section]))
return feeds return feeds
@ -156,19 +249,16 @@ class Newsweek(BasicNewsRecipe):
# Creates each article metadata (skips locked ones). The content will # Creates each article metadata (skips locked ones). The content will
# be extracted later by other method (get_obfuscated_article). # be extracted later by other method (get_obfuscated_article).
# #
def create_article(self, h2): def create_article(self, articleLi):
article = {} article = {}
a = h2.find('a')
a = articleLi.find('a')
if a is None: if a is None:
return None return None
article['title'] = self.tag_to_string(a) article['title'] = self.tag_to_string(a)
article['url'] = a['href'] article['url'] = a['href']
article['date'] = self.DATE article['date'] = self.DATE
desc = h2.findNext('p')
if desc is not None:
article['description'] = self.tag_to_string(desc)
else:
article['description'] = '' article['description'] = ''
return article return article