mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
168 lines
6.1 KiB
Python
168 lines
6.1 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 mode: python -*-
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
|
|
__version__ = '0.98'
|
|
|
|
''' http://brandeins.de - Wirtschaftsmagazin '''
|
|
import re
|
|
import string
|
|
from calibre.ebooks.BeautifulSoup import Tag
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
|
|
class BrandEins(BasicNewsRecipe):
|
|
|
|
title = u'brand eins'
|
|
__author__ = 'Constantin Hofstetter'
|
|
description = u'Wirtschaftsmagazin'
|
|
publisher ='brandeins.de'
|
|
category = 'politics, business, wirtschaft, Germany'
|
|
use_embedded_content = False
|
|
lang = 'de-DE'
|
|
no_stylesheets = True
|
|
encoding = 'utf-8'
|
|
language = 'de'
|
|
publication_type = 'magazine'
|
|
needs_subscription = 'optional'
|
|
# Prevent that conversion date is appended to title
|
|
timefmt = ''
|
|
|
|
# 2 is the last full magazine (default)
|
|
# 1 is the newest (but not full)
|
|
# 3 is one before 2 etc.
|
|
# This value can be set via the username field.
|
|
default_issue = 2
|
|
|
|
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
|
|
|
|
'''
|
|
brandeins.de
|
|
'''
|
|
|
|
def postprocess_html(self, soup,first):
|
|
|
|
# Move the image of the sidebar right below the h3
|
|
first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
|
|
for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
|
|
if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
|
|
# first_h3.parent.insert(2, imgdiv)
|
|
first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
|
|
else:
|
|
first_h3.parent.insert(2, imgdiv)
|
|
|
|
# Now, remove the sidebar
|
|
soup.find(name='div', attrs={'id':'sidebar'}).extract()
|
|
|
|
# Remove the rating-image (stars) from the h3
|
|
for img in first_h3.findAll(name='img'):
|
|
img.extract()
|
|
|
|
# Mark the intro texts as italic
|
|
for div in soup.findAll(name='div', attrs={'class':'intro'}):
|
|
for p in div.findAll('p'):
|
|
content = self.tag_to_string(p)
|
|
new_p = "<p><i>"+ content +"</i></p>"
|
|
p.replaceWith(new_p)
|
|
|
|
# Change <h3> to <h1>
|
|
header = soup.find("h3")
|
|
if header:
|
|
tag = Tag(soup, "h1")
|
|
tag.insert(0, header.contents[0])
|
|
header.replaceWith(tag)
|
|
|
|
return soup
|
|
|
|
def get_cover(self, soup):
|
|
cover_url = None
|
|
cover_item = soup.find('div', attrs = {'class': 'cover_image'})
|
|
if cover_item:
|
|
cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
|
|
return cover_url
|
|
|
|
def parse_index(self):
|
|
feeds = []
|
|
issue_map = {}
|
|
|
|
archive = "http://www.brandeins.de/archiv.html"
|
|
|
|
issue = self.default_issue
|
|
if self.username:
|
|
try:
|
|
issue = int(self.username)
|
|
except:
|
|
pass
|
|
|
|
soup = self.index_to_soup(archive)
|
|
issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
|
|
issue_list = [i for i in issue_list if i.get('onmouseover', False)]
|
|
for i in issue_list:
|
|
issue_number_string = i.get('onmouseover', False)
|
|
if issue_number_string:
|
|
match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string)
|
|
issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2)))
|
|
issue_map[issue_number] = i
|
|
keys = issue_map.keys()
|
|
keys.sort()
|
|
keys.reverse()
|
|
selected_issue_key = keys[issue - 1]
|
|
selected_issue = issue_map[selected_issue_key]
|
|
url = selected_issue.get('href', False)
|
|
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
|
# self.title = "brand eins " + selected_issue_key[4:] + "/" + selected_issue_key[0:4]
|
|
# Get the alternative title for the magazin - build it out of the title of the cover - without the issue and year;
|
|
url = 'http://brandeins.de/'+url
|
|
self.timefmt = ' ' + selected_issue_key[4:] + '/' + selected_issue_key[:4]
|
|
|
|
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
|
titles_and_articles = self.brand_eins_parse_issue(url)
|
|
if titles_and_articles:
|
|
for title, articles in titles_and_articles:
|
|
feeds.append((title, articles))
|
|
return feeds
|
|
|
|
def brand_eins_parse_issue(self, url):
|
|
soup = self.index_to_soup(url)
|
|
self.cover_url = self.get_cover(soup)
|
|
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
|
|
|
|
titles_and_articles = []
|
|
current_articles = []
|
|
chapter_title = "Editorial"
|
|
self.log('Found Chapter:', chapter_title)
|
|
|
|
# Remove last list of links (thats just the impressum and the 'gewinnspiel')
|
|
article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()
|
|
|
|
for article_list in article_lists:
|
|
for chapter in article_list.findAll('ul'):
|
|
if len(chapter.findPreviousSiblings('h3')) >= 1:
|
|
new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
|
|
if new_chapter_title != chapter_title:
|
|
titles_and_articles.append([chapter_title, current_articles])
|
|
current_articles = []
|
|
self.log('Found Chapter:', new_chapter_title)
|
|
chapter_title = new_chapter_title
|
|
for li in chapter.findAll('li'):
|
|
a = li.find('a', href = True)
|
|
if a is None:
|
|
continue
|
|
title = self.tag_to_string(a)
|
|
url = a.get('href', False)
|
|
if not url or not title:
|
|
continue
|
|
url = 'http://brandeins.de/'+url
|
|
if len(a.parent.findNextSiblings('p')) >= 1:
|
|
description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
|
|
else:
|
|
description = ''
|
|
|
|
self.log('\t\tFound article:', title)
|
|
self.log('\t\t\t', url)
|
|
self.log('\t\t\t', description)
|
|
|
|
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
|
titles_and_articles.append([chapter_title, current_articles])
|
|
return titles_and_articles
|