calibre/recipes/respekt_magazine.recipe

173 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python
# -*- coding: utf-8 -*-
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright: tomashnyk@gmail.com
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'tomashnyk@gmail.com'
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
# This imports the version bundled with Calibre
import lxml
from lxml.builder import E
respekt_url = 'http://www.respekt.cz'
class respektRecipe(BasicNewsRecipe):
__author__ = 'Tomáš Hnyk'
publisher = u'Respekt Publishing a. s.'
description = u'Articles from the print edition'
title = u'Respekt Magazine Print'
encoding = 'utf-8'
language = 'cs'
delay = 0.001
remove_javascript = True
remove_tags_before = dict(name='h1')
remove_tags_after = [dict(id='postcontent')]
remove_tags = [
dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}),
dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}),
dict(name='a',attrs={'class':['quote','authorship-face']}),
dict(name='span',attrs={'class':'embed'}),
dict(name='svg'),
dict(name='script')
]
extra_css = 'p {text-align:justify;margin-top:0;margin-bottom:0} \
ul {color:black} \
.frame-caption {font-weight:normal;font-size:50%;font-style:italic;} \
h1 {font-size:150%;margin-bottom:0;} \
h2 {font-size:100%;margin-bottom:0;} \
.post-subtitle {margin-top:0;} \
h3 {font-size:100%;margin-bottom:0;margin-top:0;} \
.box-title {background-color: lightgray;font-size:150%;font-weight:bold;margin-left:12%;margin-right:12%;margin-top:12%;margin-bottom:0;} \
.box-content {background-color:lightgray;margin-left:12%;margin-right:12%;margin-top:0;margin-bottom:12%;} \
p.indent_first_line {text-indent:30px;} \
a {text-decoration:none;color:black;}'
needs_subscription = True
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('https://www.respekt.cz/uzivatel/prihlaseni')
for form in br.forms():
if form.attrs.get('id') == 'frm-loginForm':
br.form = form
break
for c in form.controls:
if 'name' in c.attrs:
if c.attrs['name'] == 'username':
c.value = self.username
if c.attrs['name'] == 'password':
c.value = self.password
br.submit()
return br
# So that remove_tags_before works for this section
def preprocess_raw_html(self, raw_html, url):
root = lxml.html.fromstring(raw_html)
if root.xpath("//title")[0].text == (u"Respekt • Despekt • RESPEKT"):
raw_html = re.sub("h2","h1",raw_html)
return raw_html
def parse_index(self):
raw1 = self.index_to_soup('http://www.respekt.cz/tydenik/', raw=True)
root1 = lxml.html.fromstring(raw1)
current_edition_url = root1.xpath("//div[@class='heroissue']/a")[0].items()[0][1]
raw2 = self.index_to_soup('http://www.respekt.cz/' + current_edition_url, raw=True)
root2 = lxml.html.fromstring(raw2)
self.cover_url = root2.xpath("//i[contains(@class, 'heroissue-cover')]")[0].get("data-src")
# Fetch date
date_text = root2.xpath("//time[@class='heroissue-date']")[0].text.split(',')[1]
s = date_text.split(" ")
# Are the dates of the issue in the same month and year?
if len(s) == 4 or len(s) == 7:
date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[-1]])
elif len(s) == 8:
date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[3]])
self.conversion_options = {'pubdate':date}
self.title = "Respekt magazine #" + "/".join(current_edition_url.split("/")[-1:-3:-1])
ans = []
for section in root2.xpath("//div[@class='col-md-6']/div[@class='issuedetail-categorized-sectionname']"):
section_name = section.text
articles = []
article = section.getnext()
while hasattr(article, 'text') and not article.text.strip():
title = article.xpath("span[@class='issuedetail-categorized-title']")[0].text
url = respekt_url + article.xpath("@href")[0]
articles.append({'title':title,'url':url})
article = article.getnext()
ans.append((section_name,articles))
return ans
def cleanup(self):
self.browser.open('https://www.respekt.cz/?do=logout')
def preprocess_html(self,soup):
raw = u''.join(type(u'')(a) for a in soup.contents)
root = lxml.html.fromstring(raw)
# Fix Letem světem
if "Letem sv" in root.xpath("//title")[0].text:
p = root.xpath("//p")
for par in p[:]:
next = par.getnext()
if par.getchildren():
child = par.getchildren()[0]
if hasattr(next,"tag") and next.tag == "h2" and hasattr(child,"tag") and child.tag == "strong":
text = child.text_content()
if next.text:
next.text = next.text + u"" + text
else:
if next.getchildren():
next_child = next.getchildren()[0]
next_child.text = next_child.text + u"" + text
par.getparent().remove(par)
# Insert text length
text = root.xpath("//div[@id='postcontent']")[0]
article_length = u"" + str(len(text.text_content().split(' '))) + ' slov'
try:
aut = root.xpath("//div[@class='authorship-names']")[0]
if aut.getchildren() and aut.getchildren()[0].tag == 'a':
for i,t in enumerate(aut.getchildren()):
if i == 0:
t.text = 'Autor: ' + t.text + ' '
else:
t.text = t.text + ' '
# Remove link
e = E.span(t.text)
t.getparent().replace(t,e)
else:
t = root.xpath("//span[@class='post-author-name']")[0]
t.text = ('Autor: ' + t.text + ' ')
root.xpath("//div[@class='authorship-names']")[0].append(E.span(article_length))
except:
pass
# Make images visible
pictures = root.xpath("//picture")
for picture in pictures:
image = picture.xpath("//source")[0]
image_link = [a for a in image.get('srcset').split(' ') if a[:4] == "http"][-1]
e=E.img({"src":image_link})
picture.getparent().replace(picture,e)
# Properly indent
paragraphs = root.xpath('//p')
paragraphs.reverse()
# First paragraph is never indented
for par in paragraphs[:-1]:
prev = par.getprevious()
# Do not indent after headings
if hasattr(prev,'tag') and not (prev.tag in ['h2','h3']):
par.attrib['class']="indent_first_line"
# Fix subtitle for Téma
try:
o = root.xpath("//p[@class='post-perex']")[0]
e = E.h2({"class":"post-subtitle"})
e.text = o.text
o.getparent().replace(o,e)
except:
pass
return(BeautifulSoup(lxml.etree.tostring(root,encoding='unicode')))