mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Respekt Magazine
Merge branch 'master' of https://github.com/felagund/calibre
This commit is contained in:
commit
72a211f6c2
189
recipes/respekt_magazine.recipe
Normal file
189
recipes/respekt_magazine.recipe
Normal file
@ -0,0 +1,189 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
# Copyright: tomashnyk@gmail.com
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = 'tomashnyk@gmail.com'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
# This imports the version bundled with Calibre
|
||||
import lxml
|
||||
from lxml.builder import E
|
||||
respekt_url = 'http://www.respekt.cz'
|
||||
|
||||
class respektRecipe(BasicNewsRecipe):
|
||||
__author__ = 'Tomáš Hnyk'
|
||||
publisher = u'Respekt Publishing a. s.'
|
||||
description = u'Articles from the print edition'
|
||||
encoding = 'utf-8'
|
||||
language = 'cs'
|
||||
remove_javascript = True
|
||||
remove_tags_before = dict(name='h1')
|
||||
remove_tags_after = [dict(id='postcontent')]
|
||||
remove_tags = [dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}),
|
||||
dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}),
|
||||
dict(name='a',attrs={'class':['quote','authorship-face']}),
|
||||
dict(name='span',attrs={'class':'embed'}),
|
||||
dict(name='svg'),
|
||||
dict(name='script')]
|
||||
|
||||
extra_css = 'p {text-align:justify;margin-top:0;margin-bottom:0} \
|
||||
ul {color:black} \
|
||||
.frame-caption {font-weight:normal;font-size:50%;font-style:italic;} \
|
||||
h1 {font-size:150%;margin-bottom:0;} \
|
||||
h2 {font-size:100%;margin-bottom:0;} \
|
||||
.post-subtitle {margin-top:0;} \
|
||||
h3 {font-size:100%;margin-bottom:0;margin-top:0;} \
|
||||
.box-title {background-color: lightgray;font-size:150%;font-weight:bold;margin-left:12%;margin-right:12%;margin-top:12%;margin-bottom:0;} \
|
||||
.box-content {background-color:lightgray;margin-left:12%;margin-right:12%;margin-top:0;margin-bottom:12%;} \
|
||||
p.indent_first_line {text-indent:30px;} \
|
||||
a {text-decoration:none;color:black;}'
|
||||
|
||||
needs_subscription = True
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('https://www.respekt.cz/')
|
||||
for form in br.forms():
|
||||
if form.attrs.get('id') == 'frm-authBox-loginForm':
|
||||
br.form = form
|
||||
break
|
||||
for c in form.controls:
|
||||
if 'name' in c.attrs:
|
||||
if c.attrs['name'] == 'username':
|
||||
c.value = self.username
|
||||
if c.attrs['name'] == 'password':
|
||||
c.value = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
# So that remove_tags_before works for this section
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
root = lxml.html.fromstring(raw_html)
|
||||
if root.xpath("//title")[0].text == (u"Respekt • Despekt • RESPEKT"):
|
||||
raw_html = re.sub("h2","h1",raw_html)
|
||||
return raw_html
|
||||
|
||||
def parse_index(self):
|
||||
raw1 = self.index_to_soup('http://www.respekt.cz/tydenik/', raw=True)
|
||||
root1 = lxml.html.fromstring(raw1)
|
||||
current_edition_url = root1.xpath("//div[@class='heroissue']/a")[0].items()[0][1]
|
||||
raw2 = self.index_to_soup('http://www.respekt.cz/' + current_edition_url, raw=True)
|
||||
root2 = lxml.html.fromstring(raw2)
|
||||
self.cover_url = root2.xpath("//i[contains(@class, 'heroissue-cover')]")[0].get("data-src")
|
||||
# Fetch date
|
||||
date_text = root2.xpath("//time[@class='heroissue-date']")[0].text.split(',')[1]
|
||||
s = date_text.split(" ")
|
||||
# Are the dates of the issue in the same month and year?
|
||||
if len(s) == 4 or len(s) == 7:
|
||||
date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[-1]])
|
||||
elif len(s) == 8:
|
||||
date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[3]])
|
||||
self.conversion_options = {'pubdate':date}
|
||||
self.title = "Respekt magazine #" + "/".join(current_edition_url.split("/")[-1:-3:-1])
|
||||
ans = []
|
||||
for section in root2.xpath("//div[@class='col-md-6']/div[@class='issuedetail-categorized-sectionname']"):
|
||||
section_name = section.text
|
||||
articles = []
|
||||
article = section.getnext()
|
||||
while hasattr(article, 'text') and not article.text.strip():
|
||||
title = article.xpath("span[@class='issuedetail-categorized-title']")[0].text
|
||||
url = respekt_url + article.xpath("@href")[0]
|
||||
articles.append({'title':title,'url':url})
|
||||
article = article.getnext()
|
||||
ans.append((section_name,articles))
|
||||
highlights = zip(root2.xpath("//a[@class='issuedetail-highlighted-item']"),root2.xpath("//div[@class='issuedetail-highlighted-title']"))
|
||||
highlights.reverse()
|
||||
sections = [i[0] for i in ans]
|
||||
for l,t in highlights:
|
||||
title = t.text
|
||||
link = l.xpath('@href')[0]
|
||||
raw3 = self.index_to_soup(respekt_url + link, raw=True)
|
||||
root3 = lxml.html.fromstring(raw3)
|
||||
topics = [i.text.strip() for i in root3.xpath("//div[contains(@class, 'post-topics')]/a")]
|
||||
# The name of the section changes its position
|
||||
if u"Téma" in topics:
|
||||
section_name = "Fokus"
|
||||
elif u"Rozhovor" in topics:
|
||||
section_name = "Rozhovor"
|
||||
else:
|
||||
for t in topics:
|
||||
if t in sections:
|
||||
section_name = t
|
||||
break
|
||||
for i in ans:
|
||||
if i[0] == section_name:
|
||||
i[1].insert(-(len(i[1])),{'title':title,'url':respekt_url+link})
|
||||
if section_name == u"Rozhovor":
|
||||
ans.insert(sections.index(u'Fokus')+1,(section_name,[{'title':title,'url':respekt_url+link}]))
|
||||
return ans
|
||||
|
||||
def cleanup(self):
|
||||
self.browser.open('https://www.respekt.cz/?do=logout')
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
raw = u''.join(unicode(a) for a in soup.contents)
|
||||
root = lxml.html.fromstring(raw)
|
||||
# Fix Letem světem
|
||||
if "Letem sv" in root.xpath("//title")[0].text:
|
||||
p = root.xpath("//p")
|
||||
for par in p[:]:
|
||||
next = par.getnext()
|
||||
if par.getchildren():
|
||||
child = par.getchildren()[0]
|
||||
if hasattr(next,"tag") and next.tag == "h2" and hasattr(child,"tag") and child.tag == "strong":
|
||||
text = child.text_content()
|
||||
if next.text:
|
||||
next.text = next.text + u" • " + text
|
||||
else:
|
||||
if next.getchildren():
|
||||
next_child = next.getchildren()[0]
|
||||
next_child.text = next_child.text + u" • " + text
|
||||
par.getparent().remove(par)
|
||||
# Insert text length
|
||||
text = root.xpath("//div[@id='postcontent']")[0]
|
||||
article_length = u" • " + str(len(text.text_content().split(' '))) + ' slov'
|
||||
try:
|
||||
aut = root.xpath("//div[@class='authorship-names']")[0]
|
||||
if aut.getchildren() and aut.getchildren()[0].tag == 'a':
|
||||
t = aut.getchildren()[0]
|
||||
t.text = 'Autor: ' + t.text + ' '
|
||||
# Remove link
|
||||
e = E.span(t.text)
|
||||
t.getparent().replace(t,e)
|
||||
else:
|
||||
t = root.xpath("//span[@class='post-author-name']")[0]
|
||||
t.text = ('Autor: ' + t.text + ' ')
|
||||
root.xpath("//div[@class='authorship-names']")[0].append(E.span(article_length))
|
||||
except:
|
||||
pass
|
||||
# Make images visible
|
||||
pictures = root.xpath("//picture")
|
||||
for picture in pictures:
|
||||
image = picture.xpath("//source")[0]
|
||||
image_link = [a for a in image.get('srcset').split(' ') if a[:4] == "http"][-1]
|
||||
e=E.img({"src":image_link})
|
||||
picture.getparent().replace(picture,e)
|
||||
# Properly indent
|
||||
paragraphs = root.xpath('//p')
|
||||
paragraphs.reverse()
|
||||
# First paragraph is never indented
|
||||
for par in paragraphs[:-1]:
|
||||
prev = par.getprevious()
|
||||
# Do not indent after headings
|
||||
if hasattr(prev,'tag') and not (prev.tag in ['h2','h3']):
|
||||
par.attrib['class']="indent_first_line"
|
||||
# Fix subtitle for Téma
|
||||
try:
|
||||
o = root.xpath("//p[@class='post-perex']")[0]
|
||||
e = E.h2({"class":"post-subtitle"})
|
||||
e.text = o.text
|
||||
o.getparent().replace(o,e)
|
||||
except:
|
||||
pass
|
||||
return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))
|
||||
|
Loading…
x
Reference in New Issue
Block a user