diff --git a/recipes/respekt_magazine.recipe b/recipes/respekt_magazine.recipe new file mode 100644 index 0000000000..33c7453b62 --- /dev/null +++ b/recipes/respekt_magazine.recipe @@ -0,0 +1,189 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +# Copyright: tomashnyk@gmail.com + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +__copyright__ = 'tomashnyk@gmail.com' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +# This imports the version bundled with Calibre +import lxml +from lxml.builder import E +respekt_url = 'http://www.respekt.cz' + +class respektRecipe(BasicNewsRecipe): + __author__ = 'Tomáš Hnyk' + publisher = u'Respekt Publishing a. s.' + description = u'Articles from the print edition' + encoding = 'utf-8' + language = 'cs' + remove_javascript = True + remove_tags_before = dict(name='h1') + remove_tags_after = [dict(id='postcontent')] + remove_tags = [dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}), + dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}), + dict(name='a',attrs={'class':['quote','authorship-face']}), + dict(name='span',attrs={'class':'embed'}), + dict(name='svg'), + dict(name='script')] + + extra_css = 'p {text-align:justify;margin-top:0;margin-bottom:0} \ + ul {color:black} \ + .frame-caption {font-weight:normal;font-size:50%;font-style:italic;} \ + h1 {font-size:150%;margin-bottom:0;} \ + h2 {font-size:100%;margin-bottom:0;} \ + .post-subtitle {margin-top:0;} \ + h3 {font-size:100%;margin-bottom:0;margin-top:0;} \ + .box-title {background-color: lightgray;font-size:150%;font-weight:bold;margin-left:12%;margin-right:12%;margin-top:12%;margin-bottom:0;} \ + .box-content {background-color:lightgray;margin-left:12%;margin-right:12%;margin-top:0;margin-bottom:12%;} \ + p.indent_first_line {text-indent:30px;} \ + a {text-decoration:none;color:black;}' + + needs_subscription = True + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username is not None and self.password is not None: + br.open('https://www.respekt.cz/') + for form in br.forms(): + if form.attrs.get('id') == 'frm-authBox-loginForm': + br.form = form + break + for c in form.controls: + if 'name' in c.attrs: + if c.attrs['name'] == 'username': + c.value = self.username + if c.attrs['name'] == 'password': + c.value = self.password + br.submit() + return br + + # So that remove_tags_before works for this section + def preprocess_raw_html(self, raw_html, url): + root = lxml.html.fromstring(raw_html) + if root.xpath("//title")[0].text == (u"Respekt • Despekt • RESPEKT"): + raw_html = re.sub("h2","h1",raw_html) + return raw_html + + def parse_index(self): + raw1 = self.index_to_soup('http://www.respekt.cz/tydenik/', raw=True) + root1 = lxml.html.fromstring(raw1) + current_edition_url = root1.xpath("//div[@class='heroissue']/a")[0].items()[0][1] + raw2 = self.index_to_soup('http://www.respekt.cz/' + current_edition_url, raw=True) + root2 = lxml.html.fromstring(raw2) + self.cover_url = root2.xpath("//i[contains(@class, 'heroissue-cover')]")[0].get("data-src") + # Fetch date + date_text = root2.xpath("//time[@class='heroissue-date']")[0].text.split(',')[1] + s = date_text.split(" ") + # Are the dates of the issue in the same month and year? + if len(s) == 4 or len(s) == 7: + date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[-1]]) + elif len(s) == 8: + date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[3]]) + self.conversion_options = {'pubdate':date} + self.title = "Respekt magazine #" + "/".join(current_edition_url.split("/")[-1:-3:-1]) + ans = [] + for section in root2.xpath("//div[@class='col-md-6']/div[@class='issuedetail-categorized-sectionname']"): + section_name = section.text + articles = [] + article = section.getnext() + while hasattr(article, 'text') and not article.text.strip(): + title = article.xpath("span[@class='issuedetail-categorized-title']")[0].text + url = respekt_url + article.xpath("@href")[0] + articles.append({'title':title,'url':url}) + article = article.getnext() + ans.append((section_name,articles)) + highlights = zip(root2.xpath("//a[@class='issuedetail-highlighted-item']"),root2.xpath("//div[@class='issuedetail-highlighted-title']")) + highlights.reverse() + sections = [i[0] for i in ans] + for l,t in highlights: + title = t.text + link = l.xpath('@href')[0] + raw3 = self.index_to_soup(respekt_url + link, raw=True) + root3 = lxml.html.fromstring(raw3) + topics = [i.text.strip() for i in root3.xpath("//div[contains(@class, 'post-topics')]/a")] + # The name of the section changes its position + if u"Téma" in topics: + section_name = "Fokus" + elif u"Rozhovor" in topics: + section_name = "Rozhovor" + else: + for t in topics: + if t in sections: + section_name = t + break + for i in ans: + if i[0] == section_name: + i[1].insert(-(len(i[1])),{'title':title,'url':respekt_url+link}) + if section_name == u"Rozhovor": + ans.insert(sections.index(u'Fokus')+1,(section_name,[{'title':title,'url':respekt_url+link}])) + return ans + + def cleanup(self): + self.browser.open('https://www.respekt.cz/?do=logout') + + def preprocess_html(self,soup): + raw = u''.join(unicode(a) for a in soup.contents) + root = lxml.html.fromstring(raw) + # Fix Letem světem + if "Letem sv" in root.xpath("//title")[0].text: + p = root.xpath("//p") + for par in p[:]: + next = par.getnext() + if par.getchildren(): + child = par.getchildren()[0] + if hasattr(next,"tag") and next.tag == "h2" and hasattr(child,"tag") and child.tag == "strong": + text = child.text_content() + if next.text: + next.text = next.text + u" • " + text + else: + if next.getchildren(): + next_child = next.getchildren()[0] + next_child.text = next_child.text + u" • " + text + par.getparent().remove(par) + # Insert text length + text = root.xpath("//div[@id='postcontent']")[0] + article_length = u" • " + str(len(text.text_content().split(' '))) + ' slov' + try: + aut = root.xpath("//div[@class='authorship-names']")[0] + if aut.getchildren() and aut.getchildren()[0].tag == 'a': + t = aut.getchildren()[0] + t.text = 'Autor: ' + t.text + ' ' + # Remove link + e = E.span(t.text) + t.getparent().replace(t,e) + else: + t = root.xpath("//span[@class='post-author-name']")[0] + t.text = ('Autor: ' + t.text + ' ') + root.xpath("//div[@class='authorship-names']")[0].append(E.span(article_length)) + except: + pass + # Make images visible + pictures = root.xpath("//picture") + for picture in pictures: + image = picture.xpath("//source")[0] + image_link = [a for a in image.get('srcset').split(' ') if a[:4] == "http"][-1] + e=E.img({"src":image_link}) + picture.getparent().replace(picture,e) + # Properly indent + paragraphs = root.xpath('//p') + paragraphs.reverse() + # First paragraph is never indented + for par in paragraphs[:-1]: + prev = par.getprevious() + # Do not indent after headings + if hasattr(prev,'tag') and not (prev.tag in ['h2','h3']): + par.attrib['class']="indent_first_line" + # Fix subtitle for Téma + try: + o = root.xpath("//p[@class='post-perex']")[0] + e = E.h2({"class":"post-subtitle"}) + e.text = o.text + o.getparent().replace(o,e) + except: + pass + return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode))) +