calibre/recipes/respekt_magazine.recipe

#!/usr/bin/python
# -*- coding: utf-8 -*-
# License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright:    tomashnyk@gmail.com

__license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__   = 'tomashnyk@gmail.com'

import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
# This imports the version bundled with Calibre
import lxml
from lxml.builder import E
respekt_url = 'http://www.respekt.cz'


class respektRecipe(BasicNewsRecipe):
    __author__  = 'Tomáš Hnyk'
    publisher = u'Respekt Publishing a. s.'
    description = u'Articles from the print edition'
    title = u'Respekt Magazine – Print'
    encoding = 'utf-8'
    language = 'cs'
    delay = 0.001
    remove_javascript = True
    remove_tags_before = dict(name='h1')
    remove_tags_after = [dict(id='postcontent')]
    remove_tags = [
        dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}),
        dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}),
        dict(name='a',attrs={'class':['quote','authorship-face']}),
        dict(name='span',attrs={'class':'embed'}),
        dict(name='svg'),
        dict(name='script')
    ]

    extra_css = 'p {text-align:justify;margin-top:0;margin-bottom:0} \
                 ul {color:black} \
                 .frame-caption {font-weight:normal;font-size:50%;font-style:italic;}  \
                 h1 {font-size:150%;margin-bottom:0;} \
                 h2 {font-size:100%;margin-bottom:0;} \
                .post-subtitle {margin-top:0;} \
                 h3 {font-size:100%;margin-bottom:0;margin-top:0;} \
                 .box-title {background-color: lightgray;font-size:150%;font-weight:bold;margin-left:12%;margin-right:12%;margin-top:12%;margin-bottom:0;} \
                 .box-content {background-color:lightgray;margin-left:12%;margin-right:12%;margin-top:0;margin-bottom:12%;} \
                 p.indent_first_line {text-indent:30px;} \
                 a {text-decoration:none;color:black;}'

    needs_subscription = True

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('https://www.respekt.cz/uzivatel/prihlaseni')
            for form in br.forms():
                if form.attrs.get('id') == 'frm-loginForm':
                    br.form = form
                    break
            for c in form.controls:
                if 'name' in c.attrs:
                    if c.attrs['name'] == 'username':
                        c.value = self.username
                    if c.attrs['name'] == 'password':
                        c.value = self.password
            br.submit()
        return br

    # So that remove_tags_before works for this section
    def preprocess_raw_html(self, raw_html, url):
        root = lxml.html.fromstring(raw_html)
        if root.xpath("//title")[0].text == (u"Respekt • Despekt • RESPEKT"):
            raw_html = re.sub("h2","h1",raw_html)
        return raw_html

    def parse_index(self):
        raw1 = self.index_to_soup('http://www.respekt.cz/tydenik/', raw=True)
        root1 = lxml.html.fromstring(raw1)
        current_edition_url = root1.xpath("//div[@class='heroissue']/a")[0].items()[0][1]
        raw2 = self.index_to_soup('http://www.respekt.cz/' + current_edition_url, raw=True)
        root2 = lxml.html.fromstring(raw2)
        self.cover_url = root2.xpath("//i[contains(@class, 'heroissue-cover')]")[0].get("data-src")
        # Fetch date
        date_text = root2.xpath("//time[@class='heroissue-date']")[0].text.split(',')[1]
        s = date_text.split(" ")
        # Are the dates of the issue in the same month and year?
        if len(s) == 4 or len(s) == 7:
            date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[-1]])
        elif len(s) == 8:
            date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[3]])
        self.conversion_options = {'pubdate':date}
        self.title = "Respekt magazine #" + "/".join(current_edition_url.split("/")[-1:-3:-1])
        ans = []
        for section in root2.xpath("//div[@class='col-md-6']/div[@class='issuedetail-categorized-sectionname']"):
            section_name = section.text
            articles = []
            article = section.getnext()
            while hasattr(article, 'text') and not article.text.strip():
                title = article.xpath("span[@class='issuedetail-categorized-title']")[0].text
                url = respekt_url + article.xpath("@href")[0]
                articles.append({'title':title,'url':url})
                article = article.getnext()
            ans.append((section_name,articles))
        return ans

    def cleanup(self):
        self.browser.open('https://www.respekt.cz/?do=logout')

    def preprocess_html(self,soup):
        raw = u''.join(type(u'')(a) for a in soup.contents)
        root = lxml.html.fromstring(raw)
        # Fix Letem světem
        if "Letem sv" in root.xpath("//title")[0].text:
            p = root.xpath("//p")
            for par in p[:]:
                next = par.getnext()
                if par.getchildren():
                    child = par.getchildren()[0]
                if hasattr(next,"tag") and next.tag == "h2" and hasattr(child,"tag") and child.tag == "strong":
                    text = child.text_content()
                    if next.text:
                        next.text = next.text + u" • " + text
                    else:
                        if next.getchildren():
                            next_child = next.getchildren()[0]
                            next_child.text = next_child.text +  u" • " + text
                    par.getparent().remove(par)
        # Insert text length
        text = root.xpath("//div[@id='postcontent']")[0]
        article_length = u" • " + str(len(text.text_content().split(' '))) + ' slov'
        try:
            aut = root.xpath("//div[@class='authorship-names']")[0]
            if aut.getchildren() and aut.getchildren()[0].tag == 'a':
                for i,t in enumerate(aut.getchildren()):
                    if i == 0:
                        t.text = 'Autor: ' + t.text + ' '
                    else:
                        t.text = t.text + ' '
                    # Remove link
                    e = E.span(t.text)
                    t.getparent().replace(t,e)
            else:
                t = root.xpath("//span[@class='post-author-name']")[0]
                t.text = ('Autor: ' + t.text + ' ')
            root.xpath("//div[@class='authorship-names']")[0].append(E.span(article_length))
        except:
            pass
        # Make images visible
        pictures = root.xpath("//picture")
        for picture in pictures:
            image = picture.xpath("//source")[0]
            image_link = [a for a in image.get('srcset').split(' ') if a[:4] == "http"][-1]
            e=E.img({"src":image_link})
            picture.getparent().replace(picture,e)
        # Properly indent
        paragraphs = root.xpath('//p')
        paragraphs.reverse()
        # First paragraph is never indented
        for par in paragraphs[:-1]:
            prev = par.getprevious()
            # Do not indent after headings
            if hasattr(prev,'tag') and not (prev.tag in ['h2','h3']):
                par.attrib['class']="indent_first_line"
        # Fix subtitle for Téma
        try:
            o = root.xpath("//p[@class='post-perex']")[0]
            e = E.h2({"class":"post-subtitle"})
            e.text = o.text
            o.getparent().replace(o,e)
        except:
            pass
        return(BeautifulSoup(lxml.etree.tostring(root,encoding='unicode')))