#!/usr/bin/python # -*- coding: utf-8 -*- # License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html # Copyright: tomashnyk@gmail.com __license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' __copyright__ = 'tomashnyk@gmail.com' import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup # This imports the version bundled with Calibre import lxml from lxml.builder import E respekt_url = 'http://www.respekt.cz' class respektRecipe(BasicNewsRecipe): __author__ = 'Tomáš Hnyk' publisher = u'Respekt Publishing a. s.' description = u'Articles from the print edition' title = u'Respekt Magazine – Print' encoding = 'utf-8' language = 'cs' delay = 0.001 remove_javascript = True remove_tags_before = dict(name='h1') remove_tags_after = [dict(id='postcontent')] remove_tags = [ dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}), dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}), dict(name='a',attrs={'class':['quote','authorship-face']}), dict(name='span',attrs={'class':'embed'}), dict(name='svg'), dict(name='script') ] extra_css = 'p {text-align:justify;margin-top:0;margin-bottom:0} \ ul {color:black} \ .frame-caption {font-weight:normal;font-size:50%;font-style:italic;} \ h1 {font-size:150%;margin-bottom:0;} \ h2 {font-size:100%;margin-bottom:0;} \ .post-subtitle {margin-top:0;} \ h3 {font-size:100%;margin-bottom:0;margin-top:0;} \ .box-title {background-color: lightgray;font-size:150%;font-weight:bold;margin-left:12%;margin-right:12%;margin-top:12%;margin-bottom:0;} \ .box-content {background-color:lightgray;margin-left:12%;margin-right:12%;margin-top:0;margin-bottom:12%;} \ p.indent_first_line {text-indent:30px;} \ a {text-decoration:none;color:black;}' needs_subscription = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.respekt.cz/uzivatel/prihlaseni') for form in br.forms(): if form.attrs.get('id') == 'frm-loginForm': br.form = form break for c in form.controls: if 'name' in c.attrs: if c.attrs['name'] == 'username': c.value = self.username if c.attrs['name'] == 'password': c.value = self.password br.submit() return br # So that remove_tags_before works for this section def preprocess_raw_html(self, raw_html, url): root = lxml.html.fromstring(raw_html) if root.xpath("//title")[0].text == (u"Respekt • Despekt • RESPEKT"): raw_html = re.sub("h2","h1",raw_html) return raw_html def parse_index(self): raw1 = self.index_to_soup('http://www.respekt.cz/tydenik/', raw=True) root1 = lxml.html.fromstring(raw1) current_edition_url = root1.xpath("//div[@class='heroissue']/a")[0].items()[0][1] raw2 = self.index_to_soup('http://www.respekt.cz/' + current_edition_url, raw=True) root2 = lxml.html.fromstring(raw2) self.cover_url = root2.xpath("//i[contains(@class, 'heroissue-cover')]")[0].get("data-src") # Fetch date date_text = root2.xpath("//time[@class='heroissue-date']")[0].text.split(',')[1] s = date_text.split(" ") # Are the dates of the issue in the same month and year? if len(s) == 4 or len(s) == 7: date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[-1]]) elif len(s) == 8: date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[3]]) self.conversion_options = {'pubdate':date} self.title = "Respekt magazine #" + "/".join(current_edition_url.split("/")[-1:-3:-1]) ans = [] for section in root2.xpath("//div[@class='col-md-6']/div[@class='issuedetail-categorized-sectionname']"): section_name = section.text articles = [] article = section.getnext() while hasattr(article, 'text') and not article.text.strip(): title = article.xpath("span[@class='issuedetail-categorized-title']")[0].text url = respekt_url + article.xpath("@href")[0] articles.append({'title':title,'url':url}) article = article.getnext() ans.append((section_name,articles)) return ans def cleanup(self): self.browser.open('https://www.respekt.cz/?do=logout') def preprocess_html(self,soup): raw = u''.join(type(u'')(a) for a in soup.contents) root = lxml.html.fromstring(raw) # Fix Letem světem if "Letem sv" in root.xpath("//title")[0].text: p = root.xpath("//p") for par in p[:]: next = par.getnext() if par.getchildren(): child = par.getchildren()[0] if hasattr(next,"tag") and next.tag == "h2" and hasattr(child,"tag") and child.tag == "strong": text = child.text_content() if next.text: next.text = next.text + u" • " + text else: if next.getchildren(): next_child = next.getchildren()[0] next_child.text = next_child.text + u" • " + text par.getparent().remove(par) # Insert text length text = root.xpath("//div[@id='postcontent']")[0] article_length = u" • " + str(len(text.text_content().split(' '))) + ' slov' try: aut = root.xpath("//div[@class='authorship-names']")[0] if aut.getchildren() and aut.getchildren()[0].tag == 'a': for i,t in enumerate(aut.getchildren()): if i == 0: t.text = 'Autor: ' + t.text + ' ' else: t.text = t.text + ' ' # Remove link e = E.span(t.text) t.getparent().replace(t,e) else: t = root.xpath("//span[@class='post-author-name']")[0] t.text = ('Autor: ' + t.text + ' ') root.xpath("//div[@class='authorship-names']")[0].append(E.span(article_length)) except: pass # Make images visible pictures = root.xpath("//picture") for picture in pictures: image = picture.xpath("//source")[0] image_link = [a for a in image.get('srcset').split(' ') if a[:4] == "http"][-1] e=E.img({"src":image_link}) picture.getparent().replace(picture,e) # Properly indent paragraphs = root.xpath('//p') paragraphs.reverse() # First paragraph is never indented for par in paragraphs[:-1]: prev = par.getprevious() # Do not indent after headings if hasattr(prev,'tag') and not (prev.tag in ['h2','h3']): par.attrib['class']="indent_first_line" # Fix subtitle for Téma try: o = root.xpath("//p[@class='post-perex']")[0] e = E.h2({"class":"post-subtitle"}) e.text = o.text o.getparent().replace(o,e) except: pass return(BeautifulSoup(lxml.etree.tostring(root,encoding='unicode')))