Make Respekt Magazine recipe work again and remove dysfunctional Respekt-Web recipe following a rewrite of the harvested website

2025-12-10 23:25:01 -05:00 · 2016-09-13 13:15:32 +02:00 · 2016-09-13 13:15:32 +02:00 · 28c733ad8f
commit 28c733ad8f
parent 159ba40d56
2 changed files with 148 additions and 332 deletions
--- a/recipes/respekt_magazine.recipe
+++ b/recipes/respekt_magazine.recipe
@ -12,137 +12,178 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
 #This imports the version bundled with Calibre
 import lxml
 from lxml.builder import E
 respekt_url = 'http://www.respekt.cz'
 class respektRecipe(BasicNewsRecipe):
-    __author__  = u'Tomáš Hnyk'
+    __author__  = 'Tomáš Hnyk'
    title = u'Respekt - Magazine'
    publisher = u'Respekt Publishing a. s.'
-    description = u'Articles from the printed edition, password needed for full access'
+    description = u'Articles from the print edition'
-    encoding = 'cp1250'
+    encoding = 'utf-8'
    language = 'cs'
    remove_javascript = True
-    extra_css = 'p {text-align:justify} \
+    remove_tags_before = dict(name='h1')
-                 ul {color:black} \
+    remove_tags_after = [dict(id='postcontent')]
-                 .image_caption {font-size:50%;font-style:italic;} \
+    remove_tags = [dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}), \
-                 .author {text-align:left;} \
+    dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}), \
-                 p.indent_first_line {text-indent:30px;}'
+    dict(name='a',attrs={'class':['quote','authorship-face']}), \
-    remove_tags_before = dict(name='div',attrs={'class':['l']})
+    dict(name='span',attrs={'class':'embed'}), \
-    remove_tags_after = dict(id='text')
+    dict(name='svg'), \
    remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
    dict(name='div',attrs={'class':['slot','reklama','date']}), \
    dict(name='span', attrs={'class':['detail-vykrik']}), \
    dict(name='p', attrs={'class':['detail-vykrik']}), \
    dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}),  # soup>lxml>soup in preprocess requires this
    dict(name='strong', attrs={'class':['detail-vykrik']}),
    dict(name='script')] 
    # this makes authors left-aligned by not using the author class)
    preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
    # remove empty tags
    preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
    preprocess_regexps.append((re.compile(r'<strong>&nbsp;</strong>', re.DOTALL|re.IGNORECASE), lambda match: '&nbsp;'))
    preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
    preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
    preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
-    def get_cover_url(self):
+    extra_css = 'p {text-align:justify;margin-top:0;margin-bottom:0} \
-        soup = self.index_to_soup('http://respekt.ihned.cz/')
+                 ul {color:black} \
-        cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src']
+                 .frame-caption {font-weight:normal;font-size:50%;font-style:italic;}  \
-        return cover
+                 h1 {font-size:150%;margin-bottom:0;} \
                 h2 {font-size:100%;margin-bottom:0;} \
                .post-subtitle {margin-top:0;} \
                 h3 {font-size:100%;margin-bottom:0;margin-top:0;} \
                 .box-title {background-color: lightgray;font-size:150%;font-weight:bold;margin-left:12%;margin-right:12%;margin-top:12%;margin-bottom:0;} \
                 .box-content {background-color:lightgray;margin-left:12%;margin-right:12%;margin-top:0;margin-bottom:12%;} \
                 p.indent_first_line {text-indent:30px;} \
                 a {text-decoration:none;color:black;}'
    needs_subscription = True
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
-            br.open('http://muj-ucet.ihned.cz/')
+            br.open('https://www.respekt.cz/')
-            br.select_form(name='login')
+            for form in br.forms():
-            br['login[nick]'] = self.username
+                if form.attrs.get('id') == 'frm-authBox-loginForm':
-            br['login[pass]'] = self.password
+                    br.form = form
                    break
            for c in form.controls:
                if 'name' in c.attrs:
                    if c.attrs['name'] == 'username':
                        c.value = self.username
                    if c.attrs['name'] == 'password':
                        c.value = self.password
            br.submit()
        return br
    # So that remove_tags_before works for this section
    def preprocess_raw_html(self, raw_html, url):
        root = lxml.html.fromstring(raw_html)
        if root.xpath("//title")[0].text == (u"Respekt • Despekt • RESPEKT"):
            raw_html = re.sub("h2","h1",raw_html)
        return raw_html
    def parse_index(self):
-        raw = self.index_to_soup('http://respekt.ihned.cz/aktualni-cislo/', raw=True)
+        raw1 = self.index_to_soup('http://www.respekt.cz/tydenik/', raw=True)
-        root = lxml.html.fromstring(raw)
+        root1 = lxml.html.fromstring(raw1)
        current_edition_url = root1.xpath("//div[@class='heroissue']/a")[0].items()[0][1]
        raw2 = self.index_to_soup('http://www.respekt.cz/' + current_edition_url, raw=True)
        root2 = lxml.html.fromstring(raw2)
        self.cover_url = root2.xpath("//i[contains(@class, 'heroissue-cover')]")[0].get("data-src")
        # Fetch date
        date_text = root2.xpath("//time[@class='heroissue-date']")[0].text.split(',')[1]
        s = date_text.split(" ")
        # Are the dates of the issue in the same month and year?
        if len(s) == 4 or len(s) == 7:
            date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[-1]])
        elif len(s) == 8:
            date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[3]])
        self.conversion_options = {'pubdate':date}
        self.title = "Respekt magazine #" + "/".join(current_edition_url.split("/")[-1:-3:-1])
        ans = []
-        for article in root.xpath("//div[@class='ow-enclose']/div[@class='ow']"):
+        for section in root2.xpath("//div[@class='col-md-6']/div[@class='issuedetail-categorized-sectionname']"):
-            section_title = article.xpath(".//span[text()='(rubrika: ']")[0].find("a").text
+            section_name = section.text
-            date = article.xpath("span[@class='date-author']")[0].text[:-3]
+            articles = []
-            title = article.find("h2").find("a").text
+            article = section.getnext()
-            url = article.find('h2').find('a').get('href')
+            while hasattr(article, 'text') and not article.text.strip():
-            link = {'title':title,'url':url,'date':date}
+                title = article.xpath("span[@class='issuedetail-categorized-title']")[0].text
-            for section in ans:
+                url = respekt_url + article.xpath("@href")[0]
-                if section[0] == section_title:
+                articles.append({'title':title,'url':url})
-                    section[1].append(link)
+                article = article.getnext()
-                    break
+            ans.append((section_name,articles))    
        highlights = zip(root2.xpath("//a[@class='issuedetail-highlighted-item']"),root2.xpath("//div[@class='issuedetail-highlighted-title']"))
        highlights.reverse()
        sections = [i[0] for i in ans]
        for l,t in highlights:
            title = t.text
            link = l.xpath('@href')[0]
            raw3 = self.index_to_soup(respekt_url + link, raw=True) 
            root3 = lxml.html.fromstring(raw3)
            topics = [i.text.strip() for i in root3.xpath("//div[contains(@class, 'post-topics')]/a")]
            # The name of the section changes its position
            if u"Téma" in topics: 
                section_name = "Fokus"
            elif u"Rozhovor" in topics: 
                section_name = "Rozhovor"
            else:
-                ans.append((section_title,[link]))
+               for t in topics:
                    if t in sections:
                        section_name = t
                        break
            for i in ans:
                if i[0] == section_name:
                    i[1].insert(-(len(i[1])),{'title':title,'url':respekt_url+link})
            if section_name == u"Rozhovor":
                ans.insert(sections.index(u'Fokus')+1,(section_name,[{'title':title,'url':respekt_url+link}]))
        return ans
    def cleanup(self):
-        self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1')
+        self.browser.open('https://www.respekt.cz/?do=logout')
    def preprocess_html(self,soup):
        raw = u''.join(unicode(a) for a in soup.contents)
        root = lxml.html.fromstring(raw)
-
+        #Fix Letem světem
-        # Make image captions visible
+        if "Letem sv" in root.xpath("//title")[0].text:
-        body = root.xpath("//div[@id='text']")[0]
+            p = root.xpath("//p")
-        add = 0 
+            for par in p[:]:
-        for index, element in enumerate(body):
+                next = par.getnext()
-            try:
+                if par.getchildren():
-                if element.tag == 'img':
+                    child = par.getchildren()[0] 
-                    body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
+                if hasattr(next,"tag") and next.tag == "h2" and hasattr(child,"tag") and child.tag == "strong":
-                    add += 1
+                    text = child.text_content()
-            except:
+                    if next.text:
-                pass
+                        next.text = next.text + u" • " + text
-
+                    else:
-        # Add length of the articles in words after author
+                        if next.getchildren():
-        article_length = str(len(body.text_content().split(' '))) + ' slov'
+                            next_child = next.getchildren()[0]
-        root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
+                            next_child.text = next_child.text +  u" • " + text
-
+                    par.getparent().remove(par)
-        # Make perex (subheading) start on a new line
+        # Insert text length
-        root.xpath("//h1")[0].append(E.br(''))
+        text = root.xpath("//div[@id='postcontent']")[0]
-        
+        article_length = u" • " + str(len(text.text_content().split(' '))) + ' slov'
-         # Indent paragraphs when typographically suitable
+        try: 
-        parse = True
+            aut = root.xpath("//div[@class='authorship-names']")[0]
-        # There are only single paragraphs in these sections
+            if aut.getchildren() and aut.getchildren()[0].tag == 'a':
-        if root.xpath("//title")[0].text == u"Deset českých zpráv, které by vás neměly minout | Deset českých zpráv - RESPEKT.IHNED.CZ":
+                t = aut.getchildren()[0]
-            parse = False
+                t.text = 'Autor: ' + t.text + ' '
-        if root.xpath("//title")[0].text == u"Deset zahraničních zpráv, které by vás neměly minout | Deset světových zpráv - RESPEKT.IHNED.CZ":
+                # Remove link
-            parse = False
+                e = E.span(t.text)
-        if parse:
+                t.getparent().replace(t,e)
-            # First paragraph is never indented
+            else: 
-            paragraphs = root.xpath('//p')
+                t = root.xpath("//span[@class='post-author-name']")[0]
-            # Clear the formatting a little bit by removing these attributes
+                t.text = ('Autor: ' + t.text + ' ')
-            for par in paragraphs:
+            root.xpath("//div[@class='authorship-names']")[0].append(E.span(article_length))
-                if 'class' in par.keys():
+        except:
-                    if par.attrib['class'] == 'detail-odstavec':
+            pass
-                        par.attrib.pop('class')
+        # Make images visible
-            paragraphs.reverse()
+        pictures = root.xpath("//picture")
-            for par in paragraphs[:-1]:
+        for picture in pictures:
-                try:
+            image = picture.xpath("//source")[0]
-                    # <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
+            image_link = [a for a in image.get('srcset').split(' ') if a[:4] == "http"][-1]
-                    if len(par) > 0:
+            e=E.img({"src":image_link})
-                        if (par.text is None and par.getchildren()[0].tag == 'strong'):
+            picture.getparent().replace(picture,e)
-                            continue
+        # Properly indent
-                    elif par.getprevious().text == u'\u2026':
+        paragraphs = root.xpath('//p')
-                        continue
+        paragraphs.reverse()
-                    indent = False
+        # First paragraph is never indented
-                    # Either indent if the paragraphs are the same
+        for par in paragraphs[:-1]:
-                    if par.getprevious().attrib == par.attrib:
+            prev = par.getprevious()
-                        indent = True
+            # Do not indent after headings
-                    # Or else if the first paragraph of the text was special
+            if hasattr(prev,'tag') and not (prev.tag in ['h2','h3']):
-                    if 'class' in par.getprevious().keys():
+                par.attrib['class']="indent_first_line"
-                        par_name = par.getprevious().attrib['class']
+        # Fix subtitle for Téma
-                        if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
+        try: 
-                            indent = True
+            o = root.xpath("//p[@class='post-perex']")[0]
-                    if indent:
+            e = E.h2({"class":"post-subtitle"})
-                        for key in par.keys():
+            e.text = o.text
-                            par.attrib.pop(key)
+            o.getparent().replace(o,e)
-                        par.attrib['class']="indent_first_line"
+        except:
-                except:
+            pass
                    pass
        return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))
--- a/recipes/respekt_web.recipe
+++ b/recipes/respekt_web.recipe
@ -1,225 +0,0 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
 # Copyright:    tomashnyk@gmail.com
 __license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
 __copyright__   = 'tomashnyk@gmail.com'
 import re,os,datetime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
 from calibre.constants import config_dir, CONFIG_DIR_MODE
 #This imports the version bundled with Calibre
 import lxml
 from lxml.builder import E
 class respektWebRecipe(BasicNewsRecipe):
    __author__  = u'Tomáš Hnyk'
    title = u'Respekt - Web'
    publisher = u'Respekt Publishing a. s.'
    description = u'Free articles from respekt.cz website'
    encoding = 'cp1250'
    language = 'cs'
    remove_javascript = True
    cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
    extra_css = 'p {text-align:justify} \
                 ul {color:black} \
                 .image_caption {font-size:50%;font-style:italic;} \
                 .author {text-align:left;} \
                 p.indent_first_line {text-indent:30px;}'
    remove_tags_before = dict(name='div',attrs={'class':['l']})
    remove_tags_after = dict(id='text')
    remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
    dict(name='div',attrs={'class':['slot','reklama','date']}), \
    dict(name='span', attrs={'class':['detail-vykrik']}), \
    dict(name='p', attrs={'class':['detail-vykrik']}), \
    dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}),  # soup>lxml>soup in prprocess requires this
    dict(name='strong', attrs={'class':['detail-vykrik']}),
    dict(name='script')] 
    # this makes authors left-aligned by not using the author class)
    preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
    # remove empty tags
    preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
    preprocess_regexps.append((re.compile(r'<strong>&nbsp;</strong>', re.DOTALL|re.IGNORECASE), lambda match: '&nbsp;'))
    preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
    preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
    preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
    def parse_index(self):
        # Read already downloaded articles
        recipe_dir = os.path.join(config_dir,'recipes')
        old_articles = os.path.join(recipe_dir,self.title.encode('utf-8').replace('/',':'))
        past_items = []
        if os.path.exists(old_articles):
           with file(old_articles) as f:
               for h in f:
                   l = h.strip().split(" ")
                   past_items.append((l[0]," ".join(l[1:])))
        old_urls = [x[0] for x in past_items]
        count_items = {}
        current_items = []
        # Keep a list of only 20 latest articles for each section
        past_items.reverse()
        for item in past_items:
            if item[1] in count_items.keys():
                if count_items[item[1]] < 20:
                    count_items[item[1]] += 1
                    current_items.append(item)
            else:
                count_items[item[1]] = 1
                current_items.append(item)
        current_items.reverse()
        sections = []
        # Get the webpages to download lists of articles from
        raw = self.index_to_soup('http://respekt.ihned.cz/sloupky-redaktoru/', raw=True)
        root = lxml.html.fromstring(raw)
        sections = []
        for section in root.xpath("//div[@class='ow-enclose sr']/table/tr/td"):
            try:
                url = section.find('a').get('href')
                if not ('?m=authors&person[id]=' in url):
                    sections.append((url,section.find('a').find('b').text))
            except:
                pass
        sections.append(('http://respekt.ihned.cz/respekt-dj/','Respekt DJ'))
        sections.append(('http://respekt.ihned.cz/fokus/','Fokus'))
        sections.append(('http://respekt.ihned.cz/respekt-hub/','Respekt Hub'))
        sections.append(('http://respekt.ihned.cz/rozhovory/','Rozhovory'))
        sections.append(('http://respekt.ihned.cz/glosy/','Glosy'))
        # Get the list of articles
        ans = []
        for section in sections:
            raw = self.index_to_soup(section[0], raw=True)
            root = lxml.html.fromstring(raw)
            list_of_articles = []
            articles = root.xpath("//div[@class='ow-enclose']/div[@class='ow']")
            # Sort the articles in a section from oldest to newest
            articles.reverse()
            for article in articles:
                date = getattr(article.xpath("span[@class='date-author']")[0],'text','')[:-3]
                author = getattr(article.xpath("span[@class='date-author']")[0].find("a"),'text','')
                title = getattr(article.find("h2").find("a"),'text','')
                url = article.find('h2').find('a').get('href')
                # Only download new articles
                if url not in old_urls:
                    old_urls.append(url)
                    current_items.append((url,section[1]))
                    list_of_articles.append({'title':title,'url':url,'date':date,'author':author})
            # Redownload this page next time if it is still being updated (between 7 and 17 GMT generally, so make the limits a little bit bigger):wq
            if section[1] == 'Respekt DJ':
                if list_of_articles:
                    if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour  < 17:
                        #list_of_articles = list_of_articles[:-1]
                        current_items = current_items[:-1]
            if list_of_articles:
                ans.append((section[1],list_of_articles))
        # Write already downloaded articles
        with file(old_articles,'w') as f:
            f.write('\n'.join('{} {}'.format(*x) for x in current_items))
        return ans
    # For some reason, the following does not work:
    # preprocess_regexps.append((re.compile(r'<br/><br/>', re.DOTALL|re.IGNORECASE), lambda match: '</p><p>'))
    def preprocess_raw_html(self, raw_html, url):
        return re.sub("<br /><br />","</p><p>",raw_html)
    def preprocess_html(self,soup):
        raw = u''.join(unicode(a) for a in soup.contents)
        root = lxml.html.fromstring(raw)
        # Make image captions visible
        body = root.xpath("//div[@id='text']")[0]
        add = 0 
        for index, element in enumerate(body):
            try:
                if element.tag == 'img':
                    body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
                    add += 1
            except:
                pass
        # Make captions visible on the website have the same style
        try:
            root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption'
        except:
            pass
        # For DJ, the perex is always the same, so remove it
        if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ':
            perex = root.xpath("//div[@id='perex']")[0]
            clean = root.xpath("//div[@class='clean']")[0]
            perex.getparent().remove(perex)
            clean.getparent().remove(clean)
            # DJ section gets mal-formatted on kindle otherwise
            for i in root.xpath("//h2[@class='d-dj-t']"):
                i.attrib['class'] = ''
                E.style = "font-size:60%;font-weight:normal;"
                time = E('span',i.getprevious().text_content(),style = E.style)
                # Time should be ahead of the title
                time.tail = ' ' + i.text
                i.text = ''
                i.insert(0,time)
            for i in root.xpath("//div[@class='d-dj-d']"):
                i.attrib['class'] = ''
                i.xpath("div/span")[0].text = ''
            for i in root.xpath("//div[@class='d-dj-b']"):
                i.attrib['class'] = ''
            # Make captions visible on the website have the same style
            root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption'
            # Reverse the entries so that the earliest are at the top
            entries = root.xpath("//div[@class='d-dj-i']")
            entries.reverse()
            dj_body = entries[0].getparent()
            for entry in entries:
                dj_body.remove(entry)
                dj_body.append(entry) 
        # We are not interested in this paragraph as it stays the same and is essentialy an ad
        if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz':
            ad = root.xpath("//p[@id='ajmonf']")[0]
            ad.getparent().remove(ad)
        # Add length of the articles in words after author
        article_length = str(len(body.text_content().split(' '))) + ' slov'
        root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
        # Make perex (subheading) start on a new line
        root.xpath("//h1")[0].append(E.br(''))
        # Indent paragraphs when typographically suitable
        # First paragraph is never indented
        paragraphs = root.xpath('//p')
        # Clear the formatting a little bit by removing these attributes
        for par in paragraphs:
            if 'class' in par.keys():
                if par.attrib['class'] == 'detail-odstavec':
                    par.attrib.pop('class')
        paragraphs.reverse()
        for par in paragraphs[:-1]:
            try:
                # <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
                if len(par) > 0:
                    if (par.text is None and par.getchildren()[0].tag == 'strong'):
                        continue
                elif par.getprevious().text == u'\u2026':
                    continue
                indent = False
                # Either indent if the paragraphs are the same
                if par.getprevious().attrib == par.attrib:
                    indent = True
                # Or else if the first paragraph of the text was special
                if 'class' in par.getprevious().keys():
                    par_name = par.getprevious().attrib['class']
                    if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
                        indent = True
                if indent:
                    for key in par.keys():
                        par.attrib.pop(key)
                    par.attrib['class']="indent_first_line"
            except:
                pass
        return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))