Update Respekt

Merge branch 'master' of https://github.com/felagund/calibre
2025-09-29 15:31:08 -04:00 · 2014-06-11 21:15:39 +05:30 · 2014-06-11 21:15:39 +05:30 · 42b2eeb3bc
commit 42b2eeb3bc
parent bd1b10d3ae 159ba40d56
3 changed files with 373 additions and 37 deletions
--- a/recipes/respekt.recipe
+++ b/recipes/respekt.recipe
@ -1,37 +0,0 @@
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-from __future__ import unicode_literals
-from calibre.web.feeds.recipes import BasicNewsRecipe
-import re
-
-class respektRecipe(BasicNewsRecipe):
-    __author__  = 'bubak'
-    title = u'Respekt'
-    publisher = u'Respekt'
-    description = 'Respekt'
-    oldest_article = 1
-    max_articles_per_feed = 20
-
-    feeds = [
-            (u'Všechny články', u'http://respekt.ihned.cz/index.php?p=R00000_rss')
-            ,(u'Blogy', u'http://blog.respekt.ihned.cz/?p=Rb00VR_rss')
-            #,(u'Respekt DJ', u'http://respekt.ihned.cz/index.php?p=R00RDJ_rss')
-            ]
-
-
-    encoding = 'cp1250'
-    language = 'cs'
-    cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
-    remove_javascript = True
-    no_stylesheets = True
-
-    remove_tags = [dict(name='div',   attrs={'class':['d-tools', 'actions']})]
-    remove_tags_before  = dict(name='div',attrs={'id':['detail']})
-    remove_tags_after  = dict(name='div',attrs={'class':'d-tools'})
-    preprocess_regexps = [(re.compile(r'<div class="paid-zone".*', re.DOTALL|re.IGNORECASE), lambda match: 'Za zbytek článku je nutno platit. </body>'),
-			(re.compile(r'.*<div class="mm-ow">', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
-			(re.compile(r'<div class="col3">.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
-
-    keep_only_tags = []
-
-
-
--- a/recipes/respekt_magazine.recipe
+++ b/recipes/respekt_magazine.recipe
@ -0,0 +1,148 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
+# Copyright:    tomashnyk@gmail.com
+
+__license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
+__copyright__   = 'tomashnyk@gmail.com'
+
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
+#This imports the version bundled with Calibre
+import lxml
+from lxml.builder import E
+
+class respektRecipe(BasicNewsRecipe):
+    __author__  = u'Tomáš Hnyk'
+    title = u'Respekt - Magazine'
+    publisher = u'Respekt Publishing a. s.'
+    description = u'Articles from the printed edition, password needed for full access'
+    encoding = 'cp1250'
+    language = 'cs'
+    remove_javascript = True
+    extra_css = 'p {text-align:justify} \
+                 ul {color:black} \
+                 .image_caption {font-size:50%;font-style:italic;} \
+                 .author {text-align:left;} \
+                 p.indent_first_line {text-indent:30px;}'
+    remove_tags_before = dict(name='div',attrs={'class':['l']})
+    remove_tags_after = dict(id='text')
+    remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
+    dict(name='div',attrs={'class':['slot','reklama','date']}), \
+    dict(name='span', attrs={'class':['detail-vykrik']}), \
+    dict(name='p', attrs={'class':['detail-vykrik']}), \
+    dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}),  # soup>lxml>soup in preprocess requires this
+    dict(name='strong', attrs={'class':['detail-vykrik']}),
+    dict(name='script')] 
+    # this makes authors left-aligned by not using the author class)
+    preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
+    # remove empty tags
+    preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
+    preprocess_regexps.append((re.compile(r'<strong>&nbsp;</strong>', re.DOTALL|re.IGNORECASE), lambda match: '&nbsp;'))
+    preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
+    preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
+    preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
+
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://respekt.ihned.cz/')
+        cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src']
+        return cover
+    
+    needs_subscription = True
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        if self.username is not None and self.password is not None:
+            br.open('http://muj-ucet.ihned.cz/')
+            br.select_form(name='login')
+            br['login[nick]'] = self.username
+            br['login[pass]'] = self.password
+            br.submit()
+        return br
+
+    def parse_index(self):
+        raw = self.index_to_soup('http://respekt.ihned.cz/aktualni-cislo/', raw=True)
+        root = lxml.html.fromstring(raw)
+        ans = []
+        for article in root.xpath("//div[@class='ow-enclose']/div[@class='ow']"):
+            section_title = article.xpath(".//span[text()='(rubrika: ']")[0].find("a").text
+            date = article.xpath("span[@class='date-author']")[0].text[:-3]
+            title = article.find("h2").find("a").text
+            url = article.find('h2').find('a').get('href')
+            link = {'title':title,'url':url,'date':date}
+            for section in ans:
+                if section[0] == section_title:
+                    section[1].append(link)
+                    break
+            else:
+                ans.append((section_title,[link]))
+        return ans
+
+    def cleanup(self):
+        self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1')
+
+      
+    def preprocess_html(self,soup):
+        raw = u''.join(unicode(a) for a in soup.contents)
+        root = lxml.html.fromstring(raw)
+
+        # Make image captions visible
+        body = root.xpath("//div[@id='text']")[0]
+        add = 0 
+        for index, element in enumerate(body):
+            try:
+                if element.tag == 'img':
+                    body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
+                    add += 1
+            except:
+                pass
+
+        # Add length of the articles in words after author
+        article_length = str(len(body.text_content().split(' '))) + ' slov'
+        root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
+
+        # Make perex (subheading) start on a new line
+        root.xpath("//h1")[0].append(E.br(''))
+        
+         # Indent paragraphs when typographically suitable
+        parse = True
+        # There are only single paragraphs in these sections
+        if root.xpath("//title")[0].text == u"Deset českých zpráv, které by vás neměly minout | Deset českých zpráv - RESPEKT.IHNED.CZ":
+            parse = False
+        if root.xpath("//title")[0].text == u"Deset zahraničních zpráv, které by vás neměly minout | Deset světových zpráv - RESPEKT.IHNED.CZ":
+            parse = False
+        if parse:
+            # First paragraph is never indented
+            paragraphs = root.xpath('//p')
+            # Clear the formatting a little bit by removing these attributes
+            for par in paragraphs:
+                if 'class' in par.keys():
+                    if par.attrib['class'] == 'detail-odstavec':
+                        par.attrib.pop('class')
+            paragraphs.reverse()
+            for par in paragraphs[:-1]:
+                try:
+                    # <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
+                    if len(par) > 0:
+                        if (par.text is None and par.getchildren()[0].tag == 'strong'):
+                            continue
+                    elif par.getprevious().text == u'\u2026':
+                        continue
+                    indent = False
+                    # Either indent if the paragraphs are the same
+                    if par.getprevious().attrib == par.attrib:
+                        indent = True
+                    # Or else if the first paragraph of the text was special
+                    if 'class' in par.getprevious().keys():
+                        par_name = par.getprevious().attrib['class']
+                        if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
+                            indent = True
+                    if indent:
+                        for key in par.keys():
+                            par.attrib.pop(key)
+                        par.attrib['class']="indent_first_line"
+                except:
+                    pass
+
+        return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))
--- a/recipes/respekt_web.recipe
+++ b/recipes/respekt_web.recipe
@ -0,0 +1,225 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
+# Copyright:    tomashnyk@gmail.com
+
+__license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
+__copyright__   = 'tomashnyk@gmail.com'
+
+import re,os,datetime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
+from calibre.constants import config_dir, CONFIG_DIR_MODE
+#This imports the version bundled with Calibre
+import lxml
+from lxml.builder import E
+
+class respektWebRecipe(BasicNewsRecipe):
+    __author__  = u'Tomáš Hnyk'
+    title = u'Respekt - Web'
+    publisher = u'Respekt Publishing a. s.'
+    description = u'Free articles from respekt.cz website'
+    encoding = 'cp1250'
+    language = 'cs'
+    remove_javascript = True
+    cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
+    extra_css = 'p {text-align:justify} \
+                 ul {color:black} \
+                 .image_caption {font-size:50%;font-style:italic;} \
+                 .author {text-align:left;} \
+                 p.indent_first_line {text-indent:30px;}'
+    remove_tags_before = dict(name='div',attrs={'class':['l']})
+    remove_tags_after = dict(id='text')
+    remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
+    dict(name='div',attrs={'class':['slot','reklama','date']}), \
+    dict(name='span', attrs={'class':['detail-vykrik']}), \
+    dict(name='p', attrs={'class':['detail-vykrik']}), \
+    dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}),  # soup>lxml>soup in prprocess requires this
+    dict(name='strong', attrs={'class':['detail-vykrik']}),
+    dict(name='script')] 
+    # this makes authors left-aligned by not using the author class)
+    preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
+    # remove empty tags
+    preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
+    preprocess_regexps.append((re.compile(r'<strong>&nbsp;</strong>', re.DOTALL|re.IGNORECASE), lambda match: '&nbsp;'))
+    preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
+    preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
+    preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
+    
+    def parse_index(self):
+        # Read already downloaded articles
+        recipe_dir = os.path.join(config_dir,'recipes')
+        old_articles = os.path.join(recipe_dir,self.title.encode('utf-8').replace('/',':'))
+        past_items = []
+        if os.path.exists(old_articles):
+           with file(old_articles) as f:
+               for h in f:
+                   l = h.strip().split(" ")
+                   past_items.append((l[0]," ".join(l[1:])))
+        old_urls = [x[0] for x in past_items]
+        count_items = {}
+        current_items = []
+        # Keep a list of only 20 latest articles for each section
+        past_items.reverse()
+        for item in past_items:
+            if item[1] in count_items.keys():
+                if count_items[item[1]] < 20:
+                    count_items[item[1]] += 1
+                    current_items.append(item)
+            else:
+                count_items[item[1]] = 1
+                current_items.append(item)
+        current_items.reverse()
+
+        sections = []
+        # Get the webpages to download lists of articles from
+        raw = self.index_to_soup('http://respekt.ihned.cz/sloupky-redaktoru/', raw=True)
+        root = lxml.html.fromstring(raw)
+        sections = []
+        for section in root.xpath("//div[@class='ow-enclose sr']/table/tr/td"):
+            try:
+                url = section.find('a').get('href')
+                if not ('?m=authors&person[id]=' in url):
+                    sections.append((url,section.find('a').find('b').text))
+            except:
+                pass
+        sections.append(('http://respekt.ihned.cz/respekt-dj/','Respekt DJ'))
+        sections.append(('http://respekt.ihned.cz/fokus/','Fokus'))
+        sections.append(('http://respekt.ihned.cz/respekt-hub/','Respekt Hub'))
+        sections.append(('http://respekt.ihned.cz/rozhovory/','Rozhovory'))
+        sections.append(('http://respekt.ihned.cz/glosy/','Glosy'))
+
+        # Get the list of articles
+        ans = []
+        for section in sections:
+            raw = self.index_to_soup(section[0], raw=True)
+            root = lxml.html.fromstring(raw)
+            list_of_articles = []
+            articles = root.xpath("//div[@class='ow-enclose']/div[@class='ow']")
+            # Sort the articles in a section from oldest to newest
+            articles.reverse()
+            for article in articles:
+                date = getattr(article.xpath("span[@class='date-author']")[0],'text','')[:-3]
+                author = getattr(article.xpath("span[@class='date-author']")[0].find("a"),'text','')
+                title = getattr(article.find("h2").find("a"),'text','')
+                url = article.find('h2').find('a').get('href')
+                # Only download new articles
+                if url not in old_urls:
+                    old_urls.append(url)
+                    current_items.append((url,section[1]))
+                    list_of_articles.append({'title':title,'url':url,'date':date,'author':author})
+            # Redownload this page next time if it is still being updated (between 7 and 17 GMT generally, so make the limits a little bit bigger):wq
+            if section[1] == 'Respekt DJ':
+                if list_of_articles:
+                    if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour  < 17:
+                        #list_of_articles = list_of_articles[:-1]
+                        current_items = current_items[:-1]
+            if list_of_articles:
+                ans.append((section[1],list_of_articles))
+        # Write already downloaded articles
+        with file(old_articles,'w') as f:
+            f.write('\n'.join('{} {}'.format(*x) for x in current_items))
+        return ans
+
+    # For some reason, the following does not work:
+    # preprocess_regexps.append((re.compile(r'<br/><br/>', re.DOTALL|re.IGNORECASE), lambda match: '</p><p>'))
+    def preprocess_raw_html(self, raw_html, url):
+        return re.sub("<br /><br />","</p><p>",raw_html)
+
+    def preprocess_html(self,soup):
+        raw = u''.join(unicode(a) for a in soup.contents)
+        root = lxml.html.fromstring(raw)
+        # Make image captions visible
+        body = root.xpath("//div[@id='text']")[0]
+        add = 0 
+        for index, element in enumerate(body):
+            try:
+                if element.tag == 'img':
+                    body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
+                    add += 1
+            except:
+                pass
+        # Make captions visible on the website have the same style
+        try:
+            root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption'
+        except:
+            pass
+        # For DJ, the perex is always the same, so remove it
+        if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ':
+            
+            perex = root.xpath("//div[@id='perex']")[0]
+            clean = root.xpath("//div[@class='clean']")[0]
+            perex.getparent().remove(perex)
+            clean.getparent().remove(clean)
+                
+            # DJ section gets mal-formatted on kindle otherwise
+            for i in root.xpath("//h2[@class='d-dj-t']"):
+                i.attrib['class'] = ''
+                E.style = "font-size:60%;font-weight:normal;"
+                time = E('span',i.getprevious().text_content(),style = E.style)
+                # Time should be ahead of the title
+                time.tail = ' ' + i.text
+                i.text = ''
+                i.insert(0,time)
+            for i in root.xpath("//div[@class='d-dj-d']"):
+                i.attrib['class'] = ''
+                i.xpath("div/span")[0].text = ''
+            for i in root.xpath("//div[@class='d-dj-b']"):
+                i.attrib['class'] = ''
+
+            # Make captions visible on the website have the same style
+            root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption'
+
+            # Reverse the entries so that the earliest are at the top
+            entries = root.xpath("//div[@class='d-dj-i']")
+            entries.reverse()
+            dj_body = entries[0].getparent()
+            for entry in entries:
+                dj_body.remove(entry)
+                dj_body.append(entry) 
+
+        # We are not interested in this paragraph as it stays the same and is essentialy an ad
+        if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz':
+            ad = root.xpath("//p[@id='ajmonf']")[0]
+            ad.getparent().remove(ad)
+        
+        # Add length of the articles in words after author
+        article_length = str(len(body.text_content().split(' '))) + ' slov'
+        root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
+
+        # Make perex (subheading) start on a new line
+        root.xpath("//h1")[0].append(E.br(''))
+
+        # Indent paragraphs when typographically suitable
+        # First paragraph is never indented
+        paragraphs = root.xpath('//p')
+        # Clear the formatting a little bit by removing these attributes
+        for par in paragraphs:
+            if 'class' in par.keys():
+                if par.attrib['class'] == 'detail-odstavec':
+                    par.attrib.pop('class')
+        paragraphs.reverse()
+        for par in paragraphs[:-1]:
+            try:
+                # <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
+                if len(par) > 0:
+                    if (par.text is None and par.getchildren()[0].tag == 'strong'):
+                        continue
+                elif par.getprevious().text == u'\u2026':
+                    continue
+                indent = False
+                # Either indent if the paragraphs are the same
+                if par.getprevious().attrib == par.attrib:
+                    indent = True
+                # Or else if the first paragraph of the text was special
+                if 'class' in par.getprevious().keys():
+                    par_name = par.getprevious().attrib['class']
+                    if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
+                        indent = True
+                if indent:
+                    for key in par.keys():
+                        par.attrib.pop(key)
+                    par.attrib['class']="indent_first_line"
+            except:
+                pass
+        return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))