From 069e77b9b9f0cc44a81d7b209a53a0d6a9c7b116 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 19 Nov 2015 08:13:47 +0530 Subject: [PATCH] Remove no longer working recipes --- recipes/deia.recipe | 70 ---------- recipes/respekt_magazine.recipe | 148 --------------------- recipes/respekt_web.recipe | 225 -------------------------------- 3 files changed, 443 deletions(-) delete mode 100644 recipes/deia.recipe delete mode 100644 recipes/respekt_magazine.recipe delete mode 100644 recipes/respekt_web.recipe diff --git a/recipes/deia.recipe b/recipes/deia.recipe deleted file mode 100644 index ee17d58765..0000000000 --- a/recipes/deia.recipe +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'Gerardo Diez' -__copyright__ = 'Gerardo Diez' -description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' -__docformat__ = 'restructuredtext en' - -''' -deia.com -''' -from calibre.web.feeds.recipes import BasicNewsRecipe - -class Deia(BasicNewsRecipe): - title ='Deia' - __author__ ='Gerardo Diez' - publisher ='Editorial Iparraguirre, S.A' - category ='news, politics, finances, world, spain, euskadi' - publication_type ='newspaper' - oldest_article =1 - max_articles_per_feed =100 - simultaneous_downloads =10 - cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg' - timefmt ='[%a, %d %b, %Y]' - encoding ='utf8' - language ='es' - remove_javascript =True - remove_tags_after =dict(id='Texto') - remove_tags_before =dict(id='Texto') - remove_tags =[dict(name='div', attrs={'class':['Herramientas ', 'Multimedia']})] - no_stylesheets =True - extra_css ='h1 {margin-bottom: .15em;font-size: 2.7em; font-family: Georgia, "Times New Roman", Times, serif;} .Antetitulo {margin: 1em 0;text-transform: uppercase;color: #999;} .PieFoto {margin: .1em 0;padding: .5em .5em .5em .5em;background: #F0F0F0;} .PieFoto p {margin-bottom: 0;font-family: Georgia,"Times New Roman",Times,serif;font-weight: bold; font-style: italic; color: #666;}' - keep_only_tags =[dict(name='div', attrs={'class':['Texto ', 'NoticiaFicha ']})] - feeds = [ - (u'Bizkaia' ,u'http://www.deia.com/index.php/services/rss?seccion=bizkaia'), - (u'Bilbao' ,u'http://www.deia.com/index.php/services/rss?seccion=bilbao'), - (u'Hemendik eta Handik' ,u'http://www.deia.com/index.php/services/rss?seccion=hemendik-eta-handik'), - (u'Margen Derecha' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-derecha'), - (u'Encartaciones y Margen Izquierda' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-izquierda-encartaciones'), - (u'Costa' ,u'http://www.deia.com/index.php/services/rss?seccion=costa'), - (u'Duranguesado' ,u'http://www.deia.com/index.php/services/rss?seccion=duranguesado'), - (u'Llodio-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=llodio-nervion'), - (u'Arratia-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=arratia-nervion'), - (u'Uribe-Txorierri' ,u'http://www.deia.com/index.php/services/rss?seccion=uribe-txorierri'), - (u'Ecos de sociedad' ,u'http://www.deia.com/index.php/services/rss?seccion=ecos-de-sociedad'), - (u'Sucesos' ,u'http://www.deia.com/index.php/services/rss?seccion=sucesos'), - (u'Política' ,u'http://www.deia.com/index.php/services/rss?seccion=politica'), - (u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/euskadi'), - (u'España' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/espana'), - (u'Sociedad',u'http://www.deia.com/index.php/services/rss?seccion=sociedad'), - (u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=socidad/euskadi'), - (u'Sociedad.España' ,u'http://www.deia.com/index.php/services/rss?seccion=sociedad/espana'), - (u'Ocio y Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio-y-cultura'), - #(u'Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=cultura'), - #(u'Ocio' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio'), - (u'On' ,u'http://www.deia.com/index.php/services/rss?seccion=on'), - (u'Agenda' ,u'http://www.deia.com/index.php/services/rss?seccion=agenda'), - (u'Comunicación' ,u'http://www.deia.com/index.php/services/rss?seccion=comunicacion'), - (u'Viajes' ,u'http://www.deia.com/index.php/services/rss?seccion=viajes'), - (u'¡Mundo!' ,u'http://www.deia.com/index.php/services/rss?seccion=que-mundo'), - (u'Humor' ,u'http://www.deia.com/index.php/services/rss?seccion=humor'), - (u'Opinión' ,u'http://www.deia.com/index.php/services/rss?seccion=opinion'), - (u'Editorial' ,u'http://www.deia.com/index.php/services/rss?seccion=editorial'), - (u'Tribuna abierta' ,u'http://www.deia.com/index.php/services/rss?seccion=tribuna-abierta'), - (u'Colaboración' ,u'http://www.deia.com/index.php/services/rss?seccion=colaboracion'), - (u'Columnistas' ,u'http://www.deia.com/index.php/services/rss?seccion=columnistas'), - (u'Deportes' ,u'http://www.deia.com/index.php/services/rss?seccion=deportes'), - (u'Athletic' ,u'http://www.deia.com/index.php/services/rss?seccion=athletic'), - (u'Economía' ,'http://www.deia.com/index.php/services/rss?seccion=economia'), - (u'Mundo' ,u'http://www.deia.com/index.php/services/rss?seccion=mundo')] - diff --git a/recipes/respekt_magazine.recipe b/recipes/respekt_magazine.recipe deleted file mode 100644 index 2d5998d5e0..0000000000 --- a/recipes/respekt_magazine.recipe +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/python2 -# -*- coding: utf-8 -*- -# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html -# Copyright: tomashnyk@gmail.com - -__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' -__copyright__ = 'tomashnyk@gmail.com' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -#This imports the version bundled with Calibre -import lxml -from lxml.builder import E - -class respektRecipe(BasicNewsRecipe): - __author__ = u'Tomáš Hnyk' - title = u'Respekt - Magazine' - publisher = u'Respekt Publishing a. s.' - description = u'Articles from the printed edition, password needed for full access' - encoding = 'cp1250' - language = 'cs' - remove_javascript = True - extra_css = 'p {text-align:justify} \ - ul {color:black} \ - .image_caption {font-size:50%;font-style:italic;} \ - .author {text-align:left;} \ - p.indent_first_line {text-indent:30px;}' - remove_tags_before = dict(name='div',attrs={'class':['l']}) - remove_tags_after = dict(id='text') - remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \ - dict(name='div',attrs={'class':['slot','reklama','date']}), \ - dict(name='span', attrs={'class':['detail-vykrik']}), \ - dict(name='p', attrs={'class':['detail-vykrik']}), \ - dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this - dict(name='strong', attrs={'class':['detail-vykrik']}), - dict(name='script')] - # this makes authors left-aligned by not using the author class) - preprocess_regexps = [(re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: '
')] - # remove empty tags - preprocess_regexps.append((re.compile(r' ', re.DOTALL|re.IGNORECASE), lambda match: ' ')) - preprocess_regexps.append((re.compile(r' ', re.DOTALL|re.IGNORECASE), lambda match: ' ')) - preprocess_regexps.append((re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '')) - preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: '')) - preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: '')) - - def get_cover_url(self): - soup = self.index_to_soup('http://respekt.ihned.cz/') - cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src'] - return cover - - needs_subscription = True - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open('http://muj-ucet.ihned.cz/') - br.select_form(name='login') - br['login[nick]'] = self.username - br['login[pass]'] = self.password - br.submit() - return br - - def parse_index(self): - raw = self.index_to_soup('http://respekt.ihned.cz/aktualni-cislo/', raw=True) - root = lxml.html.fromstring(raw) - ans = [] - for article in root.xpath("//div[@class='ow-enclose']/div[@class='ow']"): - section_title = article.xpath(".//span[text()='(rubrika: ']")[0].find("a").text - date = article.xpath("span[@class='date-author']")[0].text[:-3] - title = article.find("h2").find("a").text - url = article.find('h2').find('a').get('href') - link = {'title':title,'url':url,'date':date} - for section in ans: - if section[0] == section_title: - section[1].append(link) - break - else: - ans.append((section_title,[link])) - return ans - - def cleanup(self): - self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1') - - - def preprocess_html(self,soup): - raw = u''.join(unicode(a) for a in soup.contents) - root = lxml.html.fromstring(raw) - - # Make image captions visible - body = root.xpath("//div[@id='text']")[0] - add = 0 - for index, element in enumerate(body): - try: - if element.tag == 'img': - body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"})) - add += 1 - except: - pass - - # Add length of the articles in words after author - article_length = str(len(body.text_content().split(' '))) + ' slov' - root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length)) - - # Make perex (subheading) start on a new line - root.xpath("//h1")[0].append(E.br('')) - - # Indent paragraphs when typographically suitable - parse = True - # There are only single paragraphs in these sections - if root.xpath("//title")[0].text == u"Deset českých zpráv, které by vás neměly minout | Deset českých zpráv - RESPEKT.IHNED.CZ": - parse = False - if root.xpath("//title")[0].text == u"Deset zahraničních zpráv, které by vás neměly minout | Deset světových zpráv - RESPEKT.IHNED.CZ": - parse = False - if parse: - # First paragraph is never indented - paragraphs = root.xpath('//p') - # Clear the formatting a little bit by removing these attributes - for par in paragraphs: - if 'class' in par.keys(): - if par.attrib['class'] == 'detail-odstavec': - par.attrib.pop('class') - paragraphs.reverse() - for par in paragraphs[:-1]: - try: - # in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph - if len(par) > 0: - if (par.text is None and par.getchildren()[0].tag == 'strong'): - continue - elif par.getprevious().text == u'\u2026': - continue - indent = False - # Either indent if the paragraphs are the same - if par.getprevious().attrib == par.attrib: - indent = True - # Or else if the first paragraph of the text was special - if 'class' in par.getprevious().keys(): - par_name = par.getprevious().attrib['class'] - if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn': - indent = True - if indent: - for key in par.keys(): - par.attrib.pop(key) - par.attrib['class']="indent_first_line" - except: - pass - - return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode))) diff --git a/recipes/respekt_web.recipe b/recipes/respekt_web.recipe deleted file mode 100644 index b65d57459d..0000000000 --- a/recipes/respekt_web.recipe +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/python2 -# -*- coding: utf-8 -*- -# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html -# Copyright: tomashnyk@gmail.com - -__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' -__copyright__ = 'tomashnyk@gmail.com' - -import re,os,datetime -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.constants import config_dir -# This imports the version bundled with Calibre -import lxml -from lxml.builder import E - -class respektWebRecipe(BasicNewsRecipe): - __author__ = u'Tomáš Hnyk' - title = u'Respekt - Web' - publisher = u'Respekt Publishing a. s.' - description = u'Free articles from respekt.cz website' - encoding = 'cp1250' - language = 'cs' - remove_javascript = True - cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png' - extra_css = 'p {text-align:justify} \ - ul {color:black} \ - .image_caption {font-size:50%;font-style:italic;} \ - .author {text-align:left;} \ - p.indent_first_line {text-indent:30px;}' - remove_tags_before = dict(name='div',attrs={'class':['l']}) - remove_tags_after = dict(id='text') - remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), - dict(name='div',attrs={'class':['slot','reklama','date']}), - dict(name='span', attrs={'class':['detail-vykrik']}), - dict(name='p', attrs={'class':['detail-vykrik']}), - dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in prprocess requires this - dict(name='strong', attrs={'class':['detail-vykrik']}), - dict(name='script')] - # this makes authors left-aligned by not using the author class) - preprocess_regexps = [(re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: '
')] - # remove empty tags - preprocess_regexps.append((re.compile(r' ', re.DOTALL|re.IGNORECASE), lambda match: ' ')) - preprocess_regexps.append((re.compile(r' ', re.DOTALL|re.IGNORECASE), lambda match: ' ')) - preprocess_regexps.append((re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '')) - preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: '')) - preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: '')) - - def parse_index(self): - # Read already downloaded articles - recipe_dir = os.path.join(config_dir,'recipes') - old_articles = os.path.join(recipe_dir,self.title) - past_items = [] - if os.path.exists(old_articles): - with file(old_articles) as f: - for h in f: - l = h.strip().split(" ") - past_items.append((l[0]," ".join(l[1:]))) - old_urls = [x[0] for x in past_items] - count_items = {} - current_items = [] - # Keep a list of only 20 latest articles for each section - past_items.reverse() - for item in past_items: - if item[1] in count_items.keys(): - if count_items[item[1]] < 20: - count_items[item[1]] += 1 - current_items.append(item) - else: - count_items[item[1]] = 1 - current_items.append(item) - current_items.reverse() - - sections = [] - # Get the webpages to download lists of articles from - raw = self.index_to_soup('http://respekt.ihned.cz/sloupky-redaktoru/', raw=True) - root = lxml.html.fromstring(raw) - sections = [] - for section in root.xpath("//div[@class='ow-enclose sr']/table/tr/td"): - try: - url = section.find('a').get('href') - if not ('?m=authors&person[id]=' in url): - sections.append((url,section.find('a').find('b').text)) - except: - pass - sections.append(('http://respekt.ihned.cz/respekt-dj/','Respekt DJ')) - sections.append(('http://respekt.ihned.cz/fokus/','Fokus')) - sections.append(('http://respekt.ihned.cz/respekt-hub/','Respekt Hub')) - sections.append(('http://respekt.ihned.cz/rozhovory/','Rozhovory')) - sections.append(('http://respekt.ihned.cz/glosy/','Glosy')) - - # Get the list of articles - ans = [] - for section in sections: - raw = self.index_to_soup(section[0], raw=True) - root = lxml.html.fromstring(raw) - list_of_articles = [] - articles = root.xpath("//div[@class='ow-enclose']/div[@class='ow']") - # Sort the articles in a section from oldest to newest - articles.reverse() - for article in articles: - date = getattr(article.xpath("span[@class='date-author']")[0],'text','')[:-3] - author = getattr(article.xpath("span[@class='date-author']")[0].find("a"),'text','') - title = getattr(article.find("h2").find("a"),'text','') - url = article.find('h2').find('a').get('href') - # Only download new articles - if url not in old_urls: - old_urls.append(url) - current_items.append((url,section[1])) - list_of_articles.append({'title':title,'url':url,'date':date,'author':author}) - # Redownload this page next time if it is still being updated (between 7 and 17 GMT generally, so make the limits a little bit bigger):wq - if section[1] == 'Respekt DJ': - if list_of_articles: - if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17: - # list_of_articles = list_of_articles[:-1] - current_items = current_items[:-1] - if list_of_articles: - ans.append((section[1],list_of_articles)) - # Write already downloaded articles - with file(old_articles,'w') as f: - f.write('\n'.join('{} {}'.format(*x) for x in current_items)) - return ans - - # For some reason, the following does not work: - # preprocess_regexps.append((re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '

')) - def preprocess_raw_html(self, raw_html, url): - return re.sub("

","

",raw_html) - - def preprocess_html(self,soup): - raw = u''.join(unicode(a) for a in soup.contents) - root = lxml.html.fromstring(raw) - # Make image captions visible - body = root.xpath("//div[@id='text']")[0] - add = 0 - for index, element in enumerate(body): - try: - if element.tag == 'img': - body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"})) - add += 1 - except: - pass - # Make captions visible on the website have the same style - try: - root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption' - except: - pass - # For DJ, the perex is always the same, so remove it - if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ': - - perex = root.xpath("//div[@id='perex']")[0] - clean = root.xpath("//div[@class='clean']")[0] - perex.getparent().remove(perex) - clean.getparent().remove(clean) - - # DJ section gets mal-formatted on kindle otherwise - for i in root.xpath("//h2[@class='d-dj-t']"): - i.attrib['class'] = '' - E.style = "font-size:60%;font-weight:normal;" - time = E('span',i.getprevious().text_content(),style=E.style) - # Time should be ahead of the title - time.tail = ' ' + i.text - i.text = '' - i.insert(0,time) - for i in root.xpath("//div[@class='d-dj-d']"): - i.attrib['class'] = '' - i.xpath("div/span")[0].text = '' - for i in root.xpath("//div[@class='d-dj-b']"): - i.attrib['class'] = '' - - # Make captions visible on the website have the same style - root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption' - - # Reverse the entries so that the earliest are at the top - entries = root.xpath("//div[@class='d-dj-i']") - entries.reverse() - dj_body = entries[0].getparent() - for entry in entries: - dj_body.remove(entry) - dj_body.append(entry) - - # We are not interested in this paragraph as it stays the same and is essentialy an ad - if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz': - ad = root.xpath("//p[@id='ajmonf']")[0] - ad.getparent().remove(ad) - - # Add length of the articles in words after author - article_length = str(len(body.text_content().split(' '))) + ' slov' - root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length)) - - # Make perex (subheading) start on a new line - root.xpath("//h1")[0].append(E.br('')) - - # Indent paragraphs when typographically suitable - # First paragraph is never indented - paragraphs = root.xpath('//p') - # Clear the formatting a little bit by removing these attributes - for par in paragraphs: - if 'class' in par.keys(): - if par.attrib['class'] == 'detail-odstavec': - par.attrib.pop('class') - paragraphs.reverse() - for par in paragraphs[:-1]: - try: - # in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph - if len(par) > 0: - if (par.text is None and par.getchildren()[0].tag == 'strong'): - continue - elif par.getprevious().text == u'\u2026': - continue - indent = False - # Either indent if the paragraphs are the same - if par.getprevious().attrib == par.attrib: - indent = True - # Or else if the first paragraph of the text was special - if 'class' in par.getprevious().keys(): - par_name = par.getprevious().attrib['class'] - if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn': - indent = True - if indent: - for key in par.keys(): - par.attrib.pop(key) - par.attrib['class']="indent_first_line" - except: - pass - return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))