from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class LevanteRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' version = 1 language = 'es' description = u'El Mercantil Valenciano' title = u'Levante' oldest_article = 2 max_articles_per_feed = 100 encoding = 'latin1' no_stylesheets = True remove_javascript = True # Feeds taken from http://www.levante-emv.com/servicios/rss/rss.jsp?pServicio=rss # Feed titles are without accented characters for now. Hope to resolve # this in the future. feeds = [] feeds.append( (u'Portada Valencia', u'http://www.levante-emv.com/elementosInt/rss/1')) feeds.append( (u'Portada Castello', u'http://www.levante-emv.com/elementosInt/rss/2')) feeds.append( (u'Portada Alacant', u'http://www.levante-emv.com/elementosInt/rss/3')) feeds.append( (u'Lo Mas Leido', u'http://www.levante-emv.com/elementosInt/rss/LoMas')) feeds.append( (u'Seccion al minuto', u'http://www.levante-emv.com/elementosInt/rss/AlMinuto')) feeds.append((u'Comunidad Valenciana', u'http://www.levante-emv.com/elementosInt/rss/19')) feeds.append( (u'Valencia', u'http://www.levante-emv.com/elementosInt/rss/16')) feeds.append( (u'Castello', u'http://www.levante-emv.com/elementosInt/rss/4')) feeds.append( (u'Alacant', u'http://www.levante-emv.com/elementosInt/rss/17')) feeds.append( (u'Comarcas', u'http://www.levante-emv.com/elementosInt/rss/12')) feeds.append((u'Espana', u'http://www.levante-emv.com/elementosInt/rss/6')) feeds.append( (u'Internacional', u'http://www.levante-emv.com/elementosInt/rss/7')) feeds.append( (u'Opinion', u'http://www.levante-emv.com/elementosInt/rss/5')) feeds.append( (u'Economia', u'http://www.levante-emv.com/elementosInt/rss/8')) feeds.append( (u'Sociedad', u'http://www.levante-emv.com/elementosInt/rss/9')) feeds.append( (u'Sucesos', u'http://www.levante-emv.com/elementosInt/rss/10')) feeds.append( (u'Deportes', u'http://www.levante-emv.com/elementosInt/rss/11')) feeds.append((u'Motor', u'http://www.levante-emv.com/elementosInt/rss/31')) feeds.append( (u'Panorama', u'http://www.levante-emv.com/elementosInt/rss/18')) feeds.append( (u'Salud y Vida', u'http://www.levante-emv.com/elementosInt/rss/20')) feeds.append( (u'Ciencia y Salud', u'http://www.levante-emv.com/elementosInt/rss/44')) feeds.append((u'Ciencia e Investigacion', u'http://www.levante-emv.com/elementosInt/rss/23')) feeds.append( (u'Ensenanza', u'http://www.levante-emv.com/elementosInt/rss/22')) feeds.append((u'Fiestas y Tradiciones', u'http://www.levante-emv.com/elementosInt/rss/24')) feeds.append( (u'Club Diario', u'http://www.levante-emv.com/elementosInt/rss/26')) feeds.append( (u'Juntos', u'http://www.levante-emv.com/elementosInt/rss/33')) feeds.append( (u'Integrados', u'http://www.levante-emv.com/elementosInt/rss/35')) feeds.append( (u'Agenda', u'http://www.levante-emv.com/elementosInt/rss/36')) feeds.append( (u'Cultura', u'http://www.levante-emv.com/elementosInt/rss/39')) feeds.append( (u'Tecnologia', u'http://www.levante-emv.com/elementosInt/rss/40')) feeds.append((u'Gente', u'http://www.levante-emv.com/elementosInt/rss/41')) feeds.append( (u'Television', u'http://www.levante-emv.com/elementosInt/rss/42')) feeds.append( (u'Participa', u'http://www.levante-emv.com/elementosInt/rss/45')) keep_only_tags = [dict(name='div', attrs={'class': 'noticia_titular'}), dict(name='div', attrs={'class': 'subtitulo'}), dict(name='div', attrs={'id': 'noticia_texto', 'class': 'noticia_texto'})] def preprocess_html(self, soup): # Nuke some real crappy html theirHead = soup.head theirHead.extract() myHead = new_tag(soup, 'head') soup.insert(0, myHead) return soup