from calibre.web.feeds.news import BasicNewsRecipe class ElUniversalImpresaRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'es_MX' version = 1 title = u'El Universal (Edici\u00F3n Impresa)' publisher = u'El Universal' category = u'News, Mexico' description = u'News from Mexico' remove_empty_feeds = True remove_javascript = True INDEX = 'http://www.eluniversal.com.mx' extra_css = ''' body{font-family:verdana,arial,helvetica,geneva,sans-serif;} ''' conversion_options = {'comments': description, 'tags': category, 'language': 'en', 'publisher': publisher, 'linearize_tables': True} def parse_index(self): soup = self.index_to_soup( 'http://www.eluniversal.com.mx/edicion_impresa.html') index = [] table = soup.find('table', attrs={'width': '500'}) articles = [] for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'): # noqa a = td.a a.extract() title = self.tag_to_string(a) url = self.INDEX + a['href'] description = self.tag_to_string(td) articles.append({'title': title, 'date': None, 'url': url, 'description': description}) index.append(('Primera Plana', articles)) for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0): articles = [] feedTitle = None for a in td.findAll('a'): if not feedTitle: feedTitle = self.tag_to_string(a) continue title = self.tag_to_string(a) url = self.INDEX + a['href'] articles.append({'title': title, 'date': None, 'url': url, 'description': ''}) index.append((feedTitle, articles)) return index def print_version(self, url): if url.find('wcarton') >= 0: return None main, sep, id = url.rpartition('/') return main + '/vi_' + id def preprocess_html(self, soup): table = soup.find('table') table.extract() for p in soup.findAll('p'): if self.tag_to_string(p).strip() == '': p.extract() tag = soup.find('font', attrs={'color': '#0F046A'}) if tag: for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']: if tag.has_key(attr): # noqa del tag[attr] tag.name = 'h1' return soup