from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag import re def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class JoopRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'nl' country = 'NL' version = 1 title = u'Joop' publisher = u'Vara' category = u'News, Politics, Discussion' description = u'Political blog from the Netherlands' oldest_article = 7 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True remove_javascript = True keep_only_tags = [] keep_only_tags.append( dict(name='div', attrs={'class': 'author_head clearfix photo'})) keep_only_tags.append( dict(name='h2', attrs={'class': 'columnhead smallline'})) keep_only_tags.append( dict(name='div', attrs={'class': re.compile('article.*')})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif;} img {margin-right: 0.4em;} h3 {font-size: medium; font-style: italic; font-weight: normal;} h2 {font-size: xx-large; font-weight: bold} sub {color: #666666; font-size: x-small; font-weight: normal;} div.joop_byline {font-size: large} div.joop_byline_job {font-size: small; color: #696969;} div.joop_date {font-size: x-small; font-style: italic; margin-top: 0.6em} ''' INDEX = 'http://www.joop.nl' conversion_options = {'comments': description, 'tags': category, 'language': language, 'publisher': publisher} def parse_index(self): sections = ['Politiek', 'Wereld', 'Economie', 'Groen', 'Media', 'Leven', 'Show', 'Opinies'] soup = self.index_to_soup(self.INDEX) answer = [] div = soup.find('div', attrs={'id': 'footer'}) for section in sections: articles = [] h2 = div.find(lambda tag: tag.name == 'h2' and tag.renderContents().decode('utf-8') == section) if h2: ul = h2.findNextSibling('ul', 'linklist') if ul: for li in ul.findAll('li'): title = self.tag_to_string(li.a) url = self.INDEX + li.a['href'] articles.append( {'title': title, 'date': None, 'url': url, 'description': ''}) answer.append((section, articles)) return answer def preprocess_html(self, soup): div = soup.find('div', 'author_head clearfix photo') if div: h2 = soup.find('h2') if h2: h2.name = 'div' h2['class'] = 'joop_byline' span = h2.find('span') if span: span.name = 'div' span['class'] = 'joop_byline_job' div.replaceWith(h2) h2 = soup.find('h2', attrs={'class': 'columnhead smallline'}) if h2: txt = None span = h2.find('span', 'info') if span: txt = span.find(text=True) div = new_tag(soup, 'div', attrs=[('class', 'joop_date')]) div.append(txt) h2.replaceWith(div) return soup