calibre/recipes/starwars.recipe

# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe


class TheForce(BasicNewsRecipe):
    title = u'The Force'
    language = 'en'
    __author__ = 'Krittika Goyal'
    oldest_article = 1  # days
    max_articles_per_feed = 25
    encoding = 'cp1252'

    remove_stylesheets = True
    conversion_options = {'linearize_tables': True}
    remove_tags_after = dict(name='div', attrs={'class': 'KonaBody'})
    keep_only_tags = dict(
        name='td', attrs={'background': '/images/span/tile_story_bgtile.gif'})
    remove_tags = [
        dict(name='iframe'),
    ]

    feeds = [
        ('The Force',
         'http://www.theforce.net/outnews/tfnrdf.xml'),
    ]

    def preprocess_html(self, soup):
        for tag in soup.findAll(name='i'):
            if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
                for x in tag.findAllNext():
                    x.extract()
                tag.extract()
                break
        tag = soup.find(attrs={'class': 'articleoption'})
        if tag is not None:
            tag = tag.findParent('table')
            if tag is not None:
                for x in tag.findAllNext():
                    x.extract()
            tag.extract()

        for img in soup.findAll('img', src=True):
            a = img.findParent('a', href=True)
            if a is None:
                continue
            url = a.get('href').split('?')[-1].partition('=')[-1]
            if url:
                img.extract()
                a.name = 'img'
                a['src'] = url
                del a['href']
                img['src'] = url
        return soup