mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
54 lines
1.6 KiB
Plaintext
54 lines
1.6 KiB
Plaintext
# -*- coding: utf-8 -*-
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
class TheForce(BasicNewsRecipe):
|
|
title = u'The Force'
|
|
language = 'en'
|
|
__author__ = 'Krittika Goyal'
|
|
oldest_article = 1 # days
|
|
max_articles_per_feed = 25
|
|
encoding = 'cp1252'
|
|
|
|
remove_stylesheets = True
|
|
conversion_options = {'linearize_tables': True}
|
|
remove_tags_after = dict(name='div', attrs={'class': 'KonaBody'})
|
|
keep_only_tags = dict(
|
|
name='td', attrs={'background': '/images/span/tile_story_bgtile.gif'})
|
|
remove_tags = [
|
|
dict(name='iframe'),
|
|
]
|
|
|
|
feeds = [
|
|
('The Force',
|
|
'http://www.theforce.net/outnews/tfnrdf.xml'),
|
|
]
|
|
|
|
def preprocess_html(self, soup):
|
|
for tag in soup.findAll(name='i'):
|
|
if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
|
|
for x in tag.findAllNext():
|
|
x.extract()
|
|
tag.extract()
|
|
break
|
|
tag = soup.find(attrs={'class': 'articleoption'})
|
|
if tag is not None:
|
|
tag = tag.findParent('table')
|
|
if tag is not None:
|
|
for x in tag.findAllNext():
|
|
x.extract()
|
|
tag.extract()
|
|
|
|
for img in soup.findAll('img', src=True):
|
|
a = img.findParent('a', href=True)
|
|
if a is None:
|
|
continue
|
|
url = a.get('href').split('?')[-1].partition('=')[-1]
|
|
if url:
|
|
img.extract()
|
|
a.name = 'img'
|
|
a['src'] = url
|
|
del a['href']
|
|
img['src'] = url
|
|
return soup
|