New recipe Darknet by Oliver Beusner

This commit is contained in:
Kovid Goyal 2009-08-26 08:28:11 -06:00
parent 4bcede833d
commit fd2d6bdd3d
6 changed files with 69 additions and 25 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

View File

@ -55,7 +55,7 @@ recipe_modules = ['recipe_' + r for r in (
'eltiempo_hn', 'slate', 'tnxm', 'bbcvietnamese', 'vnexpress', 'eltiempo_hn', 'slate', 'tnxm', 'bbcvietnamese', 'vnexpress',
'volksrant', 'theeconomictimes_india', 'ourdailybread', 'volksrant', 'theeconomictimes_india', 'ourdailybread',
'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti', 'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti',
'esquire', 'livemint', 'thedgesingapore', 'esquire', 'livemint', 'thedgesingapore', 'darknet',
)] )]

View File

@ -0,0 +1,43 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch darknet.
'''
from calibre.web.feeds.news import BasicNewsRecipe
class darknet(BasicNewsRecipe):
title = 'darknet'
description = 'Ethical hacking and security news'
__author__ = 'Oliver Niesner'
language = _('English')
use_embedded_content = False
timefmt = ' [%b %d %Y]'
max_articles_per_feed = 40
no_stylesheets = True
oldest_article = 180
remove_tags = [dict(id='navi_top'),
dict(id='navi_bottom'),
dict(id='logo'),
dict(id='login_suche'),
dict(id='navi_login'),
dict(id='breadcrumb'),
dict(id='subtitle'),
dict(id='bannerzone'),
dict(name='span', attrs={'class':'rsaquo'}),
dict(name='span', attrs={'class':'next'}),
dict(name='span', attrs={'class':'prev'}),
dict(name='div', attrs={'class':'news_logo'}),
dict(name='div', attrs={'class':'nextprev'}),
dict(name='p', attrs={'class':'news_option'}),
dict(name='p', attrs={'class':'news_foren'})]
remove_tags_after = [dict(name='div', attrs={'class':'entrybody'})]
feeds = [ ('darknet', 'http://feedproxy.google.com/darknethackers') ]

View File

@ -32,7 +32,6 @@ class elektrolese(BasicNewsRecipe):
feeds = [ (u'electrolese', u'http://elektrolese.blogspot.com/feeds/posts/default?alt=rss') ] feeds = [ (u'elektrolese', u'http://elektrolese.blogspot.com/feeds/posts/default?alt=rss') ]

View File

@ -19,16 +19,24 @@ class hnaDe(BasicNewsRecipe):
timefmt = ' [%d %b %Y]' timefmt = ' [%d %b %Y]'
max_articles_per_feed = 40 max_articles_per_feed = 40
no_stylesheets = True no_stylesheets = True
remove_javascript = True
encoding = 'iso-8859-1' encoding = 'iso-8859-1'
remove_tags = [dict(id='topnav'), remove_tags = [dict(id='topnav'),
dict(id='nav_main'), dict(id='nav_main'),
dict(id='teaser'),
dict(id='suchen'), dict(id='suchen'),
dict(id='superbanner'),
dict(id='navigation'),
dict(id='skyscraper'),
dict(id=''), dict(id=''),
dict(name='span'), dict(name='span'),
dict(name='ul', attrs={'class':'linklist'}), dict(name='ul', attrs={'class':'linklist'}),
dict(name='a', attrs={'href':'#'}), dict(name='a', attrs={'href':'#'}),
dict(name='div', attrs={'class':'hlist'}),
dict(name='div', attrs={'class':'subc noprint'}),
dict(name='p', attrs={'class':'breadcrumb'}), dict(name='p', attrs={'class':'breadcrumb'}),
dict(name='a', attrs={'style':'cursor:hand'}),
dict(name='p', attrs={'class':'h5'})] dict(name='p', attrs={'class':'h5'})]
#remove_tags_after = [dict(name='div', attrs={'class':'rahmenbreaking'})] #remove_tags_after = [dict(name='div', attrs={'class':'rahmenbreaking'})]
remove_tags_after = [dict(name='a', attrs={'href':'#'})] remove_tags_after = [dict(name='a', attrs={'href':'#'})]
@ -38,3 +46,4 @@ class hnaDe(BasicNewsRecipe):

View File

@ -6,6 +6,7 @@ Fetch Linuxdevices.
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Sueddeutsche(BasicNewsRecipe): class Sueddeutsche(BasicNewsRecipe):
@ -16,22 +17,22 @@ class Sueddeutsche(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
timefmt = ' [%a %d %b %Y]' timefmt = ' [%a %d %b %Y]'
max_articles_per_feed = 50 max_articles_per_feed = 50
language = _('English')
no_stylesheets = True no_stylesheets = True
html2epub_options = 'linearize_tables = True\nbase_font_size2=14' language = _('English')
html2lrf_options = ['--ignore-tables'] remove_javascript = True
conversion_options {' linearize_tables' : True}
encoding = 'latin1' encoding = 'latin1'
remove_tags_after = [dict(id='nointelliTXT')] remove_tags_after = [dict(id='intelliTxt')]
filter_regexps = [r'ad\.doubleclick\.net'] filter_regexps = [r'ad\.doubleclick\.net']
remove_tags = [dict(name='div', attrs={'class':'bannerSuperBanner'}), remove_tags = [dict(name='div', attrs={'class':'bannerSuperBanner'}),
dict(name='div', attrs={'class':'bannerSky'}), dict(name='div', attrs={'class':'bannerSky'}),
dict(name='div', attrs={'border':'0'}),
dict(name='div', attrs={'class':'footerLinks'}), dict(name='div', attrs={'class':'footerLinks'}),
dict(name='div', attrs={'class':'seitenanfang'}), dict(name='div', attrs={'class':'seitenanfang'}),
dict(name='td', attrs={'class':'mar5'}), dict(name='td', attrs={'class':'mar5'}),
dict(name='td', attrs={'class':'mar5'}),
dict(name='table', attrs={'class':'pageAktiv'}), dict(name='table', attrs={'class':'pageAktiv'}),
dict(name='table', attrs={'class':'xartable'}), dict(name='table', attrs={'class':'xartable'}),
dict(name='table', attrs={'class':'wpnavi'}), dict(name='table', attrs={'class':'wpnavi'}),
@ -40,24 +41,26 @@ class Sueddeutsche(BasicNewsRecipe):
dict(name='table', attrs={'class':'artikelBox'}), dict(name='table', attrs={'class':'artikelBox'}),
dict(name='table', attrs={'class':'kommentare'}), dict(name='table', attrs={'class':'kommentare'}),
dict(name='table', attrs={'class':'pageBoxBot'}), dict(name='table', attrs={'class':'pageBoxBot'}),
dict(name='table', attrs={'td':'height="3"'}),
dict(name='table', attrs={'class':'contentpaneopen'}),
dict(name='td', attrs={'nowrap':'nowrap'}), dict(name='td', attrs={'nowrap':'nowrap'}),
dict(name='td', attrs={'valign':'middle'}),
dict(name='td', attrs={'align':'left'}), dict(name='td', attrs={'align':'left'}),
dict(name='td', attrs={'align':'center'}),
dict(name='td', attrs={'height':'5'}), dict(name='td', attrs={'height':'5'}),
dict(name='td', attrs={'class':'ArticleWidgetsHeadline'}),
dict(name='div', attrs={'class':'artikelBox navigatorBox'}), dict(name='div', attrs={'class':'artikelBox navigatorBox'}),
dict(name='div', attrs={'class':'similar-article-box'}), dict(name='div', attrs={'class':'similar-article-box'}),
dict(name='div', attrs={'class':'videoBigHack'}), dict(name='div', attrs={'class':'videoBigHack'}),
dict(name='td', attrs={'class':'artikelDruckenRight'}), dict(name='td', attrs={'class':'artikelDruckenRight'}),
dict(name='td', attrs={'class':'width="200"'}), dict(name='td', attrs={'class':'width="200"'}),
dict(name='span', attrs={'class':'content_rating'}),
dict(name='a', attrs={'href':'http://www.addthis.com/bookmark.php'}),
dict(name='a', attrs={'href':'/news'}), dict(name='a', attrs={'href':'/news'}),
dict(name='a', attrs={'href':'/'}),
dict(name='a', attrs={'href':'/articles'}),
dict(name='a', attrs={'href':'/cgi-bin/survey/survey.cgi'}), dict(name='a', attrs={'href':'/cgi-bin/survey/survey.cgi'}),
dict(name='a', attrs={'href':'/cgi-bin/board/UltraBoard.pl'}), dict(name='a', attrs={'href':'/cgi-bin/board/UltraBoard.pl'}),
dict(name='iframe'), dict(name='iframe'),
dict(name='form'), dict(name='form'),
dict(name='span', attrs={'class':'hidePrint'}), dict(name='span', attrs={'class':'hidePrint'}),
dict(id='ArticleWidgets'),
dict(id='headerLBox'), dict(id='headerLBox'),
dict(id='nointelliTXT'), dict(id='nointelliTXT'),
dict(id='rechteSpalte'), dict(id='rechteSpalte'),
@ -69,27 +72,18 @@ class Sueddeutsche(BasicNewsRecipe):
dict(id='nnav-headerteaser'), dict(id='nnav-headerteaser'),
dict(id='nnav-head'), dict(id='nnav-head'),
dict(id='nnav-top'), dict(id='nnav-top'),
dict(id='nnav-logodiv'),
dict(id='nnav-logo'),
dict(id='nnav-oly'),
dict(id='readcomment')] dict(id='readcomment')]
feeds = [ (u'Linuxdevices', u'http://www.linuxdevices.com/backend/headlines.rss') ] feeds = [ (u'Linuxdevices', u'http://www.linuxfordevices.com/rss.xml') ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(re.compile('^a')):
item.extract()
match = re.compile(r"^Related") match = re.compile(r"^Related")
for item in soup.findAll('b', text=match): for item in soup.findAll('b', text=match):
item.extract() item.extract()
for item in soup.findAll(re.compile('^li')):
item.extract()
for item in soup.findAll(re.compile('^ul')): for item in soup.findAll(re.compile('^ul')):
item.extract() item.extract()
for item in soup.find(re.compile('^br')):
item.extract()
for item in soup.findAll('br', limit=10): for item in soup.findAll('br', limit=10):
item.extract() item.extract()
return soup return soup
@ -101,4 +95,3 @@ class Sueddeutsche(BasicNewsRecipe):
return soup return soup