__license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' Fetch Linuxdevices. ''' import re from calibre.web.feeds.news import BasicNewsRecipe class LinuxDevices(BasicNewsRecipe): title = u'Linuxdevices' description = 'News about Linux driven Hardware' __author__ = 'Oliver Niesner' use_embedded_content = False timefmt = ' [%a %d %b %Y]' max_articles_per_feed = 50 no_stylesheets = True language = 'en' remove_javascript = True conversion_options = { 'linearize_tables' : True} encoding = 'latin1' remove_tags_after = [dict(id='intelliTxt')] filter_regexps = [r'ad\.doubleclick\.net'] remove_tags = [dict(name='div', attrs={'class':'bannerSuperBanner'}), dict(name='div', attrs={'class':'bannerSky'}), dict(name='div', attrs={'border':'0'}), dict(name='div', attrs={'class':'footerLinks'}), dict(name='div', attrs={'class':'seitenanfang'}), dict(name='td', attrs={'class':'mar5'}), dict(name='table', attrs={'class':'pageAktiv'}), dict(name='table', attrs={'class':'xartable'}), dict(name='table', attrs={'class':'wpnavi'}), dict(name='table', attrs={'class':'bgcontent absatz'}), dict(name='table', attrs={'class':'footer'}), dict(name='table', attrs={'class':'artikelBox'}), dict(name='table', attrs={'class':'kommentare'}), dict(name='table', attrs={'class':'pageBoxBot'}), dict(name='table', attrs={'td':'height="3"'}), dict(name='table', attrs={'class':'contentpaneopen'}), dict(name='td', attrs={'nowrap':'nowrap'}), dict(name='td', attrs={'align':'left'}), dict(name='td', attrs={'height':'5'}), dict(name='td', attrs={'class':'ArticleWidgetsHeadline'}), dict(name='div', attrs={'class':'artikelBox navigatorBox'}), dict(name='div', attrs={'class':'similar-article-box'}), dict(name='div', attrs={'class':'videoBigHack'}), dict(name='td', attrs={'class':'artikelDruckenRight'}), dict(name='td', attrs={'class':'width="200"'}), dict(name='span', attrs={'class':'content_rating'}), dict(name='a', attrs={'href':'http://www.addthis.com/bookmark.php'}), dict(name='a', attrs={'href':'/news'}), dict(name='a', attrs={'href':'/cgi-bin/survey/survey.cgi'}), dict(name='a', attrs={'href':'/cgi-bin/board/UltraBoard.pl'}), dict(name='iframe'), dict(name='form'), dict(name='span', attrs={'class':'hidePrint'}), dict(id='ArticleWidgets'), dict(id='headerLBox'), dict(id='nointelliTXT'), dict(id='rechteSpalte'), dict(id='newsticker-list-small'), dict(id='ntop5'), dict(id='ntop5send'), dict(id='ntop5commented'), dict(id='nnav-bgheader'), dict(id='nnav-headerteaser'), dict(id='nnav-head'), dict(id='nnav-top'), dict(id='readcomment')] feeds = [ (u'Linuxdevices', u'http://www.linuxfordevices.com/rss.xml') ] def preprocess_html(self, soup): match = re.compile(r"^Related") for item in soup.findAll('b', text=match): item.extract() for item in soup.findAll(re.compile('^ul')): item.extract() for item in soup.findAll('br', limit=10): item.extract() return soup def postprocess_html(self, soup, first): for tag in soup.findAll(name=['table', 'tr', 'td']): tag.name = 'div' return soup