__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' www.haaretz.com ''' import re from calibre import strftime from time import gmtime from calibre.web.feeds.news import BasicNewsRecipe class HaaretzPrint_en(BasicNewsRecipe): title = 'Haaretz - print edition' __author__ = 'Darko Miletic' description = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East." publisher = 'Haaretz' category = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news" oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'en_IL' publication_type = 'newspaper' PREFIX = 'http://www.haaretz.com' masthead_url = PREFIX + '/images/logos/logoGrey.gif' extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } ' preprocess_regexps = [(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '')] conversion_options = { 'comment' : description , 'tags' : category , 'publisher': publisher , 'language' : language } keep_only_tags = [dict(attrs={'id':'threecolumns'})] remove_attributes = ['width','height'] remove_tags = [ dict(name=['iframe','link','object','embed']) ,dict(name='div',attrs={'class':'rightcol'}) ] feeds = [ (u'News' , PREFIX + u'/print-edition/news' ) ,(u'Opinion' , PREFIX + u'/print-edition/opinion' ) ,(u'Business' , PREFIX + u'/print-edition/business' ) ,(u'Real estate' , PREFIX + u'/print-edition/real-estate' ) ,(u'Sports' , PREFIX + u'/print-edition/sports' ) ,(u'Travel' , PREFIX + u'/print-edition/travel' ) ,(u'Books' , PREFIX + u'/print-edition/books' ) ,(u'Food & Wine' , PREFIX + u'/print-edition/food-wine' ) ,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' ) ,(u'Features' , PREFIX + u'/print-edition/features' ) ] def print_version(self, url): article = url.rpartition('/')[2] return 'http://www.haaretz.com/misc/article-print-page/' + article def parse_index(self): totalfeeds = [] lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) for item in soup.findAll(attrs={'class':'text'}): sp = item.find('span',attrs={'class':'h3 font-weight-normal'}) desc = item.find('p') description = '' if sp: if desc: description = self.tag_to_string(desc) link = sp.a url = self.PREFIX + link['href'] title = self.tag_to_string(link) times = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime()) articles.append({ 'title' :title ,'date' :times ,'url' :url ,'description':description }) totalfeeds.append((feedtitle, articles)) return totalfeeds def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] return soup