diff --git a/recipes/haaretz_en.recipe b/recipes/haaretz_en.recipe index 4404624aff..ade32ae5ea 100644 --- a/recipes/haaretz_en.recipe +++ b/recipes/haaretz_en.recipe @@ -1,16 +1,15 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = '2010-2012, Darko Miletic ' ''' www.haaretz.com ''' import re -from calibre import strftime -from time import gmtime +import urllib from calibre.web.feeds.news import BasicNewsRecipe -class HaaretzPrint_en(BasicNewsRecipe): - title = 'Haaretz - print edition' +class Haaretz_en(BasicNewsRecipe): + title = 'Haaretz' __author__ = 'Darko Miletic' description = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East." publisher = 'Haaretz' @@ -21,10 +20,16 @@ class HaaretzPrint_en(BasicNewsRecipe): encoding = 'utf8' use_embedded_content = False language = 'en_IL' + needs_subscription = True + remove_empty_feeds = True publication_type = 'newspaper' PREFIX = 'http://www.haaretz.com' - masthead_url = PREFIX + '/images/logos/logoGrey.gif' - extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } ' + masthead_url = PREFIX + '/images/logos/HaaretzLogo.gif' + extra_css = """ + body{font-family: Verdana,Arial,Helvetica,sans-serif } + h1, .articleBody {font-family: Georgia, serif} + .authorBar {font-size: small} + """ preprocess_regexps = [(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '')] @@ -44,53 +49,42 @@ class HaaretzPrint_en(BasicNewsRecipe): feeds = [ - (u'News' , PREFIX + u'/print-edition/news' ) - ,(u'Opinion' , PREFIX + u'/print-edition/opinion' ) - ,(u'Business' , PREFIX + u'/print-edition/business' ) - ,(u'Real estate' , PREFIX + u'/print-edition/real-estate' ) - ,(u'Sports' , PREFIX + u'/print-edition/sports' ) - ,(u'Travel' , PREFIX + u'/print-edition/travel' ) - ,(u'Books' , PREFIX + u'/print-edition/books' ) - ,(u'Food & Wine' , PREFIX + u'/print-edition/food-wine' ) - ,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' ) - ,(u'Features' , PREFIX + u'/print-edition/features' ) + (u'Headlines' , 'http://feeds.feedburner.com/haaretz/LBao' ) + ,(u'Opinion' , 'http://feeds.feedburner.com/haaretz/opinions' ) + ,(u'Defence and diplomacy' , 'http://feeds.feedburner.com/DefenseAndDiplomacy' ) + ,(u'National' , 'http://feeds.feedburner.com/haaretz/National' ) + ,(u'International' , 'http://feeds.feedburner.com/InternationalRss' ) + ,(u'Jewish World' , 'http://feeds.feedburner.com/JewishWorldRss' ) + ,(u'Business' , 'http://feeds.feedburner.com/BusinessPrintRss' ) + ,(u'Real Estate' , 'http://feeds.feedburner.com/RealEstatePrintRss' ) + ,(u'Features' , 'http://feeds.feedburner.com/FeaturesPrintRss' ) + ,(u'Arts & Leisure' , 'http://feeds.feedburner.com/ArtsAndLeisureRss' ) + ,(u'Books' , 'http://www.haaretz.com/cmlink/books-rss-1.264947?localLinksEnabled=false') + ,(u'Food & Wine' , 'http://feeds.feedburner.com/FoodAndWinePrintRss' ) + ,(u'Sports' , 'http://feeds.feedburner.com/haaretz/Sport' ) ] + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open(self.PREFIX) + if self.username is not None and self.password is not None: + data = urllib.urlencode({ 'cb':'parseEngReply' + ,'newsso':'true' + ,'fromlogin':'true' + ,'layer':'eng_login' + ,'userName':self.username + ,'password':self.password + }) + br.open('https://sso.haaretz.com/sso/sso/signIn',data) + return br + + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + return self.browser.open_novisit(url).geturl() def print_version(self, url): article = url.rpartition('/')[2] return 'http://www.haaretz.com/misc/article-print-page/' + article - def parse_index(self): - totalfeeds = [] - lfeeds = self.get_feeds() - for feedobj in lfeeds: - feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) - articles = [] - soup = self.index_to_soup(feedurl) - for item in soup.findAll(attrs={'class':'text'}): - sp = item.find('span',attrs={'class':'h3 font-weight-normal'}) - desc = item.find('p') - description = '' - if sp: - if desc: - description = self.tag_to_string(desc) - link = sp.a - url = self.PREFIX + link['href'] - title = self.tag_to_string(link) - times = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime()) - articles.append({ - 'title' :title - ,'date' :times - ,'url' :url - ,'description':description - }) - totalfeeds.append((feedtitle, articles)) - return totalfeeds - - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup + def preprocess_raw_html(self, raw, url): + return ''+raw[raw.find(''):]