mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
91 lines
4.4 KiB
Plaintext
91 lines
4.4 KiB
Plaintext
__license__ = 'GPL v3'
|
|
__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>'
|
|
'''
|
|
www.haaretz.com
|
|
'''
|
|
|
|
import re
|
|
import urllib
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
class Haaretz_en(BasicNewsRecipe):
|
|
title = 'Haaretz'
|
|
__author__ = 'Darko Miletic'
|
|
description = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
|
|
publisher = 'Haaretz'
|
|
category = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news"
|
|
oldest_article = 2
|
|
max_articles_per_feed = 200
|
|
no_stylesheets = True
|
|
encoding = 'utf8'
|
|
use_embedded_content = False
|
|
language = 'en_IL'
|
|
needs_subscription = True
|
|
remove_empty_feeds = True
|
|
publication_type = 'newspaper'
|
|
PREFIX = 'http://www.haaretz.com'
|
|
masthead_url = PREFIX + '/images/logos/HaaretzLogo.gif'
|
|
extra_css = """
|
|
body{font-family: Verdana,Arial,Helvetica,sans-serif }
|
|
h1, .articleBody {font-family: Georgia, serif}
|
|
.authorBar {font-size: small}
|
|
"""
|
|
|
|
preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]
|
|
|
|
conversion_options = {
|
|
'comment' : description
|
|
, 'tags' : category
|
|
, 'publisher': publisher
|
|
, 'language' : language
|
|
}
|
|
|
|
keep_only_tags = [dict(attrs={'id':'threecolumns'})]
|
|
remove_attributes = ['width','height']
|
|
remove_tags = [
|
|
dict(name=['iframe','link','object','embed'])
|
|
,dict(name='div',attrs={'class':'rightcol'})
|
|
]
|
|
|
|
|
|
feeds = [
|
|
(u'Headlines' , 'http://feeds.feedburner.com/haaretz/LBao' )
|
|
,(u'Opinion' , 'http://feeds.feedburner.com/haaretz/opinions' )
|
|
,(u'Defence and diplomacy' , 'http://feeds.feedburner.com/DefenseAndDiplomacy' )
|
|
,(u'National' , 'http://feeds.feedburner.com/haaretz/National' )
|
|
,(u'International' , 'http://feeds.feedburner.com/InternationalRss' )
|
|
,(u'Jewish World' , 'http://feeds.feedburner.com/JewishWorldRss' )
|
|
,(u'Business' , 'http://feeds.feedburner.com/BusinessPrintRss' )
|
|
,(u'Real Estate' , 'http://feeds.feedburner.com/RealEstatePrintRss' )
|
|
,(u'Features' , 'http://feeds.feedburner.com/FeaturesPrintRss' )
|
|
,(u'Arts & Leisure' , 'http://feeds.feedburner.com/ArtsAndLeisureRss' )
|
|
,(u'Books' , 'http://www.haaretz.com/cmlink/books-rss-1.264947?localLinksEnabled=false')
|
|
,(u'Food & Wine' , 'http://feeds.feedburner.com/FoodAndWinePrintRss' )
|
|
,(u'Sports' , 'http://feeds.feedburner.com/haaretz/Sport' )
|
|
]
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
br.open(self.PREFIX)
|
|
if self.username is not None and self.password is not None:
|
|
data = urllib.urlencode({ 'cb':'parseEngReply'
|
|
,'newsso':'true'
|
|
,'fromlogin':'true'
|
|
,'layer':'eng_login'
|
|
,'userName':self.username
|
|
,'password':self.password
|
|
})
|
|
br.open('https://sso.haaretz.com/sso/sso/signIn',data)
|
|
return br
|
|
|
|
def get_article_url(self, article):
|
|
url = BasicNewsRecipe.get_article_url(self, article)
|
|
return self.browser.open_novisit(url).geturl()
|
|
|
|
def print_version(self, url):
|
|
article = url.rpartition('/')[2]
|
|
return 'http://www.haaretz.com/misc/article-print-page/' + article
|
|
|
|
def preprocess_raw_html(self, raw, url):
|
|
return '<html><head>'+raw[raw.find('</head>'):]
|