calibre/recipes/haaretz_en.recipe

97 lines
4.3 KiB
Plaintext

__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.haaretz.com
'''
import re
from calibre import strftime
from time import gmtime
from calibre.web.feeds.news import BasicNewsRecipe
class HaaretzPrint_en(BasicNewsRecipe):
title = 'Haaretz - print edition'
__author__ = 'Darko Miletic'
description = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
publisher = 'Haaretz'
category = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news"
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en_IL'
publication_type = 'newspaper'
PREFIX = 'http://www.haaretz.com'
masthead_url = PREFIX + '/images/logos/logoGrey.gif'
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
keep_only_tags = [dict(attrs={'id':'threecolumns'})]
remove_attributes = ['width','height']
remove_tags = [
dict(name=['iframe','link','object','embed'])
,dict(name='div',attrs={'class':'rightcol'})
]
feeds = [
(u'News' , PREFIX + u'/print-edition/news' )
,(u'Opinion' , PREFIX + u'/print-edition/opinion' )
,(u'Business' , PREFIX + u'/print-edition/business' )
,(u'Real estate' , PREFIX + u'/print-edition/real-estate' )
,(u'Sports' , PREFIX + u'/print-edition/sports' )
,(u'Travel' , PREFIX + u'/print-edition/travel' )
,(u'Books' , PREFIX + u'/print-edition/books' )
,(u'Food & Wine' , PREFIX + u'/print-edition/food-wine' )
,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
,(u'Features' , PREFIX + u'/print-edition/features' )
]
def print_version(self, url):
article = url.rpartition('/')[2]
return 'http://www.haaretz.com/misc/article-print-page/' + article
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll(attrs={'class':'text'}):
sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
desc = item.find('p')
description = ''
if sp:
if desc:
description = self.tag_to_string(desc)
link = sp.a
url = self.PREFIX + link['href']
title = self.tag_to_string(link)
times = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
articles.append({
'title' :title
,'date' :times
,'url' :url
,'description':description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup