Fix #6095 (Haaretz recipe broken)

This commit is contained in:
Kovid Goyal 2010-07-05 08:06:17 -06:00
parent 2d18d57c5d
commit 75d1de87b5
3 changed files with 69 additions and 30 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

View File

@ -1,56 +1,95 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
haaretz.com
www.haaretz.com
'''
import re
from calibre import strftime
from time import gmtime
from calibre.web.feeds.news import BasicNewsRecipe
class Haaretz_en(BasicNewsRecipe):
title = 'Haaretz in English'
class HaaretzPrint_en(BasicNewsRecipe):
title = 'Haaretz - print edition'
__author__ = 'Darko Miletic'
description = 'Haaretz.com, the online edition of Haaretz Newspaper in Israel, and analysis from Israel and the Middle East. Haaretz.com provides extensive and in-depth coverage of Israel, the Jewish World and the Middle East, including defense, diplomacy, the Arab-Israeli conflict, the peace process, Israeli politics, Jerusalem affairs, international relations, Iran, Iraq, Syria, Lebanon, the Palestinian Authority, the West Bank and the Gaza Strip, the Israeli business world and Jewish life in Israel and the Diaspora. '
publisher = 'haaretz.com'
category = 'news, politics, Israel'
description = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
publisher = 'Haaretz'
category = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news"
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
encoding = 'utf8'
use_embedded_content = False
language = 'en_IL'
publication_type = 'newspaper'
remove_empty_feeds = True
masthead_url = 'http://www.haaretz.com/images/logos/logoGrey.gif'
PREFIX = 'http://www.haaretz.com'
masthead_url = PREFIX + '/images/logos/logoGrey.gif'
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
remove_tags = [dict(name='div', attrs={'class':['rightcol']}),dict(name='table')]
remove_tags_before = dict(name='h1')
remove_tags_after = dict(attrs={'id':'innerArticle'})
keep_only_tags = [dict(attrs={'id':'content'})]
keep_only_tags = [dict(attrs={'id':'threecolumns'})]
remove_attributes = ['width','height']
remove_tags = [
dict(name=['iframe','link','object','embed'])
,dict(name='div',attrs={'class':'rightcol'})
]
feeds = [
(u'Opinion' , u'http://www.haaretz.com/cmlink/opinion-rss-1.209234?localLinksEnabled=false' )
,(u'Defense and diplomacy' , u'http://www.haaretz.com/cmlink/defense-and-diplomacy-rss-1.208894?localLinksEnabled=false')
,(u'National' , u'http://www.haaretz.com/cmlink/national-rss-1.208896?localLinksEnabled=false' )
,(u'International' , u'http://www.haaretz.com/cmlink/international-rss-1.208898?localLinksEnabled=false' )
,(u'Jewish World' , u'http://www.haaretz.com/cmlink/jewish-world-rss-1.209085?localLinksEnabled=false' )
,(u'Business' , u'http://www.haaretz.com/cmlink/business-print-rss-1.264904?localLinksEnabled=false' )
,(u'Real Estate' , u'http://www.haaretz.com/cmlink/real-estate-print-rss-1.264977?localLinksEnabled=false' )
,(u'Features' , u'http://www.haaretz.com/cmlink/features-print-rss-1.264912?localLinksEnabled=false' )
,(u'Arts and leisure' , u'http://www.haaretz.com/cmlink/arts-and-leisure-rss-1.286090?localLinksEnabled=false' )
,(u'Books' , u'http://www.haaretz.com/cmlink/books-rss-1.264947?localLinksEnabled=false' )
,(u'Food and Wine' , u'http://www.haaretz.com/cmlink/food-and-wine-print-rss-1.265034?localLinksEnabled=false' )
,(u'Sports' , u'http://www.haaretz.com/cmlink/sports-rss-1.286092?localLinksEnabled=false' )
(u'News' , PREFIX + u'/print-edition/news' )
,(u'Opinion' , PREFIX + u'/print-edition/opinion' )
,(u'Business' , PREFIX + u'/print-edition/business' )
,(u'Real estate' , PREFIX + u'/print-edition/real-estate' )
,(u'Sports' , PREFIX + u'/print-edition/sports' )
,(u'Travel' , PREFIX + u'/print-edition/travel' )
,(u'Books' , PREFIX + u'/print-edition/books' )
,(u'Food & Wine' , PREFIX + u'/print-edition/food-wine' )
,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
,(u'Features' , PREFIX + u'/print-edition/features' )
]
def print_version(self, url):
article = url.rpartition('/')[2]
return 'http://www.haaretz.com/misc/article-print-page/' + article
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll(attrs={'class':'text'}):
sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
desc = item.find('p')
description = ''
if sp:
if desc:
description = self.tag_to_string(desc)
link = sp.a
url = self.PREFIX + link['href']
title = self.tag_to_string(link)
times = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
articles.append({
'title' :title
,'date' :times
,'url' :url
,'description':description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']

View File

@ -22,7 +22,7 @@ E = ElementMaker(namespace=NS, nsmap={None:NS})
def iterate_over_builtin_recipe_files():
exclude = ['craigslist', 'iht', 'outlook_india', 'toronto_sun',
'indian_express', 'india_today']
'indian_express', 'india_today', 'livemint']
d = os.path.dirname
base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'resources', 'recipes')
for x in os.walk(base):