mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #6095 (Haaretz recipe broken)
This commit is contained in:
parent
2d18d57c5d
commit
75d1de87b5
BIN
resources/images/news/haaretz.png
Normal file
BIN
resources/images/news/haaretz.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.2 KiB |
@ -1,56 +1,95 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
haaretz.com
|
www.haaretz.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre import strftime
|
||||||
|
from time import gmtime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Haaretz_en(BasicNewsRecipe):
|
class HaaretzPrint_en(BasicNewsRecipe):
|
||||||
title = 'Haaretz in English'
|
title = 'Haaretz - print edition'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Haaretz.com, the online edition of Haaretz Newspaper in Israel, and analysis from Israel and the Middle East. Haaretz.com provides extensive and in-depth coverage of Israel, the Jewish World and the Middle East, including defense, diplomacy, the Arab-Israeli conflict, the peace process, Israeli politics, Jerusalem affairs, international relations, Iran, Iraq, Syria, Lebanon, the Palestinian Authority, the West Bank and the Gaza Strip, the Israeli business world and Jewish life in Israel and the Diaspora. '
|
description = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
|
||||||
publisher = 'haaretz.com'
|
publisher = 'Haaretz'
|
||||||
category = 'news, politics, Israel'
|
category = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news"
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 200
|
max_articles_per_feed = 200
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'en_IL'
|
language = 'en_IL'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
remove_empty_feeds = True
|
PREFIX = 'http://www.haaretz.com'
|
||||||
masthead_url = 'http://www.haaretz.com/images/logos/logoGrey.gif'
|
masthead_url = PREFIX + '/images/logos/logoGrey.gif'
|
||||||
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
|
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
, 'tags' : category
|
, 'tags' : category
|
||||||
, 'publisher' : publisher
|
, 'publisher': publisher
|
||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'class':['rightcol']}),dict(name='table')]
|
keep_only_tags = [dict(attrs={'id':'threecolumns'})]
|
||||||
remove_tags_before = dict(name='h1')
|
remove_attributes = ['width','height']
|
||||||
remove_tags_after = dict(attrs={'id':'innerArticle'})
|
remove_tags = [
|
||||||
keep_only_tags = [dict(attrs={'id':'content'})]
|
dict(name=['iframe','link','object','embed'])
|
||||||
|
,dict(name='div',attrs={'class':'rightcol'})
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Opinion' , u'http://www.haaretz.com/cmlink/opinion-rss-1.209234?localLinksEnabled=false' )
|
(u'News' , PREFIX + u'/print-edition/news' )
|
||||||
,(u'Defense and diplomacy' , u'http://www.haaretz.com/cmlink/defense-and-diplomacy-rss-1.208894?localLinksEnabled=false')
|
,(u'Opinion' , PREFIX + u'/print-edition/opinion' )
|
||||||
,(u'National' , u'http://www.haaretz.com/cmlink/national-rss-1.208896?localLinksEnabled=false' )
|
,(u'Business' , PREFIX + u'/print-edition/business' )
|
||||||
,(u'International' , u'http://www.haaretz.com/cmlink/international-rss-1.208898?localLinksEnabled=false' )
|
,(u'Real estate' , PREFIX + u'/print-edition/real-estate' )
|
||||||
,(u'Jewish World' , u'http://www.haaretz.com/cmlink/jewish-world-rss-1.209085?localLinksEnabled=false' )
|
,(u'Sports' , PREFIX + u'/print-edition/sports' )
|
||||||
,(u'Business' , u'http://www.haaretz.com/cmlink/business-print-rss-1.264904?localLinksEnabled=false' )
|
,(u'Travel' , PREFIX + u'/print-edition/travel' )
|
||||||
,(u'Real Estate' , u'http://www.haaretz.com/cmlink/real-estate-print-rss-1.264977?localLinksEnabled=false' )
|
,(u'Books' , PREFIX + u'/print-edition/books' )
|
||||||
,(u'Features' , u'http://www.haaretz.com/cmlink/features-print-rss-1.264912?localLinksEnabled=false' )
|
,(u'Food & Wine' , PREFIX + u'/print-edition/food-wine' )
|
||||||
,(u'Arts and leisure' , u'http://www.haaretz.com/cmlink/arts-and-leisure-rss-1.286090?localLinksEnabled=false' )
|
,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
|
||||||
,(u'Books' , u'http://www.haaretz.com/cmlink/books-rss-1.264947?localLinksEnabled=false' )
|
,(u'Features' , PREFIX + u'/print-edition/features' )
|
||||||
,(u'Food and Wine' , u'http://www.haaretz.com/cmlink/food-and-wine-print-rss-1.265034?localLinksEnabled=false' )
|
|
||||||
,(u'Sports' , u'http://www.haaretz.com/cmlink/sports-rss-1.286092?localLinksEnabled=false' )
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
article = url.rpartition('/')[2]
|
||||||
|
return 'http://www.haaretz.com/misc/article-print-page/' + article
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
totalfeeds = []
|
||||||
|
lfeeds = self.get_feeds()
|
||||||
|
for feedobj in lfeeds:
|
||||||
|
feedtitle, feedurl = feedobj
|
||||||
|
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||||
|
articles = []
|
||||||
|
soup = self.index_to_soup(feedurl)
|
||||||
|
for item in soup.findAll(attrs={'class':'text'}):
|
||||||
|
sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
|
||||||
|
desc = item.find('p')
|
||||||
|
description = ''
|
||||||
|
if sp:
|
||||||
|
if desc:
|
||||||
|
description = self.tag_to_string(desc)
|
||||||
|
link = sp.a
|
||||||
|
url = self.PREFIX + link['href']
|
||||||
|
title = self.tag_to_string(link)
|
||||||
|
times = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
|
||||||
|
articles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :times
|
||||||
|
,'url' :url
|
||||||
|
,'description':description
|
||||||
|
})
|
||||||
|
totalfeeds.append((feedtitle, articles))
|
||||||
|
return totalfeeds
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
|
@ -22,7 +22,7 @@ E = ElementMaker(namespace=NS, nsmap={None:NS})
|
|||||||
|
|
||||||
def iterate_over_builtin_recipe_files():
|
def iterate_over_builtin_recipe_files():
|
||||||
exclude = ['craigslist', 'iht', 'outlook_india', 'toronto_sun',
|
exclude = ['craigslist', 'iht', 'outlook_india', 'toronto_sun',
|
||||||
'indian_express', 'india_today']
|
'indian_express', 'india_today', 'livemint']
|
||||||
d = os.path.dirname
|
d = os.path.dirname
|
||||||
base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'resources', 'recipes')
|
base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'resources', 'recipes')
|
||||||
for x in os.walk(base):
|
for x in os.walk(base):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user