From 6250465558c3cdc41eba0a86f561940c15bbad3c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jan 2018 20:57:47 +0530 Subject: [PATCH] Update Danas Fixes #1742163 [Updated recipe for Danas](https://bugs.launchpad.net/calibre/+bug/1742163) [Updated recipe for Danas](https://bugs.launchpad.net/calibre/+bug/1742163) --- recipes/danas.recipe | 171 +++++++++++++++++----------------------- recipes/icons/danas.png | Bin 485 -> 553 bytes 2 files changed, 71 insertions(+), 100 deletions(-) diff --git a/recipes/danas.recipe b/recipes/danas.recipe index 1c95993b72..3fe59c7426 100644 --- a/recipes/danas.recipe +++ b/recipes/danas.recipe @@ -1,131 +1,102 @@ +# -*- mode: python -*- +# -*- coding: utf-8 -*- + __license__ = 'GPL v3' -__copyright__ = '2008-2011, Darko Miletic ' +__copyright__ = '2008-2018, Darko Miletic ' ''' danas.rs ''' -import re from calibre.web.feeds.news import BasicNewsRecipe +from datetime import date, timedelta class Danas(BasicNewsRecipe): title = 'Danas' __author__ = 'Darko Miletic' description = 'Dnevne novine sa vestima iz sveta, politike, ekonomije, kulture, sporta, Beograda, Novog Sada i cele Srbije.' - publisher = 'Danas d.o.o.' - category = 'news, politics, Serbia' + publisher = 'DAN GRAF d.o.o.' + category = 'danas Dnevne novine politika drustvo ekonomija svet sport beograd vojvodina periskop terazije Beograd Srbija Novi Sad Nis Kragujevac Vojvodina politics Serbia' # noqa oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = False use_embedded_content = False encoding = 'utf-8' - masthead_url = 'http://www.danas.rs/images/basic/danas.gif' + masthead_url = 'http://www.danas.rs/images/basic/logo.png' language = 'sr' remove_javascript = True publication_type = 'newspaper' remove_empty_feeds = True - extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} - @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - .article,.articledescription,body,.lokacija,.feed{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} - .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} - .antrfileText{border-left: 2px solid #999999; - margin-left: 0.8em; - padding-left: 1.2em; - margin-bottom: 0; - margin-top: 0} - h2,.datum,.lokacija,.autor{font-size: small} - .autor{text-transform: uppercase} - .antrfileNaslov{border-left: 2px solid #999999; - margin-left: 0.8em; - padding-left: 1.2em; - font-weight:bold; - margin-bottom: 0; - margin-top: 0} + auto_cleanup = True + auto_cleanup_keep = '//h1[@class="title"] | //div[@class="subTitle"] | //div[@class="author"] | //div[@class="published"]' + resolve_internal_links = True + INDEX = "http://www.danas.rs" + extra_css = """ + .author{font-size: small} + .published {font-size: small} img{margin-bottom: 0.8em} - .naslovTemeDana{font-size: small} """ conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + 'comment': description, + 'tags': category, + 'publisher': publisher, + 'language': language } - preprocess_regexps = [ - (re.compile(u'\u0110'), lambda match: u'\u00D0'), (re.compile( - u'\u2018'), lambda match: '‘') # left single quotation mark - # right single quotation mark - # single low-9 quotation mark - # single high-reversed-9 quotation mark - # left double quotation mark - # right double quotation mark - # double low-9 quotation mark - # double high-reversed-9 quotation mark - # latin small letter o with circumflex - # latin small letter o with dieaeresis - # latin small letter a with acute - # latin small letter a with dieaeresis - , (re.compile(u'\u2019'), lambda match: '’'), (re.compile(u'\u201a'), lambda match: '‘'), (re.compile(u'\u201b'), lambda match: '’'), (re.compile(u'\u201c'), lambda match: '“'), (re.compile(u'\u201d'), lambda match: '”'), (re.compile(u'\u201e'), lambda match: '“'), (re.compile(u'\u201f'), lambda match: '”'), (re.compile(u'\u00f4'), lambda match: '“'), (re.compile(u'\u00f6'), lambda match: '”'), (re.compile(u'\u00e1'), lambda match: ' '), (re.compile(u'\u00e4'), lambda match: ' ') # noqa - ] - - keep_only_tags = [dict(name='div', attrs={'id': 'left'})] - remove_tags = [ - dict(name='div', attrs={'class': ['width_1_4', 'metaClanka', 'baner', 'listaVesti', 'article_nav']}), dict( - name='div', attrs={'id': 'comments'}), dict(name=['object', 'link', 'iframe', 'meta']) - ] - remove_attributes = ['w:st', 'st'] - feeds = [ - - (u'Politika', u'http://www.danas.rs/rss/rss.asp?column_id=27'), - (u'Hronika', u'http://www.danas.rs/rss/rss.asp?column_id=2'), - (u'Drustvo', u'http://www.danas.rs/rss/rss.asp?column_id=24'), - (u'Dijalog', u'http://www.danas.rs/rss/rss.asp?column_id=1'), - (u'Ekonomija', u'http://www.danas.rs/rss/rss.asp?column_id=6'), - (u'Svet', u'http://www.danas.rs/rss/rss.asp?column_id=25'), - (u'Srbija', u'http://www.danas.rs/rss/rss.asp?column_id=28'), - (u'Kultura', u'http://www.danas.rs/rss/rss.asp?column_id=5'), - (u'Sport', u'http://www.danas.rs/rss/rss.asp?column_id=13'), - (u'Scena', u'http://www.danas.rs/rss/rss.asp?column_id=42'), - (u'Feljton', u'http://www.danas.rs/rss/rss.asp?column_id=19'), - (u'Periskop', u'http://www.danas.rs/rss/rss.asp?column_id=4'), - (u'Famozno', u'http://www.danas.rs/rss/rss.asp?column_id=47'), - (u'Sluzbena beleska', u'http://www.danas.rs/rss/rss.asp?column_id=48'), - (u'Suocavanja', u'http://www.danas.rs/rss/rss.asp?column_id=49'), - (u'Moj Izbor', u'http://www.danas.rs/rss/rss.asp?column_id=50'), - (u'Direktno', u'http://www.danas.rs/rss/rss.asp?column_id=51'), - (u'I tome slicno', u'http://www.danas.rs/rss/rss.asp?column_id=52'), - (u'No longer and not yet', u'http://www.danas.rs/rss/rss.asp?column_id=53'), - (u'Resetovanje', u'http://www.danas.rs/rss/rss.asp?column_id=54'), - (u'Iza scene', u'http://www.danas.rs/rss/rss.asp?column_id=60'), - (u'Drustvoslovlje', u'http://www.danas.rs/rss/rss.asp?column_id=55'), - (u'Zvaka u pepeljari', u'http://www.danas.rs/rss/rss.asp?column_id=56'), - (u'Vostani Serbie', u'http://www.danas.rs/rss/rss.asp?column_id=57'), - (u'Med&Jad-a', u'http://www.danas.rs/rss/rss.asp?column_id=58'), - (u'Svetlosti pozornice', u'http://www.danas.rs/rss/rss.asp?column_id=59'), - (u'Dva cvancika', u'http://www.danas.rs/rss/rss.asp?column_id=65'), - (u'Iz kornera', u'http://www.danas.rs/rss/rss.asp?column_id=64') + (u'Politika', u'http://www.danas.rs/politika.56.html'), + (u'Drustvo', u'http://www.danas.rs/drustvo.55.html'), + (u'Dijalog', u'http://www.danas.rs/dijalog.46.html'), + (u'Drustvo', u'http://www.danas.rs/drustvo.55.html'), + (u'Ekonomija', u'http://www.danas.rs/ekonomija.4.html'), + (u'Svet', u'http://www.danas.rs/svet.1160.html'), + (u'Sport', u'http://www.danas.rs/sport.18.html'), + (u'Kultura', u'http://www.danas.rs/kultura.11.html'), + (u'Scena', u'http://www.danas.rs/scena.561.html'), + (u'Zivot', u'http://www.danas.rs/zivot.1140.html'), + (u'Auto', u'http://www.danas.rs/auto.1171.html'), + (u'IT', u'http://www.danas.rs/it.29.html'), + (u'Ljudi', u'http://www.danas.rs/ljudi.1141.html'), + (u'Beograd', u'http://www.danas.rs/beograd.39.html'), ] - def preprocess_html(self, soup): - for tagn in ['st1:place', 'st1:city', 'st1:country-region', 'st1:state']: - for item in soup.body.findAll(tagn): - item.name = 'span' - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('a'): - if item.has_key('name'): # noqa - item.extract() - for item in soup.findAll('img'): - if not item.has_key('alt'): # noqa - item['alt'] = 'image' - return soup - - def print_version(self, url): - return url + '&action=print' - def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.danas.rs/') - for citem in soup.findAll('img'): - if citem['src'].endswith('naslovna.jpg') or citem['src'].endswith('naslovna1.jpg'): - return 'http://www.danas.rs' + citem['src'] - return cover_url + td = date.today() - timedelta(1) + cpart = str(td.year) + '/' + str(td.month) + '/' + str(td.day) + return 'http://www.danas.rs/upload/images/' + cpart + '/Naslovna_velika.jpg' + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.report_progress( + 0, + _('Fetching feed') + ' %s...' % + (feedtitle if feedtitle else feedurl) + ) + articles = [] + soup = self.index_to_soup(feedurl) + for item in soup.findAll( + 'div', attrs={ + 'class': lambda x: x and 'newsBox' in x.split() + } + ): + atag = item.find('h2').a + ptag = item.find('p', attrs={'class': 'lead'}) + artdate = item.find('div', attrs={'class': 'date'}) + url = self.INDEX + atag['href'] + title = self.tag_to_string(atag) + description = '' + if ptag: + description = self.tag_to_string(ptag) + date = self.tag_to_string(artdate) + articles.append({ + 'title': title, + 'date': date, + 'url': url, + 'description': description + }) + totalfeeds.append((feedtitle, articles)) + return totalfeeds diff --git a/recipes/icons/danas.png b/recipes/icons/danas.png index 0cddcfbdabd611231c12497cf630b9e0ebbeeb21..85ae185e48c19f613f069ff28080aae877f7aa81 100644 GIT binary patch delta 539 zcmV+$0_6SW1E~a%8Gi-<001BJ|6u?C0q{vgK~y-6rISr-6Hyd~pF4MEl4eL#6Vn>S z7MoPD(h?LGT^6+<5)=dl7jC5>f`6c(t`+})E5VgJ5!|^DKfr|*bc*ob9=`ZWb!81w2G~BM z#VjW$^E~@h=i}>@80z>OoF3un!Yq4-k^pF#0i8P+j&SSJ6xcqP5^RsN~8h`L7fZY~nrbc=*XgL8JgxCn+ zOWoo6{kM>i;D@kO=gM4(@@NJCvD$Lgsk<+Ew^)l`=n{euGD-5;0e)6n@VyO|38v1) zxgrndkE4m$L64N8;rI;aEd~o|2qTahYqm$b6EI%PQkvP%v3-M7jux0H<$CfFJN=D) dpK&|=1r*;{vbIvB+eQEY002ovPDHLkV1no<`jY?v delta 470 zcmV;{0V)2e1my#e8Gix*005AYXf^-<0jx#xs0{P^?xFBT2=-+W_;TFv0LkRf9eL-Ga?8Mz9?4PGu;wB5Pm(1dNb{vd1k z@a=cu$}6(@+ZaMuFvPEA2w5RlzAI_rdDYrI3?7RZe3vjd&wp1d-1hYScc=ykwczAq z2Cu~osT&wP7v(O$3=s^TdXm9^DacO8d1^I#P&7Pm>OPv`oWU5nIUF1L;5C$u$A?jZU6=GY54g4Hy_Y% zk*h&A1T4#0et+rTKSCOKff}N~8r&DAEId!ZhUXuDFr{x~h+PBnxA)@aEjJNx+*+bQ||G1TF)i(3L