From 478f91061126e56371556bd5da07f87676862ab9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 27 Dec 2022 11:01:56 +0530 Subject: [PATCH] Update HNA --- recipes/hna.recipe | 97 +++++++++++----------------------------------- 1 file changed, 23 insertions(+), 74 deletions(-) diff --git a/recipes/hna.recipe b/recipes/hna.recipe index b742143119..4649e420b3 100644 --- a/recipes/hna.recipe +++ b/recipes/hna.recipe @@ -5,8 +5,7 @@ __copyright__ = '2008, Kovid Goyal ' Fetch Hessisch Niedersachsische Allgemeine. ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class hnaDe(BasicNewsRecipe): @@ -20,78 +19,28 @@ class hnaDe(BasicNewsRecipe): max_articles_per_feed = 40 no_stylesheets = True remove_javascript = True - auto_cleanup = True encoding = 'utf-8' + masthead_url = 'https://idcdn.de/west/assets/hna-de/img/logo--cf5324e1.svg' - remove_tags = [dict(id='topnav'), - dict(id='nav_main'), - dict(id='teaser'), - dict(id='suchen'), - dict(id='superbanner'), - dict(id='navigation'), - dict(id='skyscraper'), - dict(id='idHeaderSearchForm'), - dict(id='idHeaderSearchBar'), - dict(id='idLoginBarWrap'), - dict(id='idAccountButtons'), - dict(id='idHeadButtons'), - dict(id='idBoxesWrap'), - dict(id='idJSMainNavigation'), - dict(id=''), - dict(name='span'), - dict(name='ul', attrs={'class': 'linklist'}), - dict(name='ul', attrs={ - 'class': 'idMainNavi idJSActive idHeadHomeBtn'}), - dict(name='ul', attrs={ - 'class': 'idHiddenNavi idNaviSubcategories'}), - dict(name='a', attrs={'href': '#'}), - dict(name='a', attrs={'class': 'idImgLink'}), - dict(name='a', attrs={'class': 'idListLink'}), - dict(name='div', attrs={'class': 'hlist'}), - dict(name='div', attrs={'class': 'idTabWrap'}), - dict(name='li', attrs={ - 'class': 'idButton idIsLoginGroup idHeaderRegister '}), - dict(name='li', attrs={'class': 'idVideoBar idFirst'}), - dict(name='li', attrs={ - 'class': 'idSetStartPageLink idLast'}), - dict(name='li', attrs={'class': 'idKinderNetzBar idLast'}), - dict(name='li', attrs={'class': 'idFotoBar '}), - dict(name='div', attrs={'class': 'subc noprint'}), - dict(name='div', attrs={'class': 'idTxtLay'}), - dict(name='div', attrs={ - 'class': 'idLay idClStandard idStaticHtml'}), - dict(name='div', attrs={'class': 'idHeaderWrap'}), - dict(name='div', attrs={ - 'class': 'idLay idRss idClStandard'}), - dict(name='div', attrs={ - 'class': 'idLay idClStandard idLeadStoriesFocus idLeadStoriesFocusOverlay '}), - dict(name='div', attrs={ - 'class': 'idTeaserLay idTeaserFloat idMediaLeft idLast'}), - dict(name='div', attrs={ - 'class': 'idHeaderButtons idAccountButtons'}), - dict(name='div', attrs={ - 'class': 'idTeaserLay idTeaserWithImg idSize4 idMediaLeft'}), - dict(name='div', attrs={ - 'class': 'idHeaderButtons idHeadButtons'}), - dict(name='div', attrs={ - 'class': 'idHeaderButtons idSetStartPage'}), - dict(name='div', attrs={ - 'class': 'idLay idClHl idTeaserList '}), - dict(name='div', attrs={'class': 'idNavigationWrap'}), - dict(name='div', attrs={'class': 'idBreadcrumbWrap'}), - dict(name='div', attrs={'class': 'idBoxesWrap'}), - dict(name='div', attrs={'class': 'idBreadcrumb'}), - dict(name='div', attrs={ - 'class': 'idLay idAdvertising idClStandard '}), - dict(name='span', attrs={'class': 'idHeadLineIntro'}), - dict(name='p', attrs={'class': 'breadcrumb'}), - dict(name='a', attrs={'style': 'cursor:hand'}), - dict(name='p', attrs={'class': 'h5'}), - dict(name='p', attrs={'class': 'idMoreEnd'})] - remove_tags_after = [ - dict(name='div', attrs={'class': 'idTxtLay idStaticHtmlIEHelper'})] + def get_cover_url(self): + soup = self.index_to_soup('https://epaper.meinehna.de/') + if a := soup.find('a', attrs={'class':'edition-cover__link'}): + if citem := a.find('img', src=True): + return citem['src'] - feeds = [('hna_soehre', 'http://feeds2.feedburner.com/hna/soehre'), - ('hna_kassel', 'http://feeds2.feedburner.com/hna/kassel'), - ('hna_KSV', 'http://feeds2.feedburner.com/hna/ksv'), - ('hna_kultur', 'http://feeds2.feedburner.com/hna/kultur')] + keep_only_tags = [ + dict(name='article', attrs={'class':lambda x: x and 'id-Story' in x.split()}) + ] + remove_tags = [ + classes( + 'id-DonaldBreadcrumb id-StoryElement-interactionBar id-Recommendation ' + 'id-Comments id-Comments--targetHelper id-StoryElement-inArticleReco' + ) + ] + + feeds = [ + ('hna_soehre', 'http://feeds2.feedburner.com/hna/soehre'), + ('hna_kassel', 'http://feeds2.feedburner.com/hna/kassel'), + ('hna_KSV', 'http://feeds2.feedburner.com/hna/ksv'), + ('hna_kultur', 'http://feeds2.feedburner.com/hna/kultur') + ]