diff --git a/recipes/wirtscafts_woche.recipe b/recipes/wirtscafts_woche.recipe index cec1ad998f..5f3fd389c6 100644 --- a/recipes/wirtscafts_woche.recipe +++ b/recipes/wirtscafts_woche.recipe @@ -1,115 +1,102 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2013, Armin Geller' ## -# Written: May 2013 (new coding) -# Version: 4.4 -# Last update: 2018-03-22 -# Update AGE 2013-01-05; 2018-03-01; 2018-03-21; Modified by Hegi 2013-04-28, 2015-04-18, 2018-03-13, 2018-03-22 -## +## Written: May 2013 (new coding) +## Version: 4.4 +## Last update: 2020-12-29 +## ''' Fetch WirtschaftsWoche Online ''' -import re from calibre.web.feeds.news import BasicNewsRecipe - - class WirtschaftsWocheOnline(BasicNewsRecipe): - title = u'WirtschaftsWoche Online' - __author__ = 'Armin Geller' - description = u'Wirtschaftswoche Online - basierend auf den RRS-Feeds von Wiwo.de' - tags = 'Nachrichten, Blog, Wirtschaft' - publisher = 'Verlagsgruppe Handelsblatt GmbH / Redaktion WirtschaftsWoche Online' - category = 'business, economy, news, Germany' - publication_type = 'weekly magazine' - oldest_article = 7 + title = u'WirtschaftsWoche Online' + __author__ = 'Armin Geller' # Update AGE 2013-01-05; 2018-03-01 + description = u'German Online Portal of WirtschaftsWoche' + publisher = 'Verlagsgruppe Handelsblatt GmbH Redaktion WirtschaftsWoche Online' + category = 'business, economy, news, Germany' + publication_type = 'weekly magazine' + oldest_article = 7 max_articles_per_feed = 100 - simultaneous_downloads = 20 - - auto_cleanup = False - no_stylesheets = True - remove_javascript = True - remove_empty_feeds = True - timefmt = ' [%a, %d %b %Y]' - language = 'de' - encoding = 'UTF-8' - cover_source = 'https://kaufhaus.handelsblatt.com/downloads/wirtschaftswoche-emagazin-p1952.html' - masthead_url = 'http://www.wiwo.de/images/wiwo_logo/5748610/1-formatOriginal.png' - - conversion_options = {'smarten_punctuation': True, - 'authors': publisher, - 'publisher': publisher} - - # don't duplicate articles from "Schlagzeilen" / "Exklusiv" to other rubrics - ignore_duplicate_articles = {'title', 'url'} - - # if you want to reduce size for an b/w or E-ink device, uncomment the following 4 lines: - # compress_news_images = True - # compress_news_images_auto_size = 16 - # scale_news_images = (400,300) - # compress_news_images_max_size = 35 + + auto_cleanup = False + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + timefmt = ' [%a, %d %b %Y]' + language = 'de_DE' + encoding = 'UTF-8' + cover_source = 'https://www.ikiosk.de/shop/epaper/wirtschaftswoche.html' + masthead_url = 'http://www.wiwo.de/images/wiwo_logo/5748610/1-formatOriginal.png' + + def get_cover_url(self): - soup = self.index_to_soup(self.cover_source) - style = soup.find('img', alt='WirtschaftsWoche eMagazin', style=True)[ - 'style'] - self.cover_url = style.partition('(')[-1].rpartition(')')[0] - return self.cover_url + cover_source_soup = self.index_to_soup(self.cover_source) + preview_image_div = cover_source_soup.find(attrs={'class':'gallery'}) + return preview_image_div.a.img['src'] + - preprocess_regexps = [ - ( - re.compile( - r'([^<]*)()', re.DOTALL | re.IGNORECASE), - lambda match: match.group(1) + '. ' + match.group(2) - ), - ( - re.compile( - r'(c-overline--article">[^>]*)()', re.DOTALL | re.IGNORECASE), - lambda match: match.group(1) + ': ' + match.group(2) - ) - ] + extra_css = ''' + h1, h2 {font-size: 1.6em; text-align: left} + .c-leadtext {font-size: 1em; font-style: italic; font-weight: normal} + h4 {font-size: 1.3em;text-align: left} + h5, h6, a {font-size: 1em;text-align: left} + .c-metadata {font-size: .75em;text-align: left; font-style: italic} + ''' - extra_css = ''' - h1, h2 {font-size: 1.6em; text-align: left} - .c-leadtext {font-size: 1.2em; font-weight:bold; text-align: left} - h3, h4 {font-size: 1.3em; text-align: left} - h5, h6, a {font-size: 1em; text-align: left} - .c-metadata {font-size: .75em; text-align: left; font-style: italic} - .hcf-location-mark {font-style: italic; font-weight:bold} - .c-overline {font-size: 1em; text-align: left;font-style: normal; font-weight:bold} - ''' - - keep_only_tags = [ - dict(name='div', attrs={'class': [ - 'o-article__element', 'o-article__content c-richText', 'o-article__content-element o-article__content-element--richtext', - 'o-article__content c-richText', 'c-leadtext' - ]}) - ] + keep_only_tags = [ + dict(name='div', attrs={'class':['o-article__element', + 'o-article__content c-richText', + 'o-article__content-element o-article__content-element--richtext' + ]}) + ] remove_tags = [ - dict(name='div', attrs={'class': [ - 'c-pagination u-flex', 'c-standard-article-teaser', 'c-pagination u-flex ajaxify', 'c-socialshare u-margin-xxl ', - 'c-advertisment__fullWidth c-advertisment--P3_desktop' - ]}), - dict(attrs={'class': lambda x: x and 'c-socialshare' in x.split()}) - ] + dict(name='div', attrs={'class':['c-pagination u-flex', + 'c-standard-article-teaser', + 'c-pagination u-flex ajaxify', + 'c-socialshare u-margin-xxl ', + 'c-list', #AGe 2020-12-29 + ]}) + ] feeds = [ - (u'Schlagzeilen', u'http://www.wiwo.de/contentexport/feed/rss/schlagzeilen'), - (u'Exklusiv', u'http://www.wiwo.de/contentexport/feed/rss/exklusiv'), - # (u'Themen', u'http://www.wiwo.de/contentexport/feed/rss/themen'), # AGE no print version - (u'Unternehmen', u'http://www.wiwo.de/contentexport/feed/rss/unternehmen'), - (u'Finanzen', u'http://www.wiwo.de/contentexport/feed/rss/finanzen'), - (u'Politik', u'http://www.wiwo.de/contentexport/feed/rss/politik'), - (u'Erfolg', u'http://www.wiwo.de/contentexport/feed/rss/erfolg'), - (u'Technologie', u'http://www.wiwo.de/contentexport/feed/rss/technologie'), - # (u'Green-WiWo', u'http://green.wiwo.de/feed/rss/') # AGE offline - ] + (u'Schlagzeilen', u'http://www.wiwo.de/contentexport/feed/rss/schlagzeilen'), + (u'Exklusiv', u'http://www.wiwo.de/contentexport/feed/rss/exklusiv'), +# (u'Themen', u'http://www.wiwo.de/contentexport/feed/rss/themen'), # AGE no print version + (u'Unternehmen', u'http://www.wiwo.de/contentexport/feed/rss/unternehmen'), + (u'Finanzen', u'http://www.wiwo.de/contentexport/feed/rss/finanzen'), + (u'Politik', u'http://www.wiwo.de/contentexport/feed/rss/politik'), + (u'Erfolg', u'http://www.wiwo.de/contentexport/feed/rss/erfolg'), + (u'Technologie', u'http://www.wiwo.de/contentexport/feed/rss/technologie'), +# (u'Green-WiWo', u'http://green.wiwo.de/feed/rss/') # AGE offline + ] + +# For hegi # AGE new 2018-03-21 + + # Add ': ' between headline part one and two + # Wandel kostet Milliarden + ': ' + SUV und China sollen Audi wieder nach vorne bringen + # https://www.wiwo.de/unternehmen/auto/wandel-kostet-milliarden-suv-und-china-sollen-audi-wieder-nach-vorne-bringen/21069566.html + + # debug: + # input:

Wandel kostet Milliarden SUV und China sollen Audi wieder nach vorne bringen

+ # parsed:

Wandel kostet Milliarden SUV und China sollen Audi wieder nach vorne bringen

+ # processed:

Wandel kostet Milliarden SUV und China sollen Audi wieder nach vorne bringen

+ # structure:

Wandel kostet Milliarden SUV und China sollen Audi wieder nach vorne bringen

+ # epub:

Wandel kostet Milliarden SUV und China sollen Audi wieder nach vorne bringen

+ + preprocess_regexps = [ + (re.compile(r'(c-overline--article">[^>]*)()', re.DOTALL|re.IGNORECASE), lambda match: match.group(1) + ': ' + match.group(2)), + ] + +# /For hegi # one page n times url: https://www.wiwo.de/finanzen/geldanlage/bla-bla/21020646.html # all in one page article url: https://www.wiwo.de/finanzen/geldanlage/bla-bla/21020646-all.html def print_version(self, url): - main, sep, tail = url.rpartition('.') - return main + '-all' + sep + tail + main, sep, rest = url.rpartition('.') + return main + '-all' + sep + rest