''' scmp.com ''' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class SCMP(BasicNewsRecipe): title = 'South China Morning Post' __author__ = 'llam' description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa publisher = 'South China Morning Post Publishers Ltd.' oldest_article = 2 delay = 1 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False language = 'en_CN' remove_empty_feeds = True needs_subscription = 'optional' publication_type = 'newspaper' keep_only_tags = [ dict(name='h1'), classes('field-name-field-subheading scmp-gallery-swiper pane-node-body field-name-field-authors'), dict(itemprop='dateCreated dateModified'.split()), ] remove_tags = [ dict(name='button') ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) # br.set_debug_http(True) # br.set_debug_responses(True) # br.set_debug_redirects(True) if self.username is not None and self.password is not None: br.open('https://www.scmp.com/user/login') br.select_form(nr=0) br['name'] = self.username br['pass'] = self.password br.submit() return br feeds = [ (u'Business', u'http://www.scmp.com/rss/business.xml'), (u'Hong Kong', u'http://www.scmp.com/rss/hong_kong.xml'), (u'China', u'http://www.scmp.com/rss/china.xml'), (u'Asia & World', u'http://www.scmp.com/rss/news_asia_world.xml'), (u'Opinion', u'http://www.scmp.com/rss/opinion.xml'), (u'LifeSTYLE', u'http://www.scmp.com/rss/lifestyle.xml'), (u'Sport', u'http://www.scmp.com/rss/sport.xml') ] def preprocess_html(self, soup): for img in soup.findAll("img", attrs={'data-original':True}): img['src'] = img['data-original'] return soup