import re from calibre.web.feeds.recipes import BasicNewsRecipe # Comment out sections you are not interested in sections = [ ("정치", "politics"), ("사회", "national"), ("경제", "economy"), ("국제", "international"), ("사설칼럼", "editorials"), ("의학과학", "science"), ("문화연예", "culture"), ("스포츠", "sports"), ("사람속으로", "inmul") # Following sections are marked as marked optional # as default. Uncomment to enable. # , (u'건강', 'health') # , (u'레저', 'leisure') # , (u'도서', 'book') # , (u'공연', 'show') # , (u'여성', 'woman') # , (u'여행', 'travel') # , (u'생활정보', 'lifeinfo') ] class Donga(BasicNewsRecipe): language = "ko" title = "동아일보" description = "동아일보 기사" __author__ = "Minsik Cho" ignore_duplicate_articles = {"title", "url"} compress_news_images = True no_stylesheets = True oldest_article = 2 encoding = "utf-8" # RSS Feed in syntax: # https://rss.donga.com/[sections].xml feeds = [(title, "https://rss.donga.com/" + section + ".xml") for (title, section) in sections] # Remove logo and print buttons remove_tags = [ dict(name="div", attrs={"class": "popHeaderWrap"}), dict(name="div", attrs={"class": "etc"}), ] def print_version(self, url): # Original url in syntax: # https://www.donga.com/news/[sections]/article/all/[date]/[gid]/1 # Return print version url with syntax: # https://www.donga.com/news/View?gid=[gid]&date=[date] reobject = re.search("(?<=/all/)([0-9]*)/([0-9]*)", url) date = reobject.group(1) gid = reobject.group(2) return "https://www.donga.com/news/View?gid=" + gid + "&date=" + date