Various Korean news sources by Hoje Lee

Also update various exisitng Korean news sources.

Merge branch 'master' of https://github.com/hojel/calibre
This commit is contained in:
Kovid Goyal 2015-09-17 10:11:38 +05:30
commit f665617c9f
7 changed files with 214 additions and 45 deletions

28
recipes/chosun.recipe Normal file
View File

@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2015, Hoje Lee <hojelei at gmail.com>'
'''
Profile to download Chosun.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ChosunDotcom(BasicNewsRecipe):
language = 'ko'
title = u'조선일보'
description = u'조선닷컴 기사'
__author__ = 'Hoje Lee'
oldest_article = 7
max_articles_per_feed = 10
auto_cleanup = True
feeds = [
(u'정치', 'http://www.chosun.com/site/data/rss/politics.xml'),
(u'조선비즈','http://biz.chosun.com/site/data/rss/rss.xml'),
(u'사회', 'http://www.chosun.com/site/data/rss/national.xml'),
(u'문화', 'http://www.chosun.com/site/data/rss/culture.xml'),
(u'국제', 'http://www.chosun.com/site/data/rss/international.xml'),
(u'오피니언','http://www.chosun.com/site/data/rss/editorials.xml'),
(u'스포츠', 'http://www.chosun.com/site/data/rss/sports.xml'),
(u'연예', 'http://www.chosun.com/site/data/rss/ent.xml'),
]

View File

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011, Seongkyoun Yoo <seongkyoun.yoo at gmail.com>'
'''
@ -6,42 +7,34 @@ Profile to download The Hankyoreh
from calibre.web.feeds.news import BasicNewsRecipe
class Hankyoreh(BasicNewsRecipe):
title = u'Hankyoreh'
language = 'ko'
title = u'한겨례'
description = u'The Hankyoreh News articles'
__author__ = 'Seongkyoun Yoo'
oldest_article = 5
recursions = 1
max_articles_per_feed = 5
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 10
no_stylesheets = True
remove_javascript = True
keep_only_tags = [
dict(name='tr', attrs={'height':['60px']}),
dict(id=['fontSzArea'])
dict(name='div', attrs ={'class':['article-head']}),
dict(name='div', attrs ={'class':['article-text']}),
]
remove_tags = [
dict(target='_blank'),
dict(name='td', attrs={'style':['padding: 10px 8px 5px 8px;']}),
dict(name='iframe', attrs={'width':['590']}),
]
remove_tags_after = [
dict(target='_top')
]
remove_tags = [
dict(name='p', attrs={'class':['category']}),
]
remove_tags_after = dict(id={'ad_box01'})
feeds = [
('All News','http://www.hani.co.kr/rss/'),
('Politics','http://www.hani.co.kr/rss/politics/'),
('Economy','http://www.hani.co.kr/rss/economy/'),
('Society','http://www.hani.co.kr/rss/society/'),
('International','http://www.hani.co.kr/rss/international/'),
('Culture','http://www.hani.co.kr/rss/culture/'),
('Sports','http://www.hani.co.kr/rss/sports/'),
('Science','http://www.hani.co.kr/rss/science/'),
('Opinion','http://www.hani.co.kr/rss/opinion/'),
('Cartoon','http://www.hani.co.kr/rss/cartoon/'),
('English Edition','http://www.hani.co.kr/rss/english_edition/'),
('Specialsection','http://www.hani.co.kr/rss/specialsection/'),
('Hanionly','http://www.hani.co.kr/rss/hanionly/'),
('Hkronly','http://www.hani.co.kr/rss/hkronly/'),
('Multihani','http://www.hani.co.kr/rss/multihani/'),
('Lead','http://www.hani.co.kr/rss/lead/'),
('Newsrank','http://www.hani.co.kr/rss/newsrank/'),
#(u'전체기사', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_all.xml'),
(u'정치', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_politics.xml'),
#(u'경제', 'http://www.hani.co.kr/rss/economy/'),
(u'사회', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_society.xml'),
#(u'국제', 'http://www.hani.co.kr/rss/international/'),
(u'문화', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_culture.xml'),
(u'스포츠', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_sports.xml'),
#(u'과학', 'http://www.hani.co.kr/rss/science/'),
(u'사설·칼럼','http://www.hani.co.kr/ilram/rss/hkr_news_list_opinion.xml'),
(u'만화만평', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_cartoon.xml'),
#(u'한겨례섹션','http://www.hani.co.kr/rss/specialsection/'),
]

View File

@ -9,17 +9,30 @@ class Hankyoreh21(BasicNewsRecipe):
title = u'Hankyoreh21'
language = 'ko'
description = u'The Hankyoreh21 Magazine articles'
__author__ = 'Seongkyoun Yoo'
oldest_article = 20
recursions = 1
max_articles_per_feed = 120
no_stylesheets = True
__author__ = 'Seongkyoun Yoo'
oldest_article = 30
max_articles_per_feed = 10
no_stylesheets = True
remove_javascript = True
keep_only_tags = [
dict(name='font', attrs={'class':'t18bk'}),
dict(id=['fontSzArea'])
dict(name='header', attrs ={'class':['article_head']}),
dict(name='div', attrs ={'class':['article_body']}),
]
remove_tags = [
dict(name='div', attrs ={'class':['article_tools']}),
]
feeds = [
('Hani21','http://h21.hani.co.kr/rss/ '),
#('전체기사', 'http://h21.hani.co.kr/rss/'),
('표지이야기','http://h21.hani.co.kr/rss/cover/'),
('특집', 'http://h21.hani.co.kr/rss/special/'),
('정치', 'http://h21.hani.co.kr/rss/politics/'),
('경제', 'http://h21.hani.co.kr/rss/economy/'),
('사회', 'http://h21.hani.co.kr/rss/society/'),
('세계', 'http://h21.hani.co.kr/rss/world/'),
('문화', 'http://h21.hani.co.kr/rss/culture/'),
]
def get_article_url(self, article):
org_url = BasicNewsRecipe.get_article_url(self, article)
return "http://h21.hani.co.kr"+org_url if org_url[0]=='/' else org_url

35
recipes/joongang.recipe Normal file
View File

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2015, Hoje Lee <hojelei at gmail.com>'
'''
Profile to download Joongang Ilbo
'''
from calibre.web.feeds.news import BasicNewsRecipe
class JoongangIlbo(BasicNewsRecipe):
language = 'ko'
title = u'중앙일보'
description = u'중앙일보 신문 기사'
__author__ = 'Hoje Lee'
oldest_article = 7
max_articles_per_feed = 5
auto_cleanup = True
feeds = [
#(u'전체기사', 'http://rss.joins.com/joins_news_list.xml'),
(u'주요기사', 'http://rss.joins.com/joins_homenews_list.xml'),
#(u'경제', 'http://rss.joins.com/joins_money_list.xml'),
#(u'사회', 'http://rss.joins.com/joins_life_list.xml'),
#(u'정치', 'http://rss.joins.com/joins_politics_list.xml'),
### 많이 본 뉴스
(u'전체기사', 'http://rss.joins.com/sonagi/joins_sonagi_total_list.xml'),
(u'경제', 'http://rss.joins.com/sonagi/joins_sonagi_money_list.xml'),
(u'스포츠', 'http://rss.joins.com/sonagi/joins_sonagi_sports_list.xml'),
(u'연예', 'http://rss.joins.com/sonagi/joins_sonagi_star_list.xml'),
(u'사회', 'http://rss.joins.com/sonagi/joins_sonagi_life_list.xml'),
(u'정치', 'http://rss.joins.com/sonagi/joins_sonagi_politics_list.xml'),
(u'지구촌', 'http://rss.joins.com/sonagi/joins_sonagi_world_list.xml'),
(u'IT과학', 'http://rss.joins.com/sonagi/joins_sonagi_it_list.xml'),
(u'사설', 'http://rss.joins.com/sonagi/joins_sonagi_opinion_list.xml'),
]

View File

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011, Seongkyoun Yoo <seongkyoun.yoo at gmail.com>'
'''
@ -5,26 +6,34 @@ Profile to download The Kyungyhang
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Kyungyhang(BasicNewsRecipe):
title = u'Kyungyhang'
language = 'ko'
description = u'The Kyungyhang Shinmun articles'
__author__ = 'Seongkyoun Yoo'
oldest_article = 20
recursions = 2
max_articles_per_feed = 20
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 10
no_stylesheets = True
remove_javascript = True
preprocess_regexps = [
(re.compile("<div class='ad_movFocus'.*</html>", re.DOTALL|re.IGNORECASE), lambda match: '</html>'),
]
keep_only_tags = [
dict(name='div', attrs ={'class':['article_title_wrap']}),
dict(name='div', attrs ={'class':['viewHeader']}),
dict(name='span', attrs ={'class':['article_txt']})
]
remove_tags_after = dict(id={'sub_bottom'})
remove_tags = [
dict(name='div', attrs={'class':['widget_top_dable']}),
dict(name='div', attrs={'class':['article_bottom_ad']}),
dict(name='div', attrs={'class':['article_date']}),
dict(name='iframe'),
dict(id={'TdHot'}),
dict(name='div', attrs={'class':['btn_list','bline','linebottom','bestArticle']}),
@ -33,5 +42,14 @@ class Kyungyhang(BasicNewsRecipe):
]
feeds = [
('All News','http://www.khan.co.kr/rss/rssdata/total_news.xml'),
#(u'전체기사','http://www.khan.co.kr/rss/rssdata/total_news.xml'),
(u'정치', 'http://www.khan.co.kr/rss/rssdata/politic_news.xml'),
(u'경제', 'http://www.khan.co.kr/rss/rssdata/economy_news.xml'),
(u'사회', 'http://www.khan.co.kr/rss/rssdata/society_news.xml'),
(u'세계', 'http://www.khan.co.kr/rss/rssdata/kh_world.xml'),
(u'스포츠', 'http://www.khan.co.kr/rss/rssdata/kh_sports.xml'),
(u'문화', 'http://www.khan.co.kr/rss/rssdata/culture_news.xml'),
(u'연예', 'http://www.khan.co.kr/rss/rssdata/kh_entertainment.xml'),
(u'IT', 'http://www.khan.co.kr/rss/rssdata/it_news.xml'),
(u'오피니언','http://www.khan.co.kr/rss/rssdata/opinion_news.xml'),
]

34
recipes/maekyung.recipe Normal file
View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2015, Hoje Lee <hojelei at gmail.com>'
'''
Profile to download Maeil Business
'''
from calibre.web.feeds.news import BasicNewsRecipe
class MaeilBusiness(BasicNewsRecipe):
language = 'ko'
title = u'매일경제'
description = u'매일경제 신문 기사'
__author__ = 'Hoje Lee'
oldest_article = 7
max_articles_per_feed = 10
auto_cleanup = True
feeds = [
(u'헤드라인', 'http://file.mk.co.kr/news/rss/rss_30000001.xml'),
#(u'전체뉴스', 'http://file.mk.co.kr/news/rss/rss_40300001.xml'),
(u'경제', 'http://file.mk.co.kr/news/rss/rss_30100041.xml'),
(u'정치', 'http://file.mk.co.kr/news/rss/rss_30200030.xml'),
(u'사회', 'http://file.mk.co.kr/news/rss/rss_50400012.xml'),
(u'국제', 'http://file.mk.co.kr/news/rss/rss_30300018.xml'),
(u'기업ㆍ경영','http://file.mk.co.kr/news/rss/rss_50100032.xml'),
(u'증권', 'http://file.mk.co.kr/news/rss/rss_50200011.xml'),
(u'부동산', 'http://file.mk.co.kr/news/rss/rss_50300009.xml'),
(u'문화ㆍ연예','http://file.mk.co.kr/news/rss/rss_30000023.xml'),
(u'패션', 'http://file.mk.co.kr/news/rss/rss_72000001.xml'),
(u'스포츠', 'http://file.mk.co.kr/news/rss/rss_71000001.xml'),
(u'게임', 'http://file.mk.co.kr/news/rss/rss_50700001.xml'),
(u'오피니언', 'http://file.mk.co.kr/news/rss/rss_30500041.xml'),
]

48
recipes/sisainlive.recipe Normal file
View File

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2015, Hoje Lee <hojelei at gmail.com>'
'''
Profile to download SisaIN Live
'''
from calibre.web.feeds.news import BasicNewsRecipe
class SisaINLive(BasicNewsRecipe):
language = 'ko'
title = u'시사인 라이브'
description = u'시사인 라이브 기사'
__author__ = 'Hoje Lee'
oldest_article = 30
max_articles_per_feed = 10
auto_cleanup = True
"""
# manual cleanup
no_stylesheets = True
remove_javascript = True
keep_only_tags = [
dict(name='div', attrs ={'class':['View_Title']}),
dict(name='div', attrs ={'class':['View_Info']}),
dict(name='div', attrs ={'class':['View_Time']}),
dict(id='articleBody'),
]
remove_tags = [
dict(name='table', attrs ={'width':['320'], 'height':['265']}),
]
"""
feeds = [
#(u'전체기사', 'http://www.sisainlive.com/rss.xml'),
(u'인기기사', 'http://www.sisainlive.com/rss/clickTop.xml'),
(u'커버스토리','http://www.sisainlive.com/rss/SRN121.xml'),
(u'특집', 'http://www.sisainlive.com/rss/SRN122.xml'),
(u'정치', 'http://www.sisainlive.com/rss/S1N15.xml'),
(u'경제', 'http://www.sisainlive.com/rss/S1N16.xml'),
(u'사회', 'http://www.sisainlive.com/rss/S1N17.xml'),
(u'문화', 'http://www.sisainlive.com/rss/S1N18.xml'),
(u'국제.한반도','http://www.sisainlive.com/rss/S1N4.xml'),
(u'실용.과학', 'http://www.sisainlive.com/rss/S1N6.xml'),
(u'휴먼&휴', 'http://www.sisainlive.com/rss/S1N19.xml'),
(u'인터뷰.오피니언','http://www.sisainlive.com/rss/S1N5.xml'),
(u'사진.만화', 'http://www.sisainlive.com/rss/S1N7.xml'),
(u'별책부록', 'http://www.sisainlive.com/rss/S1N14.xml'),
]