diff --git a/recipes/chosun.recipe b/recipes/chosun.recipe new file mode 100644 index 0000000000..b3952e0670 --- /dev/null +++ b/recipes/chosun.recipe @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2015, Hoje Lee ' +''' +Profile to download Chosun.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ChosunDotcom(BasicNewsRecipe): + language = 'ko' + title = u'조선일보' + description = u'조선닷컴 기사' + __author__ = 'Hoje Lee' + oldest_article = 7 + max_articles_per_feed = 10 + auto_cleanup = True + + feeds = [ + (u'정치', 'http://www.chosun.com/site/data/rss/politics.xml'), + (u'조선비즈','http://biz.chosun.com/site/data/rss/rss.xml'), + (u'사회', 'http://www.chosun.com/site/data/rss/national.xml'), + (u'문화', 'http://www.chosun.com/site/data/rss/culture.xml'), + (u'국제', 'http://www.chosun.com/site/data/rss/international.xml'), + (u'오피니언','http://www.chosun.com/site/data/rss/editorials.xml'), + (u'스포츠', 'http://www.chosun.com/site/data/rss/sports.xml'), + (u'연예', 'http://www.chosun.com/site/data/rss/ent.xml'), + ] diff --git a/recipes/hankyoreh.recipe b/recipes/hankyoreh.recipe index 7da23fa0db..03f8346f36 100644 --- a/recipes/hankyoreh.recipe +++ b/recipes/hankyoreh.recipe @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2011, Seongkyoun Yoo ' ''' @@ -6,42 +7,34 @@ Profile to download The Hankyoreh from calibre.web.feeds.news import BasicNewsRecipe class Hankyoreh(BasicNewsRecipe): - title = u'Hankyoreh' language = 'ko' + title = u'한겨례' description = u'The Hankyoreh News articles' __author__ = 'Seongkyoun Yoo' - oldest_article = 5 - recursions = 1 - max_articles_per_feed = 5 - no_stylesheets = True + oldest_article = 7 + max_articles_per_feed = 10 + no_stylesheets = True + remove_javascript = True + keep_only_tags = [ - dict(name='tr', attrs={'height':['60px']}), - dict(id=['fontSzArea']) + dict(name='div', attrs ={'class':['article-head']}), + dict(name='div', attrs ={'class':['article-text']}), ] - remove_tags = [ - dict(target='_blank'), - dict(name='td', attrs={'style':['padding: 10px 8px 5px 8px;']}), - dict(name='iframe', attrs={'width':['590']}), - ] - remove_tags_after = [ - dict(target='_top') - ] + remove_tags = [ + dict(name='p', attrs={'class':['category']}), + ] + remove_tags_after = dict(id={'ad_box01'}) + feeds = [ - ('All News','http://www.hani.co.kr/rss/'), - ('Politics','http://www.hani.co.kr/rss/politics/'), - ('Economy','http://www.hani.co.kr/rss/economy/'), - ('Society','http://www.hani.co.kr/rss/society/'), - ('International','http://www.hani.co.kr/rss/international/'), - ('Culture','http://www.hani.co.kr/rss/culture/'), - ('Sports','http://www.hani.co.kr/rss/sports/'), - ('Science','http://www.hani.co.kr/rss/science/'), - ('Opinion','http://www.hani.co.kr/rss/opinion/'), - ('Cartoon','http://www.hani.co.kr/rss/cartoon/'), - ('English Edition','http://www.hani.co.kr/rss/english_edition/'), - ('Specialsection','http://www.hani.co.kr/rss/specialsection/'), - ('Hanionly','http://www.hani.co.kr/rss/hanionly/'), - ('Hkronly','http://www.hani.co.kr/rss/hkronly/'), - ('Multihani','http://www.hani.co.kr/rss/multihani/'), - ('Lead','http://www.hani.co.kr/rss/lead/'), - ('Newsrank','http://www.hani.co.kr/rss/newsrank/'), + #(u'전체기사', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_all.xml'), + (u'정치', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_politics.xml'), + #(u'경제', 'http://www.hani.co.kr/rss/economy/'), + (u'사회', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_society.xml'), + #(u'국제', 'http://www.hani.co.kr/rss/international/'), + (u'문화', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_culture.xml'), + (u'스포츠', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_sports.xml'), + #(u'과학', 'http://www.hani.co.kr/rss/science/'), + (u'사설·칼럼','http://www.hani.co.kr/ilram/rss/hkr_news_list_opinion.xml'), + (u'만화만평', 'http://www.hani.co.kr/ilram/rss/hkr_news_list_cartoon.xml'), + #(u'한겨례섹션','http://www.hani.co.kr/rss/specialsection/'), ] diff --git a/recipes/hankyoreh21.recipe b/recipes/hankyoreh21.recipe index 8e633b4ebe..6724df10f8 100644 --- a/recipes/hankyoreh21.recipe +++ b/recipes/hankyoreh21.recipe @@ -9,17 +9,30 @@ class Hankyoreh21(BasicNewsRecipe): title = u'Hankyoreh21' language = 'ko' description = u'The Hankyoreh21 Magazine articles' - __author__ = 'Seongkyoun Yoo' - oldest_article = 20 - recursions = 1 - max_articles_per_feed = 120 - no_stylesheets = True + __author__ = 'Seongkyoun Yoo' + oldest_article = 30 + max_articles_per_feed = 10 + no_stylesheets = True remove_javascript = True keep_only_tags = [ - dict(name='font', attrs={'class':'t18bk'}), - dict(id=['fontSzArea']) + dict(name='header', attrs ={'class':['article_head']}), + dict(name='div', attrs ={'class':['article_body']}), ] + remove_tags = [ + dict(name='div', attrs ={'class':['article_tools']}), + ] feeds = [ - ('Hani21','http://h21.hani.co.kr/rss/ '), + #('전체기사', 'http://h21.hani.co.kr/rss/'), + ('표지이야기','http://h21.hani.co.kr/rss/cover/'), + ('특집', 'http://h21.hani.co.kr/rss/special/'), + ('정치', 'http://h21.hani.co.kr/rss/politics/'), + ('경제', 'http://h21.hani.co.kr/rss/economy/'), + ('사회', 'http://h21.hani.co.kr/rss/society/'), + ('세계', 'http://h21.hani.co.kr/rss/world/'), + ('문화', 'http://h21.hani.co.kr/rss/culture/'), ] + + def get_article_url(self, article): + org_url = BasicNewsRecipe.get_article_url(self, article) + return "http://h21.hani.co.kr"+org_url if org_url[0]=='/' else org_url diff --git a/recipes/joongang.recipe b/recipes/joongang.recipe new file mode 100644 index 0000000000..487e1bfe23 --- /dev/null +++ b/recipes/joongang.recipe @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2015, Hoje Lee ' +''' +Profile to download Joongang Ilbo +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class JoongangIlbo(BasicNewsRecipe): + language = 'ko' + title = u'중앙일보' + description = u'중앙일보 신문 기사' + __author__ = 'Hoje Lee' + oldest_article = 7 + max_articles_per_feed = 5 + auto_cleanup = True + + feeds = [ + #(u'전체기사', 'http://rss.joins.com/joins_news_list.xml'), + (u'주요기사', 'http://rss.joins.com/joins_homenews_list.xml'), + #(u'경제', 'http://rss.joins.com/joins_money_list.xml'), + #(u'사회', 'http://rss.joins.com/joins_life_list.xml'), + #(u'정치', 'http://rss.joins.com/joins_politics_list.xml'), + ### 많이 본 뉴스 + (u'전체기사', 'http://rss.joins.com/sonagi/joins_sonagi_total_list.xml'), + (u'경제', 'http://rss.joins.com/sonagi/joins_sonagi_money_list.xml'), + (u'스포츠', 'http://rss.joins.com/sonagi/joins_sonagi_sports_list.xml'), + (u'연예', 'http://rss.joins.com/sonagi/joins_sonagi_star_list.xml'), + (u'사회', 'http://rss.joins.com/sonagi/joins_sonagi_life_list.xml'), + (u'정치', 'http://rss.joins.com/sonagi/joins_sonagi_politics_list.xml'), + (u'지구촌', 'http://rss.joins.com/sonagi/joins_sonagi_world_list.xml'), + (u'IT과학', 'http://rss.joins.com/sonagi/joins_sonagi_it_list.xml'), + (u'사설', 'http://rss.joins.com/sonagi/joins_sonagi_opinion_list.xml'), + ] diff --git a/recipes/kyungyhang.recipe b/recipes/kyungyhang.recipe index 7fe5b88612..e4f751fd9a 100644 --- a/recipes/kyungyhang.recipe +++ b/recipes/kyungyhang.recipe @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2011, Seongkyoun Yoo ' ''' @@ -5,26 +6,34 @@ Profile to download The Kyungyhang ''' from calibre.web.feeds.news import BasicNewsRecipe +import re class Kyungyhang(BasicNewsRecipe): title = u'Kyungyhang' language = 'ko' description = u'The Kyungyhang Shinmun articles' __author__ = 'Seongkyoun Yoo' - oldest_article = 20 - recursions = 2 - max_articles_per_feed = 20 - no_stylesheets = True + oldest_article = 7 + max_articles_per_feed = 10 + no_stylesheets = True remove_javascript = True + preprocess_regexps = [ + (re.compile("
", re.DOTALL|re.IGNORECASE), lambda match: ''), + ] + keep_only_tags = [ dict(name='div', attrs ={'class':['article_title_wrap']}), + dict(name='div', attrs ={'class':['viewHeader']}), dict(name='span', attrs ={'class':['article_txt']}) ] remove_tags_after = dict(id={'sub_bottom'}) remove_tags = [ + dict(name='div', attrs={'class':['widget_top_dable']}), + dict(name='div', attrs={'class':['article_bottom_ad']}), + dict(name='div', attrs={'class':['article_date']}), dict(name='iframe'), dict(id={'TdHot'}), dict(name='div', attrs={'class':['btn_list','bline','linebottom','bestArticle']}), @@ -33,5 +42,14 @@ class Kyungyhang(BasicNewsRecipe): ] feeds = [ - ('All News','http://www.khan.co.kr/rss/rssdata/total_news.xml'), + #(u'전체기사','http://www.khan.co.kr/rss/rssdata/total_news.xml'), + (u'정치', 'http://www.khan.co.kr/rss/rssdata/politic_news.xml'), + (u'경제', 'http://www.khan.co.kr/rss/rssdata/economy_news.xml'), + (u'사회', 'http://www.khan.co.kr/rss/rssdata/society_news.xml'), + (u'세계', 'http://www.khan.co.kr/rss/rssdata/kh_world.xml'), + (u'스포츠', 'http://www.khan.co.kr/rss/rssdata/kh_sports.xml'), + (u'문화', 'http://www.khan.co.kr/rss/rssdata/culture_news.xml'), + (u'연예', 'http://www.khan.co.kr/rss/rssdata/kh_entertainment.xml'), + (u'IT', 'http://www.khan.co.kr/rss/rssdata/it_news.xml'), + (u'오피니언','http://www.khan.co.kr/rss/rssdata/opinion_news.xml'), ] diff --git a/recipes/maekyung.recipe b/recipes/maekyung.recipe new file mode 100644 index 0000000000..d29af3ec3c --- /dev/null +++ b/recipes/maekyung.recipe @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2015, Hoje Lee ' +''' +Profile to download Maeil Business +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class MaeilBusiness(BasicNewsRecipe): + language = 'ko' + title = u'매일경제' + description = u'매일경제 신문 기사' + __author__ = 'Hoje Lee' + oldest_article = 7 + max_articles_per_feed = 10 + auto_cleanup = True + + feeds = [ + (u'헤드라인', 'http://file.mk.co.kr/news/rss/rss_30000001.xml'), + #(u'전체뉴스', 'http://file.mk.co.kr/news/rss/rss_40300001.xml'), + (u'경제', 'http://file.mk.co.kr/news/rss/rss_30100041.xml'), + (u'정치', 'http://file.mk.co.kr/news/rss/rss_30200030.xml'), + (u'사회', 'http://file.mk.co.kr/news/rss/rss_50400012.xml'), + (u'국제', 'http://file.mk.co.kr/news/rss/rss_30300018.xml'), + (u'기업ㆍ경영','http://file.mk.co.kr/news/rss/rss_50100032.xml'), + (u'증권', 'http://file.mk.co.kr/news/rss/rss_50200011.xml'), + (u'부동산', 'http://file.mk.co.kr/news/rss/rss_50300009.xml'), + (u'문화ㆍ연예','http://file.mk.co.kr/news/rss/rss_30000023.xml'), + (u'패션', 'http://file.mk.co.kr/news/rss/rss_72000001.xml'), + (u'스포츠', 'http://file.mk.co.kr/news/rss/rss_71000001.xml'), + (u'게임', 'http://file.mk.co.kr/news/rss/rss_50700001.xml'), + (u'오피니언', 'http://file.mk.co.kr/news/rss/rss_30500041.xml'), + ] diff --git a/recipes/sisainlive.recipe b/recipes/sisainlive.recipe new file mode 100644 index 0000000000..ccdd2820e0 --- /dev/null +++ b/recipes/sisainlive.recipe @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2015, Hoje Lee ' +''' +Profile to download SisaIN Live +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class SisaINLive(BasicNewsRecipe): + language = 'ko' + title = u'시사인 라이브' + description = u'시사인 라이브 기사' + __author__ = 'Hoje Lee' + oldest_article = 30 + max_articles_per_feed = 10 + auto_cleanup = True + """ + # manual cleanup + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [ + dict(name='div', attrs ={'class':['View_Title']}), + dict(name='div', attrs ={'class':['View_Info']}), + dict(name='div', attrs ={'class':['View_Time']}), + dict(id='articleBody'), + ] + remove_tags = [ + dict(name='table', attrs ={'width':['320'], 'height':['265']}), + ] + """ + + feeds = [ + #(u'전체기사', 'http://www.sisainlive.com/rss.xml'), + (u'인기기사', 'http://www.sisainlive.com/rss/clickTop.xml'), + (u'커버스토리','http://www.sisainlive.com/rss/SRN121.xml'), + (u'특집', 'http://www.sisainlive.com/rss/SRN122.xml'), + (u'정치', 'http://www.sisainlive.com/rss/S1N15.xml'), + (u'경제', 'http://www.sisainlive.com/rss/S1N16.xml'), + (u'사회', 'http://www.sisainlive.com/rss/S1N17.xml'), + (u'문화', 'http://www.sisainlive.com/rss/S1N18.xml'), + (u'국제.한반도','http://www.sisainlive.com/rss/S1N4.xml'), + (u'실용.과학', 'http://www.sisainlive.com/rss/S1N6.xml'), + (u'휴먼&휴', 'http://www.sisainlive.com/rss/S1N19.xml'), + (u'인터뷰.오피니언','http://www.sisainlive.com/rss/S1N5.xml'), + (u'사진.만화', 'http://www.sisainlive.com/rss/S1N7.xml'), + (u'별책부록', 'http://www.sisainlive.com/rss/S1N14.xml'), + ]