From 36364192c36efbbe198c36969e8716f33c1bba06 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Thu, 22 Jul 2010 13:10:57 -0600
Subject: [PATCH] daum.net by trustin
---
resources/recipes/daum_net.recipe | 112 ++++++++++++++++++++++++++++++
src/calibre/utils/resources.py | 2 +-
2 files changed, 113 insertions(+), 1 deletion(-)
create mode 100644 resources/recipes/daum_net.recipe
diff --git a/resources/recipes/daum_net.recipe b/resources/recipes/daum_net.recipe
new file mode 100644
index 0000000000..68ed574b61
--- /dev/null
+++ b/resources/recipes/daum_net.recipe
@@ -0,0 +1,112 @@
+import re
+from datetime import date, timedelta
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class MediaDaumRecipe(BasicNewsRecipe):
+ title = u'\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4'
+ description = 'Articles from media.daum.net'
+ __author__ = 'trustin'
+ language = 'ko'
+ max_articles = 100
+
+ timefmt = ''
+ masthead_url = 'http://img-media.daum-img.net/2010ci/service_news.gif'
+ cover_margins = (18,18,'grey99')
+ no_stylesheets = True
+ remove_tags_before = dict(id='GS_con')
+ remove_tags_after = dict(id='GS_con')
+ remove_tags = [dict(attrs={'class':[
+ 'bline',
+ 'GS_vod',
+ ]}),
+ dict(id=[
+ 'GS_swf_poll',
+ 'ad250',
+ ]),
+ dict(name=['script', 'noscript', 'style', 'object'])]
+ preprocess_regexps = [
+ (re.compile(r'<\s+', re.DOTALL|re.IGNORECASE),
+ lambda match: '< '),
+ (re.compile(r'(
]*>[ \t\r\n]*){3,}', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*
', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(u'\(\uB05D\)[ \t\r\n]*
]*>.*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*]*>[ \t\r\n]*)*
]*>[ \t\r\n]*)*
(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ (re.compile(u'(
]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\(c\))*\[[^\]]*(\u24D2|\(c\)|\uAE30\uC0AC|\uC778\uAE30[^\]]*\uB274\uC2A4)[^\]]*\].*', re.DOTALL|re.IGNORECASE),
+ lambda match: ''),
+ ]
+
+ def parse_index(self):
+ today = date.today();
+ articles = []
+ articles = self.parse_list_page(articles, today)
+ articles = self.parse_list_page(articles, today - timedelta(1))
+ return [('\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4', articles)]
+
+
+ def parse_list_page(self, articles, date):
+ if len(articles) >= self.max_articles:
+ return articles
+
+ for page in range(1, 10):
+ soup = self.index_to_soup('http://media.daum.net/primary/total/list.html?cateid=100044&date=%(date)s&page=%(page)d' % {'date': date.strftime('%Y%m%d'), 'page': page})
+ done = True
+ for item in soup.findAll('dl'):
+ dt = item.find('dt', { 'class': 'tit' })
+ dd = item.find('dd', { 'class': 'txt' })
+ if dt is None:
+ break
+ a = dt.find('a', href=True)
+ url = 'http://media.daum.net/primary/total/' + a['href']
+ title = self.tag_to_string(dt)
+ if dd is None:
+ description = ''
+ else:
+ description = self.tag_to_string(dd)
+ articles.append(dict(title=title, description=description, url=url, content=''))
+ done = len(articles) >= self.max_articles
+ if done:
+ break
+ if done:
+ break
+ return articles
+
+
+ def preprocess_html(self, soup):
+ return self.strip_anchors(soup)
+
+ def strip_anchors(self, soup):
+ for para in soup.findAll(True):
+ aTags = para.findAll('a')
+ for a in aTags:
+ if a.img is None:
+ a.replaceWith(a.renderContents().decode('utf-8','replace'))
+ return soup
diff --git a/src/calibre/utils/resources.py b/src/calibre/utils/resources.py
index 7f87cb4fc3..dd600eb627 100644
--- a/src/calibre/utils/resources.py
+++ b/src/calibre/utils/resources.py
@@ -47,7 +47,7 @@ class PathResolver(object):
break
if ans is None:
- ans = os.path.join(self.location[0], *path.split('/'))
+ ans = os.path.join(self.locations[0], *path.split('/'))
self.cache[path] = ans