diff --git a/resources/recipes/daum_net.recipe b/resources/recipes/daum_net.recipe new file mode 100644 index 0000000000..68ed574b61 --- /dev/null +++ b/resources/recipes/daum_net.recipe @@ -0,0 +1,112 @@ +import re +from datetime import date, timedelta + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class MediaDaumRecipe(BasicNewsRecipe): + title = u'\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4' + description = 'Articles from media.daum.net' + __author__ = 'trustin' + language = 'ko' + max_articles = 100 + + timefmt = '' + masthead_url = 'http://img-media.daum-img.net/2010ci/service_news.gif' + cover_margins = (18,18,'grey99') + no_stylesheets = True + remove_tags_before = dict(id='GS_con') + remove_tags_after = dict(id='GS_con') + remove_tags = [dict(attrs={'class':[ + 'bline', + 'GS_vod', + ]}), + dict(id=[ + 'GS_swf_poll', + 'ad250', + ]), + dict(name=['script', 'noscript', 'style', 'object'])] + preprocess_regexps = [ + (re.compile(r'<\s+', re.DOTALL|re.IGNORECASE), + lambda match: '< '), + (re.compile(r'(]*>[ \t\r\n]*){3,}', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*

', re.DOTALL|re.IGNORECASE), + lambda match: '

'), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(u'\(\uB05D\)[ \t\r\n]*]*>.*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*]*>[ \t\r\n]*)*]*>[ \t\r\n]*)*(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(r'(]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + (re.compile(u'(]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\(c\))*\[[^\]]*(\u24D2|\(c\)|\uAE30\uC0AC|\uC778\uAE30[^\]]*\uB274\uC2A4)[^\]]*\].*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + def parse_index(self): + today = date.today(); + articles = [] + articles = self.parse_list_page(articles, today) + articles = self.parse_list_page(articles, today - timedelta(1)) + return [('\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4', articles)] + + + def parse_list_page(self, articles, date): + if len(articles) >= self.max_articles: + return articles + + for page in range(1, 10): + soup = self.index_to_soup('http://media.daum.net/primary/total/list.html?cateid=100044&date=%(date)s&page=%(page)d' % {'date': date.strftime('%Y%m%d'), 'page': page}) + done = True + for item in soup.findAll('dl'): + dt = item.find('dt', { 'class': 'tit' }) + dd = item.find('dd', { 'class': 'txt' }) + if dt is None: + break + a = dt.find('a', href=True) + url = 'http://media.daum.net/primary/total/' + a['href'] + title = self.tag_to_string(dt) + if dd is None: + description = '' + else: + description = self.tag_to_string(dd) + articles.append(dict(title=title, description=description, url=url, content='')) + done = len(articles) >= self.max_articles + if done: + break + if done: + break + return articles + + + def preprocess_html(self, soup): + return self.strip_anchors(soup) + + def strip_anchors(self, soup): + for para in soup.findAll(True): + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('utf-8','replace')) + return soup diff --git a/src/calibre/utils/resources.py b/src/calibre/utils/resources.py index 7f87cb4fc3..dd600eb627 100644 --- a/src/calibre/utils/resources.py +++ b/src/calibre/utils/resources.py @@ -47,7 +47,7 @@ class PathResolver(object): break if ans is None: - ans = os.path.join(self.location[0], *path.split('/')) + ans = os.path.join(self.locations[0], *path.split('/')) self.cache[path] = ans