import re
from datetime import date, timedelta
from calibre.web.feeds.recipes import BasicNewsRecipe
class MediaDaumRecipe(BasicNewsRecipe):
title = u'\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4'
description = 'Articles from media.daum.net'
__author__ = 'trustin'
language = 'ko'
max_articles = 100
timefmt = ''
masthead_url = 'http://img-media.daum-img.net/2010ci/service_news.gif'
cover_margins = (18, 18, 'grey99')
no_stylesheets = True
remove_tags_before = dict(id='GS_con')
remove_tags_after = dict(id='GS_con')
remove_tags = [dict(attrs={'class': [
'bline',
'GS_vod',
]}),
dict(id=[
'GS_swf_poll',
'ad250',
]),
dict(name=['script', 'noscript', 'style', 'object'])]
preprocess_regexps = [
(re.compile(r'<\s+', re.DOTALL | re.IGNORECASE),
lambda match: '< '),
(re.compile(r'(
]*>[ \t\r\n]*){3,}', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*
', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'\(\uB05D\)[ \t\r\n]*
]*>.*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*]*>[ \t\r\n]*)*
]*>[ \t\r\n]*)*
(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(r'(
]*>[ \t\r\n]*)*', re.DOTALL | re.IGNORECASE),
lambda match: ''),
(re.compile(u'(
]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\(c\))*\[[^\]]*(\u24D2|\(c\)|\uAE30\uC0AC|\uC778\uAE30[^\]]*\uB274\uC2A4)[^\]]*\].*', re.DOTALL | re.IGNORECASE), # noqa
lambda match: ''),
]
def parse_index(self):
today = date.today()
articles = []
articles = self.parse_list_page(articles, today)
articles = self.parse_list_page(articles, today - timedelta(1))
return [('\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4', articles)]
def parse_list_page(self, articles, date):
if len(articles) >= self.max_articles:
return articles
for page in range(1, 10):
soup = self.index_to_soup('http://media.daum.net/primary/total/list.html?cateid=100044&date=%(date)s&page=%(page)d' % {
'date': date.strftime('%Y%m%d'), 'page': page})
done = True
for item in soup.findAll('dl'):
dt = item.find('dt', {'class': 'tit'})
dd = item.find('dd', {'class': 'txt'})
if dt is None:
break
a = dt.find('a', href=True)
url = 'http://media.daum.net/primary/total/' + a['href']
title = self.tag_to_string(dt)
if dd is None:
description = ''
else:
description = self.tag_to_string(dd)
articles.append(
dict(title=title, description=description, url=url, content=''))
done = len(articles) >= self.max_articles
if done:
break
if done:
break
return articles
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def strip_anchors(self, soup):
for para in soup.findAll(True):
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(
a.renderContents().decode('utf-8', 'replace'))
return soup