#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function __license__ = 'GPL v3' __copyright__ = '2014, spswerling' ''' www.todayszaman.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe try: from urllib.parse import urljoin except ImportError: from urlparse import urljoin class TodaysZaman(BasicNewsRecipe): title = u'Todays Zaman' __author__ = u'spswerling' description = 'English version of Turkish Daily "Zaman"' max_articles_per_feed = 100 encoding = 'utf-8' category = 'news' language = 'en_TR' publication_type = 'newspaper' cover_img_url = 'http://medya.todayszaman.com/todayszaman/images/logo/todays_yenilogo.bmp' # yep, bmp masthead_url = cover_img_url remove_empty_feeds = True # on kindle, images can make things kind of fat. Slim them down. recursions = 0 oldest_article = 1.5 compress_news_images = True compress_news_images_max_size = 7 scale_news_images = (150, 200) # (kindle touch: 600x800) useHighResImages = False sections = [ (u'Columnists', u'columnists'), (u'Opinion', u'op-ed'), (u'World', u'world'), (u'National', u'national'), (u'Diplomacy', u'diplomacy'), (u'Business', u'business'), ] # util for creating remove_tags and keep_tags style regex matchers def tag_matcher(elt, attr, str): return dict(name=elt, attrs={attr: re.compile(str, re.IGNORECASE)}) keep_only_tags = [ tag_matcher('div', 'class', '^pageNewsDetailContainer$'), tag_matcher('div', 'class', '^pageColumnistDetailContainer$'), ] remove_tags = [ tag_matcher('div', 'class', 'DetailKeyword'), tag_matcher('div', 'class', 'MainContentSocial'), tag_matcher('div', 'class', 'SocialNetwork'), tag_matcher('div', 'class', 'DetailLeftOther'), tag_matcher('div', 'class', 'RelatedNews'), tag_matcher('div', 'class', '^topMenuWrapper$'), tag_matcher('div', 'class', '^logo$'), tag_matcher('a', 'class', 'cf_email'), ] articles = {} def parse_index(self): for (sect_title, sect_uri) in self.sections: self.parse_section(sect_title, sect_uri) ans = [] for k in self.articles: ans.append((k, self.articles[k])) return ans def parse_section(self, sect_title, sect_uri): url = 'http://www.todayszaman.com/' + sect_uri print('Start section ' + sect_title + ', ' + url) try: soup = self.index_to_soup(url) except: return # Find each article for div in soup.findAll('div'): div_class = div.get('class') if div_class: if div_class in ['pageColumnistsMainContent', 'pageCategoryContainer']: # print ' DIVCLASS' + div_class for link in div.findAll('a', href=True): self.process_link(sect_title, div_class, link) print('Finished section: ' + sect_title) def process_link(self, section_title, layout, link): def p(s): print('[PROCESS LINK] ' + s[0:80]) href = link['href'] full_href = urljoin('http://www.todayszaman.com/', href) next_sib = link.nextSibling child_h2 = link.find('h2') link_text = self.tag_to_string(link).strip() title_node = None if layout in ['pageColumnistsMainContent']: if child_h2: title_node = child_h2 else: return elif layout in ['pageCategoryContainer']: top_title = link.find(attrs={'class': 'pageCategoryTopTitle'}) if top_title: title_node = top_title elif (not link_text) and (next_sib and next_sib.find('h4')): title_node = next_sib.find('h4') elif (not link_text) and (next_sib and next_sib.find('h3')): title_node = next_sib.find('h3') elif link_text: title_node = link if title_node: title = self.tag_to_string(title_node) # print ' BING: ' + href + ', ' + title self.queue_article_link(section_title, full_href, title) def queue_article_link(self, section, url, title): if section not in self.articles: self.articles[section] = [] self.articles[section].append( dict(title=title, url=url, date='', description='', author='', content='')) def populate_article_metadata(self, article, soup, first): def p(s): print('[POPULATE METADATA] ' + s[0:80]) tnode = soup.find('title') if tnode: tstring = self.tag_to_string(tnode) if ' - ' in tstring: author = tstring.split('-')[0] if author: article.author = author article.title = author + ' - ' + article.title.strip() p('Add author to title:' + author) # known matches: pageNewsDetailDate, pageColumnistDetailLeftDate regex = re.compile('(DetailDate|DetailLeftDate)$', re.IGNORECASE) date_node = soup.find('div', {'class': regex}) if date_node: date = self.tag_to_string(date_node).__str__().split('/')[0] date = ','.join(date.split(',')[:2]).strip() article.title = date + ' - ' + article.title.strip() article.date = date p('Add date to title: ' + date) strong = soup.find('strong') if strong: article.text_summary = self.tag_to_string(strong) p('Summary: ' + article.text_summary) def _dbg_soup_node(self, node): s = ' cls: ' + node.get('class').__str__().strip() + \ ' txt: ' + self.tag_to_string(node).strip() return s