calibre/recipes/todays_zaman.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

176 lines
5.9 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
www.todayszaman.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
class TodaysZaman(BasicNewsRecipe):
title = u'Todays Zaman'
__author__ = u'spswerling'
description = 'English version of Turkish Daily "Zaman"'
max_articles_per_feed = 100
encoding = 'utf-8'
category = 'news'
language = 'en_TR'
publication_type = 'newspaper'
cover_img_url = 'http://medya.todayszaman.com/todayszaman/images/logo/todays_yenilogo.bmp' # yep, bmp
masthead_url = cover_img_url
remove_empty_feeds = True
# on kindle, images can make things kind of fat. Slim them down.
recursions = 0
oldest_article = 1.5
compress_news_images = True
compress_news_images_max_size = 7
scale_news_images = (150, 200) # (kindle touch: 600x800)
useHighResImages = False
sections = [
(u'Columnists', u'columnists'),
(u'Opinion', u'op-ed'),
(u'World', u'world'),
(u'National', u'national'),
(u'Diplomacy', u'diplomacy'),
(u'Business', u'business'),
]
# util for creating remove_tags and keep_tags style regex matchers
def tag_matcher(elt, attr, str):
return dict(name=elt, attrs={attr: re.compile(str, re.IGNORECASE)})
keep_only_tags = [
tag_matcher('div', 'class', '^pageNewsDetailContainer$'),
tag_matcher('div', 'class', '^pageColumnistDetailContainer$'),
]
remove_tags = [
tag_matcher('div', 'class', 'DetailKeyword'),
tag_matcher('div', 'class', 'MainContentSocial'),
tag_matcher('div', 'class', 'SocialNetwork'),
tag_matcher('div', 'class', 'DetailLeftOther'),
tag_matcher('div', 'class', 'RelatedNews'),
tag_matcher('div', 'class', '^topMenuWrapper$'),
tag_matcher('div', 'class', '^logo$'),
tag_matcher('a', 'class', 'cf_email'),
]
articles = {}
def parse_index(self):
for (sect_title, sect_uri) in self.sections:
self.parse_section(sect_title, sect_uri)
ans = []
for k in self.articles:
ans.append((k, self.articles[k]))
return ans
def parse_section(self, sect_title, sect_uri):
url = 'http://www.todayszaman.com/' + sect_uri
print('Start section ' + sect_title + ', ' + url)
try:
soup = self.index_to_soup(url)
except:
return
# Find each article
for div in soup.findAll('div'):
div_class = div.get('class')
if div_class:
if div_class in ['pageColumnistsMainContent',
'pageCategoryContainer']:
# print ' DIVCLASS' + div_class
for link in div.findAll('a', href=True):
self.process_link(sect_title, div_class, link)
print('Finished section: ' + sect_title)
def process_link(self, section_title, layout, link):
def p(s):
print('[PROCESS LINK] ' + s[0:80])
href = link['href']
full_href = urljoin('http://www.todayszaman.com/', href)
next_sib = link.nextSibling
child_h2 = link.find('h2')
link_text = self.tag_to_string(link).strip()
title_node = None
if layout in ['pageColumnistsMainContent']:
if child_h2:
title_node = child_h2
else:
return
elif layout in ['pageCategoryContainer']:
top_title = link.find(attrs={'class': 'pageCategoryTopTitle'})
if top_title:
title_node = top_title
elif (not link_text) and (next_sib and next_sib.find('h4')):
title_node = next_sib.find('h4')
elif (not link_text) and (next_sib and next_sib.find('h3')):
title_node = next_sib.find('h3')
elif link_text:
title_node = link
if title_node:
title = self.tag_to_string(title_node)
# print ' BING: ' + href + ', ' + title
self.queue_article_link(section_title, full_href, title)
def queue_article_link(self, section, url, title):
if section not in self.articles:
self.articles[section] = []
self.articles[section].append(
dict(title=title,
url=url,
date='',
description='',
author='',
content=''))
def populate_article_metadata(self, article, soup, first):
def p(s):
print('[POPULATE METADATA] ' + s[0:80])
tnode = soup.find('title')
if tnode:
tstring = self.tag_to_string(tnode)
if ' - ' in tstring:
author = tstring.split('-')[0]
if author:
article.author = author
article.title = author + ' - ' + article.title.strip()
p('Add author to title:' + author)
# known matches: pageNewsDetailDate, pageColumnistDetailLeftDate
regex = re.compile('(DetailDate|DetailLeftDate)$', re.IGNORECASE)
date_node = soup.find('div', {'class': regex})
if date_node:
date = self.tag_to_string(date_node).__str__().split('/')[0]
date = ','.join(date.split(',')[:2]).strip()
article.title = date + ' - ' + article.title.strip()
article.date = date
p('Add date to title: ' + date)
strong = soup.find('strong')
if strong:
article.text_summary = self.tag_to_string(strong)
p('Summary: ' + article.text_summary)
def _dbg_soup_node(self, node):
s = ' cls: ' + node.get('class').__str__().strip() + \
' txt: ' + self.tag_to_string(node).strip()
return s