mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
176 lines
5.9 KiB
Python
176 lines
5.9 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
from __future__ import print_function
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2014, spswerling'
|
|
'''
|
|
www.todayszaman.com
|
|
'''
|
|
import re
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
|
|
try:
|
|
from urllib.parse import urljoin
|
|
except ImportError:
|
|
from urlparse import urljoin
|
|
|
|
|
|
class TodaysZaman(BasicNewsRecipe):
|
|
|
|
title = u'Todays Zaman'
|
|
__author__ = u'spswerling'
|
|
description = 'English version of Turkish Daily "Zaman"'
|
|
max_articles_per_feed = 100
|
|
encoding = 'utf-8'
|
|
category = 'news'
|
|
language = 'en_TR'
|
|
publication_type = 'newspaper'
|
|
cover_img_url = 'http://medya.todayszaman.com/todayszaman/images/logo/todays_yenilogo.bmp' # yep, bmp
|
|
masthead_url = cover_img_url
|
|
remove_empty_feeds = True
|
|
|
|
# on kindle, images can make things kind of fat. Slim them down.
|
|
recursions = 0
|
|
oldest_article = 1.5
|
|
compress_news_images = True
|
|
compress_news_images_max_size = 7
|
|
scale_news_images = (150, 200) # (kindle touch: 600x800)
|
|
useHighResImages = False
|
|
|
|
sections = [
|
|
(u'Columnists', u'columnists'),
|
|
(u'Opinion', u'op-ed'),
|
|
(u'World', u'world'),
|
|
(u'National', u'national'),
|
|
(u'Diplomacy', u'diplomacy'),
|
|
(u'Business', u'business'),
|
|
]
|
|
|
|
# util for creating remove_tags and keep_tags style regex matchers
|
|
def tag_matcher(elt, attr, str):
|
|
return dict(name=elt, attrs={attr: re.compile(str, re.IGNORECASE)})
|
|
|
|
keep_only_tags = [
|
|
tag_matcher('div', 'class', '^pageNewsDetailContainer$'),
|
|
tag_matcher('div', 'class', '^pageColumnistDetailContainer$'),
|
|
]
|
|
|
|
remove_tags = [
|
|
tag_matcher('div', 'class', 'DetailKeyword'),
|
|
tag_matcher('div', 'class', 'MainContentSocial'),
|
|
tag_matcher('div', 'class', 'SocialNetwork'),
|
|
tag_matcher('div', 'class', 'DetailLeftOther'),
|
|
tag_matcher('div', 'class', 'RelatedNews'),
|
|
tag_matcher('div', 'class', '^topMenuWrapper$'),
|
|
tag_matcher('div', 'class', '^logo$'),
|
|
tag_matcher('a', 'class', 'cf_email'),
|
|
]
|
|
articles = {}
|
|
|
|
def parse_index(self):
|
|
for (sect_title, sect_uri) in self.sections:
|
|
self.parse_section(sect_title, sect_uri)
|
|
|
|
ans = []
|
|
for k in self.articles:
|
|
ans.append((k, self.articles[k]))
|
|
return ans
|
|
|
|
def parse_section(self, sect_title, sect_uri):
|
|
url = 'http://www.todayszaman.com/' + sect_uri
|
|
print('Start section ' + sect_title + ', ' + url)
|
|
try:
|
|
soup = self.index_to_soup(url)
|
|
except:
|
|
return
|
|
|
|
# Find each article
|
|
for div in soup.findAll('div'):
|
|
div_class = div.get('class')
|
|
if div_class:
|
|
if div_class in ['pageColumnistsMainContent',
|
|
'pageCategoryContainer']:
|
|
# print ' DIVCLASS' + div_class
|
|
for link in div.findAll('a', href=True):
|
|
self.process_link(sect_title, div_class, link)
|
|
|
|
print('Finished section: ' + sect_title)
|
|
|
|
def process_link(self, section_title, layout, link):
|
|
def p(s):
|
|
print('[PROCESS LINK] ' + s[0:80])
|
|
|
|
href = link['href']
|
|
full_href = urljoin('http://www.todayszaman.com/', href)
|
|
next_sib = link.nextSibling
|
|
child_h2 = link.find('h2')
|
|
link_text = self.tag_to_string(link).strip()
|
|
title_node = None
|
|
|
|
if layout in ['pageColumnistsMainContent']:
|
|
if child_h2:
|
|
title_node = child_h2
|
|
else:
|
|
return
|
|
elif layout in ['pageCategoryContainer']:
|
|
top_title = link.find(attrs={'class': 'pageCategoryTopTitle'})
|
|
if top_title:
|
|
title_node = top_title
|
|
elif (not link_text) and (next_sib and next_sib.find('h4')):
|
|
title_node = next_sib.find('h4')
|
|
elif (not link_text) and (next_sib and next_sib.find('h3')):
|
|
title_node = next_sib.find('h3')
|
|
elif link_text:
|
|
title_node = link
|
|
|
|
if title_node:
|
|
title = self.tag_to_string(title_node)
|
|
# print ' BING: ' + href + ', ' + title
|
|
self.queue_article_link(section_title, full_href, title)
|
|
|
|
def queue_article_link(self, section, url, title):
|
|
if section not in self.articles:
|
|
self.articles[section] = []
|
|
self.articles[section].append(
|
|
dict(title=title,
|
|
url=url,
|
|
date='',
|
|
description='',
|
|
author='',
|
|
content=''))
|
|
|
|
def populate_article_metadata(self, article, soup, first):
|
|
|
|
def p(s):
|
|
print('[POPULATE METADATA] ' + s[0:80])
|
|
|
|
tnode = soup.find('title')
|
|
if tnode:
|
|
tstring = self.tag_to_string(tnode)
|
|
if ' - ' in tstring:
|
|
author = tstring.split('-')[0]
|
|
if author:
|
|
article.author = author
|
|
article.title = author + ' - ' + article.title.strip()
|
|
p('Add author to title:' + author)
|
|
|
|
# known matches: pageNewsDetailDate, pageColumnistDetailLeftDate
|
|
regex = re.compile('(DetailDate|DetailLeftDate)$', re.IGNORECASE)
|
|
date_node = soup.find('div', {'class': regex})
|
|
if date_node:
|
|
date = self.tag_to_string(date_node).__str__().split('/')[0]
|
|
date = ','.join(date.split(',')[:2]).strip()
|
|
article.title = date + ' - ' + article.title.strip()
|
|
article.date = date
|
|
p('Add date to title: ' + date)
|
|
|
|
strong = soup.find('strong')
|
|
if strong:
|
|
article.text_summary = self.tag_to_string(strong)
|
|
p('Summary: ' + article.text_summary)
|
|
|
|
def _dbg_soup_node(self, node):
|
|
s = ' cls: ' + node.get('class').__str__().strip() + \
|
|
' txt: ' + self.tag_to_string(node).strip()
|
|
return s
|