added dunyahalleri and dunyahalleri_haftaninozeti

This commit is contained in:
Sukru Alatas 2017-09-04 01:23:11 +03:00
parent 08471c12a4
commit 85a0304bf6
2 changed files with 465 additions and 0 deletions

199
recipes/dunyahalleri.recipe Normal file
View File

@ -0,0 +1,199 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""www.dunyahalleri.com"""
import locale
import os
import re
import time
from shutil import copyfile
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
from PIL import Image, ImageDraw, ImageFont
__license__ = 'GPL v3'
__copyright__ = '2017, sukru alatas / alatas.org'
class DunyaHalleri(BasicNewsRecipe):
title = 'Dünya Halleri'
description = 'Gözden Kaçanlar Rehberi'
timefmt = ' [%a, %d %b, %Y]'
publication_type = 'blog'
language = 'tr'
locale = 'tr_TR' # for localized month names
simultaneous_downloads = 5
needs_subscription = False
scale_news_images = True
remove_tags_before = dict(name='span', attrs={'itemprop': 'reviewBody'})
remove_tags_after = dict(
name='div', attrs={'class': 'sharedaddy sd-sharing-enabled'})
remove_tags = [dict(name=['script', 'noscript', 'style', 'footer']),
dict(attrs={'class': ['jsharedaddy sd-sharing-enabled',
'cb-sticky-sidebar', 'sharedaddy sd-sharing-enabled']}),
dict(id=['jp-relatedposts', 'tldr-post-summary', 'tldr-post-summary-buttons'])]
encoding = 'utf_8'
no_stylesheets = True
extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}'
__author__ = 'Sukru Alatas'
feeds = [("Genel Gündem".decode('utf-8', 'replace'),
'https://www.dunyahalleri.com/genel-gundem/feed/'),
("Teknoloji / Bilim".decode('utf-8', 'replace'),
'https://www.dunyahalleri.com/teknoloji-bilim/feed/'),
("İnternet / Girişimler".decode('utf-8', 'replace'),
'https://www.dunyahalleri.com/internet-girisimler/feed/'),
("Tasarım / İnovasyon".decode('utf-8', 'replace'),
'https://www.dunyahalleri.com/tasarim-inovasyon/feed/'),
("Kültür / Sanat".decode('utf-8', 'replace'), 'https://www.dunyahalleri.com/kultur-sanat/feed/')]
oldest_article = 7
max_articles_per_feed = 50
COVER_WIDTH, COVER_HEIGHT = 590, 750
masthead_url = 'https://www.dunyahalleri.com/wp-content/uploads/2016/07/dh-logo-transparan.png'
cover_url = ''
cover_img_url = 'https://i0.wp.com/www.dunyahalleri.com/wp-content/uploads/2016/04/dh-favico-v2.png'
cover_img_path = ''
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
# for localized month names
locale.setlocale(locale.LC_TIME, self.locale)
if self.output_profile.short_name.startswith('kindle'):
# Reduce image sizes to get file size below amazon's email
# sending threshold
self.web2disk_options.compress_news_images = True
self.web2disk_options.compress_news_images_auto_size = 5
self.log.warn(
'Kindle Output profile being used, reducing image quality '
'to keep file size below amazon email threshold')
def preprocess_html(self, soup):
span = soup.findAll('span', {'itemprop': 'reviewBody'}, limit=1)[0]
# title insert
article_title = soup.title.contents[0]
article_title.replace(' - Dünya Halleri'.decode('utf-8', 'replace'), '')
h2 = Tag(soup, 'h2')
h2.append(article_title)
span.insert(0, h2)
# featured image insert
meta = soup.findAll('meta', {'property': 'og:image'}, limit=1)[0]
if meta:
img = Tag(soup, 'img')
img.attrs = [('src', meta['content'])]
span.insert(1, img)
# gallery normalization
for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}):
p = Tag(soup, 'p')
for img in div.findAll('img'):
img.attrs = [(key, value)
for key, value in img.attrs if key in ['src']]
p.append(img)
div.replaceWith(p)
# youtube embeded normalization
# this block finds the cover image for each embeded youtube video then
# changes it to "a href" and "img"
for iframe in soup.findAll('iframe'):
a = Tag(soup, 'a')
caption = Tag(soup, 'pre')
img = Tag(soup, 'img')
m = re.match(
r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)'
r'(?P<vid>.*?)(([\?\&].*)|$|\n)',
iframe['src'])
if m:
# youtube
img_src = 'https://img.youtube.com/vi/' + \
m.group('vid') + '/0.jpg'
a_href = 'https://www.youtube.com/watch?v=' + m.group('vid')
else:
#not youtube
# default cover image for non-youtube embeded pages
img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg'
a_href = iframe['src']
img.attrs = [('src', img_src)]
caption.append('Video: ' + a_href)
caption.attrs = [('class', 'caption')]
a.attrs = [('href', a_href), ('target', '_blank')]
a.append(img)
a.append(caption)
iframe.replaceWith(a)
return soup
# cover generator
# original version
# https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5
def get_cover_img_url(self):
return getattr(self, 'cover_img_url', None)
def _download_cover_img(self):
old_cu = None
try:
old_cu = self.get_cover_url()
except:
pass
new_cu = self.get_cover_img_url()
self.cover_url = new_cu
self._download_cover()
outfile = os.path.join(self.output_dir, 'cover_img.jpg')
copyfile(self.cover_path, outfile)
self.cover_url = old_cu
self.cover_img_path = outfile
def download_cover_img(self):
try:
self._download_cover_img()
self.report_progress(
1, ('Downloaded cover to %s') % self.cover_img_path)
except:
self.log.exception('Failed to download cover img')
self.cover_img_path = None
def draw_text(self, draw, text, text_size, top):
font_path = P('fonts/liberation/LiberationSerif-Bold.ttf')
font = ImageFont.truetype(font_path, text_size)
width, height = draw.textsize(text, font=font)
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
draw.text((left, top), text, fill=(0, 0, 0), font=font)
return height
def default_cover(self, cover_file):
title = self.title
date = time.strftime(
'%d %B %Y').decode('utf8', 'replace')
author = 'www.dunyahalleri.com'.decode('utf8', 'replace')
# Texts
img = Image.new(
'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white')
draw = ImageDraw.Draw(img)
bottom = 15
bottom += self.draw_text(draw, title, 42, bottom)
bottom += 50
bottom += self.draw_text(draw, date, 32, bottom)
bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45)
# Logo
self.download_cover_img()
if getattr(self, 'cover_img_path', None) is not None:
logo_file = self.cover_img_path
self.report_progress(
1, ('using cover img from %s') % logo_file)
logo = Image.open(logo_file, 'r')
width, height = logo.size
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
top = max(int((self.COVER_HEIGHT - height) / 2.), 0)
img.paste(logo, (left, top))
img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE)
img.convert('RGB').save(cover_file, 'JPEG')
cover_file.flush()
return True

View File

@ -0,0 +1,266 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""www.dunyahalleri.com/haftanin-ozeti"""
import locale
import os
import re
from shutil import copyfile
from contextlib import closing
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
from PIL import Image, ImageDraw, ImageFont
__license__ = 'GPL v3'
__copyright__ = '2017, sukru alatas / alatas.org'
class DunyaHalleri_HaftaninOzeti(BasicNewsRecipe):
title = 'Dünya Halleri - Haftanın Özeti'
description = ('Geçen hafta boyunca Türkiye ve dünyadan haber,'
' site, yazılım, donanım, cihaz, video ve trendler...')
timefmt = ' [%a, %d %b, %Y]'
publication_type = 'blog'
language = 'tr'
locale = 'tr_TR' # for localized month names
simultaneous_downloads = 5
needs_subscription = False
scale_news_images = True
remove_tags_before = dict(name='section', attrs={'itemprop': 'articleBody'})
remove_tags_after = dict(name='div', attrs={'class': 'cb-alert cb-blue'})
remove_tags = [dict(name=['ol', 'h4', 'script', 'noscript', 'style', 'footer']),
dict(name='h1', attrs={
'class': 'entry-title cb-entry-title entry-title cb-title'}),
dict(attrs={'class': ['cb-alert cb-blue', 'woo-sc-box info ',
'sharedaddy sd-sharing-enabled', 'jp-relatedposts']}),
dict(id=['post-pagination', 'plp_inital_pagination'])]
encoding = 'utf_8'
no_stylesheets = True
INDEX = 'https://www.dunyahalleri.com/haftanin-ozeti/feed/'
extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}'
__author__ = 'Sukru Alatas'
COVER_WIDTH, COVER_HEIGHT = 590, 750
issue_title = ''
issue_date = ''
masthead_url = ''
cover_url = ''
cover_img_url = ''
cover_img_path = ''
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
# for localized month names
locale.setlocale(locale.LC_TIME, self.locale)
if self.output_profile.short_name.startswith('kindle'):
# Reduce image sizes to get file size below amazon's email
# sending threshold
self.web2disk_options.compress_news_images = True
self.web2disk_options.compress_news_images_auto_size = 5
self.log.warn(
'Kindle Output profile being used, reducing image quality '
'to keep file size below amazon email threshold')
# BeautifulSoup xml parser extension
# If you use index_to_soup with xml or rss, it outputs lots of garbage node,
# and change the tree for its own.
# This function very very similar copy of index_to_soup but it uses
# BeautifulStoneSoup instead of BeautifulSoup
def xml_to_soup(self, url_or_raw, raw=False):
if re.match(r'\w+://', url_or_raw):
br = self.clone_browser(self.browser)
open_func = getattr(br, 'open_novisit', br.open)
with closing(open_func(url_or_raw)) as f:
_raw = f.read()
if not _raw:
raise RuntimeError(
'Could not fetch index from %s' % url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
if callable(self.encoding):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(_raw, unicode):
_raw = strip_encoding_declarations(_raw)
else:
_raw = xml_to_unicode(
_raw, strip_encoding_pats=True, resolve_entities=True)[0]
_raw = clean_xml_chars(_raw)
return BeautifulStoneSoup(_raw) # <== the difference
def parse_index(self):
from dateutil.parser import parse
# RSS parsing
index = self.xml_to_soup(self.INDEX)
channel = index.rss.channel
self.description = channel.description.contents[0]
self.masthead_url = channel.url.contents[0]
item = channel.item
self.issue_title = item.title.contents[0]
self.issue_date = parse(item.pubdate.contents[0])
base_url = item.link.contents[0]
cover_img_desc = BeautifulSoup(item.description.contents[0])
# this is necessary for cover generator
self.cover_img_url = cover_img_desc.img['src']
soup = self.index_to_soup(base_url)
articles = {}
key = None
ans = []
for li in soup.findNext('ol').findAll('li'):
a = li.find('a', href=True)
if not a:
url = base_url
feed = self.tag_to_string(li, use_alt=True).strip()
pubdate = self.issue_date.strftime('%a, %d %b')
else:
url = base_url + re.sub(r'\.\/', '', a['href'])
feed = self.tag_to_string(a, use_alt=True).strip()
pubdate = self.issue_date.strftime('%a, %d %b')
title = self.issue_title + \
' (' + self.issue_date.strftime('%d %B %Y') + ')'
if not articles.has_key(feed):
articles[feed] = []
ans.append(feed)
articles[feed].append(
dict(title=title, url=url, date=pubdate, description='', content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
def preprocess_html(self, soup):
# gallery normalization
for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}):
p = Tag(soup, 'p')
for img in div.findAll('img'):
img.attrs = [(key, value)
for key, value in img.attrs if key in ['src']]
p.append(img)
div.replaceWith(p)
# youtube embeded normalization
# this block finds the cover image for each embeded youtube video then
# changes it to "a href" and "img"
for iframe in soup.findAll('iframe'):
a = Tag(soup, 'a')
caption = Tag(soup, 'pre')
img = Tag(soup, 'img')
m = re.match(
r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)'
r'(?P<vid>.*?)(([\?\&].*)|$|\n)',
iframe['src'])
if m:
# youtube
img_src = 'https://img.youtube.com/vi/' + \
m.group('vid') + '/0.jpg'
a_href = 'https://www.youtube.com/watch?v=' + m.group('vid')
else:
#not youtube
# default cover image for non-youtube embeded pages
img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg'
a_href = iframe['src']
img.attrs = [('src', img_src)]
caption.append('Video: ' + a_href)
caption.attrs = [('class', 'caption')]
a.attrs = [('href', a_href), ('target', '_blank')]
a.append(img)
a.append(caption)
iframe.replaceWith(a)
return soup
# cover generator
# original version https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5
def get_cover_img_url(self):
return getattr(self, 'cover_img_url', None)
def _download_cover_img(self):
old_cu = None
try:
old_cu = self.get_cover_url()
except:
pass
new_cu = self.get_cover_img_url()
self.cover_url = new_cu
self._download_cover()
outfile = os.path.join(self.output_dir, 'cover_img.jpg')
copyfile(self.cover_path, outfile)
self.cover_url = old_cu
self.cover_img_path = outfile
def download_cover_img(self):
try:
self._download_cover_img()
self.report_progress(
1, ('Downloaded cover to %s') % self.cover_img_path)
except:
self.log.exception('Failed to download cover img')
self.cover_img_path = None
def draw_text(self, draw, text, text_size, top):
font_path = P('fonts/liberation/LiberationSerif-Bold.ttf')
font = ImageFont.truetype(font_path, text_size)
width, height = draw.textsize(text, font=font)
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
draw.text((left, top), text, fill=(0, 0, 0), font=font)
return height
def default_cover(self, cover_file):
title = self.issue_title
date = self.issue_date.strftime(
'%d %B %Y').decode('utf8', 'replace')
author = 'www.dunyahalleri.com/haftanin-ozeti'.decode(
'utf8', 'replace')
# Texts
img = Image.new(
'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white')
draw = ImageDraw.Draw(img)
bottom = 15
bottom += self.draw_text(draw, title, 42, bottom)
bottom += 50
bottom += self.draw_text(draw, date, 32, bottom)
bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45)
# Logo
self.download_cover_img()
if getattr(self, 'cover_img_path', None) is not None:
logo_file = self.cover_img_path
self.report_progress(
1, ('using cover img from %s') % logo_file)
logo = Image.open(logo_file, 'r')
width, height = logo.size
logo = logo.resize(
(self.COVER_WIDTH, (self.COVER_WIDTH * height / width)), Image.ANTIALIAS)
width, height = logo.size
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
top = max(int((self.COVER_HEIGHT - height) / 2.), 0)
img.paste(logo, (left, top))
img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE)
img.convert('RGB').save(cover_file, 'JPEG')
cover_file.flush()
return True