calibre/recipes/dunyahalleri_haftaninozeti.recipe
2017-09-06 07:52:54 +05:30

268 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""www.dunyahalleri.com/haftanin-ozeti"""
import locale
import os
import re
from shutil import copyfile
from contextlib import closing
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
from PIL import Image, ImageDraw, ImageFont
__license__ = 'GPL v3'
__copyright__ = '2017, sukru alatas / alatas.org'
class DunyaHalleri_HaftaninOzeti(BasicNewsRecipe):
title = 'Dünya Halleri - Haftanın Özeti'
description = ('Geçen hafta boyunca Türkiye ve dünyadan haber,'
' site, yazılım, donanım, cihaz, video ve trendler...')
timefmt = ' [%a, %d %b, %Y]'
publication_type = 'blog'
language = 'tr'
locale = 'tr_TR' # for localized month names
simultaneous_downloads = 5
needs_subscription = False
scale_news_images = True
remove_tags_before = dict(name='section', attrs={'itemprop': 'articleBody'})
remove_tags_after = dict(name='div', attrs={'class': 'cb-alert cb-blue'})
remove_tags = [dict(name=['ol', 'h4', 'script', 'noscript', 'style', 'footer']),
dict(name='h1', attrs={
'class': 'entry-title cb-entry-title entry-title cb-title'}),
dict(attrs={'class': ['cb-alert cb-blue', 'woo-sc-box info ',
'sharedaddy sd-sharing-enabled', 'jp-relatedposts']}),
dict(id=['post-pagination', 'plp_inital_pagination'])]
encoding = 'utf_8'
no_stylesheets = True
INDEX = 'https://www.dunyahalleri.com/haftanin-ozeti/feed/'
extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}'
__author__ = 'Sukru Alatas'
COVER_WIDTH, COVER_HEIGHT = 590, 750
issue_title = ''
issue_date = ''
masthead_url = ''
cover_url = ''
cover_img_url = ''
cover_img_path = ''
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
# for localized month names
locale.setlocale(locale.LC_TIME, self.locale)
if self.output_profile.short_name.startswith('kindle'):
# Reduce image sizes to get file size below amazon's email
# sending threshold
self.web2disk_options.compress_news_images = True
self.web2disk_options.compress_news_images_auto_size = 5
self.log.warn(
'Kindle Output profile being used, reducing image quality '
'to keep file size below amazon email threshold')
# BeautifulSoup xml parser extension
# If you use index_to_soup with xml or rss, it outputs lots of garbage node,
# and change the tree for its own.
# This function very very similar copy of index_to_soup but it uses
# BeautifulStoneSoup instead of BeautifulSoup
def xml_to_soup(self, url_or_raw, raw=False):
if re.match(r'\w+://', url_or_raw):
br = self.clone_browser(self.browser)
open_func = getattr(br, 'open_novisit', br.open)
with closing(open_func(url_or_raw)) as f:
_raw = f.read()
if not _raw:
raise RuntimeError(
'Could not fetch index from %s' % url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
if callable(self.encoding):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(_raw, unicode):
_raw = strip_encoding_declarations(_raw)
else:
_raw = xml_to_unicode(
_raw, strip_encoding_pats=True, resolve_entities=True)[0]
_raw = clean_xml_chars(_raw)
return BeautifulStoneSoup(_raw) # <== the difference
def parse_index(self):
from dateutil.parser import parse
# RSS parsing
index = self.xml_to_soup(self.INDEX)
channel = index.rss.channel
self.description = channel.description.contents[0]
self.masthead_url = channel.url.contents[0]
item = channel.item
self.issue_title = item.title.contents[0]
self.issue_date = parse(item.pubdate.contents[0])
base_url = item.link.contents[0]
cover_img_desc = BeautifulSoup(item.description.contents[0])
# this is necessary for cover generator
self.cover_img_url = cover_img_desc.img['src']
soup = self.index_to_soup(base_url)
articles = {}
key = None
ans = []
for li in soup.findNext('ol').findAll('li'):
a = li.find('a', href=True)
if not a:
url = base_url
feed = self.tag_to_string(li, use_alt=True).strip()
pubdate = self.issue_date.strftime('%a, %d %b')
else:
url = base_url + re.sub(r'\.\/', '', a['href'])
feed = self.tag_to_string(a, use_alt=True).strip()
pubdate = self.issue_date.strftime('%a, %d %b')
title = self.issue_title + \
' (' + self.issue_date.strftime('%d %B %Y') + ')'
if feed not in articles:
articles[feed] = []
ans.append(feed)
articles[feed].append(
dict(title=title, url=url, date=pubdate, description='', content=''))
ans = [(key, articles[k]) for k in ans if k in articles]
return ans
def preprocess_html(self, soup):
# gallery normalization
for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}):
p = Tag(soup, 'p')
for img in div.findAll('img'):
img.attrs = [(key, value)
for key, value in img.attrs if key in ['src']]
p.append(img)
div.replaceWith(p)
# youtube embeded normalization
# this block finds the cover image for each embeded youtube video then
# changes it to "a href" and "img"
for iframe in soup.findAll('iframe'):
a = Tag(soup, 'a')
caption = Tag(soup, 'pre')
img = Tag(soup, 'img')
m = re.match(
r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)'
r'(?P<vid>.*?)(([\?\&].*)|$|\n)',
iframe['src'])
if m:
# youtube
img_src = 'https://img.youtube.com/vi/' + \
m.group('vid') + '/0.jpg'
a_href = 'https://www.youtube.com/watch?v=' + m.group('vid')
else:
# not youtube
# default cover image for non-youtube embeded pages
img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg'
a_href = iframe['src']
img.attrs = [('src', img_src)]
caption.append('Video: ' + a_href)
caption.attrs = [('class', 'caption')]
a.attrs = [('href', a_href), ('target', '_blank')]
a.append(img)
a.append(caption)
iframe.replaceWith(a)
return soup
# cover generator
# original version https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5
def get_cover_img_url(self):
return getattr(self, 'cover_img_url', None)
def _download_cover_img(self):
old_cu = None
try:
old_cu = self.get_cover_url()
except:
pass
new_cu = self.get_cover_img_url()
self.cover_url = new_cu
self._download_cover()
outfile = os.path.join(self.output_dir, 'cover_img.jpg')
copyfile(self.cover_path, outfile)
self.cover_url = old_cu
self.cover_img_path = outfile
def download_cover_img(self):
try:
self._download_cover_img()
self.report_progress(
1, ('Downloaded cover to %s') % self.cover_img_path)
except:
self.log.exception('Failed to download cover img')
self.cover_img_path = None
def draw_text(self, draw, text, text_size, top):
font_path = P('fonts/liberation/LiberationSerif-Bold.ttf')
font = ImageFont.truetype(font_path, text_size)
width, height = draw.textsize(text, font=font)
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
draw.text((left, top), text, fill=(0, 0, 0), font=font)
return height
def default_cover(self, cover_file):
title = self.issue_title
date = self.issue_date.strftime(
'%d %B %Y').decode('utf8', 'replace')
author = 'www.dunyahalleri.com/haftanin-ozeti'.decode(
'utf8', 'replace')
# Texts
img = Image.new(
'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white')
draw = ImageDraw.Draw(img)
bottom = 15
bottom += self.draw_text(draw, title, 42, bottom)
bottom += 50
bottom += self.draw_text(draw, date, 32, bottom)
bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45)
# Logo
self.download_cover_img()
if getattr(self, 'cover_img_path', None) is not None:
logo_file = self.cover_img_path
self.report_progress(
1, ('using cover img from %s') % logo_file)
logo = Image.open(logo_file, 'r')
width, height = logo.size
logo = logo.resize(
(self.COVER_WIDTH, (self.COVER_WIDTH * height / width)), Image.ANTIALIAS)
width, height = logo.size
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
top = max(int((self.COVER_HEIGHT - height) / 2.), 0)
img.paste(logo, (left, top))
img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE)
img.convert('RGB').save(cover_file, 'JPEG')
cover_file.flush()
return True