mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
added dunyahalleri and dunyahalleri_haftaninozeti
This commit is contained in:
parent
08471c12a4
commit
85a0304bf6
199
recipes/dunyahalleri.recipe
Normal file
199
recipes/dunyahalleri.recipe
Normal file
@ -0,0 +1,199 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
"""www.dunyahalleri.com"""
|
||||
import locale
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
from shutil import copyfile
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2017, sukru alatas / alatas.org'
|
||||
|
||||
|
||||
class DunyaHalleri(BasicNewsRecipe):
|
||||
title = 'Dünya Halleri'
|
||||
description = 'Gözden Kaçanlar Rehberi'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
publication_type = 'blog'
|
||||
language = 'tr'
|
||||
locale = 'tr_TR' # for localized month names
|
||||
simultaneous_downloads = 5
|
||||
|
||||
needs_subscription = False
|
||||
scale_news_images = True
|
||||
|
||||
remove_tags_before = dict(name='span', attrs={'itemprop': 'reviewBody'})
|
||||
remove_tags_after = dict(
|
||||
name='div', attrs={'class': 'sharedaddy sd-sharing-enabled'})
|
||||
remove_tags = [dict(name=['script', 'noscript', 'style', 'footer']),
|
||||
dict(attrs={'class': ['jsharedaddy sd-sharing-enabled',
|
||||
'cb-sticky-sidebar', 'sharedaddy sd-sharing-enabled']}),
|
||||
dict(id=['jp-relatedposts', 'tldr-post-summary', 'tldr-post-summary-buttons'])]
|
||||
encoding = 'utf_8'
|
||||
no_stylesheets = True
|
||||
|
||||
extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}'
|
||||
__author__ = 'Sukru Alatas'
|
||||
feeds = [("Genel Gündem".decode('utf-8', 'replace'),
|
||||
'https://www.dunyahalleri.com/genel-gundem/feed/'),
|
||||
("Teknoloji / Bilim".decode('utf-8', 'replace'),
|
||||
'https://www.dunyahalleri.com/teknoloji-bilim/feed/'),
|
||||
("İnternet / Girişimler".decode('utf-8', 'replace'),
|
||||
'https://www.dunyahalleri.com/internet-girisimler/feed/'),
|
||||
("Tasarım / İnovasyon".decode('utf-8', 'replace'),
|
||||
'https://www.dunyahalleri.com/tasarim-inovasyon/feed/'),
|
||||
("Kültür / Sanat".decode('utf-8', 'replace'), 'https://www.dunyahalleri.com/kultur-sanat/feed/')]
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
|
||||
COVER_WIDTH, COVER_HEIGHT = 590, 750
|
||||
masthead_url = 'https://www.dunyahalleri.com/wp-content/uploads/2016/07/dh-logo-transparan.png'
|
||||
cover_url = ''
|
||||
cover_img_url = 'https://i0.wp.com/www.dunyahalleri.com/wp-content/uploads/2016/04/dh-favico-v2.png'
|
||||
cover_img_path = ''
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||
# for localized month names
|
||||
locale.setlocale(locale.LC_TIME, self.locale)
|
||||
|
||||
if self.output_profile.short_name.startswith('kindle'):
|
||||
# Reduce image sizes to get file size below amazon's email
|
||||
# sending threshold
|
||||
self.web2disk_options.compress_news_images = True
|
||||
self.web2disk_options.compress_news_images_auto_size = 5
|
||||
self.log.warn(
|
||||
'Kindle Output profile being used, reducing image quality '
|
||||
'to keep file size below amazon email threshold')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
span = soup.findAll('span', {'itemprop': 'reviewBody'}, limit=1)[0]
|
||||
|
||||
# title insert
|
||||
article_title = soup.title.contents[0]
|
||||
article_title.replace(' - Dünya Halleri'.decode('utf-8', 'replace'), '')
|
||||
h2 = Tag(soup, 'h2')
|
||||
h2.append(article_title)
|
||||
span.insert(0, h2)
|
||||
|
||||
# featured image insert
|
||||
meta = soup.findAll('meta', {'property': 'og:image'}, limit=1)[0]
|
||||
if meta:
|
||||
img = Tag(soup, 'img')
|
||||
img.attrs = [('src', meta['content'])]
|
||||
span.insert(1, img)
|
||||
|
||||
# gallery normalization
|
||||
for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}):
|
||||
p = Tag(soup, 'p')
|
||||
for img in div.findAll('img'):
|
||||
img.attrs = [(key, value)
|
||||
for key, value in img.attrs if key in ['src']]
|
||||
p.append(img)
|
||||
div.replaceWith(p)
|
||||
|
||||
# youtube embeded normalization
|
||||
# this block finds the cover image for each embeded youtube video then
|
||||
# changes it to "a href" and "img"
|
||||
for iframe in soup.findAll('iframe'):
|
||||
a = Tag(soup, 'a')
|
||||
caption = Tag(soup, 'pre')
|
||||
img = Tag(soup, 'img')
|
||||
|
||||
m = re.match(
|
||||
r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)'
|
||||
r'(?P<vid>.*?)(([\?\&].*)|$|\n)',
|
||||
iframe['src'])
|
||||
if m:
|
||||
# youtube
|
||||
img_src = 'https://img.youtube.com/vi/' + \
|
||||
m.group('vid') + '/0.jpg'
|
||||
a_href = 'https://www.youtube.com/watch?v=' + m.group('vid')
|
||||
else:
|
||||
#not youtube
|
||||
# default cover image for non-youtube embeded pages
|
||||
img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg'
|
||||
a_href = iframe['src']
|
||||
|
||||
img.attrs = [('src', img_src)]
|
||||
caption.append('Video: ' + a_href)
|
||||
caption.attrs = [('class', 'caption')]
|
||||
a.attrs = [('href', a_href), ('target', '_blank')]
|
||||
a.append(img)
|
||||
a.append(caption)
|
||||
iframe.replaceWith(a)
|
||||
return soup
|
||||
|
||||
# cover generator
|
||||
# original version
|
||||
# https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5
|
||||
def get_cover_img_url(self):
|
||||
return getattr(self, 'cover_img_url', None)
|
||||
|
||||
def _download_cover_img(self):
|
||||
old_cu = None
|
||||
try:
|
||||
old_cu = self.get_cover_url()
|
||||
except:
|
||||
pass
|
||||
new_cu = self.get_cover_img_url()
|
||||
self.cover_url = new_cu
|
||||
self._download_cover()
|
||||
|
||||
outfile = os.path.join(self.output_dir, 'cover_img.jpg')
|
||||
copyfile(self.cover_path, outfile)
|
||||
self.cover_url = old_cu
|
||||
self.cover_img_path = outfile
|
||||
|
||||
def download_cover_img(self):
|
||||
try:
|
||||
self._download_cover_img()
|
||||
self.report_progress(
|
||||
1, ('Downloaded cover to %s') % self.cover_img_path)
|
||||
except:
|
||||
self.log.exception('Failed to download cover img')
|
||||
self.cover_img_path = None
|
||||
|
||||
def draw_text(self, draw, text, text_size, top):
|
||||
font_path = P('fonts/liberation/LiberationSerif-Bold.ttf')
|
||||
font = ImageFont.truetype(font_path, text_size)
|
||||
width, height = draw.textsize(text, font=font)
|
||||
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
|
||||
draw.text((left, top), text, fill=(0, 0, 0), font=font)
|
||||
return height
|
||||
|
||||
def default_cover(self, cover_file):
|
||||
title = self.title
|
||||
date = time.strftime(
|
||||
'%d %B %Y').decode('utf8', 'replace')
|
||||
author = 'www.dunyahalleri.com'.decode('utf8', 'replace')
|
||||
# Texts
|
||||
img = Image.new(
|
||||
'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white')
|
||||
draw = ImageDraw.Draw(img)
|
||||
bottom = 15
|
||||
bottom += self.draw_text(draw, title, 42, bottom)
|
||||
bottom += 50
|
||||
bottom += self.draw_text(draw, date, 32, bottom)
|
||||
bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45)
|
||||
# Logo
|
||||
self.download_cover_img()
|
||||
if getattr(self, 'cover_img_path', None) is not None:
|
||||
logo_file = self.cover_img_path
|
||||
self.report_progress(
|
||||
1, ('using cover img from %s') % logo_file)
|
||||
logo = Image.open(logo_file, 'r')
|
||||
width, height = logo.size
|
||||
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
|
||||
top = max(int((self.COVER_HEIGHT - height) / 2.), 0)
|
||||
img.paste(logo, (left, top))
|
||||
img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE)
|
||||
img.convert('RGB').save(cover_file, 'JPEG')
|
||||
cover_file.flush()
|
||||
return True
|
266
recipes/dunyahalleri_haftaninozeti.recipe
Normal file
266
recipes/dunyahalleri_haftaninozeti.recipe
Normal file
@ -0,0 +1,266 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
"""www.dunyahalleri.com/haftanin-ozeti"""
|
||||
import locale
|
||||
import os
|
||||
import re
|
||||
|
||||
from shutil import copyfile
|
||||
|
||||
from contextlib import closing
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2017, sukru alatas / alatas.org'
|
||||
|
||||
class DunyaHalleri_HaftaninOzeti(BasicNewsRecipe):
|
||||
title = 'Dünya Halleri - Haftanın Özeti'
|
||||
description = ('Geçen hafta boyunca Türkiye ve dünyadan haber,'
|
||||
' site, yazılım, donanım, cihaz, video ve trendler...')
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
publication_type = 'blog'
|
||||
language = 'tr'
|
||||
locale = 'tr_TR' # for localized month names
|
||||
simultaneous_downloads = 5
|
||||
|
||||
needs_subscription = False
|
||||
scale_news_images = True
|
||||
|
||||
remove_tags_before = dict(name='section', attrs={'itemprop': 'articleBody'})
|
||||
remove_tags_after = dict(name='div', attrs={'class': 'cb-alert cb-blue'})
|
||||
remove_tags = [dict(name=['ol', 'h4', 'script', 'noscript', 'style', 'footer']),
|
||||
dict(name='h1', attrs={
|
||||
'class': 'entry-title cb-entry-title entry-title cb-title'}),
|
||||
dict(attrs={'class': ['cb-alert cb-blue', 'woo-sc-box info ',
|
||||
'sharedaddy sd-sharing-enabled', 'jp-relatedposts']}),
|
||||
dict(id=['post-pagination', 'plp_inital_pagination'])]
|
||||
encoding = 'utf_8'
|
||||
no_stylesheets = True
|
||||
INDEX = 'https://www.dunyahalleri.com/haftanin-ozeti/feed/'
|
||||
extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}'
|
||||
__author__ = 'Sukru Alatas'
|
||||
|
||||
COVER_WIDTH, COVER_HEIGHT = 590, 750
|
||||
issue_title = ''
|
||||
issue_date = ''
|
||||
masthead_url = ''
|
||||
cover_url = ''
|
||||
cover_img_url = ''
|
||||
cover_img_path = ''
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||
# for localized month names
|
||||
locale.setlocale(locale.LC_TIME, self.locale)
|
||||
|
||||
if self.output_profile.short_name.startswith('kindle'):
|
||||
# Reduce image sizes to get file size below amazon's email
|
||||
# sending threshold
|
||||
self.web2disk_options.compress_news_images = True
|
||||
self.web2disk_options.compress_news_images_auto_size = 5
|
||||
self.log.warn(
|
||||
'Kindle Output profile being used, reducing image quality '
|
||||
'to keep file size below amazon email threshold')
|
||||
|
||||
# BeautifulSoup xml parser extension
|
||||
# If you use index_to_soup with xml or rss, it outputs lots of garbage node,
|
||||
# and change the tree for its own.
|
||||
# This function very very similar copy of index_to_soup but it uses
|
||||
# BeautifulStoneSoup instead of BeautifulSoup
|
||||
def xml_to_soup(self, url_or_raw, raw=False):
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
br = self.clone_browser(self.browser)
|
||||
open_func = getattr(br, 'open_novisit', br.open)
|
||||
with closing(open_func(url_or_raw)) as f:
|
||||
_raw = f.read()
|
||||
if not _raw:
|
||||
raise RuntimeError(
|
||||
'Could not fetch index from %s' % url_or_raw)
|
||||
else:
|
||||
_raw = url_or_raw
|
||||
|
||||
if raw:
|
||||
return _raw
|
||||
|
||||
if not isinstance(_raw, unicode) and self.encoding:
|
||||
if callable(self.encoding):
|
||||
_raw = self.encoding(_raw)
|
||||
else:
|
||||
_raw = _raw.decode(self.encoding, 'replace')
|
||||
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
|
||||
if isinstance(_raw, unicode):
|
||||
_raw = strip_encoding_declarations(_raw)
|
||||
else:
|
||||
_raw = xml_to_unicode(
|
||||
_raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
|
||||
_raw = clean_xml_chars(_raw)
|
||||
return BeautifulStoneSoup(_raw) # <== the difference
|
||||
|
||||
def parse_index(self):
|
||||
from dateutil.parser import parse
|
||||
|
||||
# RSS parsing
|
||||
index = self.xml_to_soup(self.INDEX)
|
||||
|
||||
channel = index.rss.channel
|
||||
|
||||
self.description = channel.description.contents[0]
|
||||
self.masthead_url = channel.url.contents[0]
|
||||
|
||||
item = channel.item
|
||||
self.issue_title = item.title.contents[0]
|
||||
self.issue_date = parse(item.pubdate.contents[0])
|
||||
|
||||
base_url = item.link.contents[0]
|
||||
cover_img_desc = BeautifulSoup(item.description.contents[0])
|
||||
# this is necessary for cover generator
|
||||
self.cover_img_url = cover_img_desc.img['src']
|
||||
|
||||
soup = self.index_to_soup(base_url)
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
|
||||
for li in soup.findNext('ol').findAll('li'):
|
||||
a = li.find('a', href=True)
|
||||
|
||||
if not a:
|
||||
url = base_url
|
||||
feed = self.tag_to_string(li, use_alt=True).strip()
|
||||
pubdate = self.issue_date.strftime('%a, %d %b')
|
||||
else:
|
||||
url = base_url + re.sub(r'\.\/', '', a['href'])
|
||||
feed = self.tag_to_string(a, use_alt=True).strip()
|
||||
pubdate = self.issue_date.strftime('%a, %d %b')
|
||||
|
||||
title = self.issue_title + \
|
||||
' (' + self.issue_date.strftime('%d %B %Y') + ')'
|
||||
|
||||
if not articles.has_key(feed):
|
||||
articles[feed] = []
|
||||
ans.append(feed)
|
||||
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate, description='', content=''))
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return ans
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
# gallery normalization
|
||||
for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}):
|
||||
p = Tag(soup, 'p')
|
||||
for img in div.findAll('img'):
|
||||
img.attrs = [(key, value)
|
||||
for key, value in img.attrs if key in ['src']]
|
||||
p.append(img)
|
||||
div.replaceWith(p)
|
||||
|
||||
# youtube embeded normalization
|
||||
# this block finds the cover image for each embeded youtube video then
|
||||
# changes it to "a href" and "img"
|
||||
for iframe in soup.findAll('iframe'):
|
||||
a = Tag(soup, 'a')
|
||||
caption = Tag(soup, 'pre')
|
||||
img = Tag(soup, 'img')
|
||||
|
||||
m = re.match(
|
||||
r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)'
|
||||
r'(?P<vid>.*?)(([\?\&].*)|$|\n)',
|
||||
iframe['src'])
|
||||
if m:
|
||||
# youtube
|
||||
img_src = 'https://img.youtube.com/vi/' + \
|
||||
m.group('vid') + '/0.jpg'
|
||||
a_href = 'https://www.youtube.com/watch?v=' + m.group('vid')
|
||||
else:
|
||||
#not youtube
|
||||
# default cover image for non-youtube embeded pages
|
||||
img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg'
|
||||
a_href = iframe['src']
|
||||
|
||||
img.attrs = [('src', img_src)]
|
||||
caption.append('Video: ' + a_href)
|
||||
caption.attrs = [('class', 'caption')]
|
||||
a.attrs = [('href', a_href), ('target', '_blank')]
|
||||
a.append(img)
|
||||
a.append(caption)
|
||||
iframe.replaceWith(a)
|
||||
return soup
|
||||
|
||||
# cover generator
|
||||
# original version https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5
|
||||
def get_cover_img_url(self):
|
||||
return getattr(self, 'cover_img_url', None)
|
||||
|
||||
def _download_cover_img(self):
|
||||
old_cu = None
|
||||
try:
|
||||
old_cu = self.get_cover_url()
|
||||
except:
|
||||
pass
|
||||
new_cu = self.get_cover_img_url()
|
||||
self.cover_url = new_cu
|
||||
self._download_cover()
|
||||
|
||||
outfile = os.path.join(self.output_dir, 'cover_img.jpg')
|
||||
copyfile(self.cover_path, outfile)
|
||||
self.cover_url = old_cu
|
||||
self.cover_img_path = outfile
|
||||
|
||||
def download_cover_img(self):
|
||||
try:
|
||||
self._download_cover_img()
|
||||
self.report_progress(
|
||||
1, ('Downloaded cover to %s') % self.cover_img_path)
|
||||
except:
|
||||
self.log.exception('Failed to download cover img')
|
||||
self.cover_img_path = None
|
||||
|
||||
def draw_text(self, draw, text, text_size, top):
|
||||
font_path = P('fonts/liberation/LiberationSerif-Bold.ttf')
|
||||
font = ImageFont.truetype(font_path, text_size)
|
||||
width, height = draw.textsize(text, font=font)
|
||||
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
|
||||
draw.text((left, top), text, fill=(0, 0, 0), font=font)
|
||||
return height
|
||||
|
||||
def default_cover(self, cover_file):
|
||||
title = self.issue_title
|
||||
date = self.issue_date.strftime(
|
||||
'%d %B %Y').decode('utf8', 'replace')
|
||||
author = 'www.dunyahalleri.com/haftanin-ozeti'.decode(
|
||||
'utf8', 'replace')
|
||||
# Texts
|
||||
img = Image.new(
|
||||
'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white')
|
||||
draw = ImageDraw.Draw(img)
|
||||
bottom = 15
|
||||
bottom += self.draw_text(draw, title, 42, bottom)
|
||||
bottom += 50
|
||||
bottom += self.draw_text(draw, date, 32, bottom)
|
||||
bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45)
|
||||
# Logo
|
||||
self.download_cover_img()
|
||||
if getattr(self, 'cover_img_path', None) is not None:
|
||||
logo_file = self.cover_img_path
|
||||
self.report_progress(
|
||||
1, ('using cover img from %s') % logo_file)
|
||||
logo = Image.open(logo_file, 'r')
|
||||
width, height = logo.size
|
||||
logo = logo.resize(
|
||||
(self.COVER_WIDTH, (self.COVER_WIDTH * height / width)), Image.ANTIALIAS)
|
||||
width, height = logo.size
|
||||
left = max(int((self.COVER_WIDTH - width) / 2.), 0)
|
||||
top = max(int((self.COVER_HEIGHT - height) / 2.), 0)
|
||||
img.paste(logo, (left, top))
|
||||
img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE)
|
||||
img.convert('RGB').save(cover_file, 'JPEG')
|
||||
cover_file.flush()
|
||||
return True
|
Loading…
x
Reference in New Issue
Block a user