diff --git a/recipes/dunyahalleri.recipe b/recipes/dunyahalleri.recipe new file mode 100644 index 0000000000..8c50d923c5 --- /dev/null +++ b/recipes/dunyahalleri.recipe @@ -0,0 +1,199 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +"""www.dunyahalleri.com""" +import locale +import os +import re +import time + +from shutil import copyfile + +from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.recipes import BasicNewsRecipe +from PIL import Image, ImageDraw, ImageFont + +__license__ = 'GPL v3' +__copyright__ = '2017, sukru alatas / alatas.org' + + +class DunyaHalleri(BasicNewsRecipe): + title = 'Dünya Halleri' + description = 'Gözden Kaçanlar Rehberi' + timefmt = ' [%a, %d %b, %Y]' + publication_type = 'blog' + language = 'tr' + locale = 'tr_TR' # for localized month names + simultaneous_downloads = 5 + + needs_subscription = False + scale_news_images = True + + remove_tags_before = dict(name='span', attrs={'itemprop': 'reviewBody'}) + remove_tags_after = dict( + name='div', attrs={'class': 'sharedaddy sd-sharing-enabled'}) + remove_tags = [dict(name=['script', 'noscript', 'style', 'footer']), + dict(attrs={'class': ['jsharedaddy sd-sharing-enabled', + 'cb-sticky-sidebar', 'sharedaddy sd-sharing-enabled']}), + dict(id=['jp-relatedposts', 'tldr-post-summary', 'tldr-post-summary-buttons'])] + encoding = 'utf_8' + no_stylesheets = True + + extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}' + __author__ = 'Sukru Alatas' + feeds = [("Genel Gündem".decode('utf-8', 'replace'), + 'https://www.dunyahalleri.com/genel-gundem/feed/'), + ("Teknoloji / Bilim".decode('utf-8', 'replace'), + 'https://www.dunyahalleri.com/teknoloji-bilim/feed/'), + ("İnternet / Girişimler".decode('utf-8', 'replace'), + 'https://www.dunyahalleri.com/internet-girisimler/feed/'), + ("Tasarım / İnovasyon".decode('utf-8', 'replace'), + 'https://www.dunyahalleri.com/tasarim-inovasyon/feed/'), + ("Kültür / Sanat".decode('utf-8', 'replace'), 'https://www.dunyahalleri.com/kultur-sanat/feed/')] + oldest_article = 7 + max_articles_per_feed = 50 + + COVER_WIDTH, COVER_HEIGHT = 590, 750 + masthead_url = 'https://www.dunyahalleri.com/wp-content/uploads/2016/07/dh-logo-transparan.png' + cover_url = '' + cover_img_url = 'https://i0.wp.com/www.dunyahalleri.com/wp-content/uploads/2016/04/dh-favico-v2.png' + cover_img_path = '' + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + # for localized month names + locale.setlocale(locale.LC_TIME, self.locale) + + if self.output_profile.short_name.startswith('kindle'): + # Reduce image sizes to get file size below amazon's email + # sending threshold + self.web2disk_options.compress_news_images = True + self.web2disk_options.compress_news_images_auto_size = 5 + self.log.warn( + 'Kindle Output profile being used, reducing image quality ' + 'to keep file size below amazon email threshold') + + def preprocess_html(self, soup): + span = soup.findAll('span', {'itemprop': 'reviewBody'}, limit=1)[0] + + # title insert + article_title = soup.title.contents[0] + article_title.replace(' - Dünya Halleri'.decode('utf-8', 'replace'), '') + h2 = Tag(soup, 'h2') + h2.append(article_title) + span.insert(0, h2) + + # featured image insert + meta = soup.findAll('meta', {'property': 'og:image'}, limit=1)[0] + if meta: + img = Tag(soup, 'img') + img.attrs = [('src', meta['content'])] + span.insert(1, img) + + # gallery normalization + for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}): + p = Tag(soup, 'p') + for img in div.findAll('img'): + img.attrs = [(key, value) + for key, value in img.attrs if key in ['src']] + p.append(img) + div.replaceWith(p) + + # youtube embeded normalization + # this block finds the cover image for each embeded youtube video then + # changes it to "a href" and "img" + for iframe in soup.findAll('iframe'): + a = Tag(soup, 'a') + caption = Tag(soup, 'pre') + img = Tag(soup, 'img') + + m = re.match( + r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)' + r'(?P.*?)(([\?\&].*)|$|\n)', + iframe['src']) + if m: + # youtube + img_src = 'https://img.youtube.com/vi/' + \ + m.group('vid') + '/0.jpg' + a_href = 'https://www.youtube.com/watch?v=' + m.group('vid') + else: + #not youtube + # default cover image for non-youtube embeded pages + img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg' + a_href = iframe['src'] + + img.attrs = [('src', img_src)] + caption.append('Video: ' + a_href) + caption.attrs = [('class', 'caption')] + a.attrs = [('href', a_href), ('target', '_blank')] + a.append(img) + a.append(caption) + iframe.replaceWith(a) + return soup + + # cover generator + # original version + # https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5 + def get_cover_img_url(self): + return getattr(self, 'cover_img_url', None) + + def _download_cover_img(self): + old_cu = None + try: + old_cu = self.get_cover_url() + except: + pass + new_cu = self.get_cover_img_url() + self.cover_url = new_cu + self._download_cover() + + outfile = os.path.join(self.output_dir, 'cover_img.jpg') + copyfile(self.cover_path, outfile) + self.cover_url = old_cu + self.cover_img_path = outfile + + def download_cover_img(self): + try: + self._download_cover_img() + self.report_progress( + 1, ('Downloaded cover to %s') % self.cover_img_path) + except: + self.log.exception('Failed to download cover img') + self.cover_img_path = None + + def draw_text(self, draw, text, text_size, top): + font_path = P('fonts/liberation/LiberationSerif-Bold.ttf') + font = ImageFont.truetype(font_path, text_size) + width, height = draw.textsize(text, font=font) + left = max(int((self.COVER_WIDTH - width) / 2.), 0) + draw.text((left, top), text, fill=(0, 0, 0), font=font) + return height + + def default_cover(self, cover_file): + title = self.title + date = time.strftime( + '%d %B %Y').decode('utf8', 'replace') + author = 'www.dunyahalleri.com'.decode('utf8', 'replace') + # Texts + img = Image.new( + 'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white') + draw = ImageDraw.Draw(img) + bottom = 15 + bottom += self.draw_text(draw, title, 42, bottom) + bottom += 50 + bottom += self.draw_text(draw, date, 32, bottom) + bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45) + # Logo + self.download_cover_img() + if getattr(self, 'cover_img_path', None) is not None: + logo_file = self.cover_img_path + self.report_progress( + 1, ('using cover img from %s') % logo_file) + logo = Image.open(logo_file, 'r') + width, height = logo.size + left = max(int((self.COVER_WIDTH - width) / 2.), 0) + top = max(int((self.COVER_HEIGHT - height) / 2.), 0) + img.paste(logo, (left, top)) + img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE) + img.convert('RGB').save(cover_file, 'JPEG') + cover_file.flush() + return True diff --git a/recipes/dunyahalleri_haftaninozeti.recipe b/recipes/dunyahalleri_haftaninozeti.recipe new file mode 100644 index 0000000000..bd794ea49b --- /dev/null +++ b/recipes/dunyahalleri_haftaninozeti.recipe @@ -0,0 +1,266 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +"""www.dunyahalleri.com/haftanin-ozeti""" +import locale +import os +import re + +from shutil import copyfile + +from contextlib import closing +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag +from calibre.web.feeds.recipes import BasicNewsRecipe +from PIL import Image, ImageDraw, ImageFont + +__license__ = 'GPL v3' +__copyright__ = '2017, sukru alatas / alatas.org' + +class DunyaHalleri_HaftaninOzeti(BasicNewsRecipe): + title = 'Dünya Halleri - Haftanın Özeti' + description = ('Geçen hafta boyunca Türkiye ve dünyadan haber,' + ' site, yazılım, donanım, cihaz, video ve trendler...') + timefmt = ' [%a, %d %b, %Y]' + publication_type = 'blog' + language = 'tr' + locale = 'tr_TR' # for localized month names + simultaneous_downloads = 5 + + needs_subscription = False + scale_news_images = True + + remove_tags_before = dict(name='section', attrs={'itemprop': 'articleBody'}) + remove_tags_after = dict(name='div', attrs={'class': 'cb-alert cb-blue'}) + remove_tags = [dict(name=['ol', 'h4', 'script', 'noscript', 'style', 'footer']), + dict(name='h1', attrs={ + 'class': 'entry-title cb-entry-title entry-title cb-title'}), + dict(attrs={'class': ['cb-alert cb-blue', 'woo-sc-box info ', + 'sharedaddy sd-sharing-enabled', 'jp-relatedposts']}), + dict(id=['post-pagination', 'plp_inital_pagination'])] + encoding = 'utf_8' + no_stylesheets = True + INDEX = 'https://www.dunyahalleri.com/haftanin-ozeti/feed/' + extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}' + __author__ = 'Sukru Alatas' + + COVER_WIDTH, COVER_HEIGHT = 590, 750 + issue_title = '' + issue_date = '' + masthead_url = '' + cover_url = '' + cover_img_url = '' + cover_img_path = '' + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + # for localized month names + locale.setlocale(locale.LC_TIME, self.locale) + + if self.output_profile.short_name.startswith('kindle'): + # Reduce image sizes to get file size below amazon's email + # sending threshold + self.web2disk_options.compress_news_images = True + self.web2disk_options.compress_news_images_auto_size = 5 + self.log.warn( + 'Kindle Output profile being used, reducing image quality ' + 'to keep file size below amazon email threshold') + + # BeautifulSoup xml parser extension + # If you use index_to_soup with xml or rss, it outputs lots of garbage node, + # and change the tree for its own. + # This function very very similar copy of index_to_soup but it uses + # BeautifulStoneSoup instead of BeautifulSoup + def xml_to_soup(self, url_or_raw, raw=False): + if re.match(r'\w+://', url_or_raw): + br = self.clone_browser(self.browser) + open_func = getattr(br, 'open_novisit', br.open) + with closing(open_func(url_or_raw)) as f: + _raw = f.read() + if not _raw: + raise RuntimeError( + 'Could not fetch index from %s' % url_or_raw) + else: + _raw = url_or_raw + + if raw: + return _raw + + if not isinstance(_raw, unicode) and self.encoding: + if callable(self.encoding): + _raw = self.encoding(_raw) + else: + _raw = _raw.decode(self.encoding, 'replace') + + from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode + from calibre.utils.cleantext import clean_xml_chars + + if isinstance(_raw, unicode): + _raw = strip_encoding_declarations(_raw) + else: + _raw = xml_to_unicode( + _raw, strip_encoding_pats=True, resolve_entities=True)[0] + + _raw = clean_xml_chars(_raw) + return BeautifulStoneSoup(_raw) # <== the difference + + def parse_index(self): + from dateutil.parser import parse + + # RSS parsing + index = self.xml_to_soup(self.INDEX) + + channel = index.rss.channel + + self.description = channel.description.contents[0] + self.masthead_url = channel.url.contents[0] + + item = channel.item + self.issue_title = item.title.contents[0] + self.issue_date = parse(item.pubdate.contents[0]) + + base_url = item.link.contents[0] + cover_img_desc = BeautifulSoup(item.description.contents[0]) + # this is necessary for cover generator + self.cover_img_url = cover_img_desc.img['src'] + + soup = self.index_to_soup(base_url) + articles = {} + key = None + ans = [] + + for li in soup.findNext('ol').findAll('li'): + a = li.find('a', href=True) + + if not a: + url = base_url + feed = self.tag_to_string(li, use_alt=True).strip() + pubdate = self.issue_date.strftime('%a, %d %b') + else: + url = base_url + re.sub(r'\.\/', '', a['href']) + feed = self.tag_to_string(a, use_alt=True).strip() + pubdate = self.issue_date.strftime('%a, %d %b') + + title = self.issue_title + \ + ' (' + self.issue_date.strftime('%d %B %Y') + ')' + + if not articles.has_key(feed): + articles[feed] = [] + ans.append(feed) + + articles[feed].append( + dict(title=title, url=url, date=pubdate, description='', content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans + + def preprocess_html(self, soup): + # gallery normalization + for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}): + p = Tag(soup, 'p') + for img in div.findAll('img'): + img.attrs = [(key, value) + for key, value in img.attrs if key in ['src']] + p.append(img) + div.replaceWith(p) + + # youtube embeded normalization + # this block finds the cover image for each embeded youtube video then + # changes it to "a href" and "img" + for iframe in soup.findAll('iframe'): + a = Tag(soup, 'a') + caption = Tag(soup, 'pre') + img = Tag(soup, 'img') + + m = re.match( + r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)' + r'(?P.*?)(([\?\&].*)|$|\n)', + iframe['src']) + if m: + # youtube + img_src = 'https://img.youtube.com/vi/' + \ + m.group('vid') + '/0.jpg' + a_href = 'https://www.youtube.com/watch?v=' + m.group('vid') + else: + #not youtube + # default cover image for non-youtube embeded pages + img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg' + a_href = iframe['src'] + + img.attrs = [('src', img_src)] + caption.append('Video: ' + a_href) + caption.attrs = [('class', 'caption')] + a.attrs = [('href', a_href), ('target', '_blank')] + a.append(img) + a.append(caption) + iframe.replaceWith(a) + return soup + + # cover generator + # original version https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5 + def get_cover_img_url(self): + return getattr(self, 'cover_img_url', None) + + def _download_cover_img(self): + old_cu = None + try: + old_cu = self.get_cover_url() + except: + pass + new_cu = self.get_cover_img_url() + self.cover_url = new_cu + self._download_cover() + + outfile = os.path.join(self.output_dir, 'cover_img.jpg') + copyfile(self.cover_path, outfile) + self.cover_url = old_cu + self.cover_img_path = outfile + + def download_cover_img(self): + try: + self._download_cover_img() + self.report_progress( + 1, ('Downloaded cover to %s') % self.cover_img_path) + except: + self.log.exception('Failed to download cover img') + self.cover_img_path = None + + def draw_text(self, draw, text, text_size, top): + font_path = P('fonts/liberation/LiberationSerif-Bold.ttf') + font = ImageFont.truetype(font_path, text_size) + width, height = draw.textsize(text, font=font) + left = max(int((self.COVER_WIDTH - width) / 2.), 0) + draw.text((left, top), text, fill=(0, 0, 0), font=font) + return height + + def default_cover(self, cover_file): + title = self.issue_title + date = self.issue_date.strftime( + '%d %B %Y').decode('utf8', 'replace') + author = 'www.dunyahalleri.com/haftanin-ozeti'.decode( + 'utf8', 'replace') + # Texts + img = Image.new( + 'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white') + draw = ImageDraw.Draw(img) + bottom = 15 + bottom += self.draw_text(draw, title, 42, bottom) + bottom += 50 + bottom += self.draw_text(draw, date, 32, bottom) + bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45) + # Logo + self.download_cover_img() + if getattr(self, 'cover_img_path', None) is not None: + logo_file = self.cover_img_path + self.report_progress( + 1, ('using cover img from %s') % logo_file) + logo = Image.open(logo_file, 'r') + width, height = logo.size + logo = logo.resize( + (self.COVER_WIDTH, (self.COVER_WIDTH * height / width)), Image.ANTIALIAS) + width, height = logo.size + left = max(int((self.COVER_WIDTH - width) / 2.), 0) + top = max(int((self.COVER_HEIGHT - height) / 2.), 0) + img.paste(logo, (left, top)) + img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE) + img.convert('RGB').save(cover_file, 'JPEG') + cover_file.flush() + return True