added dunyahalleri and dunyahalleri_haftaninozeti

2025-12-05 12:45:02 -05:00 · 2017-09-04 01:23:11 +03:00 · 2017-09-04 01:23:11 +03:00 · 85a0304bf6
commit 85a0304bf6
parent 08471c12a4
2 changed files with 465 additions and 0 deletions
--- a/recipes/dunyahalleri.recipe
+++ b/recipes/dunyahalleri.recipe
@ -0,0 +1,199 @@
 #!/usr/bin/env  python2
 # -*- coding: utf-8 -*-
 """www.dunyahalleri.com"""
 import locale
 import os
 import re
 import time
 from shutil import copyfile
 from calibre.ebooks.BeautifulSoup import Tag
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from PIL import Image, ImageDraw, ImageFont
 __license__ = 'GPL v3'
 __copyright__ = '2017, sukru alatas / alatas.org'
 class DunyaHalleri(BasicNewsRecipe):
    title = 'Dünya Halleri'
    description = 'Gözden Kaçanlar Rehberi'
    timefmt = ' [%a, %d %b, %Y]'
    publication_type = 'blog'
    language = 'tr'
    locale = 'tr_TR'  # for localized month names
    simultaneous_downloads = 5
    needs_subscription = False
    scale_news_images = True
    remove_tags_before = dict(name='span', attrs={'itemprop': 'reviewBody'})
    remove_tags_after = dict(
        name='div', attrs={'class': 'sharedaddy sd-sharing-enabled'})
    remove_tags = [dict(name=['script', 'noscript', 'style', 'footer']),
                   dict(attrs={'class': ['jsharedaddy sd-sharing-enabled',
                                         'cb-sticky-sidebar', 'sharedaddy sd-sharing-enabled']}),
                   dict(id=['jp-relatedposts', 'tldr-post-summary', 'tldr-post-summary-buttons'])]
    encoding = 'utf_8'
    no_stylesheets = True
    extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}'
    __author__ = 'Sukru Alatas'
    feeds = [("Genel Gündem".decode('utf-8', 'replace'),
              'https://www.dunyahalleri.com/genel-gundem/feed/'),
             ("Teknoloji / Bilim".decode('utf-8', 'replace'),
              'https://www.dunyahalleri.com/teknoloji-bilim/feed/'),
             ("İnternet / Girişimler".decode('utf-8', 'replace'),
              'https://www.dunyahalleri.com/internet-girisimler/feed/'),
             ("Tasarım / İnovasyon".decode('utf-8', 'replace'),
              'https://www.dunyahalleri.com/tasarim-inovasyon/feed/'),
             ("Kültür / Sanat".decode('utf-8', 'replace'), 'https://www.dunyahalleri.com/kultur-sanat/feed/')]
    oldest_article = 7
    max_articles_per_feed = 50
    COVER_WIDTH, COVER_HEIGHT = 590, 750
    masthead_url = 'https://www.dunyahalleri.com/wp-content/uploads/2016/07/dh-logo-transparan.png'
    cover_url = ''
    cover_img_url = 'https://i0.wp.com/www.dunyahalleri.com/wp-content/uploads/2016/04/dh-favico-v2.png'
    cover_img_path = ''
    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        # for localized month names
        locale.setlocale(locale.LC_TIME, self.locale)
        if self.output_profile.short_name.startswith('kindle'):
            # Reduce image sizes to get file size below amazon's email
            # sending threshold
            self.web2disk_options.compress_news_images = True
            self.web2disk_options.compress_news_images_auto_size = 5
            self.log.warn(
                'Kindle Output profile being used, reducing image quality '
                'to keep file size below amazon email threshold')
    def preprocess_html(self, soup):
        span = soup.findAll('span', {'itemprop': 'reviewBody'}, limit=1)[0]
        # title insert
        article_title = soup.title.contents[0]
        article_title.replace(' - Dünya Halleri'.decode('utf-8', 'replace'), '')
        h2 = Tag(soup, 'h2')
        h2.append(article_title)
        span.insert(0, h2)
        # featured image insert
        meta = soup.findAll('meta', {'property': 'og:image'}, limit=1)[0]
        if meta:
            img = Tag(soup, 'img')
            img.attrs = [('src', meta['content'])]
            span.insert(1, img)
        # gallery normalization
        for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}):
            p = Tag(soup, 'p')
            for img in div.findAll('img'):
                img.attrs = [(key, value)
                             for key, value in img.attrs if key in ['src']]
                p.append(img)
            div.replaceWith(p)
        # youtube embeded normalization
        # this block finds the cover image for each embeded youtube video then
        # changes it to "a href" and "img"
        for iframe in soup.findAll('iframe'):
            a = Tag(soup, 'a')
            caption = Tag(soup, 'pre')
            img = Tag(soup, 'img')
            m = re.match(
                r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)'
                r'(?P<vid>.*?)(([\?\&].*)|$|\n)',
                iframe['src'])
            if m:
                # youtube
                img_src = 'https://img.youtube.com/vi/' + \
                    m.group('vid') + '/0.jpg'
                a_href = 'https://www.youtube.com/watch?v=' + m.group('vid')
            else:
                #not youtube
                # default cover image for non-youtube embeded pages
                img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg'
                a_href = iframe['src']
            img.attrs = [('src', img_src)]
            caption.append('Video: ' + a_href)
            caption.attrs = [('class', 'caption')]
            a.attrs = [('href', a_href), ('target', '_blank')]
            a.append(img)
            a.append(caption)
            iframe.replaceWith(a)
        return soup
    # cover generator
    # original version
    # https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5
    def get_cover_img_url(self):
        return getattr(self, 'cover_img_url', None)
    def _download_cover_img(self):
        old_cu = None
        try:
            old_cu = self.get_cover_url()
        except:
            pass
        new_cu = self.get_cover_img_url()
        self.cover_url = new_cu
        self._download_cover()
        outfile = os.path.join(self.output_dir, 'cover_img.jpg')
        copyfile(self.cover_path, outfile)
        self.cover_url = old_cu
        self.cover_img_path = outfile
    def download_cover_img(self):
        try:
            self._download_cover_img()
            self.report_progress(
                1, ('Downloaded cover to %s') % self.cover_img_path)
        except:
            self.log.exception('Failed to download cover img')
            self.cover_img_path = None
    def draw_text(self, draw, text, text_size, top):
        font_path = P('fonts/liberation/LiberationSerif-Bold.ttf')
        font = ImageFont.truetype(font_path, text_size)
        width, height = draw.textsize(text, font=font)
        left = max(int((self.COVER_WIDTH - width) / 2.), 0)
        draw.text((left, top), text, fill=(0, 0, 0), font=font)
        return height
    def default_cover(self, cover_file):
        title = self.title
        date = time.strftime(
            '%d %B %Y').decode('utf8', 'replace')
        author = 'www.dunyahalleri.com'.decode('utf8', 'replace')
        # Texts
        img = Image.new(
            'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white')
        draw = ImageDraw.Draw(img)
        bottom = 15
        bottom += self.draw_text(draw, title, 42, bottom)
        bottom += 50
        bottom += self.draw_text(draw, date, 32, bottom)
        bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45)
        # Logo
        self.download_cover_img()
        if getattr(self, 'cover_img_path', None) is not None:
            logo_file = self.cover_img_path
            self.report_progress(
                1, ('using cover img from %s') % logo_file)
            logo = Image.open(logo_file, 'r')
            width, height = logo.size
            left = max(int((self.COVER_WIDTH - width) / 2.), 0)
            top = max(int((self.COVER_HEIGHT - height) / 2.), 0)
            img.paste(logo, (left, top))
        img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE)
        img.convert('RGB').save(cover_file, 'JPEG')
        cover_file.flush()
        return True
--- a/recipes/dunyahalleri_haftaninozeti.recipe
+++ b/recipes/dunyahalleri_haftaninozeti.recipe
@ -0,0 +1,266 @@
 #!/usr/bin/env  python2
 # -*- coding: utf-8 -*-
 """www.dunyahalleri.com/haftanin-ozeti"""
 import locale
 import os
 import re
 from shutil import copyfile
 from contextlib import closing
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from PIL import Image, ImageDraw, ImageFont
 __license__ = 'GPL v3'
 __copyright__ = '2017, sukru alatas / alatas.org'
 class DunyaHalleri_HaftaninOzeti(BasicNewsRecipe):
    title = 'Dünya Halleri - Haftanın Özeti'
    description = ('Geçen hafta boyunca Türkiye ve dünyadan haber,'
                   ' site, yazılım, donanım, cihaz, video ve trendler...')
    timefmt = ' [%a, %d %b, %Y]'
    publication_type = 'blog'
    language = 'tr'
    locale = 'tr_TR'  # for localized month names
    simultaneous_downloads = 5
    needs_subscription = False
    scale_news_images = True
    remove_tags_before = dict(name='section', attrs={'itemprop': 'articleBody'})
    remove_tags_after = dict(name='div', attrs={'class': 'cb-alert cb-blue'})
    remove_tags = [dict(name=['ol', 'h4', 'script', 'noscript', 'style', 'footer']),
                   dict(name='h1', attrs={
                       'class': 'entry-title cb-entry-title entry-title cb-title'}),
                   dict(attrs={'class': ['cb-alert cb-blue', 'woo-sc-box  info   ',
                                         'sharedaddy sd-sharing-enabled', 'jp-relatedposts']}),
                   dict(id=['post-pagination', 'plp_inital_pagination'])]
    encoding = 'utf_8'
    no_stylesheets = True
    INDEX = 'https://www.dunyahalleri.com/haftanin-ozeti/feed/'
    extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}'
    __author__ = 'Sukru Alatas'
    COVER_WIDTH, COVER_HEIGHT = 590, 750
    issue_title = ''
    issue_date = ''
    masthead_url = ''
    cover_url = ''
    cover_img_url = ''
    cover_img_path = ''
    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        # for localized month names
        locale.setlocale(locale.LC_TIME, self.locale)
        if self.output_profile.short_name.startswith('kindle'):
            # Reduce image sizes to get file size below amazon's email
            # sending threshold
            self.web2disk_options.compress_news_images = True
            self.web2disk_options.compress_news_images_auto_size = 5
            self.log.warn(
                'Kindle Output profile being used, reducing image quality '
                'to keep file size below amazon email threshold')
    # BeautifulSoup xml parser extension
    # If you use index_to_soup with xml or rss, it outputs lots of garbage node,
    #   and change the tree for its own.
    # This function very very similar copy of index_to_soup but it uses
    # BeautifulStoneSoup instead of BeautifulSoup
    def xml_to_soup(self, url_or_raw, raw=False):
        if re.match(r'\w+://', url_or_raw):
            br = self.clone_browser(self.browser)
            open_func = getattr(br, 'open_novisit', br.open)
            with closing(open_func(url_or_raw)) as f:
                _raw = f.read()
            if not _raw:
                raise RuntimeError(
                    'Could not fetch index from %s' % url_or_raw)
        else:
            _raw = url_or_raw
        if raw:
            return _raw
        if not isinstance(_raw, unicode) and self.encoding:
            if callable(self.encoding):
                _raw = self.encoding(_raw)
            else:
                _raw = _raw.decode(self.encoding, 'replace')
        from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
        from calibre.utils.cleantext import clean_xml_chars
        if isinstance(_raw, unicode):
            _raw = strip_encoding_declarations(_raw)
        else:
            _raw = xml_to_unicode(
                _raw, strip_encoding_pats=True, resolve_entities=True)[0]
        _raw = clean_xml_chars(_raw)
        return BeautifulStoneSoup(_raw)  # <== the difference
    def parse_index(self):
        from dateutil.parser import parse
        # RSS parsing
        index = self.xml_to_soup(self.INDEX)
        channel = index.rss.channel
        self.description = channel.description.contents[0]
        self.masthead_url = channel.url.contents[0]
        item = channel.item
        self.issue_title = item.title.contents[0]
        self.issue_date = parse(item.pubdate.contents[0])
        base_url = item.link.contents[0]
        cover_img_desc = BeautifulSoup(item.description.contents[0])
        # this is necessary for cover generator
        self.cover_img_url = cover_img_desc.img['src']
        soup = self.index_to_soup(base_url)
        articles = {}
        key = None
        ans = []
        for li in soup.findNext('ol').findAll('li'):
            a = li.find('a', href=True)
            if not a:
                url = base_url
                feed = self.tag_to_string(li, use_alt=True).strip()
                pubdate = self.issue_date.strftime('%a, %d %b')
            else:
                url = base_url + re.sub(r'\.\/', '', a['href'])
                feed = self.tag_to_string(a, use_alt=True).strip()
                pubdate = self.issue_date.strftime('%a, %d %b')
            title = self.issue_title + \
                ' (' + self.issue_date.strftime('%d %B %Y') + ')'
            if not articles.has_key(feed):
                articles[feed] = []
                ans.append(feed)
            articles[feed].append(
                dict(title=title, url=url, date=pubdate, description='', content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
    def preprocess_html(self, soup):
        # gallery normalization
        for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}):
            p = Tag(soup, 'p')
            for img in div.findAll('img'):
                img.attrs = [(key, value)
                             for key, value in img.attrs if key in ['src']]
                p.append(img)
            div.replaceWith(p)
        # youtube embeded normalization
        # this block finds the cover image for each embeded youtube video then
        # changes it to "a href" and "img"
        for iframe in soup.findAll('iframe'):
            a = Tag(soup, 'a')
            caption = Tag(soup, 'pre')
            img = Tag(soup, 'img')
            m = re.match(
                r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)'
                r'(?P<vid>.*?)(([\?\&].*)|$|\n)',
                iframe['src'])
            if m:
                # youtube
                img_src = 'https://img.youtube.com/vi/' + \
                    m.group('vid') + '/0.jpg'
                a_href = 'https://www.youtube.com/watch?v=' + m.group('vid')
            else:
                #not youtube
                # default cover image for non-youtube embeded pages
                img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg'
                a_href = iframe['src']
            img.attrs = [('src', img_src)]
            caption.append('Video: ' + a_href)
            caption.attrs = [('class', 'caption')]
            a.attrs = [('href', a_href), ('target', '_blank')]
            a.append(img)
            a.append(caption)
            iframe.replaceWith(a)
        return soup
    # cover generator
    # original version https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5
    def get_cover_img_url(self):
        return getattr(self, 'cover_img_url', None)
    def _download_cover_img(self):
        old_cu = None
        try:
            old_cu = self.get_cover_url()
        except:
            pass
        new_cu = self.get_cover_img_url()
        self.cover_url = new_cu
        self._download_cover()
        outfile = os.path.join(self.output_dir, 'cover_img.jpg')
        copyfile(self.cover_path, outfile)
        self.cover_url = old_cu
        self.cover_img_path = outfile
    def download_cover_img(self):
        try:
            self._download_cover_img()
            self.report_progress(
                1, ('Downloaded cover to %s') % self.cover_img_path)
        except:
            self.log.exception('Failed to download cover img')
            self.cover_img_path = None
    def draw_text(self, draw, text, text_size, top):
        font_path = P('fonts/liberation/LiberationSerif-Bold.ttf')
        font = ImageFont.truetype(font_path, text_size)
        width, height = draw.textsize(text, font=font)
        left = max(int((self.COVER_WIDTH - width) / 2.), 0)
        draw.text((left, top), text, fill=(0, 0, 0), font=font)
        return height
    def default_cover(self, cover_file):
        title = self.issue_title
        date = self.issue_date.strftime(
            '%d %B %Y').decode('utf8', 'replace')
        author = 'www.dunyahalleri.com/haftanin-ozeti'.decode(
            'utf8', 'replace')
        # Texts
        img = Image.new(
            'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white')
        draw = ImageDraw.Draw(img)
        bottom = 15
        bottom += self.draw_text(draw, title, 42, bottom)
        bottom += 50
        bottom += self.draw_text(draw, date, 32, bottom)
        bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45)
        # Logo
        self.download_cover_img()
        if getattr(self, 'cover_img_path', None) is not None:
            logo_file = self.cover_img_path
            self.report_progress(
                1, ('using cover img from %s') % logo_file)
            logo = Image.open(logo_file, 'r')
            width, height = logo.size
            logo = logo.resize(
                (self.COVER_WIDTH, (self.COVER_WIDTH * height / width)), Image.ANTIALIAS)
            width, height = logo.size
            left = max(int((self.COVER_WIDTH - width) / 2.), 0)
            top = max(int((self.COVER_HEIGHT - height) / 2.), 0)
            img.paste(logo, (left, top))
        img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE)
        img.convert('RGB').save(cover_file, 'JPEG')
        cover_file.flush()
        return True