#!/usr/bin/env python2 # -*- coding: utf-8 -*- """www.dunyahalleri.com/haftanin-ozeti""" import locale import os import re from shutil import copyfile from contextlib import closing from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag from calibre.web.feeds.recipes import BasicNewsRecipe from PIL import Image, ImageDraw, ImageFont __license__ = 'GPL v3' __copyright__ = '2017, sukru alatas / alatas.org' class DunyaHalleri_HaftaninOzeti(BasicNewsRecipe): title = 'Dünya Halleri - Haftanın Özeti' description = ('Geçen hafta boyunca Türkiye ve dünyadan haber,' ' site, yazılım, donanım, cihaz, video ve trendler...') timefmt = ' [%a, %d %b, %Y]' publication_type = 'blog' language = 'tr' locale = 'tr_TR' # for localized month names simultaneous_downloads = 5 needs_subscription = False scale_news_images = True remove_tags_before = dict(name='section', attrs={'itemprop': 'articleBody'}) remove_tags_after = dict(name='div', attrs={'class': 'cb-alert cb-blue'}) remove_tags = [dict(name=['ol', 'h4', 'script', 'noscript', 'style', 'footer']), dict(name='h1', attrs={ 'class': 'entry-title cb-entry-title entry-title cb-title'}), dict(attrs={'class': ['cb-alert cb-blue', 'woo-sc-box info ', 'sharedaddy sd-sharing-enabled', 'jp-relatedposts']}), dict(id=['post-pagination', 'plp_inital_pagination'])] encoding = 'utf_8' no_stylesheets = True INDEX = 'https://www.dunyahalleri.com/haftanin-ozeti/feed/' extra_css = '.caption {color: #998; font-style: italic; font-size: 8pt}' __author__ = 'Sukru Alatas' COVER_WIDTH, COVER_HEIGHT = 590, 750 issue_title = '' issue_date = '' masthead_url = '' cover_url = '' cover_img_url = '' cover_img_path = '' def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) # for localized month names locale.setlocale(locale.LC_TIME, self.locale) if self.output_profile.short_name.startswith('kindle'): # Reduce image sizes to get file size below amazon's email # sending threshold self.web2disk_options.compress_news_images = True self.web2disk_options.compress_news_images_auto_size = 5 self.log.warn( 'Kindle Output profile being used, reducing image quality ' 'to keep file size below amazon email threshold') # BeautifulSoup xml parser extension # If you use index_to_soup with xml or rss, it outputs lots of garbage node, # and change the tree for its own. # This function very very similar copy of index_to_soup but it uses # BeautifulStoneSoup instead of BeautifulSoup def xml_to_soup(self, url_or_raw, raw=False): if re.match(r'\w+://', url_or_raw): br = self.clone_browser(self.browser) open_func = getattr(br, 'open_novisit', br.open) with closing(open_func(url_or_raw)) as f: _raw = f.read() if not _raw: raise RuntimeError( 'Could not fetch index from %s' % url_or_raw) else: _raw = url_or_raw if raw: return _raw if not isinstance(_raw, unicode) and self.encoding: if callable(self.encoding): _raw = self.encoding(_raw) else: _raw = _raw.decode(self.encoding, 'replace') from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode from calibre.utils.cleantext import clean_xml_chars if isinstance(_raw, unicode): _raw = strip_encoding_declarations(_raw) else: _raw = xml_to_unicode( _raw, strip_encoding_pats=True, resolve_entities=True)[0] _raw = clean_xml_chars(_raw) return BeautifulStoneSoup(_raw) # <== the difference def parse_index(self): from dateutil.parser import parse # RSS parsing index = self.xml_to_soup(self.INDEX) channel = index.rss.channel self.description = channel.description.contents[0] self.masthead_url = channel.url.contents[0] item = channel.item self.issue_title = item.title.contents[0] self.issue_date = parse(item.pubdate.contents[0]) base_url = item.link.contents[0] cover_img_desc = BeautifulSoup(item.description.contents[0]) # this is necessary for cover generator self.cover_img_url = cover_img_desc.img['src'] soup = self.index_to_soup(base_url) articles = {} key = None ans = [] for li in soup.findNext('ol').findAll('li'): a = li.find('a', href=True) if not a: url = base_url feed = self.tag_to_string(li, use_alt=True).strip() pubdate = self.issue_date.strftime('%a, %d %b') else: url = base_url + re.sub(r'\.\/', '', a['href']) feed = self.tag_to_string(a, use_alt=True).strip() pubdate = self.issue_date.strftime('%a, %d %b') title = self.issue_title + \ ' (' + self.issue_date.strftime('%d %B %Y') + ')' if feed not in articles: articles[feed] = [] ans.append(feed) articles[feed].append( dict(title=title, url=url, date=pubdate, description='', content='')) ans = [(key, articles[k]) for k in ans if k in articles] return ans def preprocess_html(self, soup): # gallery normalization for div in soup.findAll('div', {'itemtype': 'http://schema.org/ImageGallery'}): p = Tag(soup, 'p') for img in div.findAll('img'): img.attrs = [(key, value) for key, value in img.attrs if key in ['src']] p.append(img) div.replaceWith(p) # youtube embeded normalization # this block finds the cover image for each embeded youtube video then # changes it to "a href" and "img" for iframe in soup.findAll('iframe'): a = Tag(soup, 'a') caption = Tag(soup, 'pre') img = Tag(soup, 'img') m = re.match( r'https\:\/\/(www\.)?youtube.com\/(embed\/|watch\?v\=)' r'(?P.*?)(([\?\&].*)|$|\n)', iframe['src']) if m: # youtube img_src = 'https://img.youtube.com/vi/' + \ m.group('vid') + '/0.jpg' a_href = 'https://www.youtube.com/watch?v=' + m.group('vid') else: # not youtube # default cover image for non-youtube embeded pages img_src = 'http://www.warnerclassics.com/img_style/default_video_m.jpg' a_href = iframe['src'] img.attrs = [('src', img_src)] caption.append('Video: ' + a_href) caption.attrs = [('class', 'caption')] a.attrs = [('href', a_href), ('target', '_blank')] a.append(img) a.append(caption) iframe.replaceWith(a) return soup # cover generator # original version https://www.mobileread.com/forums/showpost.php?p=866553&postcount=5 def get_cover_img_url(self): return getattr(self, 'cover_img_url', None) def _download_cover_img(self): old_cu = None try: old_cu = self.get_cover_url() except: pass new_cu = self.get_cover_img_url() self.cover_url = new_cu self._download_cover() outfile = os.path.join(self.output_dir, 'cover_img.jpg') copyfile(self.cover_path, outfile) self.cover_url = old_cu self.cover_img_path = outfile def download_cover_img(self): try: self._download_cover_img() self.report_progress( 1, ('Downloaded cover to %s') % self.cover_img_path) except: self.log.exception('Failed to download cover img') self.cover_img_path = None def draw_text(self, draw, text, text_size, top): font_path = P('fonts/liberation/LiberationSerif-Bold.ttf') font = ImageFont.truetype(font_path, text_size) width, height = draw.textsize(text, font=font) left = max(int((self.COVER_WIDTH - width) / 2.), 0) draw.text((left, top), text, fill=(0, 0, 0), font=font) return height def default_cover(self, cover_file): title = self.issue_title date = self.issue_date.strftime( '%d %B %Y').decode('utf8', 'replace') author = 'www.dunyahalleri.com/haftanin-ozeti'.decode( 'utf8', 'replace') # Texts img = Image.new( 'RGB', (self.COVER_WIDTH, self.COVER_HEIGHT), 'white') draw = ImageDraw.Draw(img) bottom = 15 bottom += self.draw_text(draw, title, 42, bottom) bottom += 50 bottom += self.draw_text(draw, date, 32, bottom) bottom += self.draw_text(draw, author, 32, self.COVER_HEIGHT - 45) # Logo self.download_cover_img() if getattr(self, 'cover_img_path', None) is not None: logo_file = self.cover_img_path self.report_progress( 1, ('using cover img from %s') % logo_file) logo = Image.open(logo_file, 'r') width, height = logo.size logo = logo.resize( (self.COVER_WIDTH, (self.COVER_WIDTH * height / width)), Image.ANTIALIAS) width, height = logo.size left = max(int((self.COVER_WIDTH - width) / 2.), 0) top = max(int((self.COVER_HEIGHT - height) / 2.), 0) img.paste(logo, (left, top)) img = img.convert('RGB').convert('P', palette=Image.ADAPTIVE) img.convert('RGB').save(cover_file, 'JPEG') cover_file.flush() return True