#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2017, Kovid Goyal ''' private-eye.co.uk ''' import re from datetime import datetime, timedelta from calibre.ebooks.BeautifulSoup import Comment, Tag from calibre.web.feeds.news import BasicNewsRecipe def get_classes(tag): ans = tag.get('class') or () if hasattr(ans, 'split'): ans = ans.split() return list(ans) class PrivateEyeRecipe(BasicNewsRecipe): title = 'Private Eye Online' title_with_date = 'Private Eye Online' title_author = 'Private Eye' __author__ = 'Sophist at sodalis.co.uk' version = 2.10 issue_no = '' description = '''Private Eye is a fortnightly British satirical news and current affairs magazine,\ edited by Ian Hislop, offering a unique blend of humour, social and political observations and\ investigative journalism. This e-book is a download of the online-edition. The full edition is\ available only on subscription.''' publication_type = 'magazine' language = 'en' encoding = 'utf-8' DOMAIN = 'http://www.private-eye.co.uk/' INDEX = DOMAIN + 'current-issue' oldest_article = 13 max_articles_per_feed = 100 remove_javascript = True ignore_duplicate_articles = {'url'} conn_options = { 'authors': title_author, 'author_sort': title_author, 'smarten_punctuation': True, 'series': title, 'publisher': title_author, } remove_tags_before = [ { 'id': 'story', 'class': 'article', }, { 'id': 'page'}, ] remove_tags_after = [ { 'class': 'section', }, ] remove_tags = [ dict(name='div', attrs={'class': 'sub-nav-bar'}), dict(name='img', attrs={'class': 'about-covers'}), dict(name='div', attrs={'id': 'follow-us', 'class': 'text'}), dict(name='span', attrs={'class': 'section'}), ] preprocess_regexps = [ ( re.compile(r'\.\./grfx', re.DOTALL | re.IGNORECASE), lambda match: 'http://www.private-eye.co.uk/grfx'), ] def fix_url(self, url): if ( url.startswith('//') or url.startswith('http://') or url.startswith('https://')): return url if url.startswith('/'): url = self.DOMAIN + url[1:] elif url.startswith('../'): url = self.DOMAIN + url[3:] else: url = self.DOMAIN + url return url urls = [] edition_date = "" def add_article(self, title, url, description="", date=None): if date is None: date = self.edition_date if url and url not in self.urls: self.urls.append(url) self.log.info( "Page added: %s: %s: %s (%s)" % (date, title, description, url)) self.current_articles.append({ 'title': title, 'url': url, 'description': description, 'date': date, }) def page_index_append(self, section): if self.current_articles: self.page_index.append((section, self.current_articles)) self.current_articles = [] # Process the Index page to get the content for the ebook def parse_index(self): self.log.info('Private Eye: v%s,Parse Index: %s' % (self.version,self.INDEX)) self.page_index = [] soup = self.index_to_soup(self.INDEX) for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() # Get masthead URL masthead = soup.find('img', id='site-logo') if masthead: self.masthead_url = self.fix_url(masthead['src']) self.log.debug('Masthead found: %s' % self.masthead_url) else: self.log.warning('Masthead not found.') soup = soup.find('div', id='content') # Get cover image for img in soup.findAll('img', {'class': 'current-issue'}): if img['src'].endswith('_big.jpg'): self.cover_url = img['src'] filename = img['src'].split('/')[-1] self.issue_no = filename.replace('_big.jpg', '') self.log.debug('Cover image found. Issue: %s' % self.issue_no) break else: self.log.warning('Cover image NOT found') # Get publication cover date as 12 days before next publication date for tag in soup.findAll('span', {'class': 'only-smallest'}): tag_contents = tag.contents if tag_contents[0].string.lower().split()[:2] == ["next", "issue"]: try: day, month, year = tag_contents[2].split() day = ''.join(c for c in day if c.isdigit()) date = datetime.strptime( " ".join((day, month, year)), "%d %B %Y") date = date - timedelta(11) self.edition_date = datetime.strftime( date, "%d %B %Y").lstrip("0") self.log.debug("Publication date: %s" % self.edition_date) self.title_with_date = self.title + datetime.strftime( date, " %Y-%m-%d") break except: self.log.warning( "Invalid publication date: %s" % tag.contents[2]) else: self.log.warning("Publication date not found") # Online articles online = soup.find('div', {'id': 'block-left'}) headline = online.find('span', {'class': 'headline'}) if headline: current_section = headline.string self.log.debug('Headline found: %s' % current_section) else: current_section = 'Online Edition' self.log.warning('Headline not found: Default used') self.current_articles = [] title, url, descriptions = "", "", [] for piece in online.contents: if isinstance(piece, Tag): tag_class = piece.name, ' '.join(get_classes(piece)) if tag_class == ('span', 'header'): self.page_index_append(current_section) current_section = piece.string elif tag_class == ('a', 'header'): self.add_article(title, url, r"\r\n".join(descriptions)) title = self.tag_to_string(piece).rstrip(u' »').strip() url = self.fix_url(piece.get('href', '')) descriptions = [] else: self.add_article(title, url, r"\r\n".join(descriptions)) title, url, descriptions = "", "", [] else: desc = piece.strip(" \r\n") if desc: descriptions.append(desc) self.add_article(title, url, r"\r\n".join(descriptions)) self.add_article("Number Crunching", self.DOMAIN + "number-crunching", "") self.page_index_append(current_section) # Process More From This Issue (crossword etc.) current_section = "" self.current_articles = [] title, url, descriptions = "", "", [] # Remove gaps for gap in soup.findAll(attrs={'class': True}): classes = get_classes(gap) for c in classes: if c.startswith('gap-'): gap.extract() break # Find more items more = soup.find('span', {'class': 'section'}) current_section = more.string more = more.findNextSibling() while more.name == 'div' and get_classes(more) == ['box-contents']: title_tag = more.find('a', {'class': 'header-home'}) if title_tag: title = title_tag.string if not url: url = self.fix_url(title_tag.get('href', '')) desc_tag = more.find('a', {'class': 'header'}) if desc_tag: descriptions.append(self.tag_to_string(desc_tag)) if not url: url = self.fix_url(desc_tag.get('href', '')) self.add_article(title, url, r"\r\n".join(descriptions)) title, url, descriptions = "", "", [] more = more.findNextSibling() self.page_index_append(current_section) # Add the PE About Us page. self.add_article( "About Private Eye", self.DOMAIN + "about", """Private Eye is the UK's number one best-selling news and current affairs magazine, edited by Ian Hislop. It offers a unique blend of humour, social and political observations and investigative journalism.\ Published fortnightly, the magazine is read by over 700,000 readers and costs just £1.80 an issue.""", date="") self.page_index_append("About Private Eye") self.log.info('Private Eye: Parse Index complete') return self.page_index def preprocess_html(self, soup): for figure in soup.findAll( 'a', attrs={'href': lambda x: x and (x.endswith('.jpg') or x.endswith('.png') or x.endswith('.gif')) }): # makes sure that the link points to the absolute web address figure['href'] = self.fix_url(figure['href']) return soup def postprocess_book(self, oeb, opts, log): m = oeb.metadata m.clear('title') m.add('title', self.title_with_date) m.clear('authors') m.add('authors', self.title_author) m.clear('author_sort') m.add('author_sort', self.title_author) m.clear('series') m.add('series', self.title) m.clear('series_index') m.add('series_index', self.issue_no)