''' Fetch Private Eye (Online Edition) ''' import re from datetime import datetime, timedelta from calibre.web.feeds.news import BasicNewsRecipe class PrivateEyeRecipe(BasicNewsRecipe): ## # Last Edited: 2025-04-02 # # Remark: Version 3.2 2025-04-02 # Fix recipe after web-site changes # Version 3.1 2023-07-14 # Show crossword on right so clues are continuous down left # Link to crossword image removed # Improve many image layouts # Version 3.0 2023-07-01 # Rewrite (by Sophist-UK) to fit latest web pages, correctly identify pages to include # and improve formatting. # Edited to add: inclusion of About page, # identifying series number and publication date and setting metadata. # title = u'Private Eye (Online Edition)' description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' publication_type = 'magazine' language = 'en_GB' encoding = 'utf-8' oldest_article = 13 max_articles_per_feed = 100 remove_javascript = True ignore_duplicate_articles = {'url'} __author__ = u'Martyn Pritchard & Sophist-UK' __copyright__ = '2020-2025, Martyn Pritchard & Sophist-UK ' base_url = 'https://www.private-eye.co.uk/' current_issue = 'https://www.private-eye.co.uk/' about_page = 'https://www.private-eye.co.uk/about' masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png' author = 'Private Eye' series = title = 'Private Eye Online' conversion_options = { 'authors': author, 'author_sort': author, 'series': series, 'series_index': 0, 'title': title, 'title_sort': title, } index_attrs_to_include = [ {'class': 'footer-block'}, {'id': 'top-stories'}, {'id': 'home-content'}, {'id': 'home-color-content'}, ] titles_to_skip = [ 'Home', 'more', 'In This Issue', ] url_to_section_name = { 'hp-sauce': 'HP Sauce', 'in-the-back': 'In the Back', 'street-of-shame': 'Street of Shame', 'cartoons': 'Strips and Cartoons', 'lookalikes': 'Lookalike', 'number-crunching': 'Number Crunching', 'mediaballs': 'Dumb Britain', 'crossword': 'Eye Crossword', } def get_cover_url(self): soup = self.index_to_soup(self.current_issue) for img in soup.findAll('img', {'class': 'issue-cover'}): src = img['src'] if src.endswith('_big.jpg'): file_name = src.rsplit('/',1)[1] if file_name is None: file_name = src try: self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])}) self.log('series-index:', self.conversion_options['series_index']) except (TypeError, ValueError): # wrong big image continue return src return None def parse_index(self): soup = self.index_to_soup(self.current_issue) # Get publication date - Next issue on sale date - 12 days issue_box = soup.find('div', attrs={'id': 'issue-box'}) next_issue_text = issue_box.find(text=re.compile(r'NEXT\s+ISSUE')).parent.contents[-1].strip() self.log('next_issue_text:', next_issue_text) try: day, month, year = next_issue_text.split(' ') # remove day suffixes e.g. 2nd day = ''.join(c for c in day if c.isdigit()) pub_date = datetime.strptime(' '.join((day, month, year)), '%d %B %Y') - timedelta(12) self.log('pub-date:', pub_date) self.conversion_options.update({'pubdate': datetime.strftime(pub_date, '%d %B %Y').lstrip('0')}) title = self.title + ' ' + datetime.strftime(pub_date, '%Y-%m-%d') self.conversion_options.update({'title': title}) self.conversion_options.update({'title_sort': title}) except (TypeError, ValueError): # Bad date self.log('Cannot parse next issue date from:', next_issue_text) # Get pages from the various contents panels. # Duplicates will be eliminated automatically. articles = [] urls = [] for section_attrs in self.index_attrs_to_include: section = soup.find('div', attrs=section_attrs) if not section: continue for a in section.findAll('a', href=True): url = a.get('href') title = a.getText().rstrip(' ยป\n') if not title: continue if title in self.titles_to_skip: continue known_url = url.rsplit('/',1)[-1] if known_url and known_url in self.url_to_section_name: title = self.url_to_section_name[known_url] if not url.startswith('http'): url = self.base_url + url if url in urls: continue self.log('title:', title, ', url:', url) urls.append(url) articles.append({ 'title': title, 'url': url, }) if not articles: raise ValueError('Private-Eye Online index of pages not found') # Add the About page as a final article articles.append({ 'title': 'About Private Eye', 'url': self.about_page, }) self.log('parse_index:', articles) return [('Private Eye', articles)] def preprocess_html(self, soup): # Remove tag link to crossword image for tag in soup.findAll('a', {'href': re.compile(r'/pictures/crossword/')}): self.log('Removing link to crossword image...') tag.unwrap() # Remove align tag in crossword image (so float right works) for tag in soup.findAll('img', {'src': re.compile(r'/pictures/crossword/')}): if 'align' in tag.attrs: self.log('Removing crossword image align attribute...') del tag.attrs['align'] return soup # We remove vast swathes of HTML which is not part of the articles. # Remove sibling content remove_tags_before = [ {'name': 'div', 'class': 'article'}, {'name': 'div', 'id': 'page'}, {'name': 'div', 'id': 'page-wide'}, {'name': 'div', 'id': 'content'}, {'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}}, ] remove_tags_after = remove_tags_before.copy() remove_tags_after.append( {'name': 'div', 'id': 'about-covers'}, ) # Remove non-sibling content remove_tags = [ {'name': 'div', 'attrs': {'id': 'top-bar'}}, {'name': 'div', 'attrs': {'id': 'header-wide'}}, {'name': 'div', 'attrs': {'id': 'footer-wide'}}, {'name': 'div', 'attrs': {'id': 'follow-buttons'}}, {'name': 'div', 'attrs': {'id': 'sidebar'}}, {'name': 'div', 'attrs': {'id': 'sections-sidebar'}}, {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}}, {'name': 'div', 'attrs': {'id': 'nav-box-pages-mobile'}}, {'name': 'div', 'attrs': {'id': 'about-covers'}}, {'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}}, {'name': 'iframe'}, {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/lightbox/')}}, {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/news_ticker/')}}, {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/media-queries-')}}, ] # Convert headers to h1, strapline to h4 preprocess_regexps = [ ( re.compile( r'(.*?)\s*(?:
\s*)*(?:(.*?))?', re.DOTALL | re.IGNORECASE ), lambda match: '

' + match[1] + '

' + (('

' + match[2] + '

') if match[2] else '') ), ] # The following extra css is to tweak the formatting of various elements of various article pages. # Unfortunately, there are a variety of different pages styles, hence the extended tweak list. # Some of these mimic the actual layout.css which does not seem to make it across into the calibre # ebook without duplicating it as extra css. # However some is new css to tweak output when part of an ebook. extra_css = ' \n '.join([ '#content img {float: right; width: 45%; minimum-width:350px;}', '#content img.cartoon-left {float: left; margin-right: 15px; margin-bottom: 15px;}', '#content img.cartoon-right {float: none; margin-bottom: 15px;}', '#content img.strip {float: none; width: 100%;}', '#content img:first-child {float: none;}', '#content img.gnitty-right {float: none; width: 160px;}', '#content #story > div[align=right] > img:first-child {float: none; width: 15px;}', '#content #story > img:first-child {float: none; height: 100px; width: none; minimum-width: none;}', '#content #block-sections img {float: none; width: none;}', '#content #block-sections img.lookalike {float: none; width: 100%;}', '#content #block-sections img.photo-right {float: right; width: 25%; min-width:120px; margin-left: 15px;}', '#content #block-sections > p:last-child > img:first-child {float: none; width: 120px;}', '#content #block-sections > p:last-child > img:nth-child(2) {float: none; width: 120px;}', '#content #block-sections img.crossword {float: right; width: 40%; margin-left: 15px; min-width: 350px;}', '#content #article-caption-box {float: right; background: #222222; display: block; width: 40%; min-width: 250px; font-size: 90%; margin-left: 15px;}', '#content #article-caption-box img {float: none; width: 100%; max-width: none;}', '#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}', '#whatsapp {border-left: 5px #8aba60 solid; border-right: 5px #8aba60 solid; border-bottom: 5px #8aba60 solid; padding: 0 20px 20px 20px;}', '#whatsapp::after {clear:both;}', '#whatsapp .whatsapp-left, .whatsapp-right {margin: 0 0 20px 0; padding: 15px; border-radius: 10px;}', '#whatsapp .whatsapp-left, .whatsapp-right {font-family: Helvetica, Arial, "sans-serif"; font-weight: 300; font-size: 18px; line-height: 24px;}', '#whatsapp .whatsapp-left {text-align: left; margin-right: 30%; background-color: #eeeeee;}', '#whatsapp .whatsapp-right {text-align: right; margin-left: 30%; background-color: #dce5ae;}', '#whatsapp .whatsapp-left img, #whatsapp .whatsapp-right img {width: 35px; margin: 0 10px; vertical-align: middle;}', '#whatsapp .whatsapp-left img.emoji, #whatsapp .whatsapp-right img.emoji {max-width: 35px; margin: 0 5px; vertical-align: middle;}', ])