calibre/recipes/theoldie.recipe
2025-01-24 11:14:24 +01:00

253 lines
9.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

'''
Fetch The Oldie (Online Edition)
'''
import re
from datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class PrivateEyeRecipe(BasicNewsRecipe):
##
# Last Edited: 2023-08-07
#
# Remark: Version 1.0 2023-08-07
# Initial version
title = u'The Oldie (Online Edition)'
description = ('The Oldie has been dubbed Private Eye for grown-ups and is read by intelligent people who are fed'
' up with the formulaic nature of the celebrity-obsessed national press. The Oldie was cooked up in'
' 1992 by Richard Ingrams (who previously co-founded Private Eye in 1961) as a free-thinking,'
' funny magazine, a light-hearted alternative to a press obsessed with youth and celebrity.'
' The editors claim that the Oldie is ageless and timeless, free of retirement advice, crammed'
' with rejuvenating wit, intelligence and delight.')
publication_type = 'magazine'
language = 'en_GB'
encoding = 'utf-8'
oldest_article = 31
max_articles_per_feed = 100
remove_javascript = True
ignore_duplicate_articles = {'url'}
__author__ = u'Sophist-UK'
__copyright__ = '2023, Sophist-UK <sophist-uk@sodalis.co.uk>'
web_root = 'https://www.theoldie.co.uk'
current_issue = web_root + '/magazine'
about_pages = {
'About Us': web_root + '/about-us',
'Our History': web_root + '/about-us/history',
}
masthead_url = web_root + '/assets/images/theoldie_logo_22.png'
name = 'Oldie Online'
series = 'The ' + name
now = datetime.now().strftime(' %Y-%m')
title = series + now # noqa: PIE794
title_sort = name + now + ', The'
conversion_options = {
'authors': 'The Oldie',
'author_sort': 'Oldie, The',
'series': series,
'series_index': 0,
'title': title,
'title_sort': title_sort,
}
cover_suburl = '-front-cover-'
# Convert relative URLS to absolute ones i.e. /cover to https://theoldie.co.uk/cover
def abs_url(self, url):
return self.web_root + url if url.startswith('/') else url
# Create a correctly formatted DICT entry for Calibre parse_index return
def article_entry(self, title, url, author=None):
article = {
'title': title,
'url': url,
}
if author:
article['author'] = author
return article
edition_re = re.compile(r'(?:-front-cover-)(\d+)-')
# Identify the cover image and extract the edition# from the url
def get_cover_url(self):
soup = self.index_to_soup(self.current_issue)
for img in soup.findAll('img'):
src = self.abs_url(img['src'])
editions = self.edition_re.findall(src)
if editions:
try:
self.conversion_options.update({'series_index': int(editions[0])})
self.log('series-index:', self.conversion_options['series_index'])
except (TypeError, ValueError):
continue
self.log('cover_url:', src)
return src
return None
# oldie links/headings often contain the author (in one of various formats
# 1. Title. By author
# 2. Title by author: subtitle
# 3. Title: author: subtitle
title_author_re = re.compile(r'^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$')
# Separate author from title (where it is specified)
def title_author(self, head):
if '. By ' in head:
return head.rsplit('. By ', 1)
matches = self.title_author_re.findall(head)
if matches and len(matches[0]) == 3:
title_1, author, title_2 = matches[0]
title = ': '.join((title_1, title_2))
return title, author
return head, None
# Return the list of articles from blocks in the content of an index/listing page
def parse_content(self, soup):
content_articles = []
content = soup.find('div', class_='content-wrapper')
if not content:
return content_articles
for article in content.findAll('div', class_='listing-block'):
for a in article.findAll('a', href=True):
for h in a.findAll('h3'):
title, author = self.title_author(h.getText())
content_articles.append(self.article_entry(
title=title,
url=self.abs_url(a.get('href')),
author=author,
))
break
else:
continue
break
return content_articles
def parse_index(self):
# The set of pages to be used in the online edition are:
# 1. The list of articles in the body of the magazine index page
# 2. The contents / pages linked to by each of the links in the #categories menu
# 3. The div.only-in-the-magazine contents in the magazine index page
# 4. The about pages
# Obviously repeated content is de-duplicated by Calibre
self.log('masthead_url:', self.masthead_url)
soup = self.index_to_soup(self.current_issue)
# 1. The list of articles in the body of the magazine index page
articles = self.parse_content(soup)
# 2. The contents / pages linked to by each of the links in the #categories menu
categories = soup.find('nav', class_='categories')
for li in categories.findAll('li'):
a = li.find('a', href=True)
href = self.abs_url(a.get('href'))
self.log('Checking page for sub-index:', href)
content = self.parse_content(self.index_to_soup(href))
if content:
self.log('Subpages found:', href, len(content))
articles.extend(content)
else:
title, author = self.title_author(a.getText())
articles.append(self.article_entry(
title=title,
url=self.abs_url(a.get('href')),
author=author,
))
if not articles:
raise ValueError('The Oldie Online index of pages not found')
# 3. The div.only-in-the-magazine contents in the magazine index page
articles.append({
'title': 'In the full issue…',
'url': self.current_issue,
})
pages = [('In this issue…', articles)]
self.log('n this issue…', articles)
# 4. The about pages
abouts = []
for title, url in self.about_pages.items():
abouts.append({
'title': title,
'url': url,
})
if abouts:
pages.append(('About The Oldie', abouts))
self.log('About The Oldie', abouts)
return pages
def preprocess_html(self, soup):
for h in soup.findAll('h1'):
title, author = self.title_author(h.getText())
self.log('Replacing h3 "', h.getText(), '" with "', title, '"')
h.string = title
return soup
# Remove features not wanted and tweak HTML
preprocess_regexps = [
# Remove big blank spaces
(
re.compile(
r'<p>\s*<br\/?>\s*</p>',
re.DOTALL | re.IGNORECASE
),
lambda match: ''
),
# Local fix for paragraph HTML issues join paragraphs that do not end in a full-stop.
(
re.compile(
r'(?<=[^\.\s])\s*</p>\s*<p>',
re.DOTALL | re.IGNORECASE
),
lambda match: ' ' # space
),
]
# We remove vast swathes of HTML which is not part of the articles.
remove_tags_before = [
{'name': 'div', 'class': 'container'},
{'name': 'div', 'class': 'content-wrapper'},
{'name': 'div', 'class': 'only-in-the-magazine'},
]
remove_tags_after = [
{'name': 'div', 'class': 'container'},
{'name': 'div', 'class': 'content-wrapper'},
{'name': 'h2', 'string': 'Find out more about The Oldie'},
]
# Remove non-sibling content
remove_tags = [
{'name': 'nav', 'class': 'categories'},
{'name': 'div', 'class': 'internal-placeholders'},
{'name': 'div', 'class': 'leaderboard'},
{'name': 'div', 'class': 'share'},
{'name': 'div', 'class': 'most-popular'},
{'name': 'div', 'class': 'article-convert'},
# {'name': 'p', 'class': "article-convert"},
# {'name': 'p', 'class': "meta"},
{'name': 'hr'},
{'name': 'a', 'class': 'view-full-screen'},
{'name': 'div', 'class': 'image-counter'},
{'name': 'h2', 'string': 'Find out more about The Oldie'},
{'name': 'a', 'href': re.compile(r'^https?:\/\/issuu.com\/')},
{'name': 'img', 'src': re.compile(r'\/assets\/images\/icons\/icon-')},
]
# The following extra css is to tweak the formatting of various elements of various article pages.
extra_css = ' \n '.join([
'div.image-captions div.caption {text-align: center; font-weight: bold; width:750px;}',
'p.article-convert {text-align: center;}',
])