mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-30 23:00:21 -04:00
253 lines
9.2 KiB
Python
253 lines
9.2 KiB
Python
'''
|
||
Fetch The Oldie (Online Edition)
|
||
'''
|
||
|
||
import re
|
||
from datetime import datetime
|
||
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
|
||
|
||
class PrivateEyeRecipe(BasicNewsRecipe):
|
||
##
|
||
# Last Edited: 2023-08-07
|
||
#
|
||
# Remark: Version 1.0 2023-08-07
|
||
# Initial version
|
||
|
||
title = u'The Oldie (Online Edition)'
|
||
description = ('The Oldie has been dubbed ‘Private Eye for grown-ups’ and is read by intelligent people who are fed'
|
||
' up with the formulaic nature of the celebrity-obsessed national press. The Oldie was cooked up in'
|
||
' 1992 by Richard Ingrams (who previously co-founded Private Eye in 1961) as a free-thinking,'
|
||
' funny magazine, a light-hearted alternative to a press obsessed with youth and celebrity.'
|
||
' The editors claim that the Oldie is ageless and timeless, free of retirement advice, crammed'
|
||
' with rejuvenating wit, intelligence and delight.')
|
||
publication_type = 'magazine'
|
||
language = 'en_GB'
|
||
encoding = 'utf-8'
|
||
oldest_article = 31
|
||
max_articles_per_feed = 100
|
||
remove_javascript = True
|
||
ignore_duplicate_articles = {'url'}
|
||
|
||
__author__ = u'Sophist-UK'
|
||
__copyright__ = '2023, Sophist-UK <sophist-uk@sodalis.co.uk>'
|
||
|
||
web_root = 'https://www.theoldie.co.uk'
|
||
current_issue = web_root + '/magazine'
|
||
about_pages = {
|
||
'About Us': web_root + '/about-us',
|
||
'Our History': web_root + '/about-us/history',
|
||
}
|
||
masthead_url = web_root + '/assets/images/theoldie_logo_22.png'
|
||
name = 'Oldie Online'
|
||
series = 'The ' + name
|
||
now = datetime.now().strftime(' %Y-%m')
|
||
title = series + now # noqa: PIE794
|
||
title_sort = name + now + ', The'
|
||
conversion_options = {
|
||
'authors': 'The Oldie',
|
||
'author_sort': 'Oldie, The',
|
||
'series': series,
|
||
'series_index': 0,
|
||
'title': title,
|
||
'title_sort': title_sort,
|
||
}
|
||
cover_suburl = '-front-cover-'
|
||
|
||
# Convert relative URLS to absolute ones i.e. /cover to https://theoldie.co.uk/cover
|
||
def abs_url(self, url):
|
||
return self.web_root + url if url.startswith('/') else url
|
||
|
||
# Create a correctly formatted DICT entry for Calibre parse_index return
|
||
def article_entry(self, title, url, author=None):
|
||
article = {
|
||
'title': title,
|
||
'url': url,
|
||
}
|
||
if author:
|
||
article['author'] = author
|
||
return article
|
||
|
||
edition_re = re.compile(r'(?:-front-cover-)(\d+)-')
|
||
|
||
# Identify the cover image and extract the edition# from the url
|
||
def get_cover_url(self):
|
||
soup = self.index_to_soup(self.current_issue)
|
||
|
||
for img in soup.findAll('img'):
|
||
src = self.abs_url(img['src'])
|
||
editions = self.edition_re.findall(src)
|
||
if editions:
|
||
try:
|
||
self.conversion_options.update({'series_index': int(editions[0])})
|
||
self.log('series-index:', self.conversion_options['series_index'])
|
||
except (TypeError, ValueError):
|
||
continue
|
||
self.log('cover_url:', src)
|
||
return src
|
||
return None
|
||
|
||
# oldie links/headings often contain the author (in one of various formats
|
||
# 1. Title. By author
|
||
# 2. Title by author: subtitle
|
||
# 3. Title: author: subtitle
|
||
title_author_re = re.compile(r'^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$')
|
||
|
||
# Separate author from title (where it is specified)
|
||
def title_author(self, head):
|
||
if '. By ' in head:
|
||
return head.rsplit('. By ', 1)
|
||
matches = self.title_author_re.findall(head)
|
||
if matches and len(matches[0]) == 3:
|
||
title_1, author, title_2 = matches[0]
|
||
title = ': '.join((title_1, title_2))
|
||
return title, author
|
||
return head, None
|
||
|
||
# Return the list of articles from blocks in the content of an index/listing page
|
||
def parse_content(self, soup):
|
||
content_articles = []
|
||
|
||
content = soup.find('div', class_='content-wrapper')
|
||
|
||
if not content:
|
||
return content_articles
|
||
|
||
for article in content.findAll('div', class_='listing-block'):
|
||
for a in article.findAll('a', href=True):
|
||
for h in a.findAll('h3'):
|
||
title, author = self.title_author(h.getText())
|
||
content_articles.append(self.article_entry(
|
||
title=title,
|
||
url=self.abs_url(a.get('href')),
|
||
author=author,
|
||
))
|
||
break
|
||
else:
|
||
continue
|
||
break
|
||
|
||
return content_articles
|
||
|
||
def parse_index(self):
|
||
# The set of pages to be used in the online edition are:
|
||
# 1. The list of articles in the body of the magazine index page
|
||
# 2. The contents / pages linked to by each of the links in the #categories menu
|
||
# 3. The div.only-in-the-magazine contents in the magazine index page
|
||
# 4. The about pages
|
||
# Obviously repeated content is de-duplicated by Calibre
|
||
|
||
self.log('masthead_url:', self.masthead_url)
|
||
soup = self.index_to_soup(self.current_issue)
|
||
|
||
# 1. The list of articles in the body of the magazine index page
|
||
articles = self.parse_content(soup)
|
||
|
||
# 2. The contents / pages linked to by each of the links in the #categories menu
|
||
categories = soup.find('nav', class_='categories')
|
||
for li in categories.findAll('li'):
|
||
a = li.find('a', href=True)
|
||
href = self.abs_url(a.get('href'))
|
||
self.log('Checking page for sub-index:', href)
|
||
content = self.parse_content(self.index_to_soup(href))
|
||
if content:
|
||
self.log('Subpages found:', href, len(content))
|
||
articles.extend(content)
|
||
else:
|
||
title, author = self.title_author(a.getText())
|
||
articles.append(self.article_entry(
|
||
title=title,
|
||
url=self.abs_url(a.get('href')),
|
||
author=author,
|
||
))
|
||
|
||
if not articles:
|
||
raise ValueError('The Oldie Online index of pages not found')
|
||
|
||
# 3. The div.only-in-the-magazine contents in the magazine index page
|
||
articles.append({
|
||
'title': 'In the full issue…',
|
||
'url': self.current_issue,
|
||
})
|
||
|
||
pages = [('In this issue…', articles)]
|
||
self.log('n this issue…', articles)
|
||
|
||
# 4. The about pages
|
||
abouts = []
|
||
for title, url in self.about_pages.items():
|
||
abouts.append({
|
||
'title': title,
|
||
'url': url,
|
||
})
|
||
|
||
if abouts:
|
||
pages.append(('About The Oldie', abouts))
|
||
self.log('About The Oldie', abouts)
|
||
|
||
return pages
|
||
|
||
def preprocess_html(self, soup):
|
||
for h in soup.findAll('h1'):
|
||
title, author = self.title_author(h.getText())
|
||
self.log('Replacing h3 "', h.getText(), '" with "', title, '"')
|
||
h.string = title
|
||
|
||
return soup
|
||
|
||
# Remove features not wanted and tweak HTML
|
||
preprocess_regexps = [
|
||
# Remove big blank spaces
|
||
(
|
||
re.compile(
|
||
r'<p>\s*<br\/?>\s*</p>',
|
||
re.DOTALL | re.IGNORECASE
|
||
),
|
||
lambda match: ''
|
||
),
|
||
# Local fix for paragraph HTML issues join paragraphs that do not end in a full-stop.
|
||
(
|
||
re.compile(
|
||
r'(?<=[^\.\s])\s*</p>\s*<p>',
|
||
re.DOTALL | re.IGNORECASE
|
||
),
|
||
lambda match: ' ' # space
|
||
),
|
||
]
|
||
|
||
# We remove vast swathes of HTML which is not part of the articles.
|
||
remove_tags_before = [
|
||
{'name': 'div', 'class': 'container'},
|
||
{'name': 'div', 'class': 'content-wrapper'},
|
||
{'name': 'div', 'class': 'only-in-the-magazine'},
|
||
]
|
||
remove_tags_after = [
|
||
{'name': 'div', 'class': 'container'},
|
||
{'name': 'div', 'class': 'content-wrapper'},
|
||
{'name': 'h2', 'string': 'Find out more about The Oldie'},
|
||
]
|
||
# Remove non-sibling content
|
||
remove_tags = [
|
||
{'name': 'nav', 'class': 'categories'},
|
||
{'name': 'div', 'class': 'internal-placeholders'},
|
||
{'name': 'div', 'class': 'leaderboard'},
|
||
{'name': 'div', 'class': 'share'},
|
||
{'name': 'div', 'class': 'most-popular'},
|
||
{'name': 'div', 'class': 'article-convert'},
|
||
# {'name': 'p', 'class': "article-convert"},
|
||
# {'name': 'p', 'class': "meta"},
|
||
{'name': 'hr'},
|
||
{'name': 'a', 'class': 'view-full-screen'},
|
||
{'name': 'div', 'class': 'image-counter'},
|
||
{'name': 'h2', 'string': 'Find out more about The Oldie'},
|
||
{'name': 'a', 'href': re.compile(r'^https?:\/\/issuu.com\/')},
|
||
{'name': 'img', 'src': re.compile(r'\/assets\/images\/icons\/icon-')},
|
||
]
|
||
|
||
# The following extra css is to tweak the formatting of various elements of various article pages.
|
||
extra_css = ' \n '.join([
|
||
'div.image-captions div.caption {text-align: center; font-weight: bold; width:750px;}',
|
||
'p.article-convert {text-align: center;}',
|
||
])
|