Tweaks to content, formatting and metadata

This commit is contained in:
Sophist 2023-07-01 21:46:25 +01:00 committed by GitHub
parent 3baef4a41e
commit 1def665183
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,14 +4,17 @@ Fetch Private Eye (Online Edition)
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta
class PrivateEyeRecipe(BasicNewsRecipe): class PrivateEyeRecipe(BasicNewsRecipe):
## ##
# Last Edited: 2023-07-01 # Last Edited: 2023-07-01
# #
# Remark: Version 3.0 # Remark: Version 3.0
# Rewrite (by Sophist-UK) to fix bugs, fit latest web pages, # Rewrite (by Sophist-UK) to fit latest web pages, correctly identify pages to include
# correctly identify pages to include and improve formatting. # and improve formatting.
# Edited to add: inclusion of About page,
# identifying series number and publication date and setting metadata.
# #
title = u'Private Eye (Online Edition)' title = u'Private Eye (Online Edition)'
@ -27,24 +30,60 @@ class PrivateEyeRecipe(BasicNewsRecipe):
__author__ = u'Martyn Pritchard & Sophist-UK' __author__ = u'Martyn Pritchard & Sophist-UK'
__copyright__ = '2020, Martyn Pritchard <MPritchard2k9@gmail.com> & Sophist-UK <sophist-uk@sodalis.co.uk>' __copyright__ = '2020, Martyn Pritchard <MPritchard2k9@gmail.com> & Sophist-UK <sophist-uk@sodalis.co.uk>'
current_issue = 'https://www.private-eye.co.uk/current-issue' current_issue = 'https://www.private-eye.co.uk/current-issue'
masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png' about_page = 'https://www.private-eye.co.uk/about'
masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png'
author = 'Private Eye'
series = title = 'Private Eye Online'
conversion_options = {
'authors': author,
'author_sort': author,
'series': series,
'series_index': 0,
'title': title,
'title_sort': title,
}
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.current_issue) soup = self.index_to_soup(self.current_issue)
for citem in soup.findAll('img'):
if citem['src'].endswith('big.jpg'): for img in soup.findAll('img'):
return citem['src'] src = img['src']
if src.endswith('_big.jpg'):
file_name = src.rsplit('/',1)[1]
if file_name is None:
file_name = src
try:
self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])})
self.log('series-index:', self.conversion_options['series_index'])
except (TypeError, ValueError):
# wrong big image
continue
return src
return None return None
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.current_issue) soup = self.index_to_soup(self.current_issue)
key = None # Get publication date
articles = [] sidebar = soup.find('div', attrs={'id': 'current-issue-sidebar'})
next_issue_text = sidebar.find('b').nextSibling.strip()
try:
day, month, year = next_issue_text.split(' ')
day = ''.join(c for c in day if c.isdigit())
pub_date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y") - timedelta(12)
self.log('pub-date:', pub_date)
self.conversion_options.update({'pubdate': datetime.strftime(pub_date, "%d %B %Y").lstrip("0")})
title = self.title + " " + datetime.strftime(pub_date, "%Y-%m-%d")
self.conversion_options.update({'title': title})
self.conversion_options.update({'title_sort': title})
except (TypeError, ValueError):
# Bad date
self.log('Cannot parse next issue date from:', next_issue_date)
# Get pages first from the sub-menu, and then from the contents panel. # Get pages first from the sub-menu, and then from the contents panel.
# Duplicates will be eliminated automatically. # Duplicates will be eliminated automatically.
articles = []
for menu_attrs in ( for menu_attrs in (
{'class': 'sub-nav-bar', 'id':'sub-nav-box'}, {'class': 'sub-nav-bar', 'id':'sub-nav-box'},
{'class': 'article', 'id': 'block-left'}, {'class': 'article', 'id': 'block-left'},
@ -66,19 +105,30 @@ class PrivateEyeRecipe(BasicNewsRecipe):
if not articles: if not articles:
abort_recipe_processing('Private-Eye Online index of pages not found'); abort_recipe_processing('Private-Eye Online index of pages not found');
index = [('Private Eye', articles)] # Add the About page as a final article
articles.append({
'title': 'About Private Eye',
'url': self.about_page,
})
self.log('parse_index', index) self.log('parse_index:', articles)
return index return [('Private Eye', articles)]
# We remove vast swathes of HTML which is not part of the articles.
remove_tags_before = remove_tags_after = [ # Remove sibling content
remove_tags_before = [
{'name': 'div', 'class': "article"}, {'name': 'div', 'class': "article"},
{'name': 'div', 'id': "page"}, {'name': 'div', 'id': "page"},
{'name': 'div', 'id': "page-wide"}, {'name': 'div', 'id': "page-wide"},
{'name': 'div', 'id': "content"}, {'name': 'div', 'id': "content"},
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
] ]
remove_tags_after = remove_tags_before.copy()
remove_tags_after.append(
{'name': 'div', 'id': 'about-covers'},
)
# Remove non-sibling content
remove_tags = [ remove_tags = [
{'name': 'div', 'attrs': {'id': 'top-bar'}}, {'name': 'div', 'attrs': {'id': 'top-bar'}},
{'name': 'div', 'attrs': {'id': 'header-wide'}}, {'name': 'div', 'attrs': {'id': 'header-wide'}},
@ -87,6 +137,9 @@ class PrivateEyeRecipe(BasicNewsRecipe):
{'name': 'div', 'attrs': {'id': 'sidebar'}}, {'name': 'div', 'attrs': {'id': 'sidebar'}},
{'name': 'div', 'attrs': {'id': 'sections-sidebar'}}, {'name': 'div', 'attrs': {'id': 'sections-sidebar'}},
{'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}}, {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}},
{'name': 'div', 'attrs': {'id': 'nav-box-pages-mobile'}},
{'name': 'div', 'attrs': {'id': 'about-covers'}},
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
{'name': 'iframe'}, {'name': 'iframe'},
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}}, {'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}},
{'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}}, {'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}},
@ -97,24 +150,26 @@ class PrivateEyeRecipe(BasicNewsRecipe):
preprocess_regexps = [ preprocess_regexps = [
( (
re.compile( re.compile(
r'<span class="headline">(.*?)</span>', r'<span class="headline(?:-new)?">(.*?)</span>\s*(?:<br>\s*)*(?:<span class="text">(.*?)</span>)?',
re.DOTALL | re.IGNORECASE re.DOTALL | re.IGNORECASE
), lambda match: '<h1>' + match[0] + '</h1>' ),
), lambda match: '<h1>' + match[1] + '</h1>' +
( (('<h4>' + match[2] + '</h4>') if match[2] else '')
re.compile(
r'<span class="text">(<font color="#666666">.*?)</span>',
re.DOTALL | re.IGNORECASE
), lambda match: '<h4>' + match[0] + '</h4>'
), ),
] ]
# The following extra css is to tweak the formatting of various elements of various article pages.
# Unfortunately, there are a variety of different pages styles, hence the extended tweak list.
# Some of these mimic the actual layout.css which does not seem to make it across into the calibre
# ebook without duplicating it as extra css.
# However some is new css to tweak output when part of an ebook.
extra_css = ' \n '.join([ extra_css = ' \n '.join([
'#content img {float: right;}', '#content img {float: right;}',
'#content img.cartoon-left {float: left;}', '#content img.cartoon-left {float: left;}',
'#content img.cartoon-right {float: right;}', '#content img.cartoon-right {float: right;}',
'#content img:first-child {float: none;}', '#content img:first-child {float: none;}',
'#content #block-sections img {float: none;}', '#content #block-sections img {float: none;}',
'#content #block-sections img.crossword {float: none; width: 50%; margin-right: 20px;}',
'#article-caption-box {float: right; background: #222222; display: block; width: 40%; max-width: 40%;}', '#article-caption-box {float: right; background: #222222; display: block; width: 40%; max-width: 40%;}',
'#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}', '#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}',
'#whatsapp {border-left: 5px #8aba60 solid; border-right: 5px #8aba60 solid; border-bottom: 5px #8aba60 solid; padding: 0 20px 20px 20px;}', '#whatsapp {border-left: 5px #8aba60 solid; border-right: 5px #8aba60 solid; border-bottom: 5px #8aba60 solid; padding: 0 20px 20px 20px;}',