''' Fetch The Oldie (Online Edition) ''' import re from datetime import datetime from calibre.web.feeds.news import BasicNewsRecipe class PrivateEyeRecipe(BasicNewsRecipe): ## # Last Edited: 2023-08-07 # # Remark: Version 1.0 2023-08-07 # Initial version title = u'The Oldie (Online Edition)' description = ('The Oldie has been dubbed ‘Private Eye for grown-ups’ and is read by intelligent people who are fed' ' up with the formulaic nature of the celebrity-obsessed national press. The Oldie was cooked up in' ' 1992 by Richard Ingrams (who previously co-founded Private Eye in 1961) as a free-thinking,' ' funny magazine, a light-hearted alternative to a press obsessed with youth and celebrity.' ' The editors claim that the Oldie is ageless and timeless, free of retirement advice, crammed' ' with rejuvenating wit, intelligence and delight.') publication_type = 'magazine' language = 'en_GB' encoding = 'utf-8' oldest_article = 31 max_articles_per_feed = 100 remove_javascript = True ignore_duplicate_articles = {'url'} __author__ = u'Sophist-UK' __copyright__ = '2023, Sophist-UK ' web_root = 'https://www.theoldie.co.uk' current_issue = web_root + '/magazine' about_pages = { 'About Us': web_root + '/about-us', 'Our History': web_root + '/about-us/history', } masthead_url = web_root + '/assets/images/theoldie_logo_22.png' name = 'Oldie Online' series = 'The ' + name now = datetime.now().strftime(' %Y-%m') title = series + now # noqa: PIE794 title_sort = name + now + ', The' conversion_options = { 'authors': 'The Oldie', 'author_sort': 'Oldie, The', 'series': series, 'series_index': 0, 'title': title, 'title_sort': title_sort, } cover_suburl = '-front-cover-' # Convert relative URLS to absolute ones i.e. /cover to https://theoldie.co.uk/cover def abs_url(self, url): return self.web_root + url if url.startswith('/') else url # Create a correctly formatted DICT entry for Calibre parse_index return def article_entry(self, title, url, author=None): article = { 'title': title, 'url': url, } if author: article['author'] = author return article edition_re = re.compile(r'(?:-front-cover-)(\d+)-') # Identify the cover image and extract the edition# from the url def get_cover_url(self): soup = self.index_to_soup(self.current_issue) for img in soup.findAll('img'): src = self.abs_url(img['src']) editions = self.edition_re.findall(src) if editions: try: self.conversion_options.update({'series_index': int(editions[0])}) self.log('series-index:', self.conversion_options['series_index']) except (TypeError, ValueError): continue self.log('cover_url:', src) return src return None # oldie links/headings often contain the author (in one of various formats # 1. Title. By author # 2. Title by author: subtitle # 3. Title: author: subtitle title_author_re = re.compile(r'^(.*?)(?:(?: by )|(?:: ))(.*?): (.*?)$') # Separate author from title (where it is specified) def title_author(self, head): if '. By ' in head: return head.rsplit('. By ', 1) matches = self.title_author_re.findall(head) if matches and len(matches[0]) == 3: title_1, author, title_2 = matches[0] title = ': '.join((title_1, title_2)) return title, author return head, None # Return the list of articles from blocks in the content of an index/listing page def parse_content(self, soup): content_articles = [] content = soup.find('div', class_='content-wrapper') if not content: return content_articles for article in content.findAll('div', class_='listing-block'): for a in article.findAll('a', href=True): for h in a.findAll('h3'): title, author = self.title_author(h.getText()) content_articles.append(self.article_entry( title=title, url=self.abs_url(a.get('href')), author=author, )) break else: continue break return content_articles def parse_index(self): # The set of pages to be used in the online edition are: # 1. The list of articles in the body of the magazine index page # 2. The contents / pages linked to by each of the links in the #categories menu # 3. The div.only-in-the-magazine contents in the magazine index page # 4. The about pages # Obviously repeated content is de-duplicated by Calibre self.log('masthead_url:', self.masthead_url) soup = self.index_to_soup(self.current_issue) # 1. The list of articles in the body of the magazine index page articles = self.parse_content(soup) # 2. The contents / pages linked to by each of the links in the #categories menu categories = soup.find('nav', class_='categories') for li in categories.findAll('li'): a = li.find('a', href=True) href = self.abs_url(a.get('href')) self.log('Checking page for sub-index:', href) content = self.parse_content(self.index_to_soup(href)) if content: self.log('Subpages found:', href, len(content)) articles.extend(content) else: title, author = self.title_author(a.getText()) articles.append(self.article_entry( title=title, url=self.abs_url(a.get('href')), author=author, )) if not articles: raise ValueError('The Oldie Online index of pages not found') # 3. The div.only-in-the-magazine contents in the magazine index page articles.append({ 'title': 'In the full issue…', 'url': self.current_issue, }) pages = [('In this issue…', articles)] self.log('n this issue…', articles) # 4. The about pages abouts = [] for title, url in self.about_pages.items(): abouts.append({ 'title': title, 'url': url, }) if abouts: pages.append(('About The Oldie', abouts)) self.log('About The Oldie', abouts) return pages def preprocess_html(self, soup): for h in soup.findAll('h1'): title, author = self.title_author(h.getText()) self.log('Replacing h3 "', h.getText(), '" with "', title, '"') h.string = title return soup # Remove features not wanted and tweak HTML preprocess_regexps = [ # Remove big blank spaces ( re.compile( r'

\s*\s*

', re.DOTALL | re.IGNORECASE ), lambda match: '' ), # Local fix for paragraph HTML issues join paragraphs that do not end in a full-stop. ( re.compile( r'(?<=[^\.\s])\s*

\s*

', re.DOTALL | re.IGNORECASE ), lambda match: ' ' # space ), ] # We remove vast swathes of HTML which is not part of the articles. remove_tags_before = [ {'name': 'div', 'class': 'container'}, {'name': 'div', 'class': 'content-wrapper'}, {'name': 'div', 'class': 'only-in-the-magazine'}, ] remove_tags_after = [ {'name': 'div', 'class': 'container'}, {'name': 'div', 'class': 'content-wrapper'}, {'name': 'h2', 'string': 'Find out more about The Oldie'}, ] # Remove non-sibling content remove_tags = [ {'name': 'nav', 'class': 'categories'}, {'name': 'div', 'class': 'internal-placeholders'}, {'name': 'div', 'class': 'leaderboard'}, {'name': 'div', 'class': 'share'}, {'name': 'div', 'class': 'most-popular'}, {'name': 'div', 'class': 'article-convert'}, # {'name': 'p', 'class': "article-convert"}, # {'name': 'p', 'class': "meta"}, {'name': 'hr'}, {'name': 'a', 'class': 'view-full-screen'}, {'name': 'div', 'class': 'image-counter'}, {'name': 'h2', 'string': 'Find out more about The Oldie'}, {'name': 'a', 'href': re.compile(r'^https?:\/\/issuu.com\/')}, {'name': 'img', 'src': re.compile(r'\/assets\/images\/icons\/icon-')}, ] # The following extra css is to tweak the formatting of various elements of various article pages. extra_css = ' \n '.join([ 'div.image-captions div.caption {text-align: center; font-weight: bold; width:750px;}', 'p.article-convert {text-align: center;}', ])