mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-25 07:48:55 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			245 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			245 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| # vim:fileencoding=utf-8
 | |
| from datetime import datetime
 | |
| 
 | |
| from calibre.web.feeds.news import BasicNewsRecipe
 | |
| 
 | |
| 
 | |
| class NoArticles(Exception):
 | |
|     pass
 | |
| 
 | |
| 
 | |
| class Fokus(BasicNewsRecipe):
 | |
|     title = 'Fokus'
 | |
|     main_url = 'https://www.fokus.se'
 | |
|     description = "The current week's edition of Swedish current-affairs magazine 'Fokus'"
 | |
|     encoding = 'utf-8'
 | |
|     __author__ = 'Henrik Holm (https://github.com/h-holm)'
 | |
|     language = 'sv'
 | |
|     ignore_duplicate_articles = {'title', 'url'}
 | |
|     masthead_url = 'https://cdn.fokus.se/app/uploads/fokus/2022/05/12214931/fokus-logo.svg'
 | |
|     no_stylesheets = True
 | |
|     compress_news_images = True
 | |
|     needs_subscription = 'optional'
 | |
|     use_embedded_content = False
 | |
|     remove_empty_feeds = True
 | |
|     scale_news_images_to_device = True
 | |
|     scale_news_images = (800, 600)
 | |
|     delay = 3  # Avoid throttling by the server.
 | |
| 
 | |
|     # 1. Center and reduce the size of images and image captions.
 | |
|     # 2. Make the lead text italic.
 | |
|     # 3. Make the article metadata text gray and small.
 | |
|     extra_css = '''
 | |
|         img { display: block; margin: auto; width: 50%; height: auto; }
 | |
|         div.calibre-nuked-tag-figure { font-size: small; text-align: center; }
 | |
|         p.Single__lead, p.Longread__lead { font-style: italic; color:#202020; }
 | |
|         p.article-metadata { color: gray; font-size:small; }
 | |
|     '''
 | |
| 
 | |
|     keep_only_tags = [
 | |
|         dict(name='h1', class_='Single__title'),                # Title of "Single" type articles.
 | |
|         dict(name='h1', class_='Longread__title'),              # Title of "Longread" type articles.
 | |
|         dict(name='p', class_='Single__lead'),                  # Lead text of "Single" type articles.
 | |
|         dict(name='p', class_='Longread__lead'),                # Lead text of "Longread" type articles.
 | |
|         dict(name='p', class_='article-metadata'),              # Dynamically created by the recipe.
 | |
|         dict(name='figure', class_='Single__thumbnail'),        # Image of "Single" type articles.
 | |
|         dict(name='figure', class_='Longread__thumbnail'),      # Image of "Longread" type articles.
 | |
|         dict(name='div', class_='sesamy-protected-content'),    # Article body.
 | |
|     ]
 | |
| 
 | |
|     def extract_cover_url(self, a_tag) -> str:
 | |
|         '''Given the <a> tag of the current edition, extract the URL of the highest-resolution cover image.'''
 | |
|         figure_tag = a_tag.find('figure')
 | |
|         img_tag = figure_tag.find('img')
 | |
| 
 | |
|         try:
 | |
|             # The `srcset` attribute contains a comma-separated list of URLs and their resolutions.
 | |
|             cover_urls = img_tag['srcset'].split(', ')
 | |
|             cover_urls = [src.split(' ') for src in cover_urls]
 | |
| 
 | |
|             # The second item of each tuple should be the resolution, e.g., '578w' or '821w'. Remove the 'w' suffix, cast
 | |
|             # to an integer and sort in descending order.
 | |
|             cover_urls = [(url, int(resolution[:-1])) for url, resolution in cover_urls]
 | |
|             cover_urls = sorted(cover_urls, key=lambda x: x[1], reverse=True)
 | |
| 
 | |
|             # The first item of the sorted list is now the URL of the highest-resolution image.
 | |
|             self.cover_url = cover_urls[0][0]
 | |
|             self.log(f"Identified cover URL: '{self.cover_url}'")
 | |
|         except (KeyError, ValueError) as exc:
 | |
|             self.log.error(f'Failed to extract cover URL! Has the website format changed?\n{exc}')
 | |
| 
 | |
|         return
 | |
| 
 | |
|     def get_current_edition_url(self) -> str:
 | |
|         '''Return the URL of the current (weekly) edition of Fokus.'''
 | |
|         current_year = datetime.now().year
 | |
|         try:
 | |
|             soup = self.index_to_soup(f'{self.main_url}/vara-utgavor')
 | |
| 
 | |
|             # Identify all <a> tags of class 'Issue' that have an href attribute containing the current year.
 | |
|             a_tags = soup.find_all('a', class_='Issue', href=True)
 | |
| 
 | |
|             # Keep only the href, and subset to only those links that contain the current year.
 | |
|             edition_links = [a_tag['href'] for a_tag in a_tags if str(current_year) in a_tag['href']]
 | |
| 
 | |
|             # In order to successfully sort the links chronologically, first convert the data structure to a dict, wherein
 | |
|             # the key consists of only the date part of the URL and the value consists of the entire (unmodified) URL.
 | |
|             edition_links = {link.removesuffix('/').split('/')[-1]: link for link in edition_links}
 | |
| 
 | |
|             # Then, shorten the key further by keeping only the part after the first hyphen. This removes the year and
 | |
|             # typically results in only the calendar week number remaining, e.g., '1', '21' or '52'. Note however that
 | |
|             # editions can sometimes cover multiples weeks, e.g., '1-2', '01-03' or '50-51-52'. In order to sort correctly,
 | |
|             # it is therefore necessary to additionally keep only the first part of the week number(s) after the hyphen.
 | |
|             edition_links = {key.split('-', 1)[-1].split('-', 1)[0]: value for key, value in edition_links.items()}
 | |
| 
 | |
|             # Now, convert the resulting keys to integers
 | |
|             edition_links = {int(key): value for key, value in edition_links.items()}
 | |
| 
 | |
|             # Finally, sort in descending order, so that the most recent edition is first.
 | |
|             edition_links = dict(sorted(edition_links.items(), reverse=True))
 | |
|             current_edition_url = edition_links[list(edition_links.keys())[0]]
 | |
| 
 | |
|             self.log(f"Identified {len(edition_links)} editions, of which the most recent is '{current_edition_url}'.")
 | |
| 
 | |
|             # Now that we know the URL of the current edition, we can use it to identify the cover image. The cover
 | |
|             # image URL exists in the `src` attribute of the <img> child tag of the <figure> child tag of the <a> tag
 | |
|             # of the current edition.
 | |
|             current_edition_a_tag = soup.find('a', class_='Issue', href=current_edition_url)
 | |
|             self.extract_cover_url(current_edition_a_tag)
 | |
|         except Exception as exc:
 | |
|             self.log.error(f'Failed to identify the current edition URL: {exc}')
 | |
|             raise NoArticles(
 | |
|                 f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing "
 | |
|                 'issues, in which case you should try again later, or the website format has changed and the recipe '
 | |
|                 'needs updating.'
 | |
|             ) from exc
 | |
|         return current_edition_url
 | |
| 
 | |
|     def parse_article_blurb(self, article_blurb) -> dict[str, str, str, str, str] | None:
 | |
|         '''Given a <article> tag of class 'Blurb', parse it into a dict.
 | |
| 
 | |
|         Args:
 | |
|             article_blurb (Tag): An <article> tag hosting metadata and the URL of an article.
 | |
| 
 | |
|         Returns:
 | |
|             A dict on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format.
 | |
|         '''
 | |
|         if a_tag := article_blurb.find('a', href=True):
 | |
|             url = a_tag['href'].strip().rstrip('/')
 | |
|             if url.startswith('/'):
 | |
|                 url = f'{self.main_url}{url}'
 | |
| 
 | |
|             if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}):
 | |
|                 title = self.tag_to_string(title_tag).strip()
 | |
| 
 | |
|             if date_tag := a_tag.find('time', {'class': 'Blurb__date'}):
 | |
|                 # Results in a Swedish date format, e.g., '23 MAJ 2025'.
 | |
|                 date = self.tag_to_string(date_tag).strip()
 | |
|                 # Add a newline before the date to make it more readable.
 | |
|                 date = f'\n{date}'
 | |
| 
 | |
|             # Assign the article to its first listed category as inferred from the first <li> tag of class
 | |
|             # 'Blurb__category'. Default to 'Fokus' if no such tag is found.
 | |
|             category = 'Fokus'
 | |
|             if category_tag := a_tag.find('li', {'class': 'Blurb__category'}):
 | |
|                 category = self.tag_to_string(category_tag).strip()
 | |
| 
 | |
|             desc = ''
 | |
|             if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}):
 | |
|                 desc = self.tag_to_string(desc_tag).strip()
 | |
| 
 | |
|             return {'url': url, 'title': title, 'date': date, 'category': category, 'description': desc}
 | |
| 
 | |
|         return
 | |
| 
 | |
|     def get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str, str]]:
 | |
|         '''Given a Fokus webpage `soup`, return a dict of unique article entries found on the page.
 | |
| 
 | |
|         The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary
 | |
|         on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format.
 | |
| 
 | |
|         Args:
 | |
|             soup (BeautifulSoup): The `bs4.BeautifulSoup` soup of a Fokus webpage.
 | |
| 
 | |
|         Returns:
 | |
|             dict[str, dict[str, str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values.
 | |
|         '''
 | |
| 
 | |
|         def _log(article) -> None:
 | |
|             '''Log a digestible summary of the input `article` blurb.'''
 | |
|             log_message = f"\t{article['title']} : {article['url']}"
 | |
|             if article.get('description'):
 | |
|                 log_message += f" : {article['description']}"
 | |
|             self.log.debug(log_message)
 | |
| 
 | |
|         # Identify all <article> tags of class 'Blurb' that have an href attribute.
 | |
|         self.log('Identifying all articles...')
 | |
|         try:
 | |
|             article_blurbs = soup.find_all('article', {'class': 'Blurb'})
 | |
|         except AttributeError:
 | |
|             article_blurbs = []
 | |
| 
 | |
|         if not article_blurbs:
 | |
|             self.log.error('Failed to identify any article blurbs.')
 | |
|             return {}
 | |
| 
 | |
|         parsed_blurbs = {}
 | |
|         for article_blurb in article_blurbs:
 | |
|             if article := self.parse_article_blurb(article_blurb):
 | |
|                 _log(article)
 | |
|                 parsed_blurbs[article['url']] = article
 | |
| 
 | |
|         return parsed_blurbs
 | |
| 
 | |
|     def convert_to_section_lists(
 | |
|         self,
 | |
|         articles: dict[str, dict[str, str, str, str, str]],
 | |
|     ) -> dict[str, list[dict[str, str, str, str]]]:
 | |
|         '''Convert the `articles` dict of dicts to a dict of lists; each list holds the articles of a given section.
 | |
| 
 | |
|         Args:
 | |
|             articles (dict[str, dict[str, str, str, str, str]]): A dict of article URLs and article dicts.
 | |
| 
 | |
|         Returns:
 | |
|             dict[str, list[dict[str, str, str, str]]]: A dict on a `{section_title: list[article_dict]}` format.
 | |
|         '''
 | |
|         self.log(f'Assigning each of the {len(articles)} articles to a section...')
 | |
|         section_to_articles = {}
 | |
|         for article_url, article_dict in articles.items():
 | |
|             section_title = article_dict['category']
 | |
|             if section_title not in section_to_articles:
 | |
|                 section_to_articles[section_title] = []
 | |
|             # Remove the 'category' key from the article dict, as it is not needed in the final output.
 | |
|             article_dict.pop('category')
 | |
|             section_to_articles[section_title].append(article_dict)
 | |
| 
 | |
|         return section_to_articles
 | |
| 
 | |
|     def parse_index(self):
 | |
|         current_edition_url = self.get_current_edition_url()
 | |
|         if not current_edition_url:
 | |
|             raise NoArticles(
 | |
|                 f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing "
 | |
|                 'issues, in which case you should try again later, or the website format has changed and the recipe '
 | |
|                 'needs updating.'
 | |
|             )
 | |
|         self.log(f'Current edition URL: {current_edition_url}')
 | |
| 
 | |
|         # Identify all sections in the web version of Fokus.
 | |
|         edition_soup = self.index_to_soup(current_edition_url)
 | |
| 
 | |
|         # From the section URLs and the main URL, identify all unique articles.
 | |
|         articles = self.get_article_blurbs(edition_soup)
 | |
|         if not articles:
 | |
|             raise NoArticles(
 | |
|                 f"Could not find any articles. Either the '{self.main_url}' server is experiencing issues, in which "
 | |
|                 'case you should try again later, or the website format has changed and the recipe needs updating.'
 | |
|             )
 | |
| 
 | |
|         # Assign each identified article to a section based on its URL.
 | |
|         section_to_articles = self.convert_to_section_lists(articles)
 | |
| 
 | |
|         # Convert to tuples.
 | |
|         return list(section_to_articles.items())
 |