diff --git a/recipes/private_eye.recipe b/recipes/private_eye.recipe index bbce44f129..66aaa81210 100644 --- a/recipes/private_eye.recipe +++ b/recipes/private_eye.recipe @@ -1,49 +1,182 @@ +''' +Fetch Private Eye (Online Edition) +''' + import re from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta +class PrivateEyeRecipe(BasicNewsRecipe): + ## + # Last Edited: 2023-07-01 + # + # Remark: Version 3.0 + # Rewrite (by Sophist-UK) to fit latest web pages, correctly identify pages to include + # and improve formatting. + # Edited to add: inclusion of About page, + # identifying series number and publication date and setting metadata. + # -class AdvancedUserRecipe1359406781(BasicNewsRecipe): - title = u'Private Eye' - publication_type = 'magazine' + title = u'Private Eye (Online Edition)' description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' - oldest_article = 13 - max_articles_per_feed = 100 - remove_empty_feeds = True - remove_javascript = True - no_stylesheets = True - ignore_duplicate_articles = {'title'} + publication_type = 'magazine' language = 'en_GB' encoding = 'utf-8' - __author__ = u'Martyn Pritchard' - __copyright__ = '2020, Martyn Pritchard ' + oldest_article = 13 + max_articles_per_feed = 100 + remove_javascript = True + ignore_duplicate_articles = {'url'} + + __author__ = u'Martyn Pritchard & Sophist-UK' + __copyright__ = '2020, Martyn Pritchard & Sophist-UK ' + + current_issue = 'https://www.private-eye.co.uk/current-issue' + about_page = 'https://www.private-eye.co.uk/about' + masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png' + author = 'Private Eye' + series = title = 'Private Eye Online' + conversion_options = { + 'authors': author, + 'author_sort': author, + 'series': series, + 'series_index': 0, + 'title': title, + 'title_sort': title, + } def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('https://www.private-eye.co.uk') - for citem in soup.findAll('img'): - if citem['src'].endswith('big.jpg'): - return citem['src'] - return cover_url + soup = self.index_to_soup(self.current_issue) - remove_tags_before = {'class': "article"} - remove_tags_after = {'class': "article"} - remove_tags = [dict(name='div', attrs={'id': 'sections-sidebar'})] - remove_tags = {'class': "sub-nav-bar"} - remove_tags = [dict(name='a', attrs={'class': 'twitter-share-button'})] - remove_tags = [dict(name='div', attrs={'id': 'nav-box-sections-mobile'})] + for img in soup.findAll('img'): + src = img['src'] + if src.endswith('_big.jpg'): + file_name = src.rsplit('/',1)[1] + if file_name is None: + file_name = src + try: + self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])}) + self.log('series-index:', self.conversion_options['series_index']) + except (TypeError, ValueError): + # wrong big image + continue + return src + return None + def parse_index(self): + soup = self.index_to_soup(self.current_issue) + + # Get publication date + sidebar = soup.find('div', attrs={'id': 'current-issue-sidebar'}) + next_issue_text = sidebar.find('b').nextSibling.strip() + try: + day, month, year = next_issue_text.split(' ') + day = ''.join(c for c in day if c.isdigit()) + pub_date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y") - timedelta(12) + self.log('pub-date:', pub_date) + self.conversion_options.update({'pubdate': datetime.strftime(pub_date, "%d %B %Y").lstrip("0")}) + title = self.title + " " + datetime.strftime(pub_date, "%Y-%m-%d") + self.conversion_options.update({'title': title}) + self.conversion_options.update({'title_sort': title}) + except (TypeError, ValueError): + # Bad date + self.log('Cannot parse next issue date from:', next_issue_date) + + # Get pages first from the sub-menu, and then from the contents panel. + # Duplicates will be eliminated automatically. + articles = [] + for menu_attrs in ( + {'class': 'sub-nav-bar', 'id':'sub-nav-box'}, + {'class': 'article', 'id': 'block-left'}, + ): + menu = soup.find('div', attrs=menu_attrs) + + if not menu: + continue + + for a in menu.findAll('a', href=True): + title = a.getText().rstrip(' ยป\n') + if not title: + continue + articles.append({ + 'title': title, + 'url': a.get('href'), + }) + + if not articles: + abort_recipe_processing('Private-Eye Online index of pages not found'); + + # Add the About page as a final article + articles.append({ + 'title': 'About Private Eye', + 'url': self.about_page, + }) + + self.log('parse_index:', articles) + + return [('Private Eye', articles)] + + # We remove vast swathes of HTML which is not part of the articles. + # Remove sibling content + remove_tags_before = [ + {'name': 'div', 'class': "article"}, + {'name': 'div', 'id': "page"}, + {'name': 'div', 'id': "page-wide"}, + {'name': 'div', 'id': "content"}, + {'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}}, + ] + remove_tags_after = remove_tags_before.copy() + remove_tags_after.append( + {'name': 'div', 'id': 'about-covers'}, + ) + # Remove non-sibling content + remove_tags = [ + {'name': 'div', 'attrs': {'id': 'top-bar'}}, + {'name': 'div', 'attrs': {'id': 'header-wide'}}, + {'name': 'div', 'attrs': {'id': 'footer-wide'}}, + {'name': 'div', 'attrs': {'id': 'follow-buttons'}}, + {'name': 'div', 'attrs': {'id': 'sidebar'}}, + {'name': 'div', 'attrs': {'id': 'sections-sidebar'}}, + {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}}, + {'name': 'div', 'attrs': {'id': 'nav-box-pages-mobile'}}, + {'name': 'div', 'attrs': {'id': 'about-covers'}}, + {'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}}, + {'name': 'iframe'}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}}, + ] + + # Convert headers to h1, strapline to h4 preprocess_regexps = [ ( re.compile( - r'(.*?)\s*(?:
\s*)*(?:(.*?))?', re.DOTALL | re.IGNORECASE - ), lambda match: '' - ), - ( - re.compile( - r'