From 3baef4a41ebd8db91216b51cce835db7372a58dc Mon Sep 17 00:00:00 2001 From: Sophist <3001893+Sophist-UK@users.noreply.github.com> Date: Sat, 1 Jul 2023 16:12:26 +0100 Subject: [PATCH] Improve Private_Eye.Recipe Rewrite (by Sophist-UK) to fit latest web pages, correctly identify pages to include and improve formatting. Please run the existing recipe and this revised one and compare the contents between them. Also compare new output to web site to see how much closer it looks. --- recipes/private_eye.recipe | 128 +++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 25 deletions(-) diff --git a/recipes/private_eye.recipe b/recipes/private_eye.recipe index bbce44f129..9cfcb77aa8 100644 --- a/recipes/private_eye.recipe +++ b/recipes/private_eye.recipe @@ -1,49 +1,127 @@ +''' +Fetch Private Eye (Online Edition) +''' + import re from calibre.web.feeds.news import BasicNewsRecipe +class PrivateEyeRecipe(BasicNewsRecipe): + ## + # Last Edited: 2023-07-01 + # + # Remark: Version 3.0 + # Rewrite (by Sophist-UK) to fix bugs, fit latest web pages, + # correctly identify pages to include and improve formatting. + # -class AdvancedUserRecipe1359406781(BasicNewsRecipe): - title = u'Private Eye' - publication_type = 'magazine' + title = u'Private Eye (Online Edition)' description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' - oldest_article = 13 - max_articles_per_feed = 100 - remove_empty_feeds = True - remove_javascript = True - no_stylesheets = True - ignore_duplicate_articles = {'title'} + publication_type = 'magazine' language = 'en_GB' encoding = 'utf-8' - __author__ = u'Martyn Pritchard' - __copyright__ = '2020, Martyn Pritchard ' + oldest_article = 13 + max_articles_per_feed = 100 + remove_javascript = True + ignore_duplicate_articles = {'url'} + + __author__ = u'Martyn Pritchard & Sophist-UK' + __copyright__ = '2020, Martyn Pritchard & Sophist-UK ' + + current_issue = 'https://www.private-eye.co.uk/current-issue' + masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png' def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('https://www.private-eye.co.uk') + soup = self.index_to_soup(self.current_issue) for citem in soup.findAll('img'): if citem['src'].endswith('big.jpg'): return citem['src'] - return cover_url + return None - remove_tags_before = {'class': "article"} - remove_tags_after = {'class': "article"} - remove_tags = [dict(name='div', attrs={'id': 'sections-sidebar'})] - remove_tags = {'class': "sub-nav-bar"} - remove_tags = [dict(name='a', attrs={'class': 'twitter-share-button'})] - remove_tags = [dict(name='div', attrs={'id': 'nav-box-sections-mobile'})] + def parse_index(self): + soup = self.index_to_soup(self.current_issue) + key = None + articles = [] + + # Get pages first from the sub-menu, and then from the contents panel. + # Duplicates will be eliminated automatically. + for menu_attrs in ( + {'class': 'sub-nav-bar', 'id':'sub-nav-box'}, + {'class': 'article', 'id': 'block-left'}, + ): + menu = soup.find('div', attrs=menu_attrs) + + if not menu: + continue + + for a in menu.findAll('a', href=True): + title = a.getText().rstrip(' ยป\n') + if not title: + continue + articles.append({ + 'title': title, + 'url': a.get('href'), + }) + + if not articles: + abort_recipe_processing('Private-Eye Online index of pages not found'); + + index = [('Private Eye', articles)] + + self.log('parse_index', index) + + return index + + + remove_tags_before = remove_tags_after = [ + {'name': 'div', 'class': "article"}, + {'name': 'div', 'id': "page"}, + {'name': 'div', 'id': "page-wide"}, + {'name': 'div', 'id': "content"}, + ] + remove_tags = [ + {'name': 'div', 'attrs': {'id': 'top-bar'}}, + {'name': 'div', 'attrs': {'id': 'header-wide'}}, + {'name': 'div', 'attrs': {'id': 'footer-wide'}}, + {'name': 'div', 'attrs': {'id': 'follow-buttons'}}, + {'name': 'div', 'attrs': {'id': 'sidebar'}}, + {'name': 'div', 'attrs': {'id': 'sections-sidebar'}}, + {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}}, + {'name': 'iframe'}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}}, + {'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}}, + ] + + # Convert headers to h1, strapline to h4 preprocess_regexps = [ ( re.compile( - r'(.*?)', re.DOTALL | re.IGNORECASE - ), lambda match: '' + ), lambda match: '

' + match[0] + '

' ), ( re.compile( - r'