Merge branch 'patch-20' of https://github.com/Sophist-UK/calibre

2025-11-12 17:47:07 -05:00 · 2023-07-02 07:20:00 +05:30 · 2023-07-02 07:20:00 +05:30 · 050c9caa3d
commit 050c9caa3d
parent a3b42042aa 1def665183
1 changed files with 164 additions and 31 deletions
--- a/recipes/private_eye.recipe
+++ b/recipes/private_eye.recipe
@ -1,49 +1,182 @@
+'''
+Fetch Private Eye (Online Edition)
+'''
+
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
+from datetime import datetime, timedelta

+class PrivateEyeRecipe(BasicNewsRecipe):
+    ##
+    # Last Edited:  2023-07-01
+    #
+    # Remark:       Version 3.0
+    #               Rewrite (by Sophist-UK) to fit latest web pages, correctly identify pages to include
+    #               and improve formatting.
+    #               Edited to add:  inclusion of About page,
+    #                               identifying series number and publication date and setting metadata.
+    #

-class AdvancedUserRecipe1359406781(BasicNewsRecipe):
-    title = u'Private Eye'
-    publication_type = 'magazine'
+    title = u'Private Eye (Online Edition)'
    description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop'
-    oldest_article = 13
-    max_articles_per_feed = 100
-    remove_empty_feeds = True
-    remove_javascript = True
-    no_stylesheets = True
-    ignore_duplicate_articles = {'title'}
+    publication_type = 'magazine'
    language = 'en_GB'
    encoding = 'utf-8'
-    __author__ = u'Martyn Pritchard'
-    __copyright__ = '2020, Martyn Pritchard <MPritchard2k9@gmail.com>'
+    oldest_article = 13
+    max_articles_per_feed = 100
+    remove_javascript = True
+    ignore_duplicate_articles = {'url'}
+
+    __author__ = u'Martyn Pritchard & Sophist-UK'
+    __copyright__ = '2020, Martyn Pritchard <MPritchard2k9@gmail.com> & Sophist-UK <sophist-uk@sodalis.co.uk>'
+
+    current_issue   = 'https://www.private-eye.co.uk/current-issue'
+    about_page      = 'https://www.private-eye.co.uk/about'
+    masthead_url    = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png'
+    author = 'Private Eye'
+    series = title = 'Private Eye Online'
+    conversion_options = {
+        'authors':      author,
+        'author_sort':  author,
+        'series':       series,
+        'series_index': 0,
+        'title':        title,
+        'title_sort':   title,
+    }

    def get_cover_url(self):
-        cover_url = None
-        soup = self.index_to_soup('https://www.private-eye.co.uk')
-        for citem in soup.findAll('img'):
-            if citem['src'].endswith('big.jpg'):
-                return citem['src']
-        return cover_url
+        soup = self.index_to_soup(self.current_issue)

-    remove_tags_before = {'class': "article"}
-    remove_tags_after = {'class': "article"}
-    remove_tags = [dict(name='div', attrs={'id': 'sections-sidebar'})]
-    remove_tags = {'class': "sub-nav-bar"}
-    remove_tags = [dict(name='a', attrs={'class': 'twitter-share-button'})]
-    remove_tags = [dict(name='div', attrs={'id': 'nav-box-sections-mobile'})]
+        for img in soup.findAll('img'):
+            src = img['src']
+            if src.endswith('_big.jpg'):
+                file_name = src.rsplit('/',1)[1]
+                if file_name is None:
+                    file_name = src
+                try:
+                    self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])})
+                    self.log('series-index:', self.conversion_options['series_index'])
+                except (TypeError, ValueError):
+                    # wrong big image
+                    continue
+                return src
+        return None

+    def parse_index(self):
+        soup = self.index_to_soup(self.current_issue)
+
+        # Get publication date
+        sidebar = soup.find('div', attrs={'id': 'current-issue-sidebar'})
+        next_issue_text = sidebar.find('b').nextSibling.strip()
+        try:
+            day, month, year = next_issue_text.split(' ')
+            day = ''.join(c for c in day if c.isdigit())
+            pub_date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y") - timedelta(12)
+            self.log('pub-date:', pub_date)
+            self.conversion_options.update({'pubdate': datetime.strftime(pub_date, "%d %B %Y").lstrip("0")})
+            title = self.title + " " + datetime.strftime(pub_date, "%Y-%m-%d")
+            self.conversion_options.update({'title': title})
+            self.conversion_options.update({'title_sort': title})
+        except (TypeError, ValueError):
+            # Bad date
+            self.log('Cannot parse next issue date from:', next_issue_date)
+
+        # Get pages first from the sub-menu, and then from the contents panel.
+        # Duplicates will be eliminated automatically.
+        articles = []
+        for menu_attrs in (
+            {'class': 'sub-nav-bar', 'id':'sub-nav-box'},
+            {'class': 'article', 'id': 'block-left'},
+        ):
+            menu = soup.find('div', attrs=menu_attrs)
+
+            if not menu:
+                continue
+
+            for a in menu.findAll('a', href=True):
+                title = a.getText().rstrip(' »\n')
+                if not title:
+                    continue
+                articles.append({
+                    'title': title,
+                    'url': a.get('href'),
+                })
+
+        if not articles:
+            abort_recipe_processing('Private-Eye Online index of pages not found');
+
+        # Add the About page as a final article
+        articles.append({
+            'title': 'About Private Eye',
+            'url': self.about_page,
+        })
+
+        self.log('parse_index:', articles)
+
+        return [('Private Eye', articles)]
+
+    # We remove vast swathes of HTML which is not part of the articles.
+    # Remove sibling content
+    remove_tags_before =  [
+        {'name': 'div', 'class': "article"},
+        {'name': 'div', 'id': "page"},
+        {'name': 'div', 'id': "page-wide"},
+        {'name': 'div', 'id': "content"},
+        {'name': 'a', '  attrs': {'href': 'https://shop.private-eye.co.uk'}},
+    ]
+    remove_tags_after = remove_tags_before.copy()
+    remove_tags_after.append(
+        {'name': 'div', 'id': 'about-covers'},
+    )
+    # Remove non-sibling content
+    remove_tags = [
+        {'name': 'div', 'attrs': {'id': 'top-bar'}},
+        {'name': 'div', 'attrs': {'id': 'header-wide'}},
+        {'name': 'div', 'attrs': {'id': 'footer-wide'}},
+        {'name': 'div', 'attrs': {'id': 'follow-buttons'}},
+        {'name': 'div', 'attrs': {'id': 'sidebar'}},
+        {'name': 'div', 'attrs': {'id': 'sections-sidebar'}},
+        {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}},
+        {'name': 'div', 'attrs': {'id': 'nav-box-pages-mobile'}},
+        {'name': 'div', 'attrs': {'id': 'about-covers'}},
+        {'name': 'a', '  attrs': {'href': 'https://shop.private-eye.co.uk'}},
+        {'name': 'iframe'},
+        {'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}},
+        {'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}},
+        {'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}},
+    ]
+
+    # Convert headers to h1, strapline to h4
    preprocess_regexps = [
        (
            re.compile(
-                r'<a href="https://www.subscription.*?</a>',
+                r'<span class="headline(?:-new)?">(.*?)</span>\s*(?:<br>\s*)*(?:<span class="text">(.*?)</span>)?',
                re.DOTALL | re.IGNORECASE
-            ), lambda match: ''
-        ),
-        (
-            re.compile(
-                r'<a class="twitter-share-button.*?</a>', re.DOTALL | re.IGNORECASE
-            ), lambda match: ''
+            ),
+            lambda match: '<h1>' + match[1] + '</h1>' +
+                         (('<h4>' + match[2] + '</h4>') if match[2] else '')
        ),
    ]

-    feeds = [(u'http://bodybuilder3d.eu5.org/PrivateEyeStat.xml')]
+    # The following extra css is to tweak the formatting of various elements of various article pages.
+    # Unfortunately, there are a variety of different pages styles, hence the extended tweak list.
+    # Some of these mimic the actual layout.css which does not seem to make it across into the calibre
+    # ebook without duplicating it as extra css.
+    # However some is new css to tweak output when part of an ebook.
+    extra_css = ' \n '.join([
+        '#content img {float: right;}',
+        '#content img.cartoon-left {float: left;}',
+        '#content img.cartoon-right {float: right;}',
+        '#content img:first-child {float: none;}',
+        '#content #block-sections img {float: none;}',
+        '#content #block-sections img.crossword {float: none; width: 50%; margin-right: 20px;}',
+        '#article-caption-box {float: right; background: #222222; display: block; width: 40%; max-width: 40%;}',
+        '#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}',
+        '#whatsapp {border-left: 5px #8aba60 solid; border-right: 5px #8aba60 solid; border-bottom: 5px #8aba60 solid; padding: 0 20px 20px 20px;}',
+        '#whatsapp::after {clear:both;}',
+        '.whatsapp-left, .whatsapp-right {margin: 20px 0px 0px 0px; padding: 15px; border-radius: 10px;}',
+        '.whatsapp-left, .whatsapp-right {font-family: Helvetica, Arial, "sans-serif"; font-weight: 300; font-size: 18px; line-height: 24px;}',
+        '.whatsapp-left {text-align: left; margin-right: 30%; background-color: #eeeeee;}',
+        '.whatsapp-right {text-align: right; margin-left: 30%; background-color: #dce5ae;}',
+        '#whatsapp .whatsapp-left img, #whatsapp .whatsapp-right img {width: 35px; margin: 0 10px; vertical-align: middle;}',
+    ])