''' Fetch Private Eye (Online Edition) ''' import re from calibre.web.feeds.news import BasicNewsRecipe from datetime import datetime, timedelta class PrivateEyeRecipe(BasicNewsRecipe): ## # Last Edited: 2023-07-14 # # Remark: Version 3.1 2023-07-14 # Show crossword on right so clues are continuous down left # Link to crossword image removed # Improve many image layouts # Version 3.0 2023-07-01 # Rewrite (by Sophist-UK) to fit latest web pages, correctly identify pages to include # and improve formatting. # Edited to add: inclusion of About page, # identifying series number and publication date and setting metadata. # title = u'Private Eye (Online Edition)' description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' publication_type = 'magazine' language = 'en_GB' encoding = 'utf-8' oldest_article = 13 max_articles_per_feed = 100 remove_javascript = True ignore_duplicate_articles = {'url'} __author__ = u'Martyn Pritchard & Sophist-UK' __copyright__ = '2020, Martyn Pritchard & Sophist-UK ' current_issue = 'https://www.private-eye.co.uk/current-issue' about_page = 'https://www.private-eye.co.uk/about' masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png' author = 'Private Eye' series = title = 'Private Eye Online' conversion_options = { 'authors': author, 'author_sort': author, 'series': series, 'series_index': 0, 'title': title, 'title_sort': title, } def get_cover_url(self): soup = self.index_to_soup(self.current_issue) for img in soup.findAll('img'): src = img['src'] if src.endswith('_big.jpg'): file_name = src.rsplit('/',1)[1] if file_name is None: file_name = src try: self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])}) self.log('series-index:', self.conversion_options['series_index']) except (TypeError, ValueError): # wrong big image continue return src return None def parse_index(self): soup = self.index_to_soup(self.current_issue) # Get publication date sidebar = soup.find('div', attrs={'id': 'current-issue-sidebar'}) next_issue_text = sidebar.find('b').nextSibling.strip() try: day, month, year = next_issue_text.split(' ') day = ''.join(c for c in day if c.isdigit()) pub_date = datetime.strptime(" ".join((day, month, year)), "%d %B %Y") - timedelta(12) self.log('pub-date:', pub_date) self.conversion_options.update({'pubdate': datetime.strftime(pub_date, "%d %B %Y").lstrip("0")}) title = self.title + " " + datetime.strftime(pub_date, "%Y-%m-%d") self.conversion_options.update({'title': title}) self.conversion_options.update({'title_sort': title}) except (TypeError, ValueError): # Bad date self.log('Cannot parse next issue date from:', next_issue_text) # Get pages first from the sub-menu, and then from the contents panel. # Duplicates will be eliminated automatically. articles = [] for menu_attrs in ( {'class': 'sub-nav-bar', 'id':'sub-nav-box'}, {'class': 'article', 'id': 'block-left'}, ): menu = soup.find('div', attrs=menu_attrs) if not menu: continue for a in menu.findAll('a', href=True): title = a.getText().rstrip(' ยป\n') if not title: continue articles.append({ 'title': title, 'url': a.get('href'), }) if not articles: raise ValueError('Private-Eye Online index of pages not found') # Add the About page as a final article articles.append({ 'title': 'About Private Eye', 'url': self.about_page, }) self.log('parse_index:', articles) return [('Private Eye', articles)] def preprocess_html(self, soup): # Remove tag link to crossword image for tag in soup.findAll('a', {'href': re.compile(r'/pictures/crossword/')}): self.log("Removing link to crossword image...") tag.unwrap() # Remove align tag in crossword image (so float right works) for tag in soup.findAll('img', {'src': re.compile(r'/pictures/crossword/')}): if "align" in tag.attrs: self.log("Removing crossword image align attribute...") del tag.attrs['align'] return soup # We remove vast swathes of HTML which is not part of the articles. # Remove sibling content remove_tags_before = [ {'name': 'div', 'class': "article"}, {'name': 'div', 'id': "page"}, {'name': 'div', 'id': "page-wide"}, {'name': 'div', 'id': "content"}, {'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}}, ] remove_tags_after = remove_tags_before.copy() remove_tags_after.append( {'name': 'div', 'id': 'about-covers'}, ) # Remove non-sibling content remove_tags = [ {'name': 'div', 'attrs': {'id': 'top-bar'}}, {'name': 'div', 'attrs': {'id': 'header-wide'}}, {'name': 'div', 'attrs': {'id': 'footer-wide'}}, {'name': 'div', 'attrs': {'id': 'follow-buttons'}}, {'name': 'div', 'attrs': {'id': 'sidebar'}}, {'name': 'div', 'attrs': {'id': 'sections-sidebar'}}, {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}}, {'name': 'div', 'attrs': {'id': 'nav-box-pages-mobile'}}, {'name': 'div', 'attrs': {'id': 'about-covers'}}, {'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}}, {'name': 'iframe'}, {'name': 'link', 'attrs': {'href': re.compile('/javastyle/lightbox/')}}, {'name': 'link', 'attrs': {'href': re.compile('/javastyle/news_ticker/')}}, {'name': 'link', 'attrs': {'href': re.compile('/javastyle/media-queries-')}}, ] # Convert headers to h1, strapline to h4 preprocess_regexps = [ ( re.compile( r'(.*?)\s*(?:
\s*)*(?:(.*?))?', re.DOTALL | re.IGNORECASE ), lambda match: '

' + match[1] + '

' + (('

' + match[2] + '

') if match[2] else '') ), ] # The following extra css is to tweak the formatting of various elements of various article pages. # Unfortunately, there are a variety of different pages styles, hence the extended tweak list. # Some of these mimic the actual layout.css which does not seem to make it across into the calibre # ebook without duplicating it as extra css. # However some is new css to tweak output when part of an ebook. extra_css = ' \n '.join([ '#content img {float: right; width: 45%; minimum-width:350px;}', '#content img.cartoon-left {float: left; margin-right: 15px; margin-bottom: 15px;}', '#content img.cartoon-right {float: none; margin-bottom: 15px;}', '#content img.strip {float: none; width: 100%;}', '#content img:first-child {float: none;}', '#content img.gnitty-right {float: none; width: 160px;}', '#content #story > div[align=right] > img:first-child {float: none; width: 15px;}', '#content #story > img:first-child {float: none; height: 100px; width: none; minimum-width: none;}', '#content #block-sections img {float: none; width: none;}', '#content #block-sections img.lookalike {float: none; width: 100%;}', '#content #block-sections img.photo-right {float: right; width: 25%; min-width:120px; margin-left: 15px;}', '#content #block-sections > p:last-child > img:first-child {float: none; width: 120px;}', '#content #block-sections > p:last-child > img:nth-child(2) {float: none; width: 120px;}', '#content #block-sections img.crossword {float: right; width: 40%; margin-left: 15px; min-width: 350px;}', '#content #article-caption-box {float: right; background: #222222; display: block; width: 40%; min-width: 250px; font-size: 90%; margin-left: 15px;}', '#content #article-caption-box img {float: none; width: 100%; max-width: none;}', '#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}', '#whatsapp {border-left: 5px #8aba60 solid; border-right: 5px #8aba60 solid; border-bottom: 5px #8aba60 solid; padding: 0 20px 20px 20px;}', '#whatsapp::after {clear:both;}', '#whatsapp .whatsapp-left, .whatsapp-right {margin: 0 0 20px 0; padding: 15px; border-radius: 10px;}', '#whatsapp .whatsapp-left, .whatsapp-right {font-family: Helvetica, Arial, "sans-serif"; font-weight: 300; font-size: 18px; line-height: 24px;}', '#whatsapp .whatsapp-left {text-align: left; margin-right: 30%; background-color: #eeeeee;}', '#whatsapp .whatsapp-right {text-align: right; margin-left: 30%; background-color: #dce5ae;}', '#whatsapp .whatsapp-left img, #whatsapp .whatsapp-right img {width: 35px; margin: 0 10px; vertical-align: middle;}', '#whatsapp .whatsapp-left img.emoji, #whatsapp .whatsapp-right img.emoji {max-width: 35px; margin: 0 5px; vertical-align: middle;}', ])