From eb4d2a590a404e46e42260dce4ccaabfe0bfb10a Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Tue, 9 Jan 2024 19:23:56 +0530 Subject: [PATCH] delete FT Print Edition --- recipes/financial_times_print_edition.recipe | 169 ------------------ .../icons/financial_times_print_edition.png | Bin 1917 -> 0 bytes 2 files changed, 169 deletions(-) delete mode 100644 recipes/financial_times_print_edition.recipe delete mode 100644 recipes/icons/financial_times_print_edition.png diff --git a/recipes/financial_times_print_edition.recipe b/recipes/financial_times_print_edition.recipe deleted file mode 100644 index 223edd4166..0000000000 --- a/recipes/financial_times_print_edition.recipe +++ /dev/null @@ -1,169 +0,0 @@ -import json -import re -from datetime import date -from calibre.web.feeds.news import BasicNewsRecipe, classes -from urllib.parse import quote - - -class ft(BasicNewsRecipe): - title = 'Financial Times - Print Edition' - language = 'en' - __author__ = "Kovid Goyal" - description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.' - no_stylesheets = True - remove_javascript = True - remove_empty_feeds = True - ignore_duplicate_articles = {'url'} - remove_attributes = ['style', 'width', 'height'] - masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' - extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}' - - # needs_subscription = 'optional' - # - # def get_browser(self, *args, **kw): - # br = super().get_browser(*args, **kw) - # if self.username and self.password: - # # ft.com uses a CAPTCHA on its login page so this sadly doesnt work - # br.open('https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com') - # br.select_form(id='email-form') - # br['email'] = self.username - # br.submit() - # br.select_form(id='login-form') - # br['password'] = self.password - # br.submit() - # return br - - def get_browser(self, *args, **kw): - br = super().get_browser(*args, **kw) - br.set_current_header('Referer', 'https://www.google.com/') - return br - - def get_cover_url(self): - from datetime import date - cover = 'http://img.kiosko.net/' + str( - date.today().year - ) + '/' + date.today().strftime('%m') + '/' + date.today( - ).strftime('%d') + '/uk/ft_uk.750.jpg' - br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) - try: - br.open(cover) - except: - index = 'https://en.kiosko.net/uk/np/ft_uk.html' - soup = self.index_to_soup(index) - for image in soup.findAll('img', src=True): - if image['src'].endswith('750.jpg'): - return image['src'] - self.log("\nCover unavailable") - cover = None - return cover - - def parse_index(self): - soup = self.index_to_soup('https://www.ft.com/todaysnewspaper/uk') - # International edition: https://www.ft.com/todaysnewspaper/international - ans = self.ft_parse_index(soup) - if not ans: - is_sunday = date.today().weekday() == 6 - if is_sunday: - raise ValueError( - 'The Financial Times Newspaper is not published on Sundays.' - ) - else: - raise ValueError( - 'The Financial Times Newspaper is not published today.' - ) - return ans - - def ft_parse_index(self, soup): - feeds = [] - for section in soup.findAll(**classes('o-teaser-collection')): - h2 = section.find('h2') - secname = self.tag_to_string(h2) - self.log(secname) - articles = [] - for a in section.findAll( - 'a', href=True, **classes('js-teaser-heading-link') - ): - url = a['href'] - url = 'https://www.ft.com' + url - title = self.tag_to_string(a) - desc = '' - desc_parent = a.findParent('div') - div = desc_parent.find_previous_sibling( - 'div', **classes('o-teaser__meta') - ) - if div is not None: - desc = div.find('a', **classes('o-teaser__tag')) - desc = self.tag_to_string(desc) - prefix = div.find('span', **classes('o-teaser__tag-prefix')) - if prefix is not None: - prefix = self.tag_to_string(prefix) - desc = prefix + ' ' + desc - articles.append({ - 'title': title, - 'url': url, - 'description': desc - }) - self.log('\t', desc) - self.log('\t', title) - self.log('\t\t', url) - if articles: - feeds.append((secname, articles)) - return feeds - - def preprocess_raw_html(self, raw, *a): - # with open('/t/raw.html', 'w') as f: - # f.write(raw) - m = re.search( - r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw - ) - raw = raw[m.start():] - raw = raw.split('>', 1)[1] - # with open('/t/raw.json', 'w') as f: - # f.write(raw) - data = json.JSONDecoder().raw_decode(raw)[0] - title = data['headline'] - body = data['articleBody'] - body = body.replace('\n\n', '

') - - author = '' - if 'author' in data: - try: - author = data['author']['name'] - except TypeError: - author = ' and '.join(x['name'] for x in data['author']) - - image = desc = title_image_url = '' - - def resize_img(img): - a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' - b = quote(img, safe='') - c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400' - # use width = 200, 300, 400,.. 700... - return a + b + c - - if data.get('image'): - image_url = data['image']['url'] - if body.__contains__(image_url) is False: - title_image_url = resize_img(image_url) - image = '

'.format(title_image_url) - # embedded image links - - def insert_image(m): - url = m.group()[1:-1] - if url.__contains__('studio') is False: - url = resize_img(url) - return '

'.format(url) - - body = re.sub(r'\[https://\S+?\]', insert_image, body) - - if data.get('description'): - desc = '

' + data['description'] + '

' - html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body - return html - - def preprocess_html(self, soup): - for span in soup.findAll('span'): - p = span.findParent('p') - if p: - p['id'] = 'fig-cap' - return soup diff --git a/recipes/icons/financial_times_print_edition.png b/recipes/icons/financial_times_print_edition.png deleted file mode 100644 index 7681f5bcfcabb549b67dab329df050798f14b99f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1917 zcmV-@2ZH#CP)Nkl;;#k{w*yTi+s+4al;_=KQjCY_x@J1*jfdzg@(gW@ zJlBXZ_|Qd)n)KMza11W1gSTnP2rmUGsB`~WjQ#75mloh*MTP)CpAw!+B zrW`QocG{Z6s;C^uWF@=}YPc)5;W*(sWj@Z|!|` z>+AIgn<*vDk|7TJ#?`fmw9fAAO@46X#G7yZ`0xL`aqYwFD=SB>_aUT^Vs6Uqoz11C zrQp0~G_xs5c>T3kUOfFm(=-gN^l024jE3U_y=r*#_MP9q|G}mA-s=rV)#$*|p+l!% zI#Uft->g5ldi82G8sEP2pU-cs3DL>ZFC01k!`pY)|9tuK<*OgoHXJ{B>hQ7Stq<3( ze>9oaWxp!>J-xAh|G>e;sEWnK#bUUycI);>pL_~YD@TqWJ^XxWbUweZw0x*PocqK3 z9}r25;pKB@R##US7MECMH~#hI#~*(jVmfvD^!Xp1=M=}I{@1r}_WS)1T&9^978X=l z)^#(THOx9hshci>x)4p#b3XW;?W!8AuCC6F=j*!GirzT?`k|wTAKbs|Y+Ym=LdZ=< zNYmXN@9nuWXHK0sTJ_9yXA8-*ND^zyvLqrKyeUi0GGYx<#5yD)Rz8H3(uor%2L1A- zlgC~-+AYqb7xOo`OBXMz2d~NWZ|}Lb)Jh2 zla1}sXru;x%x2xrPL}5>Bx^lsCu|h)zT}~ZNM%``BtTL0o}mjA{Ylft7>kTk*HCnX zWQ^I}o0k1iR`zGL^?OqgDS{$l#Y&QrWJqG$rnc@I%^^tF4f{P3v~{C711llsrm={z zs;Yj!7kw+y5z=h3of{Klw9W~EGfhMQX|1<6w?Hypm@j&zw{517Y2LVh_rP$V75T0U zt_vZ=7y)I99@)SB)>H#q+kdRBUA}VduUD?^)$QSxm5uwG=UzGc)73Z6zxwjr(gF}= zcW1J zK(LIIYtD*7!?x|Lf!3xdjn>*^o$c=3yT81=OfeLhF8VnT#dZh*T~m&S(m9f86hv=D zB+rb7MB7R9xmHC{fO=>f@gClieD)qP4O5`)Hn#V)$>$C}hq7!pH)D*9m?_RENs1&9 zAc?Va))YNe4G_eI7=@j4MNxR~B|1egEKFd{P@3ekcasY~B+jy|>vlq!~%EOC+DX3jhQN z3y}Z-3xOU6Um}1+@G#aM>3j3~A^mhA(jyw*ig0@PT|5)7fkz?zoe$wD3DJI>JnHxi zyxzC@f7!=1pogsq&%|p1%ag74m7e1G(NchX>-?U)7BFOgwI^5kZNE>7)4ua(;dLT@ zv;s3F03`ffS~w9vEVPe41J6PbdP4k9C3;f)to(lg-}OJ+O?Q;?00000NkvXXu0mjf DanH2w