From 16e73eed0ace31d2a57e0fa560ec923a8f417dbe Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 13 Oct 2024 10:52:41 +0530 Subject: [PATCH] more natgeo recipes --- recipes/icons/natgeo_kids.png | Bin 0 -> 8365 bytes recipes/icons/natgeo_traveller.png | Bin 0 -> 176 bytes recipes/natgeo_kids.recipe | 95 ++++++++++++++++++++++ recipes/natgeo_traveller.recipe | 103 ++++++++++++++++++++++++ src/calibre/web/site_parsers/natgeo.py | 57 ++++++++++--- src/calibre/web/site_parsers/nytimes.py | 3 +- 6 files changed, 244 insertions(+), 14 deletions(-) create mode 100644 recipes/icons/natgeo_kids.png create mode 100644 recipes/icons/natgeo_traveller.png create mode 100644 recipes/natgeo_kids.recipe create mode 100644 recipes/natgeo_traveller.recipe diff --git a/recipes/icons/natgeo_kids.png b/recipes/icons/natgeo_kids.png new file mode 100644 index 0000000000000000000000000000000000000000..de34804a9a69c46382f2f9cd09a1b8fa3d4c8a85 GIT binary patch literal 8365 zcmeHqRZtvYv*q9}0|a*&Tn7jc65J)Y+W>KBM)DUPTW7-d$@==CK07_zTdiSmknA&1cH z(p!Erv+@c}5Q-AT1>pN-{uJ6Iy5qYrIv`c0w`LOS9w4h!s08$FPbeR9+zaj95ZLqJdiP{|{B6=3rR;_@Ju^&zeT z#h~vH=uBfzo|wWM=HEDAy8nEC^^4Zz@o53NV_3!3BglXzBd|PFR~L472R9_q7p!*c931I7OtQd9Qw*_=Zbo{te;sKKRX2_s^H$7aPkXtiid#ujEzn8K?*ppcNO-ksbNE z33mXu;(DT~B|aAISn!ad%2HWGJNJj0{ooPKpr3>&QydUjp@V}mN6i~F^k6ki%UBda zD#HgB><#7hBn^YB3a4n4!16%wknnd&XHg@rI?9!plYX3^Wlmo_)w|fXDKEkfcbhJU zA5eYY{>Akt=#|H3s37FP_!}xTz$J-NnCg^z8G4x}&&URxiu^F7_e0ej<%<7G#5w07 zQJTp%@l*j)!ax6E9%G(qUTWTuFSADW8(l<_=GZe_WsiN&dJm7DA7~8J1;)W~6F(*5 zCPETdh(u*l&6U=q$I4PPr!~&itYof=J{8>+0gE$_d`HXvv_DpRAK1>Eo;Y`GgP(~PIdlE_#EQuj0 zza&E=PUA}Bxm2VyPqDb9rNplUToNyDnwpndIH3oxhhkbz!$p(Ape>euP*=&(gU zsx+uNt2wJH7_*AA>Q<^8SMo_@mQt1)m2FmwmKPQLsBF|OmDx{ZGf6EaFZPyiF?E+v zG3Jzq#T%5bH+PhDRC|B({_;eK(-__&+bG*T;-?Y=QUaUY)E^jg>P_ge%T^nl#0l2G zim0mn^n7e^ykpnMx3iuEEJPTo40GLbiF2Ke!|Ng(>Kw|??cTrXR_lK8e(^TFjlCuO zP4SzCLJemV+Xbfs=P&;vpCyG3#kQ%E$w{-v=z3j?V@z{wo2*<6HR(dk``CNZd-4EY zUV-n~%~_KfRT<&g%NcPEM;wkiz>^NL^~q`bh&8XK_yf>E<$?9&YyE6PWUI}$43qMn zxXwDrR%mGwG!&X!+81s6#=(72(X7^n3S#W|vMIY3Zn0nkhF05BSo@6MwdD3t?bK~K z^lxwaXTLC^4fU@T_CaP7f06F$?LSN^jIpj)4%<$ztqPAmCSk1)Z$Wz}GOBafH(JKr zr>7QS*@pS=#nh%?fBey&;5av z#xeg>K1mO1kD=f5i_O1_Sufkq8;>NoYWCCK3@f+$?b!pNzp7{Rx}ttdN~ ziMUZX$C$(6sNsU)(~eG?PELcC8P0nCt4Ff8v6&Q(_DS0*^~VEN;|Gyb)`N53CB9F6 z&u8eO|HW{?V9a1gf4F*gDs`$$Nuf$+9d0+qxvwgtFPkc>B4;Q|6PXv)5XTyy8QT(H z8|_DUNT5s6ihuDYfZm^#j-`+jm`lGzXO;{225&!{M*Pw1b`vs^-pjAGsW@w}(DY z@3icF*}J(Jz0*6be4)PeJG~F_SvjxWd~ONUbZS!Tt?#;Y1$G`C;Gv3X{i$!}{E z()f%vuQtPy#}Z7SQNYKJ7}pgSkETI*-hf@#Q1AKJ=5!9tecGMs_S5bDD9`A8e76m{ z{gK$T$o3-3mfq^aKJrEAZpp=14``vR%xdl62XPxyE%ZQZwX#!8qb%M3(1zL zYL2d`sh`9jNoTx5=C9Vr=6A!7N($*#+;XOQM_ZfE4~mzm&wd~LLUgyjn)DvHohkOh zUWZT8R%Uj1dY#WfCsv1?$%a%&BS>ehv5u{HS=b(%yxj}#E2j?kvmJ$bkdyBd_mt4Eule zs=azT6tn+Jnt_#FI^xJyQu(FkTo2>gJ_dYM+?!(o~56OPCN<3z<#t(M^m{0sI z8CPV_?LWR8q@RC1H_hi@;^M?G7C%CiB*E_poJ9%cqIE+k#6thSMnSI@BziCLO_cEQ z{Z&%HUtxgfCTc+=CZL4{zyhh?rjM=_-YoS0tdEShRm6<3!lP~2CQK#`( zgPm8OK@TrX0ZqMS^(47HjIDP*D3us*&F_wt5Dj~t%dg4s{joFlM{jfHa>h(ojQ^ZF zh7?T4llMCK;_b;kP23v!o6Pi@az8yw2jxB{!Tq+ZKc*n`*0q+2eSKZt=`is(BGEEz zRQl;!eB3I{Ri6&({K~X?XJI@2SLThF=`uXt*afstq;6qpkbRsN{BHhr<>cagJ;1R@ zjC)FY?C%7|-$R0zQNou_rWd`r*N!{g{=r8aVc)xjdKmSLu(lUcwg0Sz9B ziTwiOQ~zw}NglH3fM0UUZBK(u?z>Z!<(vP)z|W#`QsL7JhM|kxH!B(fls9QDI40_Q zWv=aJGE_XBf&@{G*u zQVckhTQUiW?$MF-kN{S=F_33qfEJGw>m#}at%ITx*QMGTmC_iVimR@WRi{PGMo5u3 zI1-3Bv1gsBZ@c}}kNBJ>7;{KR9eI12gx2KugF5@i&CqU^nZ1#j$4=YqalzU{KLB*6 z8Ecxlr3xyTh!qS{Kmy@Q_9GS{q^Q8wVZ!_tfrbY90@? zsrZR|3PO+#aL%~*7+dbE%5G>+3SB!%DFmxu8>$9^x$Q*rfT9Aat zN~h0@68nkhWI6b&?SyP=Hl%&L#G;pXHuob=n$YY-A37SbSjuzAMTLbqF5%DYQ|h&- zjLh`RmY?xs%5FReU}7uc6H~oML^5KX?%AbZ+Z<`#2X_w5(rP0Wt4!?nD15p()vgwM z@8^HO^8*AqoZR#b7uUnTH5+J95~p)IK#e2Qut$D~m>b_Lv% zkkRQA((U`MBr6z&ro!i8Hb_nx15y%bCmgILvAk;;ST=L8)4|57GnnAj@1fn4#j-~G zUnNW-cGYXx{Cj=#a0z>^r2|3E5$MhMw?GD=4$e*X(zSVF&#BL-{`e70ON%gQX^TaN zMda|Hz3Y^q)`AOa9LP(9ky)b*>)W%#yNOff)x?r`tG9>{L>-3wc0-s+O_hO6Khh$3 z+ZCK*unJmzX4Yt%oGbr!ZmgzX75mflCz8&FKh5k{-9@KM>Bdd%cxjCYLV1Ie4I<)5 zH5S!?zJV++U#UzHs*oJlwRX~6c3+rM4n;`80%hMmfc?=-p`^)TSKk1mIK$*Dc{$zJSkTYi$WP|X+I{NSAts6jiT;h9}}cS9vrg^Ak@ty z8vau4E=1409TK|763g7O9`q9>QfS2wF|GRK_aOadxZv%QZI)`Lnj9)vdsCd7f%&cH zb;c@_4md;v8EK$t5K4)KJf3a1)G@5FjM(Oj+~w30ejmprgjhz&YGh1o9 zbG`p%_xM=&+B084+WRVx6AP|*y)TvfA%>p%=RGhmE^JRJH{R~S&rFeQtMz!H2KmeG z4KBI1A1Iwm@IkVkRmN|wG5O7-y~e&mOVc&VHm}M=f0G8{#5Koiavi=|)u`D zd=3?&*THzdJ6Z^c7gDOg$o)nRw_Hu5B*&A)N9+ub>+yjRh?xHj3^K-9H|#4x+#0B` zADQJ*K(xw2(0qta4a(^67VyYHt8a=0bOLV8P#W+>BR#E#(2FJ>1S)WAWWFW#tjh8W7{@Q8dugQ0&(21U zyGPMcMkIK9q4LORO@t$PSDqnLPtcSflHHJs6RiiH2%m&5?A;ubYq`bM8r_!FP z&`(v$T6RkAFeTO+-$eSo#X-gHt9D|SgOXSVv3@^B)r8`e(625Yr?za%fg>cD1BeR> zqv*f)IFq~kw_&lFL_sgZ*%ZME65d2XuNWZm9^Bn=!5~4)bF%$Y_kHBX@?l^uBg^NH zzI3W3nOJ_+cJ0KC;TWI77&CL3CNRhXXQ<>5^_qy+aoK`oht?iXkP(>g5zq)S5=bIY z!V?uzay8`X6eMn+`F{BC{SoduNMU1p>Uk!1}l zR+-ahD==mssMEikm~Va~=_e6Z%FUTGqOZ=5v`3CjW|#W7maU)mQA3Bx8RYfPHf+&; zOYmFQVf&6W>yWC}lUo|hM2jn|-G6+UOsX4eAm#wSQ)CLOvS})cGzeav2Vxh~cUHTz z%Bp>$AQjNt*{G84uX3A9J?R@jmrQMAcK;UDj;*tPVzB2K_1gnqM+(#o+RfF`L@IFh zB~Rqr)$$a^)ibacoe=yU)D7X!ShJU3!VD_0u>o)xvyH`klbvNQYL3gk)sH$_Pf>P4 zb^)3{x=F1wo;o7Q7WLqHJuvF)o|MpS;a;KhMMm&VQ1I$EhSZLgu(nP&#is-91@El`9$p@eT~Zu25W5j9C#^U8eD zDPxp<4{eJ$^oZg=(5w;W{daXNj%sS`KPh3@McXALaL{F7a~h{U^bfL;DXOOw7KGj9 zrDUdru^=e*gAwI4#Dls7P9*(t!Gpg+Azg993E@uyLR>hz{`iiw7BaYjER#UA^=Ri3 zglI6$CsHOLG4r=fctm1wN%E$;(X>g1MA>%vR{rC78Z((b}RQqC?P{QGG^@SYuOlNO-Ubi!}Rx z!QC%|y2R6#;CnvPU(SZqF)?fGb{W9G1tNkI+c;kX2r;iMi5xIcy!bF%LxV-h!VD!P z@?r)!3u!;feos?NU@mr2Vu2^Cw=ogzjj?PJ{LlCQCuX`*S#thyrBmdj$O1Qv8elD? zb!5UU4-NjG#s3FN{yY;`!x&Bf19>A}i8oUq@}U%4X!kH(32;k77_@du zgB`@&>;E3LFGYw{V zaCA}CLYOJ~oCN@&BPvKsXvvRir)+$I>H|Ii1{h>Ri-tSuY|_p)rBAGq$;+|qs*O=9 z%_0wAY7R$c6yHsEBL!kx^);e~0+6Z`3AD5!RniUbQ01Gl5=8kzo{B(fqYnT~O*Ot` zLJ=Z=g$S|~vp{lefCiREWH6_&t1>c@Gz>ke1*JX+XIb1;2$m+TBJO+1QgF%a_Ve{C zL>Qa_3AX|CArm%d@zpIwx?Cu)qoT@1Rg66QNFiPUkro zNc!k%DJs$x6Jd0^dPQVvixtp22afY#fX1u_DwsuiHwcZ27d{RO?nG*}(L7>HPs}3G~FEaxwx_v35 zdP^IwsV9B^zFARbXGM|dY%Uj;b$|YPZy+{+J`GU_>_<-9g&=~_%>spqX)B{}9Hhj8 zY~5)75Rs5O#f6Y{L6qSjH$CN$0{)m@@1~o|58dG%BZ^|KRqKrOQj*b5LGSI45enR{ zAy(n|UC_$k(B4+PF?F3njvGpWNi@@?$DFLRWLzm+U9F7u5Gl-Aq^g0z4}x1}sKuo- zMYKp{h=opYDpYC|gsRIU&yf0P2@K#e2d99hxVT%A7QkQFzs!sYofcC-gAs|j;@f&K zy*SVv83B_6RA32Y+fAAM_9YU6$OKYh1BhdQ>Y)fM@(^ajAo)V8XiS1J;x6zfax2 zdN#aY$K*Wo)`e$9M^+noN7h6JbqU=f1QIIOqXo9gn)KGt%!RXlE~5vCg|NrZK}g(n zrfb-&#nE(UELVI^DIN@pzngH0`Fs34$u~xUq;+!pLtiMkYcB3C=l2Ej3=W5HKAx@F z@*j5|s`h@^94~iyThr}pD->e)K0J2o+*xtUH2?j}88eq9`2Mcq^S+wFTS3Goy7w;X_EcTuA@Fjq4_v+={30Y~LhGswVqEGAy!stF_Aa@jv-ZqOS#!7j z)mAB@;+I)vSCX;u5o?vcJTWJS)_D=7w?3q7LB|!lUS{Q?q47`CoJI}k_F)f=8ALdZ z#Z}~=^1nLn)ie#N-i?AGHxF%}2ETIYq-Es|9ch?Tqs8xiRw9s3omWplma2DlNaXl} z-2d!1$?7!zx5=cdUEiK6wQ6mH0&48$cc_|4awGG4@a`_mH6kF>J|Lslo$eYk_;O&@ z9N_P~Us>Jy{jC^bz`~0zGi()ndzXCP@bDXXFpU+PmpiuW%H0=m70^8&e9&>&^HE{6LQq%bn>C^VU~&c?kV>W?I&Jdr}x<> zc>EJZ{ziOnn^DbFC*BP12QS!bHn-8!b^bQRw!Nv}_Y|5P&%Jz-$17#xpCU37Nbo56 zpMb;eW=W^O?7jL)sfG)FLjTXdA7&c2FJpbg$Z+G_w9h9^yv@ris+iBW+FYys-Q8^Y zVO-lH5B=ks#a{Pz-3vO%i?C|e)V!D^+w2C5!Nr|O+Bblw7Ukg*0`LY-}9`O+sP0V?!|8@<{w>3-0GxW2d-@ig4 ztl-HMZC)3sh}LG;bxciyyiErmS`zgqv$3Mlp#VireFxD_vW483_=)JZKo1vkoV$*x z7ygtpjvbCsJMDG+*66`h^TxS26lHAs(I{n^zLLDey#Pl5iYmIG)%$A%OUy%lfzhXE+as7lvKng#zCAP=42 literal 0 HcmV?d00001 diff --git a/recipes/icons/natgeo_traveller.png b/recipes/icons/natgeo_traveller.png new file mode 100644 index 0000000000000000000000000000000000000000..69834ab831a38a5905bfb898a4ca89ae5f1b7bf2 GIT binary patch literal 176 zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE1|*BCs=ffJa!(h>kcif|*AH?w81S%OkS|=& z#QnmL#~!=5|3wRP)uJ@E z0K0wj8FaQb6up(6o3mc^p{{@f10xHZn6X#E@blldcjX!SXZz+JiWcCz!KiG{H>oA% RZ6?qO44$rjF6*2UngHjxJvaaW literal 0 HcmV?d00001 diff --git a/recipes/natgeo_kids.recipe b/recipes/natgeo_kids.recipe new file mode 100644 index 0000000000..8884f8488d --- /dev/null +++ b/recipes/natgeo_kids.recipe @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 + +from calibre.web.feeds.news import BasicNewsRecipe + + +class NatGeo(BasicNewsRecipe): + title = 'National Geographic Kids' + description = 'The National Geographic, an American monthly magazine' + language = 'en' + encoding = 'utf8' + publisher = 'kids.nationalgeographic.com' + category = 'science, nat geo' + __author__ = 'unkn0wn' + description = 'Inspiring people to care about the planet since 1888' + timefmt = ' [%a, %d %b, %Y]' + use_embedded_content = False + remove_javascript = True + masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600' + remove_empty_feeds = True + resolve_internal_links = True + ignore_duplicate_articles = {'title', 'url'} + + recipe_specific_options = { + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', + 'default': '600', + }, + } + + @property + def natgeo_parser(self): + ans = getattr(self, '_natgeo_parser', None) + if ans is None: + from calibre.live import load_module + + self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo') + return ans + + def preprocess_raw_html(self, raw_html, url): + return self.natgeo_parser.extract_html(raw_html) + + extra_css = """ + blockquote { color:#404040; } + .byline, i { font-style:italic; color:#202020; } + .cap { font-size:small; } + img {display:block; margin:0 auto;} + .cred { font-style:italic; font-size:small; color:#404040; } + .auth, .time, .sub { font-size:small; color:#5c5c5c; } + """ + + def parse_index(self): + index = 'https://kids.nationalgeographic.com/' + sections = [ + 'Front Page', 'animals', 'history', 'science', + 'space', 'homework-help', 'crafts', + ] + feeds = [] + for sec in sections: + section = sec.capitalize() + self.log(section) + url = index + sec + if sec.startswith('Front'): + url = index + self.log('Fetching articles from ', url) + soup = self.index_to_soup(url) + articles = [] + for a in soup.findAll('a', attrs={'href': lambda x: x and '/article/' in x}): + if a.find('img') and '/games/' in a['href']: + continue + url = a['href'] + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds + + def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' + for img in soup.findAll('img', src=True): + res = '?w=600' + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '?w=' + w + img['src'] = img['src'] + res + return soup + + def populate_article_metadata(self, article, soup, first): + summ = soup.find(attrs={'class': 'byline'}) + if summ: + article.summary = self.tag_to_string(summ) + article.text_summary = self.tag_to_string(summ) diff --git a/recipes/natgeo_traveller.recipe b/recipes/natgeo_traveller.recipe new file mode 100644 index 0000000000..eed9e5a0c2 --- /dev/null +++ b/recipes/natgeo_traveller.recipe @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from pprint import pformat + +from calibre.web.feeds.news import BasicNewsRecipe, classes + + +class NatGeo(BasicNewsRecipe): + title = 'National Geographic Traveller' + description = 'News articles from The National Geographic Traveller, Download Monthly.' + language = 'en' + encoding = 'utf8' + publisher = 'nationalgeographic.com' + category = 'science, nat geo' + __author__ = 'unkn0wn' + description = 'Inspiring people to care about the planet since 1888' + timefmt = ' [%a, %d %b, %Y]' + no_stylesheets = True + use_embedded_content = False + remove_attributes = ['style'] + remove_javascript = False + masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600' + remove_empty_feeds = True + resolve_internal_links = True + ignore_duplicate_articles = {'url'} + + recipe_specific_options = { + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', + 'default': '600', + } + } + + @property + def natgeo_parser(self): + ans = getattr(self, '_natgeo_parser', None) + if ans is None: + from calibre.live import load_module + + self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo') + return ans + + def preprocess_raw_html(self, raw_html, url): + return self.natgeo_parser.extract_html(raw_html) + + extra_css = """ + blockquote { color:#404040; } + .byline, i { font-style:italic; color:#202020; } + .cap { font-size:small; } + img {display:block; margin:0 auto;} + .cred { font-style:italic; font-size:small; color:#404040; } + .auth, .time, .sub { font-size:small; color:#5c5c5c; } + """ + + def parse_index(self): + pages = [ + 'https://www.nationalgeographic.com/travel/topic/national-geographic-traveller-uk' + ] + + feeds = [] + + for sec in pages: + soup = self.index_to_soup(sec) + parsed = self.articles_from_soup(soup) + if parsed: + feeds += parsed + return feeds + + def articles_from_soup(self, soup): + ans = {} + for article in soup.findAll('article'): + a = article.find('a') + url = a['href'] + if url.startswith('/'): + url = 'https://www.nationalgeographic.com' + url + section = self.tag_to_string(article.find(**classes('SectionLabel'))) + if section.startswith('Paid Content'): + continue + title = self.tag_to_string( + article.find(**classes('PromoTile__Title--truncated')) + ) + articles = ans.setdefault(section, []) + articles.append({'title': title, 'url': url}) + self.log(pformat(ans)) + return list(ans.items()) + + def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' + for img in soup.findAll('img', src=True): + res = '?w=600' + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '?w=' + w + img['src'] = img['src'] + res + return soup + + def populate_article_metadata(self, article, soup, first): + summ = soup.find(attrs={'class': 'byline'}) + if summ: + article.summary = self.tag_to_string(summ) + article.text_summary = self.tag_to_string(summ) diff --git a/src/calibre/web/site_parsers/natgeo.py b/src/calibre/web/site_parsers/natgeo.py index 007515725d..3e60a3e96e 100644 --- a/src/calibre/web/site_parsers/natgeo.py +++ b/src/calibre/web/site_parsers/natgeo.py @@ -15,9 +15,11 @@ pprint def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s : raw.find('', s)] - return json.loads(script[script.find('{') :].rstrip(';'))['page']['content'][ - 'prismarticle' - ] + content = json.loads(script[script.find('{') :].rstrip(';'))['page']['content'] + if content.get('prismarticle'): + return content['prismarticle'] + if content.get('article'): + return content['article'] def parse_contributors(grp): @@ -104,12 +106,37 @@ def parse_body(x): if isinstance(y, dict): yield from parse_body(y) +def parse_bdy(item): + c = item['cntnt'] + if item.get('type') == 'inline': + if c.get('cmsType') == 'listicle': + if 'title' in c: + yield '

' + escape(c['title']) + '

' + yield c['text'] + elif c.get('cmsType') == 'image': + yield from parse_lead_image(c) + elif c.get('cmsType') == 'imagegroup': + for imgs in c['images']: + yield from parse_lead_image(imgs) + elif c.get('cmsType') == 'pullquote': + if 'quote' in c: + yield '
' + c['quote'] + '
' + elif c.get('cmsType') == 'editorsNote': + if 'note' in c: + yield '
' + c['note'] + '
' + else: + if c['mrkup'].strip().startswith('<'): + yield c['mrkup'] + else: + yield '<{tag}>{markup}'.format( + tag=item['type'], markup=c['mrkup']) def parse_article(edg): sc = edg['schma'] yield '
' + escape(edg['sctn']) + '
' yield '

' + escape(sc['sclTtl']) + '

' - yield '' + if sc.get('sclDsc'): + yield '' yield '

' yield from parse_contributors(edg.get('cntrbGrp', {})) ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') @@ -119,15 +146,19 @@ def parse_article(edg): yield '

' if edg.get('ldMda', {}).get('cmsType') == 'image': yield from parse_lead_image(edg['ldMda']) - for main in edg['prismData']['mainComponents']: - if main['name'] == 'Body': - for item in main['props']['body']: - if isinstance(item, dict): - if item.get('type', '') == 'inline': - yield ''.join(parse_inline(item)) - elif isinstance(item, list): - for line in item: - yield ''.join(parse_body(line)) + if edg.get('prismData'): + for main in edg['prismData']['mainComponents']: + if main['name'] == 'Body': + for item in main['props']['body']: + if isinstance(item, dict): + if item.get('type', '') == 'inline': + yield ''.join(parse_inline(item)) + elif isinstance(item, list): + for line in item: + yield ''.join(parse_body(line)) + elif edg.get('bdy'): + for item in edg['bdy']: + yield from parse_bdy(item) def article_parse(data): diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 56b1db6f1d..172088210d 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 10 # needed for live updates +module_version = 11 # needed for live updates pprint @@ -183,6 +183,7 @@ def parse_types(x): 'RelatedLinksBlock', 'EmailSignupBlock', 'Dropzone', + 'AudioBlock', }: yield ''.join(parse_cnt(x))