From fc8e5feabd64685f6529e4d5649de4f27d869b67 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 22 Dec 2011 19:04:37 +0530 Subject: [PATCH] ... --- recipes/icons/mlody_technik_pl.png | Bin 0 -> 2104 bytes recipes/icons/mlody_technik_pl.recipe | Bin 15086 -> 0 bytes recipes/ming_pao.recipe | 45 +++++++++++++------------- recipes/ming_pao_toronto.recipe | 45 +++++++++++++------------- recipes/ming_pao_vancouver.recipe | 45 +++++++++++++------------- 5 files changed, 66 insertions(+), 69 deletions(-) create mode 100644 recipes/icons/mlody_technik_pl.png delete mode 100644 recipes/icons/mlody_technik_pl.recipe diff --git a/recipes/icons/mlody_technik_pl.png b/recipes/icons/mlody_technik_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..9529ff05113d1d42980520b3ec02306d90ead7f4 GIT binary patch literal 2104 zcmZ9Nc{Cg78pdOdbntYxK?r2qhctgVeD zPC(uNSYB8#^Hv_+5j%$_STjMe@ha-JucWt#@M*W3uyDVt=@*4 z006*1TT3&SkjX{&0g@t0kxs83s~AJQNL11@tv#>Xk)poZQle#jUC3NwUrf_#!%w4l z1sGDxsrNFo-8%ZAC`QA_@uTTwkBO-aSrgnwD}?A(JCwd&Fg7z4J|5&mprS(oP0 zL(5p+HkrGmPY{z*)N-sby(TO;fK)G%97`6)9$%r$k9fTAls5+$a>1$x-C!ki32?67 zMrcO%yu`;4xbjFGYxK((PA?3`QZ-6gQGaJ2+Qhb&ppu30_>R=FR!lH~&{s$m2rdS= zKfKRdoIBY1u;mxGwzJvjMpZnXFSv50h%OVsDh)G0@>jCt?5NQ_drMtTZY-2M7h8Qs zM_Q<8;6uY0s9XEw3%?3Z%WqI>U0^?`GPV#(&GzdDfujr10BUVsJOL!~CgKaYQ`kp> zWltUOIhlQ@4~nzke)Qw3K8(}E(j?!$j0I!SX!NVtZXbGhcjm$K^w4~!Fd&A7;DQ`} zV17;Co!iuyO(?0GXm))c^f~@(jXt;1^F98#FCkm2EcPP5#{>2=s@JWCz>j2LDh-n# z4Y_-UGfDyz850Ifk0x(GC287la@3U9fTeqN#)+}t5_Tmx`5E8G5ly*+L$QCJrsRMW z5;VPp`rs6@b{V29uCLI$OZ@dorUC&#yrf-f*TlqAagDz?Hx-h&?J1(ybDXoRn5#<@ zgUG?r{{DU(lCWs}?p!0iTfF7U@&>#) zqCX2rQY*DX?rrqSO;39?KyVhfVh#1nqkIO>Y!ZcRhWM^r;d)!p9Q>vl(j;?#Z=21| zJlNY$mr3Fm5CGhgSD+>~J9^@2;{59L`N{ICNcd#N?vK@^RDTXjRU4bh9*LUT-`_1Q z9cmb?Ox}{YM}o4xF2aH_Rkbux(S#gL=DVhylwud2Oj{xjPr$)Ld=uPTkJf(3`~Rx;uJfjpLn8;UBkqaLc4(R2d!-<8WfL1YwlxX!Q5=sbU67{)N9m^(n2mD48gx$MU2x0%FP78IO0$mMn+lhO}q$SGN`$u+fvxVnu>q zR*BgrG!KUv+#cxc>_qtH!9HYO(gyytFh#}~>um>>(v{j~NA=eIZ66HDAXzR$-~O@vpfTCw>&-c*MD5Xofc$27cP zdNXl{%&dY|+9U4E)f?6Dssq9cT9Z;z2Gw-ty`P>HMF3meO~VE{$)9^eE^40&YUmyP zakafMvHg7~_BdVL!Dyp};UQhDVQW71=}yS)Uzi9hwW`@G#gk65ReSujkK?n11N(JqNGF%}5$g0Z`irq}^i8}GSWX8Sryc35%t4y*a6|@z1 zfYajAx-1ZZ=;kKDqQZ0H!x4kEL>%cr`p(Xgx`p9uO96H$CmOk4PL00`)O5qkKe?u^ ze4!#B!-5L2aIhIdAE5*|DWdj2{HpK=t)EFwf!n?tkbOMjaZC%+eVSMxsemf5dW@Ju zWZ1J+_k7k{hr;m(GGhAT&cx=!p8^+KETuhj&+OEJRYXkJtZ8PQPmIod%Ic8YL*EHo z1XG6X4^wmM%TG1>GW#z7?AzU3a%kHb(wb=cU{guK6`qqk%YoVbF`Z;BTI9rLM zN)i#AlnbiI(z*cZ**fd~=d-cv6(h?R^CO$hI0nAwU`&K57@2CIbuMX%zwj7~d>u<+ zLe=Ecawq1$-m@mDZE6a>5vg>g0uyc+BTWCZV>mZuLibH%*1Om0#BZ`_sIv9dM8C); z*G3NJWqJIFHkC|fMntrI#ZC6~w2`fGG`Pdi#S#w04gyIh-{@0j#wdvZKq?l&T;m7d z5B2tHpOip8A-|WFUSIFd*ULeDJY7_;)4lQUe>t^vbhN>}9^w4aG|Z{4Yib&{(V}i{ zzE}NIzLXRCCewy4#H|mSbR|u6UD-azx_5;730XGhCCZCWOM>QGKj*wob;(wudzRI$ z{V*2Ut>I6-w)!(GgjV0c;RBIUb)!?V8=0!J`VWux((X<<@f{2R1oLmf_H_`Q{h<6C z>DkeLJaWf+B>_#jvx(`|!&>RdCSdia6Z^7NOTNjthyT>ue|v6UgfC`BduTHA7QQZM zp_2(Hd_tgqLXZ(5E=T}?o-P!67OHnvPtOI4Fw%t?K@q2Q4UBYkYv*|K{}YIe4h+4V c{Qm(xBe>o_fuxcc89@MGi^5nooBLh=3&|nw^8f$< literal 0 HcmV?d00001 diff --git a/recipes/icons/mlody_technik_pl.recipe b/recipes/icons/mlody_technik_pl.recipe deleted file mode 100644 index f689e69a920f4646de8501bbb092b42fa8f67d71..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15086 zcmeHOdyJG<75|pk@+w&#%knCcEtaJfB!vPSB@k&XiZp7Wv@ub%jmr{Hq)2r`sj-Qs zVx=_}`w(TfwlSLiQG~TM_7A0?Emp;Zmi|#BEi5xTkDd4KJa&)wcg~#eyED79FatiK zbCbLGe)pbx&+q*1x#!;bX4Y|PoRQ9wC5|cQIluUz<1BX^XWqR0bDiUyGi`OIME}-N zj`JNO&}3xH5NC_YXy-KFai&a=EDhDaRR*dIR2ir;P-URXK$U@O$3RU@hH9LCLg_e3 zH@oeoZ?t1%R8F5SGX`_kj>yT}kR9TAkUqvxo^9^DiqDjcH?Naz8=}tGjBG&(Ih|*S zF*+CK*R1;SEcTR-Uv(-L)pNz=aF~bd%iGI*8M%t^>-?r%vbfu7j?a@?7X1EYAYQpSJ<{{zuJ@O&$&1{Uyc&+ zcL)01HdG*sHXFY^PK6b}@>ni@*MIW(htdK3Yu0oZtUd4{zuXyP{9XPuKWjVip=~() zkhSqwvVV*D2Y$?p1b;UGgs?XSzsN%l$={9rFMAVfo6CrAiVoQezz^IN z$4~fv-v6N^lM@~Vf#3!H+tvnL>}nrmfFJ$?{F=1C*nfau?N^!IZ6@Z`gv3sR<7)j! zIr8*mT>jCf>yyq1*Xlpu8p$44%)jlMDz|G_H`{;J{%>rz`Pcb@N&QFSpR7Cjk<@=K z`V;)D?I_V^`;Xepg84s~8*%Td{KxW9#5l#ETyNhNpw+7)G(gg4^Qio*|6r_+FNI(G zVFK_sFZWlBpZBkYKa=rdqhK-q^MH?Tzx@K^0(P|v{lEWRjAqZgNTEnCGpg877W+@J zZz_K9&+Aly|J(5g_Xkgh`bmafKNX?QSAs4+u}l01d-R1Dx@oZ80}?;2*dY7Yn-}x! z+hA!qu~h1UUDiSH*lVIPuv`6q@e&`Y{{y$m ze{)ZOVZX#Av>`oqAS8Di6CYRJdd|Y{ zofCW8&e5n5KDHA!RDxe^2UoZwD%Z*pN+gt8SNv#KymlMb5qC@6n@;~NOmCg(apf=u z33I)7A6^treM)?&Q2yxLGK10Tb=@~QO~DBX{x-b#=&>q_$-ieGoGbg`t*5JUUoQnk-@m3vZqJq zP&te%HMY3jlg*SIqw{JxYD79OD`PMRY}WoUqti5QtZ*Rqf&!~( zc5bLo5p+SXa*q1sx&$?^4AZ(hLbPt3DNSDg$q21p6QYF+gXVp>*N#3s*G-C>?}-Y> zpl^D;tlzx30ct-S<=oMpFy9X+=~_f~Iadbt=y7kj1#ao{rJl+wg8KM~fEusJ~y{o%Z+lbBH_I8KtJCsJj-* zC+y$<{jiZ2IS8YE;PC)$-ySpX;&S|~{coL(8(zAp$Gj)8Q!sQ_AA4OfuzuL@wTT^n{8N(kcL$RNebHEN0sq;o zyu-nGBd`pXu`3EnG0rN2b#{YcX@?Kl}qOrmLr+par;)@Zp z$Ne;EV(;MfFVc^3EnEG0|MT>t{f*a?%wr%Vzau~!QYrh<2N_R)-yr}0-lGwgQCZl0 z=x~rT+H?=1e*Pc5wEF=+VZYg_*napQ^lFBF#I*5aCARARpEw~!`yP%F)=OkzZ-!uB zv5n~O?aR{p?VyzO056lML~|iviI}nf!PmRz8K^6wAWh4 z!ba4!)OK4IeelcW%kgeGMju}Mo*REu?%2@~-L%mB1|@&CNGWIKq7TJ5-eR6&+{(SD zw-5`)fD^1?b90mq{Ul7s{}6OBqaR~0`QqIEP+O3$n~*IyKXE4F%z9ezEU=^-GEE(Es+?!nkkaM!mDMkETt*_YM;O>@yzc;Gt*D_gLmE zdgq-a;cR*J)g*0e4b!bFg7mFNBFyvIvuW-_EZV!Thvwgacj6gZzA`b4e!=^j&IC0~ z_3jV0ew+vJPn;7gR>o=BEpeJPQ_c*`kC@rmm|#8V|G?ZBpC=fHd7vNIV5jn~C3r)^ zIXGpqp24#I7_W0^y^<3C2kyKz^l)I!(0=5l1oN-wpRGr^&{EgPmeIP(Xo+^*FIu+7 z=!Z{)Tt8O(ac)2MgCK3ZD`f6RL)))&+oe4}+Vy#;5;oz^w(J(aSp&}xu^u@8R;}{W zgz@4lSc55b8D0yn7e8yCmgv7st{p9Q;ohd_kL;Hm$6uU3de@xMfbRn`)WF{dWVo)+ z@?C(}SL%BJjJ3y)t(AR+__wW9a%)S7_s@p)L26nMaP_+%1ge$)O9nU?S%3c3Bv`7r ziuV-ewFJMbaNkpy|26w>dru*;Nbf9m+|o8V$LyFTantLcUY|5)RGj9|Ptf&qVs0Jn zH#AE7jD{4AAD5ynUx?F|&nM~ToA6mp=J>?A82Qcb{sMsV8 zX@BxypNS#3FK3zGonMtWhdJ)t5OwKqZ4J2fZ#>vT@ubNA^f2O)ci-Oe75t{hd>1@Y z#_Ktb@mk-rCrpW?#M#3<9(QuNKYVRhm`0B1J8LgucX zX8o{VbY|(blUnbmKc7mtarT*Kfxl2cu)~J=jVb0&=aAC7o8lRK-G86h27)>|LOf2_ zb>m%ebGG-AKhR6ZUrSQ^b8))ZmF9flaalvGgEx1UzI;FKqn>|EnwX '': return __Date__[0:4] else: - return self.get_dtlocal().strftime("%Y") - + return self.get_dtlocal().strftime("%Y") + def get_fetchmonth(self): if __Date__ <> '': return __Date__[4:6] @@ -268,7 +267,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -305,7 +304,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: # articles = self.parse_section(url) @@ -322,7 +321,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + if __InclPremium__ == True: # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') @@ -330,7 +329,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -410,7 +409,7 @@ class MPRecipe(BasicNewsRecipe): title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - try: + try: br.open_novisit(url) url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article current_articles.append({'title': title, 'url': url, 'description': ''}) @@ -437,7 +436,7 @@ class MPRecipe(BasicNewsRecipe): included_urls.append(url) current_articles.reverse() return current_articles - + # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() @@ -559,7 +558,7 @@ class MPRecipe(BasicNewsRecipe): photo = photo.replace('class="photo"', '') new_raw_html = new_raw_html + '
' + photo + '
' new_html = new_raw_html + '' - else: + else: # .txt based file splitter = re.compile(r'\n') # Match non-digits new_raw_html = 'Untitled
' @@ -622,23 +621,23 @@ class MPRecipe(BasicNewsRecipe): #raw_html = raw_html.replace(u'

\u3010', u'\u3010') if __HiResImg__ == True: # TODO: add a _ in front of an image url - if url.rfind('news.mingpao.com') > -1: + if url.rfind('news.mingpao.com') > -1: imglist = re.findall('src="?.*?jpg"', new_html) br = mechanize.Browser() br.set_handle_redirect(False) for img in imglist: gifimg = img.replace('jpg"', 'gif"') - try: + try: br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) new_html = new_html.replace(img, gifimg) - except: + except: # find the location of the first _ pos = img.find('_') if pos > -1: # if found, insert _ after the first _ newimg = img[0:pos] + '_' + img[pos:] new_html = new_html.replace(img, newimg) - else: + else: # if not found, insert _ after " new_html = new_html.replace(img[1:], '"_' + img[1:]) elif url.rfind('life.mingpao.com') > -1: @@ -675,7 +674,7 @@ class MPRecipe(BasicNewsRecipe): #print 'Use hi-res img', newimg new_html = new_html.replace(img, newimg) return new_html - + def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] @@ -684,7 +683,7 @@ class MPRecipe(BasicNewsRecipe): for item in soup.findAll(stype=True): del item['absmiddle'] return soup - + def populate_article_metadata(self, article, soup, first): # thumbnails shouldn't be available if using hi-res images if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'): @@ -699,7 +698,7 @@ class MPRecipe(BasicNewsRecipe): if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: - articlebodies = soup.findAll('div',attrs={'class':'content'}) + articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: @@ -721,12 +720,12 @@ class MPRecipe(BasicNewsRecipe): # display a simple text #article.summary = article.text_summary = u'\u66f4\u591a......' # display word counts - counts = 0 + counts = 0 articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: - articlebodies = soup.findAll('div',attrs={'class':'content'}) + articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: @@ -908,5 +907,5 @@ class MPRecipe(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) - + diff --git a/recipes/ming_pao_toronto.recipe b/recipes/ming_pao_toronto.recipe index 739a808aba..84001d3952 100644 --- a/recipes/ming_pao_toronto.recipe +++ b/recipes/ming_pao_toronto.recipe @@ -29,14 +29,14 @@ __Date__ = '' ''' Change Log: 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away - from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day + from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/19: fix a bug in txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/04: option to get hi-res photos for the articles -2011/09/21: fetching "column" section is made optional. +2011/09/21: fetching "column" section is made optional. 2011/09/18: parse "column" section stuff from source text file directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source @@ -60,7 +60,6 @@ Change Log: 2010/10/31: skip repeated articles in section pages ''' -from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode) from calibre.utils.date import now as nowf import os, datetime, re, mechanize from calibre.web.feeds.recipes import BasicNewsRecipe @@ -204,13 +203,13 @@ class MPRecipe(BasicNewsRecipe): return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] else: return self.get_dtlocal().strftime("%Y-%m-%d") - + def get_fetchyear(self): if __Date__ <> '': return __Date__[0:4] else: - return self.get_dtlocal().strftime("%Y") - + return self.get_dtlocal().strftime("%Y") + def get_fetchmonth(self): if __Date__ <> '': return __Date__[4:6] @@ -268,7 +267,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -305,7 +304,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: # articles = self.parse_section(url) @@ -322,7 +321,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + if __InclPremium__ == True: # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') @@ -330,7 +329,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -410,7 +409,7 @@ class MPRecipe(BasicNewsRecipe): title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - try: + try: br.open_novisit(url) url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article current_articles.append({'title': title, 'url': url, 'description': ''}) @@ -437,7 +436,7 @@ class MPRecipe(BasicNewsRecipe): included_urls.append(url) current_articles.reverse() return current_articles - + # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() @@ -559,7 +558,7 @@ class MPRecipe(BasicNewsRecipe): photo = photo.replace('class="photo"', '') new_raw_html = new_raw_html + '

' + photo + '
' new_html = new_raw_html + '' - else: + else: # .txt based file splitter = re.compile(r'\n') # Match non-digits new_raw_html = 'Untitled
' @@ -622,23 +621,23 @@ class MPRecipe(BasicNewsRecipe): #raw_html = raw_html.replace(u'

\u3010', u'\u3010') if __HiResImg__ == True: # TODO: add a _ in front of an image url - if url.rfind('news.mingpao.com') > -1: + if url.rfind('news.mingpao.com') > -1: imglist = re.findall('src="?.*?jpg"', new_html) br = mechanize.Browser() br.set_handle_redirect(False) for img in imglist: gifimg = img.replace('jpg"', 'gif"') - try: + try: br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) new_html = new_html.replace(img, gifimg) - except: + except: # find the location of the first _ pos = img.find('_') if pos > -1: # if found, insert _ after the first _ newimg = img[0:pos] + '_' + img[pos:] new_html = new_html.replace(img, newimg) - else: + else: # if not found, insert _ after " new_html = new_html.replace(img[1:], '"_' + img[1:]) elif url.rfind('life.mingpao.com') > -1: @@ -675,7 +674,7 @@ class MPRecipe(BasicNewsRecipe): #print 'Use hi-res img', newimg new_html = new_html.replace(img, newimg) return new_html - + def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] @@ -684,7 +683,7 @@ class MPRecipe(BasicNewsRecipe): for item in soup.findAll(stype=True): del item['absmiddle'] return soup - + def populate_article_metadata(self, article, soup, first): # thumbnails shouldn't be available if using hi-res images if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'): @@ -699,7 +698,7 @@ class MPRecipe(BasicNewsRecipe): if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: - articlebodies = soup.findAll('div',attrs={'class':'content'}) + articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: @@ -721,12 +720,12 @@ class MPRecipe(BasicNewsRecipe): # display a simple text #article.summary = article.text_summary = u'\u66f4\u591a......' # display word counts - counts = 0 + counts = 0 articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: - articlebodies = soup.findAll('div',attrs={'class':'content'}) + articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: @@ -908,5 +907,5 @@ class MPRecipe(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) - + diff --git a/recipes/ming_pao_vancouver.recipe b/recipes/ming_pao_vancouver.recipe index 687d830db9..8dc2c78cb7 100644 --- a/recipes/ming_pao_vancouver.recipe +++ b/recipes/ming_pao_vancouver.recipe @@ -29,14 +29,14 @@ __Date__ = '' ''' Change Log: 2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away - from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day + from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. 2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/19: fix a bug in txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/04: option to get hi-res photos for the articles -2011/09/21: fetching "column" section is made optional. +2011/09/21: fetching "column" section is made optional. 2011/09/18: parse "column" section stuff from source text file directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source @@ -60,7 +60,6 @@ Change Log: 2010/10/31: skip repeated articles in section pages ''' -from calibre import (browser, iswindows, __appname__, force_unicode, preferred_encoding, as_unicode) from calibre.utils.date import now as nowf import os, datetime, re, mechanize from calibre.web.feeds.recipes import BasicNewsRecipe @@ -204,13 +203,13 @@ class MPRecipe(BasicNewsRecipe): return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] else: return self.get_dtlocal().strftime("%Y-%m-%d") - + def get_fetchyear(self): if __Date__ <> '': return __Date__[0:4] else: - return self.get_dtlocal().strftime("%Y") - + return self.get_dtlocal().strftime("%Y") + def get_fetchmonth(self): if __Date__ <> '': return __Date__[4:6] @@ -268,7 +267,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -305,7 +304,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: # articles = self.parse_section(url) @@ -322,7 +321,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + if __InclPremium__ == True: # parse column section articles directly from .txt files for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') @@ -330,7 +329,7 @@ class MPRecipe(BasicNewsRecipe): articles = self.parse_section2_txt(url, keystr) if articles: feeds.append((title, articles)) - + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -410,7 +409,7 @@ class MPRecipe(BasicNewsRecipe): title = self.tag_to_string(i) url = 'http://life.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - try: + try: br.open_novisit(url) url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article current_articles.append({'title': title, 'url': url, 'description': ''}) @@ -437,7 +436,7 @@ class MPRecipe(BasicNewsRecipe): included_urls.append(url) current_articles.reverse() return current_articles - + # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() @@ -559,7 +558,7 @@ class MPRecipe(BasicNewsRecipe): photo = photo.replace('class="photo"', '') new_raw_html = new_raw_html + '

' + photo + '
' new_html = new_raw_html + '' - else: + else: # .txt based file splitter = re.compile(r'\n') # Match non-digits new_raw_html = 'Untitled
' @@ -622,23 +621,23 @@ class MPRecipe(BasicNewsRecipe): #raw_html = raw_html.replace(u'

\u3010', u'\u3010') if __HiResImg__ == True: # TODO: add a _ in front of an image url - if url.rfind('news.mingpao.com') > -1: + if url.rfind('news.mingpao.com') > -1: imglist = re.findall('src="?.*?jpg"', new_html) br = mechanize.Browser() br.set_handle_redirect(False) for img in imglist: gifimg = img.replace('jpg"', 'gif"') - try: + try: br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) new_html = new_html.replace(img, gifimg) - except: + except: # find the location of the first _ pos = img.find('_') if pos > -1: # if found, insert _ after the first _ newimg = img[0:pos] + '_' + img[pos:] new_html = new_html.replace(img, newimg) - else: + else: # if not found, insert _ after " new_html = new_html.replace(img[1:], '"_' + img[1:]) elif url.rfind('life.mingpao.com') > -1: @@ -675,7 +674,7 @@ class MPRecipe(BasicNewsRecipe): #print 'Use hi-res img', newimg new_html = new_html.replace(img, newimg) return new_html - + def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] @@ -684,7 +683,7 @@ class MPRecipe(BasicNewsRecipe): for item in soup.findAll(stype=True): del item['absmiddle'] return soup - + def populate_article_metadata(self, article, soup, first): # thumbnails shouldn't be available if using hi-res images if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'): @@ -699,7 +698,7 @@ class MPRecipe(BasicNewsRecipe): if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: - articlebodies = soup.findAll('div',attrs={'class':'content'}) + articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: @@ -721,12 +720,12 @@ class MPRecipe(BasicNewsRecipe): # display a simple text #article.summary = article.text_summary = u'\u66f4\u591a......' # display word counts - counts = 0 + counts = 0 articlebodies = soup.findAll('div',attrs={'id':'newscontent'}) if not articlebodies: articlebodies = soup.findAll('div',attrs={'id':'newscontent01'}) if not articlebodies: - articlebodies = soup.findAll('div',attrs={'class':'content'}) + articlebodies = soup.findAll('div',attrs={'class':'content'}) if not articlebodies: articlebodies = soup.findAll('div', attrs={'id':'font'}) if articlebodies: @@ -908,5 +907,5 @@ class MPRecipe(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) - +