From c2e3683843d28014b8cc0a64ceff691806b7dd0c Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 2 Feb 2010 17:52:18 -0500 Subject: [PATCH 1/5] Fix bug #4777: Typo in regex for eReader PDB chapter index generation. --- src/calibre/ebooks/pdb/ereader/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index a6ee16db15..c13353745e 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -42,7 +42,7 @@ class Writer(FormatWriter): pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') text, text_sizes = self._text(pml) - chapter_index = self._index_item(r'(?s)\\C(?P[0-4)="(?P.+?)"', pml) + chapter_index = self._index_item(r'(?s)\\C(?P[0-4])="(?P.+?)"', pml) chapter_index += self.index_item(r'(?s)\\X(?P[0-4])(?P.+?)\\X[0-4]', pml) chapter_index += self.index_item(r'(?s)\\x(?P.+?)\\x', pml) link_index = self._index_item(r'(?s)\\Q="(?P.+?)"', pml) From 9ea276be209aee48f0927191d5bedf5378eb70af Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Feb 2010 10:22:00 -0700 Subject: [PATCH 2/5] Fix #4779 (Wall Street Journal (Free Content)) --- resources/recipes/the_gazette.recipe | 22 ----------------- resources/recipes/wsj_free.recipe | 2 +- src/calibre/ebooks/pdf/reflow.py | 35 ++++++++++++++++++++++------ 3 files changed, 29 insertions(+), 30 deletions(-) delete mode 100644 resources/recipes/the_gazette.recipe diff --git a/resources/recipes/the_gazette.recipe b/resources/recipes/the_gazette.recipe deleted file mode 100644 index 19afff986e..0000000000 --- a/resources/recipes/the_gazette.recipe +++ /dev/null @@ -1,22 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class The_Gazette(BasicNewsRecipe): - - cover_url = 'file:///D:/Documents/Pictures/Covers/The_Gazette.jpg' - title = u'The Gazette' - __author__ = 'Jerry Clapperton' - description = 'Montreal news in English' - language = 'en_CA' - - oldest_article = 7 - max_articles_per_feed = 20 - use_embedded_content = False - remove_javascript = True - no_stylesheets = True - encoding = 'utf-8' - - keep_only_tags = [dict(name='div', attrs={'id':['storyheader','page1']})] - - extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' - - feeds = [(u'News', u'http://feeds.canada.com/canwest/F297'), (u'Opinion', u'http://feeds.canada.com/canwest/F7383'), (u'Arts', u'http://feeds.canada.com/canwest/F7366'), (u'Life', u'http://rss.canada.com/get/?F6934'), (u'Business', u'http://feeds.canada.com/canwest/F6939'), (u'Travel', u'http://rss.canada.com/get/?F6938'), (u'Health', u'http://feeds.canada.com/canwest/F7397'), (u'Technology', u'http://feeds.canada.com/canwest/F7411')] diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index b190f43849..e29bfe3dde 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -215,7 +215,7 @@ class WSJ(BasicNewsRecipe): # first, check if there is an h3 tag which provides a section name stag = divtag.find('h3') if stag: - if stag.parent['class'] == 'dynamic': + if stag.parent.get('class', '') == 'dynamic': # a carousel of articles is too complex to extract a section name # for each article, so we'll just call the section "Carousel" section_name = 'Carousel' diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 9f98147032..552af1590f 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -262,7 +262,6 @@ class Region(object): max_lines = max(max_lines, len(c)) return max_lines - @property def is_small(self): return self.line_count < 3 @@ -438,9 +437,8 @@ class Page(object): # absorb into a neighboring region (prefer the one with number of cols # closer to the avg number of cols in the set, if equal use larger # region) - # merge contiguous regions that can contain each other - '''absorbed = set([]) found = True + absorbed = set([]) while found: found = False for i, region in enumerate(self.regions): @@ -452,10 +450,33 @@ class Page(object): regions.append(self.regions[j]) else: break - prev = None if i == 0 else i-1 - next = j if self.regions[j] not in regions else None - ''' - pass + prev_region = None if i == 0 else i-1 + next_region = j if self.regions[j] not in regions else None + if prev_region is None and next_region is not None: + absorb_into = next_region + elif next_region is None and prev_region is not None: + absorb_into = prev_region + elif prev_region is None and next_region is None: + if len(regions) > 1: + absorb_into = regions[0] + regions = regions[1:] + else: + absorb_into = None + else: + absorb_into = prev_region + if next_region.line_count >= prev_region.line_count: + avg_column_count = sum([len(r.columns) for r in + regions])/float(len(regions)) + if next_region.line_count > prev_region.line_count \ + or abs(avg_column_count - len(prev_region.columns)) \ + > abs(avg_column_count - len(next_region.columns)): + absorb_into = next_region + if absorb_into is not None: + absorb_into.absorb_region(regions) + absorbed.update(regions) + i = j + for region in absorbed: + self.regions.remove(region) From 4ecab6bc9ee483ddeddb77b8681635b5ab9918e6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Feb 2010 10:54:23 -0700 Subject: [PATCH 3/5] New recipes for Gizmodo, News Straits Times, Read It Later, TidBits by Darko Miletic --- resources/images/news/gizmodo.png | Bin 0 -> 640 bytes resources/images/news/newsstraitstimes.png | Bin 0 -> 816 bytes resources/images/news/readitlater.png | Bin 0 -> 810 bytes resources/images/news/tidbits.png | Bin 0 -> 783 bytes resources/recipes/gizmodo.recipe | 40 +++++++++++++ resources/recipes/newsstraitstimes.recipe | 35 +++++++++++ resources/recipes/readitlater.recipe | 64 +++++++++++++++++++++ resources/recipes/tidbits.recipe | 53 +++++++++++++++++ 8 files changed, 192 insertions(+) create mode 100644 resources/images/news/gizmodo.png create mode 100644 resources/images/news/newsstraitstimes.png create mode 100644 resources/images/news/readitlater.png create mode 100644 resources/images/news/tidbits.png create mode 100644 resources/recipes/gizmodo.recipe create mode 100644 resources/recipes/newsstraitstimes.recipe create mode 100644 resources/recipes/readitlater.recipe create mode 100644 resources/recipes/tidbits.recipe diff --git a/resources/images/news/gizmodo.png b/resources/images/news/gizmodo.png new file mode 100644 index 0000000000000000000000000000000000000000..8f2e6f002b7719ac70fb67d31b6f5b6785d2c140 GIT binary patch literal 640 zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE1|*BCs=fdzwj^(N7l!{JxM1({$v_d#0*}aI zAngIhZYQ(tK!Rljj_E*J0gT&!&6&%? z2M}o7@qdy2X8D_R5z#R2RZUt7s}^Y~Bsd&sU<4{-Wb?n(G_z;R?cC6<##}SKbt1p* zK7Re$=ex_--=EK4dRTXhp6#mubDNqS`#BFhKCXVxH&fw&DtF|Q?b%mEj=h{@c_sbU zj1!iNryP=Ue#z~%zar7f%0XJ#f48z``j#MxwI`HJ6fd=OT$GC6k;$-$BmUdmo%%Aj zLpzPzbY>>sZ3xmo-$UcZ*N+IWCGg^MliL*^1$8a`S$b5mw)y>v;FF%`8m3a z93>`lTL5EEwZt`|BqgyV)hf9t6-Y4{85kPq8W`ys7>5`dSeckv8CdEXm|Gbb+`K)J p14Tn_eoAIqC9(zs3oBDYD-#QdhPhGT^vI!P9L3o)?1`d;<)|(n~`^Keq0(` z>V8*1$-^-GpgGqej{dFH7e4WGEKzzD(C#Pe@>0&FV~Hb&tA6_;g&WS-GR=2)EKyB$ znw2+8@6oQ!n~gueHgNlH^JUKb=cm8F6%6t^Y3FwH{6~An8H>N)*jd4vDasfyanhf+ zyE$w-E;NhW-{YK{uWV2uS62ot0A#w+A>W7MZsT*vT(i!}Io7DgOZ(>Ep3c zPH!z6cTYRG=Rd=rx;585*F4wPw7uSb!DLCM5|h-A+8x=F8+81eG_DKo^85Ki<+UwCW|YTmHqC_*E=f&q7i`P>(lsxCG1CfP_X2J01Ml5K!WSR23Q)Q#B4>BZufD;7 z;q9SRFScKdC;EGn`+Nf?S@cc_^76$lhazt3;%$bt2X3wtt_C#~;-Pj+^Qr+?UF5S4&ktm@u+0)9(Ix{1~LBuDx zkEj2_)w`}%Q`Pp*ejJjo|1v}>G*nZ>C#-(WrSfg_fA$wYIcu{qqUYuJ%RIe2i&Kp3 z4SAX?lp3c@3a~gcvqI+Ot|vv-BD&qbGNu4UwPVHX^aJE=Ph6ZPHetaPm#m1$$Y*b_ zh?H&gP)VvV%9Xxdwjo#gdMV?I6)TRcFr9GeXrYYA{6{lqoK3qKaeu+Kbz+JvyVgy< zd+XNcFr}6U4jM<^Dcqi;*ubp=O`b1!gQ!R0gC`m~y zNwrEYN(E93Mh1okx&}tN2F4+V2396!R;Gr!2If`<28*s-%V9OXW$Ob!!_F3|gj4+SMZPM}KYU<8@OWZHPH!5ous3_T=GT z?-^+v0SzgdpC=zb|7lJB^$Si5w&pRJG6Z?$@wvW`t837&5i#e<&)v>b zE^#OZc$wa*T6FP*GQ<4y$Kx)hsqA!IbKXHJd%<)=B~GVnQlIU=X-?~ZoVm78=C{Oy z@4q!8g(k}eoh&=kC)X0~ux%#GU&cjSP3?3nuH4ODTbMK3?RtxVg=xjto2i<%&6~G# zA7T&dD|n=I-A%vX>7GV`MTN7JSeO`>Ou5l_K)-9rFW*HU<65+oOGNVxJkLa`%w3rj z-Y;`wTB*??K?jeMRRcIV8Tdm)> z16D6uZ5G0BV9n-vWOjCMSVKibq2A3_$+s=DSl2P#b*oFAr7yXC<+m-&YbE`sO6;_A z(-3kDKgIEwCwhPQjGtT16ivI<@T;xVExU(T{lyDe2aDevYXe&%|NfdcU0yXT)I2v? zSm8nFot-g>FOFF}yQz}FIFWJFOq0vS(K%v$@>!f7+oFB$$f;hNt7L5I`SC(?s^pUN zv`XXr2fKcWELhnY&c3(8Rxdx^Cbi7&_}<-mj*%&^sypX4o5d`Zvj2ERWJ=JUdCfj$ zjd#|y&kdUK_LKQ1*MsZ#h+hwVA-s2$^FPK@T3QA+^$|_Lw5D3(8c~vxSdwa$T$Bo= z7>o=I4Rj5RbPbF{3=OPI%&m;fbq&m|3=CA5+r&^b||EMpFO) literal 0 HcmV?d00001 diff --git a/resources/images/news/tidbits.png b/resources/images/news/tidbits.png new file mode 100644 index 0000000000000000000000000000000000000000..e64d71ec6887f57c6f66464bb90208ffe33978f9 GIT binary patch literal 783 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87?>OZ&!)pF42;O_6J(C3ljy)|1v9;Bzn6ZbT}F{HNh$Q zc+J$bSt)AQU2|V;y0LjrL7d+C-!coQ1j!^kah%5ZWo^`v=F_KnWG-qxab{@qpA}M9 zw&U-W^pB;ztvef5MrUr?cl6sbC5(8dK~VocSdbuT6_9tZ5GFdGcvvf6^D)(@GO~9vb(En_t_UIljh7% z5x*F=*;D4YMW6Gv?LYS2Sr(zQ=(*b@i<>-!GRYFWi!=^x%XRaAS`a(^&uuM%1+S&1 zr9P~bTNcT@Aa4EQ>{)G(Ei{%21bulR(bo8SSHv~G>G$ueWOpkbTyDgzUtjpE&4r^; zuf^edDZ>%_lBH8#?CMi1@4BU<&Gl1lWk~Cr87gnL7sQI!?l0w?|JJ}P)xc!RlMB{I zSO3<{iGF*Z+4u4Xn|>d)!(X-vGT5DK|NXyl0@LYDK)1LkF*?W`|DXQo(jPX4vSr;j z_O|Y9kFWf^$VH}Ha=(c2=Zrbm+Kzv&UM=#;_flQY?fit9Vn?Uu?va_ZJNPE|k3AwE z9(#9wEt0$(IA`aJ$z^Pjwed?sw1U=~Z_wdBzRggk)mUVbhsrmmsI_8Ov!=Q&W^~Y+ zx@i6Se!u0;OM?`7oR>^Fv43mUthU#?su{gB7ilcI{q~#91ry$-B84*F?=iLr*!urt zXm`DxsP)vV9GIX~OI#yLQW8s2t&)pUffR$0fuVt}fsw9(afqRTm5Hg9k)f`Exs`!| tvs$tsiiX_$l+3hBWDN!uRz}8Fra=854SVte<$)R)JYD@<);T3K0RR{HNE-kE literal 0 HcmV?d00001 diff --git a/resources/recipes/gizmodo.recipe b/resources/recipes/gizmodo.recipe new file mode 100644 index 0000000000..6f6e6ae0cf --- /dev/null +++ b/resources/recipes/gizmodo.recipe @@ -0,0 +1,40 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +gizmodo.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Gizmodo(BasicNewsRecipe): + title = 'Gizmodo' + __author__ = 'Darko Miletic' + description = "Gizmodo, the gadget guide. So much in love with shiny new toys, it's unnatural." + publisher = 'gizmodo.com' + category = 'news, IT, Internet, gadgets' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = True + language = 'en' + masthead_url = 'http://cache.gawkerassets.com/assets/gizmodo.com/img/logo.png' + extra_css = ' body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif} img{margin-bottom: 1em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes = ['width','height'] + remove_tags = [dict(name='div',attrs={'class':'feedflare'})] + remove_tags_after = dict(name='div',attrs={'class':'feedflare'}) + + feeds = [(u'Articles', u'http://feeds.gawker.com/gizmodo/full')] + + def preprocess_html(self, soup): + return self.adeify_images(soup) + diff --git a/resources/recipes/newsstraitstimes.recipe b/resources/recipes/newsstraitstimes.recipe new file mode 100644 index 0000000000..ebbaca1a0e --- /dev/null +++ b/resources/recipes/newsstraitstimes.recipe @@ -0,0 +1,35 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +www.nst.com.my +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Newstraitstimes(BasicNewsRecipe): + title = 'New Straits Times from Malaysia' + __author__ = 'Darko Miletic' + description = 'Learning Curve, Sunday People, New Straits Times from Malaysia' + publisher = 'nst.com.my' + category = 'news, politics, Malaysia' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'en' + masthead_url = 'http://www.nst.com.my/Current_News/NST/Images/new-nstonline.jpg' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [dict(name=['link','table'])] + keep_only_tags = dict(name='div',attrs={'id':'haidah'}) + + feeds = [(u'Articles', u'http://www.nst.com.my/rss/allSec')] + diff --git a/resources/recipes/readitlater.recipe b/resources/recipes/readitlater.recipe new file mode 100644 index 0000000000..4bd8fc2bd6 --- /dev/null +++ b/resources/recipes/readitlater.recipe @@ -0,0 +1,64 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +readitlaterlist.com +''' + +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class Readitlater(BasicNewsRecipe): + title = 'Read It Later' + __author__ = 'Darko Miletic' + description = '''Personalized news feeds. Go to readitlaterlist.com to + setup up your news. Fill in your account + username, and optionally you can add password.''' + publisher = 'readitlater.com' + category = 'news, custom' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + needs_subscription = True + INDEX = u'http://readitlaterlist.com' + LOGIN = INDEX + u'/l' + + + feeds = [(u'Unread articles' , INDEX + u'/unread')] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None: + br.open(self.LOGIN) + br.select_form(nr=0) + br['feed_id'] = self.username + if self.password is not None: + br['password'] = self.password + br.submit() + return br + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + articles = [] + soup = self.index_to_soup(feedurl) + ritem = soup.find('ul',attrs={'id':'list'}) + for item in ritem.findAll('li'): + description = '' + atag = item.find('a',attrs={'class':'text'}) + if atag and atag.has_key('href'): + url = self.INDEX + atag['href'] + title = self.tag_to_string(item.div) + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':description + }) + totalfeeds.append((feedtitle, articles)) + return totalfeeds + diff --git a/resources/recipes/tidbits.recipe b/resources/recipes/tidbits.recipe new file mode 100644 index 0000000000..702c65e9e4 --- /dev/null +++ b/resources/recipes/tidbits.recipe @@ -0,0 +1,53 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +db.tidbits.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class TidBITS(BasicNewsRecipe): + title = 'TidBITS: Mac News for the Rest of Us' + __author__ = 'Darko Miletic' + description = 'Insightful news, reviews, and analysis of the Macintosh and Internet worlds' + publisher = 'TidBITS Publishing Inc.' + category = 'news, Apple, Macintosh, IT, Internet' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = True + language = 'en' + remove_empty_feeds = True + masthead_url = 'http://db.tidbits.com/images/tblogo9.gif' + extra_css = ' body{font-family: Georgia,"Times New Roman",Times,serif} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes = ['width','height'] + remove_tags = [dict(name='small')] + remove_tags_after = dict(name='small') + + feeds = [ + (u'Business Apps' , u'http://db.tidbits.com/feeds/business.rss' ) + ,(u'Entertainment' , u'http://db.tidbits.com/feeds/entertainment.rss') + ,(u'External Links' , u'http://db.tidbits.com/feeds/links.rss' ) + ,(u'Home Mac' , u'http://db.tidbits.com/feeds/home.rss' ) + ,(u'Inside TidBITS' , u'http://db.tidbits.com/feeds/inside.rss' ) + ,(u'iPod & iPhone' , u'http://db.tidbits.com/feeds/ipod-iphone.rss' ) + ,(u'Just for Fun' , u'http://db.tidbits.com/feeds/fun.rss' ) + ,(u'Macs & Mac OS X' , u'http://db.tidbits.com/feeds/macs.rss' ) + ,(u'Media Creation' , u'http://db.tidbits.com/feeds/creative.rss' ) + ,(u'Networking & Communications', u'http://db.tidbits.com/feeds/net.rss' ) + ,(u'Opinion & Editorial' , u'http://db.tidbits.com/feeds/opinion.rss' ) + ,(u'Support & Problem Solving' , u'http://db.tidbits.com/feeds/support.rss' ) + ,(u'Safe Computing' , u'http://db.tidbits.com/feeds/security.rss' ) + ,(u'Tech News' , u'http://db.tidbits.com/feeds/tech.rss' ) + ,(u'Software Watchlist' , u'http://db.tidbits.com/feeds/watchlist.rss' ) + ] From 3d1ef6e56499bd24cbca3fb263dfa9580d5b3f9a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Feb 2010 16:14:25 -0700 Subject: [PATCH 4/5] Fix #4786 (Updated recipe for Pagina 12) --- resources/recipes/pagina12.recipe | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/resources/recipes/pagina12.recipe b/resources/recipes/pagina12.recipe index 2fb433dc82..c9801cb359 100644 --- a/resources/recipes/pagina12.recipe +++ b/resources/recipes/pagina12.recipe @@ -5,9 +5,10 @@ __copyright__ = '2008-2010, Darko Miletic ' pagina12.com.ar ''' -import time +import re, time from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag class Pagina12(BasicNewsRecipe): title = 'Pagina - 12' @@ -22,7 +23,8 @@ class Pagina12(BasicNewsRecipe): use_embedded_content = False language = 'es' remove_empty_feeds = True - extra_css = ' body{font-family: sans-serif} ' + masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h2{color: #028CCD} img{margin-bottom: 0.4em} .epigrafe{font-size: x-small; background-color: #EBEAE5; color: #565144 } .intro{font-size: 1.1em} ' conversion_options = { 'comment' : description @@ -32,7 +34,7 @@ class Pagina12(BasicNewsRecipe): } remove_tags = [dict(name='div', attrs={'id':['volver','logo','logo_suple','fin','permalink']})] - + feeds = [ (u'Edicion impresa', u'http://www.pagina12.com.ar/diario/rss/principal.xml' ) @@ -52,7 +54,11 @@ class Pagina12(BasicNewsRecipe): return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') def get_cover_url(self): - imgnames = ['tapan.jpg','tapagn.jpg','tapan_gr.jpg','tapagn.jpg','tapagn.jpg','tapan.jpg','tapagn.jpg'] - weekday = time.localtime().tm_wday - return strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/') + imgnames[weekday] - + rawc = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html',True) + rawc2 = re.sub(r'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN','PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"',rawc) + soup = BeautifulSoup(rawc2,fromEncoding=self.encoding,smartQuotesTo=None) + for image in soup.findAll('img',alt=True): + if image['alt'].startswith('Tapa de la fecha'): + return image['src'] + return None + \ No newline at end of file From 008fab308d68003341cdd3152b50629115afc15e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Feb 2010 17:03:04 -0700 Subject: [PATCH 5/5] Updated recipe for The New Republic --- resources/recipes/the_new_republic.recipe | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/resources/recipes/the_new_republic.recipe b/resources/recipes/the_new_republic.recipe index 482dba1af0..59ccef3607 100644 --- a/resources/recipes/the_new_republic.recipe +++ b/resources/recipes/the_new_republic.recipe @@ -9,6 +9,7 @@ class The_New_Republic(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 + no_stylesheets = True remove_tags = [ dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}), @@ -21,14 +22,15 @@ class The_New_Republic(BasicNewsRecipe): ('Economy', 'http://www.tnr.com/rss/articles/Economy'), ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'), ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'), - ('Urban Policy', 'http://www.tnr.com/rss/articles/Urban-Policy'), + ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'), ('World', 'http://www.tnr.com/rss/articles/World'), ('Film', 'http://www.tnr.com/rss/articles/Film'), ('Books', 'http://www.tnr.com/rss/articles/books'), + ('The Book', 'http://www.tnr.com/rss/book'), + ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'), ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'), ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'), ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'), - ('The Stash', 'http://www.tnr.com/rss/blogs/The-Stash'), ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'), ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'), ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'), @@ -40,3 +42,4 @@ class The_New_Republic(BasicNewsRecipe): def print_version(self, url): return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') +