From 99892dc98fb6dac8f1f0fcdac64917162db3efee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 25 Feb 2012 10:06:12 +0530 Subject: [PATCH] Update Mediapart and rue89 --- recipes/icons/mediapart.png | Bin 0 -> 382 bytes recipes/icons/rue89.png | Bin 0 -> 1261 bytes recipes/mediapart.recipe | 26 ++++++++------ recipes/rue89.recipe | 66 ++++++++++++++++++++---------------- 4 files changed, 53 insertions(+), 39 deletions(-) create mode 100644 recipes/icons/mediapart.png create mode 100644 recipes/icons/rue89.png diff --git a/recipes/icons/mediapart.png b/recipes/icons/mediapart.png new file mode 100644 index 0000000000000000000000000000000000000000..ab489d3db7f7ccef0d7ea8b9b06ebe8f782ce6be GIT binary patch literal 382 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!73?$#)eFPFP2=EDU{r~^nyLUhWNZkd}ckk{7 zGI#Ip0E_-Vb7s}<-Q^&;yJyavnR)ljs!Wq-=Ye_|lf2zsI6vN8`Uc41Ebxdd2GSm2 z>~=ES4#;Trba4#fxSpI4VpPU!7$%fZ5NMpgUc;{_@j^^u?B3eq>ndz#U%a}zdwc%e zBNHy~ljhLsGrY5!&2ZcP)C+fc*``aVIlJgchB>>G8D_XNzgX+bGg(n-8qZH}$C4}N zZ0h&l?k*3H4@{h4nSF8J^(gIhBZJCT~pbVa_elF{r5}E*np_Equ literal 0 HcmV?d00001 diff --git a/recipes/icons/rue89.png b/recipes/icons/rue89.png new file mode 100644 index 0000000000000000000000000000000000000000..55c52bc488abb05186f106e169556affcf67bf59 GIT binary patch literal 1261 zcmY+^dr*>D6aer~Gcdzf;$&ql+QTRHfr3b>L~3ekX!xMiTr?BQ2*V-ERqQco`4~$! z)<-Gjn9E{HTH9vnu9hjGm9>~8SgwlN3M?bKWx2Qh+uzJNXXehFnKNh3+#*`!ZcB@` z77zqkhK2-2gKG0NY!D#tP1)!S3WB+3ci_@DE8^$hf|nJ?Ll`GO4fz(qy+`~Z$jl=& za3?*ar!6KReBVLz7uC$^0SIeO-NQr*g`pIxgF_hgX#B-xEq0`#avMa1g9De=MX@X> zFgwd^t*CEGqgR%rmGNeM=c~0Um&FAR z_vX)CT@bY6RQF=rk%b1M5o<6QOg%<*&&N0qsw&Cb?SrDw%x=J6iOMzR35&1|F0#v;Lirf;Gnm%4qPf08VSt}J6~Cr)?0)`|%8&n)AB-I9l!A#?rXNdUFc>E80=m24ea`fjf8t{6n(my(f)Q83 z#ZyyLEXrlki=HyyV+^Ho$~&2%Ke;3C$9zK?3H2)D3gGA?4|Fac&FQzNvCGF^W&CC} zbT?B;Os>42uoWqs&~sXza-479yRppX+{nmCV(BI3=}!U=KIY`ZBrRicupwc#eM`4` z-rAhXX0x{?SB85^+gn>(&FEsVOhf0C+q&~gKmWWj#3<=8v*qc(xf@G&b#r>$(2w#D z-#2^rhw0GZ%T=hP4y)>GRwxt-x+`H-B0^pojYcB$tiQkCRQiluJCoO;9ylOj6?Kc_ z_)UlLS{3IlCldG)o?n2NR!L6*3s%!AcmjYTwv$LCC^aWz`cG^pEQ}A*kIPnXgG433 z9nOc7OV>6)50Hi2L`8OuMs_vbd+p|rAKN4nNr^^zuVL?Xzm3CcYco@mQmt0~Xnm3H zSE`dBtiI67I;ADP*!8-4czBpB(*iEcN(`AC{hC>>sW!Jv$;`~Ogd>b2FL3@FEKKC` zOF(=5-Om&983o_}9T~}AwmRdpb_Kk#_$GDBk1??8aH6XvrURYbwq zKu1q!XJ^|lEc!ZHT`*B@MO!SVpx?AaSI}J1rs#UHSiJHU{k>iuL5b0-9-ctjoU5s+ zS`)#em>7XZF`yY8eYrU( zszMw$<0Yx`lu0+~845W#IL<=x(vimQ*;7|#eul%k=7bMXWn6fG}sr8-kPcFBT@j|FM}H6wgilo_mO#l643a2oK}%SQwARd(&}V zWG@1lK=godWElRi3MT!pA(NfTO3V4$fWyKBES^Be5y@UYWSHOq<3R%+G3f#tpwOVm Kz$R+)ng0OQj8T68 literal 0 HcmV?d00001 diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index a5bc4e96f9..0c9bbb4b01 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,11 +1,13 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010, 2011, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' ''' Mediapart ''' +__author__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' + import re -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): @@ -15,8 +17,9 @@ class Mediapart(BasicNewsRecipe): oldest_article = 7 language = 'fr' needs_subscription = True - max_articles_per_feed = 50 + + use_embedded_content = False no_stylesheets = True cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' @@ -27,14 +30,9 @@ class Mediapart(BasicNewsRecipe): # -- print-version - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in - [ - (r'', lambda match : '

'+match.group(1)+'

'), - (r'\'', lambda match: '’') - ] - ] + conversion_options = { 'smarten_punctuation' : True } - remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] def print_version(self, url): raw = self.browser.open(url).read() @@ -55,3 +53,11 @@ class Mediapart(BasicNewsRecipe): br['pass'] = self.password br.submit() return br + + def preprocess_html(self, soup): + for title in soup.findAll('p', {'class':'titre_page'}): + title.name = 'h3' + for legend in soup.findAll('span', {'class':'legend'}): + legend.insert(0, Tag(soup, 'br', [])) + legend.name = 'small' + return soup diff --git a/recipes/rue89.recipe b/recipes/rue89.recipe index 51cf8f6b98..c49712dc32 100644 --- a/recipes/rue89.recipe +++ b/recipes/rue89.recipe @@ -1,10 +1,10 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Louis Gesbert ' +__copyright__ = '2010-2012, Louis Gesbert ' ''' Rue89 ''' -__author__ = '2010, Louis Gesbert ' +__author__ = '2010-2012, Louis Gesbert ' import re from calibre.ebooks.BeautifulSoup import Tag @@ -17,37 +17,45 @@ class Rue89(BasicNewsRecipe): title = u'Rue89' language = 'fr' oldest_article = 7 - max_articles_per_feed = 50 + max_articles_per_feed = 12 - feeds = [(u'La Une', u'http://www.rue89.com/homepage/feed')] + use_embedded_content = False + + # From http://www.rue89.com/les-flux-rss-de-rue89 + feeds = [ + (u'La Une', u'http://www.rue89.com/feed'), + (u'Rue69', u'http://www.rue89.com/rue69/feed'), + (u'Eco', u'http://www.rue89.com/rue89-eco/feed'), + (u'Planète', u'http://www.rue89.com/rue89-planete/feed'), + (u'Sport', u'http://www.rue89.com/rue89-sport/feed'), + (u'Culture', u'http://www.rue89.com/culture/feed'), + (u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'), + (u'Media', u'http://www.rue89.com/medias/feed'), + (u'Monde', u'http://www.rue89.com/monde/feed'), + (u'Politique', u'http://www.rue89.com/politique/feed'), + (u'Societe', u'http://www.rue89.com/societe/feed'), + ] + + # Follow redirection from feedsportal.com + def get_article_url(self,article): + return self.browser.open_novisit(article.link).geturl() + + def print_version(self, url): + return url + '?imprimer=1' no_stylesheets = True - preprocess_regexps = [ - (re.compile(r'<(/?)h2>', re.IGNORECASE|re.DOTALL), - lambda match : '<'+match.group(1)+'h3>'), - (re.compile(r'', re.IGNORECASE|re.DOTALL), - lambda match : '

'+match.group(1)+'

'), - (re.compile(r']+src="[^"]*/numeros/(\d+)[^0-9.">]*.gif"[^>]*/>', re.IGNORECASE|re.DOTALL), - lambda match : ''+match.group(1)+''), - (re.compile(r'\''), lambda match: '’'), - ] + conversion_options = { 'smarten_punctuation' : True } - def preprocess_html(self,soup): - body = Tag(soup, 'body') - title = soup.find('h1', {'class':'title'}) - content = soup.find('div', {'class':'content'}) - soup.body.replaceWith(body) - body.insert(0, title) - body.insert(1, content) - return soup + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}), + ] - remove_tags = [ #dict(name='div', attrs={'class':'print-source_url'}), - #dict(name='div', attrs={'class':'print-links'}), - #dict(name='img', attrs={'class':'print-logo'}), - dict(name='div', attrs={'class':'content_top'}), - dict(name='div', attrs={'id':'sidebar-left'}), ] + remove_tags_after = [ + dict(name='div', attrs={'id':'plus_loin'}), + ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# def print_version(self, url): -# return re.sub('^.*-([0-9]+)$', 'http://www.rue89.com/print/\\1',url) + remove_tags = [ + dict(name='div', attrs={'id':'article_tools'}), + dict(name='div', attrs={'id':'plus_loin'}), + ]