From 7abf29c5ba73cdbcd06cf75579139d1db669aa72 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 09:49:13 -0600 Subject: [PATCH] Fix #848900 (Updated recipe for The Japan Times) --- recipes/icons/japan_times.png | Bin 0 -> 1264 bytes recipes/japan_times.recipe | 67 ++++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 16 deletions(-) create mode 100644 recipes/icons/japan_times.png diff --git a/recipes/icons/japan_times.png b/recipes/icons/japan_times.png new file mode 100644 index 0000000000000000000000000000000000000000..1b2ac895725ec55d4328a964f6c3f70f4076f111 GIT binary patch literal 1264 zcmeIw|5MTj7zgkNycVOlqwIE@xy97zo6-l+^v-oV^y39ra_UOg4}NKDUXxZ%@6) zIOvRn?l_ReLr*;1j)yx5a5n+^6X4ISFhGa{sKPB83(i9|3b zf+Z2GN$@fWrWjyjzzhTI2)GdNFkyiSe%9(NvLM8QWj3s^0pEw*;vusAw1oo1cNmqQ z!`doKrKi3c_!4~~IHWE`-N(?NutHITDSp6IBJ7a}(}=LqI&Az3Hrs&BHDJz`W?9=M ztF+bF)jHqR>XJzx_y0aT(B)Bdc^@gg8l_LIR%^9dy7heEJ{UX0y%C&c2D5!RPI`^9}BDdflr$5pI{?<6iLjd_h0HxVRV! zg_eSNXlZ3Rgonf7f&z?~sBAAr!!LpcXex1TdU5vUP($E9?%H)Ybe zx!Rw{IkZfPGBDQhePfwp(-wP2mJ0RY#Vu{VBZKql{ZDrn8~aSvC6XtBK%U6ihmTwp zw13uod0psh+EiL#!CD|_Tu`a5Q#=(FsH39!!s((&e$mPF`!$Mh;@|JtYn120txscSyw3`B zy}hse$znyyQDN)Jw7uaa<-NW{-C!Y*9;a|!!R2`c1n)V5ymb!KAPKXE69%V3?kV;UX#7+JwZyw_#jEx zdA>HPoE~#3{XA8psnlZ@gEsJ>GKtL?E}J{^ZjeC4iR2b`)rQr#;$`vC+ZiR*{{h9F BuS5U< literal 0 HcmV?d00001 diff --git a/recipes/japan_times.recipe b/recipes/japan_times.recipe index bb83b16f1e..229d5e4035 100644 --- a/recipes/japan_times.recipe +++ b/recipes/japan_times.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2011, Darko Miletic ' ''' japantimes.co.jp ''' @@ -9,24 +7,61 @@ japantimes.co.jp from calibre.web.feeds.news import BasicNewsRecipe class JapanTimes(BasicNewsRecipe): - title = u'The Japan Times' + title = 'The Japan Times' __author__ = 'Darko Miletic' - description = 'News from Japan' - language = 'en' - - oldest_article = 7 - max_articles_per_feed = 100 + description = "Daily news and features on Japan from the most widely read English-language newspaper in Japan. Coverage includes national news, business news, sports news, commentary and features on living in Japan, entertainment, the arts, education and more." + language = 'en_JP' + category = 'news, politics, japan' + publisher = 'The Japan Times' + oldest_article = 5 + max_articles_per_feed = 150 no_stylesheets = True use_embedded_content = False + encoding = 'utf8' + publication_type = 'newspaper' + masthead_url = 'http://search.japantimes.co.jp/images/header_title.gif' + extra_css = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}' - keep_only_tags = [ dict(name='div', attrs={'id':'searchresult'}) ] - remove_tags_after = [ dict(name='div', attrs={'id':'mainbody' }) ] + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + + keep_only_tags = [dict(name='div', attrs={'id':'printresult'})] remove_tags = [ - dict(name='div' , attrs={'id':'ads' }) - ,dict(name='table', attrs={'width':470}) + dict(name=['iframe','meta','link','embed','object','base']) + ,dict(attrs={'id':'searchfooter'}) ] + feeds = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')] + remove_attributes = ['border'] + def get_article_url(self, article): + rurl = BasicNewsRecipe.get_article_url(self, article) + return rurl.partition('?')[0] - feeds = [ - (u'The Japan Times', u'http://feedproxy.google.com/japantimes') - ] \ No newline at end of file + def print_version(self, url): + return url.replace('/cgi-bin/','/print/') + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + for item in soup.findAll('photo'): + item.name = 'div' + for item in soup.head.findAll('paragraph'): + item.extract() + for item in soup.findAll('wwfilename'): + item.extract() + for item in soup.findAll('jtcategory'): + item.extract() + for item in soup.findAll('nomooter'): + item.extract() + for item in soup.body.findAll('paragraph'): + item.name = 'p' + return soup