From 8f829e6de6eae65f29128a9a17ad9eb4236a7eb3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Apr 2019 11:21:47 +0530 Subject: [PATCH] Update Global Times Fixes #1825937 [can't fetch news of globaltimes](https://bugs.launchpad.net/calibre/+bug/1825937) --- recipes/globaltimes.recipe | 71 ++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/recipes/globaltimes.recipe b/recipes/globaltimes.recipe index f5e8d3db45..2c1025267d 100644 --- a/recipes/globaltimes.recipe +++ b/recipes/globaltimes.recipe @@ -2,46 +2,67 @@ from calibre.web.feeds.news import BasicNewsRecipe import re -class globaltimes(BasicNewsRecipe): +def classes(classes): + q = frozenset(classes.split(' ')) + return dict( + attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} + ) + + +class GlobalTimes(BasicNewsRecipe): title = u'Global Times' - __author__ = 'malfi' - language = 'zh' + __author__ = 'Jose Ortiz' # lui1 at mobileread.com + language = 'en_CN' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True - cover_url = 'http://enhimg2.huanqiu.com/images/logo.png' - language = 'en' - keep_only_tags = [] - keep_only_tags.append(dict(name='div', attrs={'id': 'content'})) - remove_tags = [] - remove_tags.append(dict(name='div', attrs={'class': 'location'})) - remove_tags.append(dict(name='div', attrs={'class': 'contentpage'})) - remove_tags.append(dict(name='li', attrs={'id': 'pl'})) + keep_only_tags = [classes('article-title article-source row-content')] extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} + .article-title { + font-family:Arial,Helvetica,sans-serif; + font-weight:bold;font-size:large; + } + + .article-source, .row-content { + font-family:Arial,Helvetica,sans-serif; + font-size:small; + } ''' def parse_index(self): catnames = {} - catnames["http://china.globaltimes.cn/chinanews/"] = "China Politics" - catnames["http://china.globaltimes.cn/diplomacy/"] = "China Diplomacy" - catnames["http://military.globaltimes.cn/china/"] = "China Military" - catnames["http://business.globaltimes.cn/china-economy/"] = "China Economy" - catnames["http://world.globaltimes.cn/asia-pacific/"] = "Asia Pacific" + catnames["http://www.globaltimes.cn/china/politics/"] = "China Politics" + catnames["http://www.globaltimes.cn/china/diplomacy/"] = "China Diplomacy" + catnames["http://www.globaltimes.cn/china/military/"] = "China Military" + catnames["http://www.globaltimes.cn/business/economy/"] = "China Economy" + catnames["http://www.globaltimes.cn/world/asia-pacific/"] = "Asia Pacific" feeds = [] for cat in catnames.keys(): articles = [] soup = self.index_to_soup(cat) - for a in soup.findAll('a', attrs={'href': re.compile(cat + "201[0-9]-[0-1][0-9]/[0-9][0-9][0-9][0-9][0-9][0-9].html")}): - url = a['href'].strip() - myarticle = ({'title': self.tag_to_string( - a), 'url': url, 'description': '', 'date': ''}) - self.log("found %s" % url) + for a in soup.findAll( + 'a', + attrs={ + 'href': + re.compile( + r'https?://www.globaltimes.cn/content/[0-9]{4,10}[.]shtml' + ) + } + ): + url = a['href'].strip( + ) # Typical url http://www.globaltimes.cn/content/5555555.shtml + title = self.tag_to_string(a).strip() + if not title: + continue + myarticle = ({ + 'title': title, + 'url': url, + 'description': '', + 'date': '' + }) + self.log("found '%s'" % title) articles.append(myarticle) self.log("Adding URL %s\n" % url) if articles: