From 7535f5862712a54e4dd3ae54132f2a3060229306 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jan 2010 09:46:52 -0700 Subject: [PATCH] New recipe for The Yemen Times by kwetal --- resources/recipes/yementimes.recipe | 125 ++++++++++++++++++++++++++++ src/calibre/utils/localization.py | 1 + 2 files changed, 126 insertions(+) create mode 100644 resources/recipes/yementimes.recipe diff --git a/resources/recipes/yementimes.recipe b/resources/recipes/yementimes.recipe new file mode 100644 index 0000000000..426c9a748c --- /dev/null +++ b/resources/recipes/yementimes.recipe @@ -0,0 +1,125 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + +class YemenTimesRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_YE' + country = 'YE' + version = 1 + + title = u'Yemen Times' + publisher = u'yementimes.com' + category = u'News, Opinion, Yemen' + description = u'Award winning weekly from Yemen, promoting press freedom, professional journalism and the defense of human rights.' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + encoding = 'utf-8' + + remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'ctl00_ContentPlaceHolder1_MAINNEWS0_Panel1', + 'class': 'DMAIN2'})) + remove_attributes = ['style'] + + INDEX = 'http://www.yementimes.com/' + feeds = [] + feeds.append((u'Our Viewpoint', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=6&pnm=OUR%20VIEWPOINT')) + feeds.append((u'Local News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=3&pnm=Local%20news')) + feeds.append((u'Their News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=80&pnm=Their%20News')) + feeds.append((u'Report', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=8&pnm=report')) + feeds.append((u'Health', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=51&pnm=health')) + feeds.append((u'Interview', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=77&pnm=interview')) + feeds.append((u'Opinion', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=7&pnm=opinion')) + feeds.append((u'Business', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=5&pnm=business')) + feeds.append((u'Op-Ed', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=81&pnm=Op-Ed')) + feeds.append((u'Culture', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=75&pnm=Culture')) + feeds.append((u'Readers View', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=4&pnm=Readers%20View')) + feeds.append((u'Variety', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=9&pnm=Variety')) + feeds.append((u'Education', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=57&pnm=Education')) + + extra_css = ''' + body {font-family:verdana, arial, helvetica, geneva, sans-serif;} + div.yemen_byline {font-size: medium; font-weight: bold;} + div.yemen_date {font-size: small; color: #666666; margin-bottom: 0.6em;} + .yemen_caption {font-size: x-small; font-style: italic; color: #696969;} + ''' + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher, 'linearize_tables': True} + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.set_handle_gzip(True) + + return br + + def parse_index(self): + answer = [] + for feed_title, feed in self.feeds: + soup = self.index_to_soup(feed) + + newsbox = soup.find('div', 'newsbox') + main = newsbox.findNextSibling('table') + + articles = [] + for li in main.findAll('li'): + title = self.tag_to_string(li.a) + url = self.INDEX + li.a['href'] + articles.append({'title': title, 'date': None, 'url': url, 'description': '
 '}) + + answer.append((feed_title, articles)) + + return answer + + def preprocess_html(self, soup): + freshSoup = self.getFreshSoup(soup) + + headline = soup.find('div', attrs = {'id': 'DVMTIT'}) + if headline: + div = headline.findNext('div', attrs = {'id': 'DVTOP'}) + img = None + if div: + img = div.find('img') + + headline.name = 'h1' + freshSoup.body.append(headline) + if img is not None: + freshSoup.body.append(img) + + byline = soup.find('div', attrs = {'id': 'DVTIT'}) + if byline: + date_el = byline.find('span') + if date_el: + pub_date = self.tag_to_string(date_el) + date = Tag(soup, 'div', attrs = [('class', 'yemen_date')]) + date.append(pub_date) + date_el.extract() + + raw = '
'.join(['%s' % (part) for part in byline.findAll(text = True)]) + author = BeautifulSoup('
' + raw + '
') + + if date is not None: + freshSoup.body.append(date) + freshSoup.body.append(author) + + story = soup.find('div', attrs = {'id': 'DVDET'}) + if story: + for table in story.findAll('table'): + if table.find('img'): + table['class'] = 'yemen_caption' + + freshSoup.body.append(story) + + return freshSoup + + def getFreshSoup(self, oldSoup): + freshSoup = BeautifulSoup('') + if oldSoup.head.title: + freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) + return freshSoup diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 1ade012b1f..90f86a8368 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -104,6 +104,7 @@ _extra_lang_codes = { 'en_CY' : _('English (Cyprus)'), 'en_PK' : _('English (Pakistan)'), 'en_SG' : _('English (Singapore)'), + 'en_YE' : _('English (Yemen)'), 'de_AT' : _('German (AT)'), 'nl' : _('Dutch (NL)'), 'nl_BE' : _('Dutch (BE)'),