diff --git a/recipes/satmagazine.recipe b/recipes/satmagazine.recipe new file mode 100644 index 0000000000..3e4b1e1b19 --- /dev/null +++ b/recipes/satmagazine.recipe @@ -0,0 +1,155 @@ +#!/usr/bin/env python +## +## Title: SatMagazine +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +## +## Written: Feb 2012 +## Last Edited: Mar 2012 +## + +# Feb 2012: Initial release + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' + +''' +satmagazine.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class SatMagazine(BasicNewsRecipe): + + title = u'SatMagazine' + description = u'North American Satellite Markets...' + publisher = 'Satnews Publishers' + publication_type = 'magazine' + INDEX = 'http://www.satmagazine.com/cgi-bin/display_edition.cgi' + __author__ = 'kiavash' + + language = 'en' + asciiize = True + timeout = 120 + simultaneous_downloads = 2 + + # Flattens all the tables to make it compatible with Nook + conversion_options = {'linearize_tables' : True} + + keep_only_tags = [dict(name='span', attrs={'class':'story'})] + + no_stylesheets = True + remove_javascript = True + + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { font-size: 175%; font-weight: bold; } \ + h2 { font-size: 150%; font-weight: bold; } \ + h3 { font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { font-size: 100%; font-weight: bold; }' + + # Remove the line breaks, href links and float left/right and picture width/height. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r''), lambda h1: ''), + (re.compile(r''), lambda h2: ''), + (re.compile(r'float:.*?'), lambda h3: ''), + (re.compile(r'width:.*?px'), lambda h4: ''), + (re.compile(r'height:.*?px'), lambda h5: '') + ] + + def parse_index(self): + + article_info = [] + feeds = [] + + soup = self.index_to_soup(self.INDEX) + + # Find Cover image + cover = soup.find('img', src=True, alt='Cover Image') + if cover is not None: + self.cover_url = cover['src'] + self.log('Found Cover image:', self.cover_url) + + soup = soup.find('div', attrs={'id':'middlecontent'}) # main part of the site that has the articles + + #Find the Magazine date + ts = soup.find('span', attrs={'class':'master_heading'}) # contains the string with the date + ds = ' '.join(self.tag_to_string(ts).strip().split()[:2]) + self.log('Found Current Issue:', ds) + self.timefmt = ' [%s]'%ds + + #sections = soup.findAll('span', attrs={'class':'upper_heading'}) + + articles = soup.findAll('span', attrs={'class':'heading'}) + + descriptions = soup.findAll('span', attrs={'class':'story'}) + + title_number = 0 + + # Goes thru all the articles one by one and sort them out + for article in articles: + + title = self.tag_to_string(article) + url = article.find('a').get('href') + + self.log('\tFound article:', title, 'at', url) + desc = self.tag_to_string(descriptions[title_number]) + #self.log('\t\t', desc) + + article_info.append({'title':title, 'url':url, 'description':desc, + 'date':self.timefmt}) + + title_number = title_number + 1 + + if article_info: + feeds.append((self.title, article_info)) + + return feeds + + def preprocess_html(self, soup): + + # Finds all the images + for figure in soup.findAll('img', attrs = {'src' : True}): + + # if the image is an ad then remove it. + if (figure['alt'].find('_ad_') >=0) or (figure['alt'].find('_snipe_') >=0): + del figure['src'] + del figure['alt'] + del figure['border'] + del figure['hspace'] + del figure['vspace'] + del figure['align'] + del figure['size'] + figure.name = 'font' + continue + + figure['style'] = 'display:block' # adds /n before and after the image + + # Makes the title standing out + for title in soup.findAll('b'): + title.name = 'h3' + + # Removes all unrelated links + for link in soup.findAll('a', attrs = {'href': True}): + link.name = 'font' + del link['href'] + del link['target'] + + return soup