diff --git a/recipes/microwave_and_rf.recipe b/recipes/microwave_and_rf.recipe new file mode 100644 index 0000000000..e3eee9dab1 --- /dev/null +++ b/recipes/microwave_and_rf.recipe @@ -0,0 +1,217 @@ +#!/usr/bin/env python +## +## Title: Microwave and RF +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html + +# Feb 2012: Initial release + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +''' +mwrf.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.magick import Image + +class Microwave_and_RF(BasicNewsRecipe): + + Convert_Grayscale = False # Convert images to gray scale or not + + # Add sections that want to be excluded from the magazine + exclude_sections = [] + + # Add sections that want to be included from the magazine + include_sections = [] + + title = u'Microwave and RF' + __author__ = 'kiavash' + description = u'Microwave and RF Montly Magazine' + publisher = 'Penton Media, Inc.' + publication_type = 'magazine' + site = 'http://mwrf.com' + + language = 'en' + asciiize = True + timeout = 120 + simultaneous_downloads = 1 # very peaky site! + + # Main article is inside this tag + keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})] + + no_stylesheets = True + remove_javascript = True + + # Flattens all the tables to make it compatible with Nook + conversion_options = {'linearize_tables' : True} + + remove_tags = [ + dict(name='span', attrs={'class':'body12'}), + ] + + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { font-size: 175%; font-weight: bold; } \ + h2 { font-size: 150%; font-weight: bold; } \ + h3 { font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { font-size: 100%; font-weight: bold; }' + + # Remove the line breaks and float left/right and picture width/height. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'float:.*?'), lambda m: ''), + (re.compile(r'width:.*?px'), lambda m: ''), + (re.compile(r'height:.*?px'), lambda m: '') + ] + + + def print_version(self, url): + url = re.sub(r'.html', '', url) + url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url) + return url + + # Need to change the user agent to avoid potential download errors + def get_browser(self, *args, **kwargs): + from calibre import browser + kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0' + return browser(*args, **kwargs) + + + def parse_index(self): + + # Fetches the main page of Microwave and RF + soup = self.index_to_soup(self.site) + + # Searches the site for Issue ID link then returns the href address + # pointing to the latest issue + latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href') + + # Fetches the index page for of the latest issue + soup = self.index_to_soup(latest_issue) + + # Finds the main section of the page containing cover, issue date and + # TOC + ts = soup.find('div', attrs={'id':'columnContainer'}) + + # Finds the issue date + ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize() + self.log('Found Current Issue:', ds) + self.timefmt = ' [%s]'%ds + + # Finds the cover image + cover = ts.find('img', src = lambda x: x and 'Cover' in x) + if cover is not None: + self.cover_url = self.site + cover['src'] + self.log('Found Cover image:', self.cover_url) + + feeds = [] + article_info = [] + + # Finds all the articles (tiles and links) + articles = ts.findAll('a', attrs={'class':'commonArticleTitle'}) + + # Finds all the descriptions + descriptions = ts.findAll('span', attrs={'class':'commonCopy'}) + + # Find all the sections + sections = ts.findAll('span', attrs={'class':'kicker'}) + + title_number = 0 + + # Goes thru all the articles one by one and sort them out + for section in sections: + title_number = title_number + 1 + + # Removes the unwanted sections + if self.tag_to_string(section) in self.exclude_sections: + continue + + # Only includes the wanted sections + if self.include_sections: + if self.tag_to_string(section) not in self.include_sections: + continue + + + title = self.tag_to_string(articles[title_number]) + url = articles[title_number].get('href') + if url.startswith('/'): + url = self.site + url + + self.log('\tFound article:', title, 'at', url) + desc = self.tag_to_string(descriptions[title_number]) + self.log('\t\t', desc) + + article_info.append({'title':title, 'url':url, 'description':desc, + 'date':self.timefmt}) + + if article_info: + feeds.append((self.title, article_info)) + + #self.log(feeds) + return feeds + + def postprocess_html(self, soup, first): + if self.Convert_Grayscale: + #process all the images + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + if img < 0: + raise RuntimeError('Out of memory') + img.type = "GrayscaleType" + img.save(iurl) + return soup + + def preprocess_html(self, soup): + + # Includes all the figures inside the final ebook + # Finds all the jpg links + for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}): + + # makes sure that the link points to the absolute web address + if figure['href'].startswith('/'): + figure['href'] = self.site + figure['href'] + + figure.name = 'img' # converts the links to img + figure['src'] = figure['href'] # with the same address as href + figure['style'] = 'display:block' # adds /n before and after the image + del figure['href'] + del figure['target'] + + # Makes the title standing out + for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}): + title.name = 'h1' + del title['href'] + del title['target'] + + # Makes the section name more visible + for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}): + section_name.name = 'h5' + del section_name['href'] + del section_name['target'] + + # Removes all unrelated links + for link in soup.findAll('a', attrs = {'href': True}): + link.name = 'font' + del link['href'] + del link['target'] + + return soup