diff --git a/recipes/microwave_and_rf.recipe b/recipes/microwave_and_rf.recipe deleted file mode 100644 index 3cdf6e5acc..0000000000 --- a/recipes/microwave_and_rf.recipe +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python -## -## Title: Microwave and RF -## -## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html - -# Feb 2012: Initial release - -__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' -''' -mwrf.com -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.utils.magick import Image - -class Microwaves_and_RF(BasicNewsRecipe): - - Convert_Grayscale = False # Convert images to gray scale or not - - # Add sections that want to be excluded from the magazine - exclude_sections = [] - - # Add sections that want to be included from the magazine - include_sections = [] - - title = u'Microwaves and RF' - __author__ = u'kiavash' - description = u'Microwaves and RF Montly Magazine' - publisher = 'Penton Media, Inc.' - publication_type = 'magazine' - site = 'http://mwrf.com' - - language = 'en' - asciiize = True - timeout = 120 - simultaneous_downloads = 1 # very peaky site! - - # Main article is inside this tag - keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})] - - no_stylesheets = True - remove_javascript = True - - # Flattens all the tables to make it compatible with Nook - conversion_options = {'linearize_tables' : True} - - remove_tags = [ - dict(name='span', attrs={'class':'body12'}), - ] - - remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', - 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] - - # Specify extra CSS - overrides ALL other CSS (IE. Added last). - extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ - .introduction, .first { font-weight: bold; } \ - .cross-head { font-weight: bold; font-size: 125%; } \ - .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ - .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ - .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ - .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ - font-size: 80%; font-style: italic; margin: 1px auto; } \ - .story-date, .published { font-size: 80%; } \ - table { width: 100%; } \ - td img { display: block; margin: 5px auto; } \ - ul { padding-top: 10px; } \ - ol { padding-top: 10px; } \ - li { padding-top: 5px; padding-bottom: 5px; } \ - h1 { font-size: 175%; font-weight: bold; } \ - h2 { font-size: 150%; font-weight: bold; } \ - h3 { font-size: 125%; font-weight: bold; } \ - h4, h5, h6 { font-size: 100%; font-weight: bold; }' - - # Remove the line breaks and float left/right and picture width/height. - preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), - (re.compile(r'', re.IGNORECASE), lambda m: ''), - (re.compile(r'float:.*?'), lambda m: ''), - (re.compile(r'width:.*?px'), lambda m: ''), - (re.compile(r'height:.*?px'), lambda m: '') - ] - - - def print_version(self, url): - url = re.sub(r'.html', '', url) - url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url) - return url - - # Need to change the user agent to avoid potential download errors - def get_browser(self, *args, **kwargs): - from calibre import browser - kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0' - return browser(*args, **kwargs) - - - def parse_index(self): - - # Fetches the main page of Microwaves and RF - soup = self.index_to_soup(self.site) - - # First page has the ad, Let's find the redirect address. - url = soup.find('span', attrs={'class':'commonCopy'}).find('a').get('href') - if url.startswith('/'): - url = self.site + url - - soup = self.index_to_soup(url) - - # Searches the site for Issue ID link then returns the href address - # pointing to the latest issue - latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href') - - # Fetches the index page for of the latest issue - soup = self.index_to_soup(latest_issue) - - # Finds the main section of the page containing cover, issue date and - # TOC - ts = soup.find('div', attrs={'id':'columnContainer'}) - - # Finds the issue date - ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize() - self.log('Found Current Issue:', ds) - self.timefmt = ' [%s]'%ds - - # Finds the cover image - cover = ts.find('img', src = lambda x: x and 'Cover' in x) - if cover is not None: - self.cover_url = self.site + cover['src'] - self.log('Found Cover image:', self.cover_url) - - feeds = [] - article_info = [] - - # Finds all the articles (tiles and links) - articles = ts.findAll('a', attrs={'class':'commonArticleTitle'}) - - # Finds all the descriptions - descriptions = ts.findAll('span', attrs={'class':'commonCopy'}) - - # Find all the sections - sections = ts.findAll('span', attrs={'class':'kicker'}) - - title_number = 0 - - # Goes thru all the articles one by one and sort them out - for section in sections: - title_number = title_number + 1 - - # Removes the unwanted sections - if self.tag_to_string(section) in self.exclude_sections: - continue - - # Only includes the wanted sections - if self.include_sections: - if self.tag_to_string(section) not in self.include_sections: - continue - - - title = self.tag_to_string(articles[title_number]) - url = articles[title_number].get('href') - if url.startswith('/'): - url = self.site + url - - self.log('\tFound article:', title, 'at', url) - desc = self.tag_to_string(descriptions[title_number]) - self.log('\t\t', desc) - - article_info.append({'title':title, 'url':url, 'description':desc, - 'date':self.timefmt}) - - if article_info: - feeds.append((self.title, article_info)) - - #self.log(feeds) - return feeds - - def postprocess_html(self, soup, first): - if self.Convert_Grayscale: - #process all the images - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): - iurl = tag['src'] - img = Image() - img.open(iurl) - if img < 0: - raise RuntimeError('Out of memory') - img.type = "GrayscaleType" - img.save(iurl) - return soup - - def preprocess_html(self, soup): - - # Includes all the figures inside the final ebook - # Finds all the jpg links - for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}): - - # makes sure that the link points to the absolute web address - if figure['href'].startswith('/'): - figure['href'] = self.site + figure['href'] - - figure.name = 'img' # converts the links to img - figure['src'] = figure['href'] # with the same address as href - figure['style'] = 'display:block' # adds /n before and after the image - del figure['href'] - del figure['target'] - - # Makes the title standing out - for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}): - title.name = 'h1' - del title['href'] - del title['target'] - - # Makes the section name more visible - for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}): - section_name.name = 'h5' - del section_name['href'] - del section_name['target'] - - # Removes all unrelated links - for link in soup.findAll('a', attrs = {'href': True}): - link.name = 'font' - del link['href'] - del link['target'] - - return soup