calibre/recipes/mwjournal.recipe

#!/usr/bin/env  python
##
## Title:        Microwave Journal
## Contact:      Kiavash (use Mobile Read)
##
## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright:    Kiavash
##
## Written:      Jan 2012
## Last Edited:  Feb 2012
##

# Feb 2012: New Recipe compatible with the MWJournal 2.0 website

__license__   = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__   = 'Kiavash'
__author__ = 'Kaivash'

'''
microwavejournal.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image

class MWJournal(BasicNewsRecipe):

    title            = u'Microwave Journal'
    description      = u'Microwave Journal Monthly Magazine'
    publisher        = 'Horizon House'
    publication_type = 'magazine'
    INDEX            = 'http://www.microwavejournal.com/publications/'

    language = 'en'
    timeout = 30

    Convert_Grayscale = False # Convert images to gray scale or not

    keep_only_tags = [dict(name='div', attrs={'class':'record'})]
    no_stylesheets = True
    remove_javascript = True
    remove_tags    = [
                        dict(name='font', attrs={'class':'footer'}),    # remove fonts
                     ]

    remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
                          'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]

    # Specify extra CSS - overrides ALL other CSS (IE. Added last).
    extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
                 .introduction, .first { font-weight: bold; } \
                 .cross-head { font-weight: bold; font-size: 125%; } \
                 .cap, .caption { display: block; font-size: 80%; font-style: italic; } \
                 .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
                 .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
                    .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
                    font-size: 80%; font-style: italic; margin: 1px auto; } \
                 .story-date, .published { font-size: 80%; } \
                 table { width: 100%; } \
                 td img { display: block; margin: 5px auto; } \
                 ul { padding-top: 10px; } \
                 ol { padding-top: 10px; } \
                 li { padding-top: 5px; padding-bottom: 5px; } \
                 h1 { font-size: 175%; font-weight: bold; } \
                 h2 { font-size: 150%; font-weight: bold; } \
                 h3 { font-size: 125%; font-weight: bold; } \
                 h4, h5, h6 { font-size: 100%; font-weight: bold; }'

    # Remove the line breaks, href links and float left/right and picture width/height.
    preprocess_regexps     = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
                              (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
                              (re.compile(r'<a.*?>'), lambda h1: ''),
                              (re.compile(r'</a>'), lambda h2: ''),
                              (re.compile(r'float:.*?'), lambda h3: ''),
                              (re.compile(r'width:.*?px'), lambda h4: ''),
                              (re.compile(r'height:.*?px'), lambda h5: '')
                              ]


    def print_version(self, url):
        return url.replace('/articles/', '/articles/print/')

    def parse_index(self):
        articles = []

        soup = self.index_to_soup(self.INDEX)
        ts = soup.find('div', attrs={'class':'box1 article publications-show'})
        ds = self.tag_to_string(ts.find('h2'))
        self.log('Found Current Issue:', ds)
        self.timefmt = ' [%s]'%ds

        cover = ts.find('img', src=True)
        if cover is not None:
            self.cover_url = 'http://www.microwavejournal.com' + cover['src']
            self.log('Found Cover image:', self.cover_url)

        feeds = []
        seen_titles = set([]) # This is used to remove duplicant articles
        sections = soup.find('div', attrs={'class':'box2 publication'})
        for section in sections.findAll('div', attrs={'class':'records'}):
            section_title = self.tag_to_string(section.find('h3'))
            self.log('Found section:', section_title)
            articles = []
            for post in section.findAll('div', attrs={'class':'record'}):
                h = post.find('h2')
                title = self.tag_to_string(h)
                if title.find('The MWJ Puzzler') >=0: #Let's get rid of the useless Puzzler!
                    continue
                if title in seen_titles:
                    continue
                seen_titles.add(title)
                a = post.find('a', href=True)
                url = a['href']
                if url.startswith('/'):
                    url = 'http://www.microwavejournal.com'+url
                abstract = post.find('div', attrs={'class':'abstract'})
                p = abstract.find('p')
                desc = None
                self.log('\tFound article:', title, 'at', url)
                if p is not None:
                    desc = self.tag_to_string(p)
                    self.log('\t\t', desc)
                articles.append({'title':title, 'url':url, 'description':desc,
                    'date':self.timefmt})
            if articles:
                feeds.append((section_title, articles))
        return feeds

    def postprocess_html(self, soup, first):
        if self.Convert_Grayscale:
            #process all the images
            for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
                iurl = tag['src']
                img = Image()
                img.open(iurl)
                if img < 0:
                    raise RuntimeError('Out of memory')
                img.type = "GrayscaleType"
                img.save(iurl)
        return soup