calibre/recipes/mwjournal.recipe

#!/usr/bin/env python
##
# Title:        Microwave Journal
# Contact:      Kiavash (use Mobile Read)
##
# License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright:    Kiavash
##
# Written:      Jan 2012
# Last Edited:  Feb 2012
##

# Feb 2012: New Recipe compatible with the MWJournal 2.0 website

__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'Kiavash'
__author__ = 'Kaivash'

'''
microwavejournal.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe


class MWJournal(BasicNewsRecipe):

    title = u'Microwave Journal'
    description = u'Microwave Journal Monthly Magazine'
    publisher = 'Horizon House'
    publication_type = 'magazine'
    INDEX = 'http://www.microwavejournal.com/publications/'

    language = 'en'
    timeout = 30

    Convert_Grayscale = False  # Convert images to gray scale or not

    keep_only_tags = [dict(name='div', attrs={'class': 'record'})]
    no_stylesheets = True
    remove_javascript = True
    remove_tags = [
        dict(name='font', attrs={'class': 'footer'}),    # remove fonts
    ]

    remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
                         'valign', 'vspace', 'hspace', 'alt', 'width', 'height']

    # Specify extra CSS - overrides ALL other CSS (IE. Added last).
    extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
                 .introduction, .first { font-weight: bold; } \
                 .cross-head { font-weight: bold; font-size: 125%; } \
                 .cap, .caption { display: block; font-size: 80%; font-style: italic; } \
                 .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
                 .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
                    .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
                    font-size: 80%; font-style: italic; margin: 1px auto; } \
                 .story-date, .published { font-size: 80%; } \
                 table { width: 100%; } \
                 td img { display: block; margin: 5px auto; } \
                 ul { padding-top: 10px; } \
                 ol { padding-top: 10px; } \
                 li { padding-top: 5px; padding-bottom: 5px; } \
                 h1 { font-size: 175%; font-weight: bold; } \
                 h2 { font-size: 150%; font-weight: bold; } \
                 h3 { font-size: 125%; font-weight: bold; } \
                 h4, h5, h6 { font-size: 100%; font-weight: bold; }'

    # Remove the line breaks, href links and float left/right and picture
    # width/height.
    preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
                          (re.compile(r'<br[ ]*clear.*/>',
                                      re.IGNORECASE), lambda m: ''),
                          (re.compile(r'<a.*?>'), lambda h1: ''),
                          (re.compile(r'</a>'), lambda h2: ''),
                          (re.compile(r'float:.*?'), lambda h3: ''),
                          (re.compile(r'width:.*?px'), lambda h4: ''),
                          (re.compile(r'height:.*?px'), lambda h5: '')
                          ]

    def print_version(self, url):
        return url.replace('/articles/', '/articles/print/')

    def parse_index(self):
        articles = []

        soup = self.index_to_soup(self.INDEX)
        ts = soup.find(
            'div', attrs={'class': 'box1 article publications-show'})
        ds = self.tag_to_string(ts.find('h2'))
        self.log('Found Current Issue:', ds)
        self.timefmt = ' [%s]' % ds

        cover = ts.find('img', src=True)
        if cover is not None:
            self.cover_url = 'http://www.microwavejournal.com' + cover['src']
            self.log('Found Cover image:', self.cover_url)

        feeds = []
        seen_titles = set()  # This is used to remove duplicant articles
        sections = soup.find('div', attrs={'class': 'box2 publication'})
        for section in sections.findAll('div', attrs={'class': 'records'}):
            section_title = self.tag_to_string(section.find('h3'))
            self.log('Found section:', section_title)
            articles = []
            for post in section.findAll('div', attrs={'class': 'record'}):
                h = post.find('h2')
                title = self.tag_to_string(h)
                # Let's get rid of the useless Puzzler!
                if title.find('The MWJ Puzzler') >= 0:
                    continue
                if title in seen_titles:
                    continue
                seen_titles.add(title)
                a = post.find('a', href=True)
                url = a['href']
                if url.startswith('/'):
                    url = 'http://www.microwavejournal.com' + url
                abstract = post.find('div', attrs={'class': 'abstract'})
                p = abstract.find('p')
                desc = None
                self.log('\tFound article:', title, 'at', url)
                if p is not None:
                    desc = self.tag_to_string(p)
                    self.log('\t\t', desc)
                articles.append({'title': title, 'url': url, 'description': desc,
                                 'date': self.timefmt})
            if articles:
                feeds.append((section_title, articles))
        return feeds