calibre/recipes/mwjournal.recipe
2012-02-04 08:59:28 +05:30

142 lines
5.9 KiB
Python

#!/usr/bin/env python
##
## Title: Microwave Journal
## Contact: Kiavash (use Mobile Read)
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: Kiavash
##
## Written: Jan 2012
## Last Edited: Feb 2012
##
# Feb 2012: New Recipe compatible with the MWJournal 2.0 website
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'Kiavash'
__author__ = 'Kaivash'
'''
microwavejournal.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class MWJournal(BasicNewsRecipe):
title = u'Microwave Journal'
description = u'Microwave Journal Monthly Magazine'
publisher = 'Horizon House'
publication_type = 'magazine'
INDEX = 'http://www.microwavejournal.com/publications/'
language = 'en'
timeout = 30
Convert_Grayscale = False # Convert images to gray scale or not
keep_only_tags = [dict(name='div', attrs={'class':'record'})]
no_stylesheets = True
remove_javascript = True
remove_tags = [
dict(name='font', attrs={'class':'footer'}), # remove fonts
]
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks, href links and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<a.*?>'), lambda h1: ''),
(re.compile(r'</a>'), lambda h2: ''),
(re.compile(r'float:.*?'), lambda h3: ''),
(re.compile(r'width:.*?px'), lambda h4: ''),
(re.compile(r'height:.*?px'), lambda h5: '')
]
def print_version(self, url):
return url.replace('/articles/', '/articles/print/')
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
ts = soup.find('div', attrs={'class':'box1 article publications-show'})
ds = self.tag_to_string(ts.find('h2'))
self.log('Found Current Issue:', ds)
self.timefmt = ' [%s]'%ds
cover = ts.find('img', src=True)
if cover is not None:
self.cover_url = 'http://www.microwavejournal.com' + cover['src']
self.log('Found Cover image:', self.cover_url)
feeds = []
seen_titles = set([]) # This is used to remove duplicant articles
sections = soup.find('div', attrs={'class':'box2 publication'})
for section in sections.findAll('div', attrs={'class':'records'}):
section_title = self.tag_to_string(section.find('h3'))
self.log('Found section:', section_title)
articles = []
for post in section.findAll('div', attrs={'class':'record'}):
h = post.find('h2')
title = self.tag_to_string(h)
if title.find('The MWJ Puzzler') >=0: #Let's get rid of the useless Puzzler!
continue
if title in seen_titles:
continue
seen_titles.add(title)
a = post.find('a', href=True)
url = a['href']
if url.startswith('/'):
url = 'http://www.microwavejournal.com'+url
abstract = post.find('div', attrs={'class':'abstract'})
p = abstract.find('p')
desc = None
self.log('\tFound article:', title, 'at', url)
if p is not None:
desc = self.tag_to_string(p)
self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc,
'date':self.timefmt})
if articles:
feeds.append((section_title, articles))
return feeds
def postprocess_html(self, soup, first):
if self.Convert_Grayscale:
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup