mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
05d555d4ed
commit
2ed06d43db
@ -6,7 +6,7 @@
|
|||||||
## Copyright: Kiavash
|
## Copyright: Kiavash
|
||||||
##
|
##
|
||||||
## Written: Jan 2012
|
## Written: Jan 2012
|
||||||
## Last Edited: 2012-01-07
|
## Last Edited: Jan 2012
|
||||||
##
|
##
|
||||||
|
|
||||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||||
@ -18,14 +18,16 @@ Microwave Journal Monthly Magazine
|
|||||||
You need to sign up (free) and get username/password.
|
You need to sign up (free) and get username/password.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re # Import the regular expressions module.
|
import re # Import the regular expressions module.
|
||||||
from calibre.ptempfile import TemporaryFile # we need this for saving to a temp file
|
from calibre.ptempfile import TemporaryFile # we need this for saving to a temp file
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class MWJournal(BasicNewsRecipe):
|
class MWJournal(BasicNewsRecipe):
|
||||||
# Title to use for the ebook.
|
# Title to use for the ebook.
|
||||||
title = u'Microwave Journal'
|
title = u'Microwave Journal'
|
||||||
__author__ = 'Kiavash'
|
__author__ = 'Kiavash'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
#A brief description for the ebook.
|
#A brief description for the ebook.
|
||||||
description = u'Microwave Journal web site ebook created using rss feeds.'
|
description = u'Microwave Journal web site ebook created using rss feeds.'
|
||||||
@ -33,9 +35,8 @@ class MWJournal(BasicNewsRecipe):
|
|||||||
# Set publisher and publication type.
|
# Set publisher and publication type.
|
||||||
publisher = 'Horizon House'
|
publisher = 'Horizon House'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
language = 'en'
|
|
||||||
|
|
||||||
oldest_article = 31 # monthly published magazine. Some months are 31 days!
|
oldest_article = 31 # monthly published magazine. Some months are 31 days!
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
@ -44,43 +45,50 @@ class MWJournal(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
needs_subscription = True # oh yeah... we need to login btw.
|
asciiize = True # Converts all none ascii characters to their ascii equivalents
|
||||||
|
|
||||||
|
needs_subscription = True # oh yeah... we need to login btw.
|
||||||
|
|
||||||
# Timeout for fetching files from the server in seconds. The default of 120 seconds, seems somewhat excessive.
|
# Timeout for fetching files from the server in seconds. The default of 120 seconds, seems somewhat excessive.
|
||||||
timeout = 30
|
timeout = 30
|
||||||
|
|
||||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||||
|
|
||||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||||
.introduction, .first { font-weight: bold; } \
|
.introduction, .first { font-weight: bold; } \
|
||||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||||
.cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
|
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||||
text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
|
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||||
.story-date, .published { font-size: 80%; } \
|
.story-date, .published { font-size: 80%; } \
|
||||||
table { width: 100%; } \
|
table { width: 100%; } \
|
||||||
td img { display: block; margin: 5px auto; } \
|
td img { display: block; margin: 5px auto; } \
|
||||||
ul { padding-top: 10px; } \
|
ul { padding-top: 10px; } \
|
||||||
ol { padding-top: 10px; } \
|
ol { padding-top: 10px; } \
|
||||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||||
h1 { text-align: center; font-size: 175%; font-weight: bold; } \
|
h1 { font-size: 175%; font-weight: bold; } \
|
||||||
h2 { text-align: center; font-size: 150%; font-weight: bold; } \
|
h2 { font-size: 150%; font-weight: bold; } \
|
||||||
h3 { text-align: center; font-size: 125%; font-weight: bold; } \
|
h3 { font-size: 125%; font-weight: bold; } \
|
||||||
h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
|
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class':'boxadzonearea350'}), # Removes banner ads
|
dict(name='div', attrs={'class':'boxadzonearea350'}), # Removes banner ads
|
||||||
dict(name='font', attrs={'class':'footer'}), # remove fonts if you do like your fonts more! Comment out to use website's fonts
|
dict(name='font', attrs={'class':'footer'}), # remove fonts if you do like your fonts more! Comment out to use website's fonts
|
||||||
|
dict(name='div', attrs={'class':'newsarticlead'})
|
||||||
]
|
]
|
||||||
|
|
||||||
# Remove various tag attributes to improve the look of the ebook pages.
|
# Remove various tag attributes to improve the look of the ebook pages.
|
||||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||||
|
|
||||||
# Remove the line breaks,
|
# Remove the line breaks as well as href links. Books don't have links generally speaking
|
||||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
|
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'<a.*?>'), lambda h1: ''),
|
||||||
|
(re.compile(r'</a>'), lambda h2: '')
|
||||||
|
]
|
||||||
|
|
||||||
# Select the feeds that you are interested.
|
# Select the feeds that you are interested.
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -96,19 +104,20 @@ class MWJournal(BasicNewsRecipe):
|
|||||||
# The function is adapted from the Economist recipe
|
# The function is adapted from the Economist recipe
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover_url = None
|
cover_url = None
|
||||||
cover_page_location = 'http://www.mwjournal.com/Journal/' # Cover image is located on this page
|
cover_page_location = 'http://www.mwjournal.com/Journal/' # Cover image is located on this page
|
||||||
soup = self.index_to_soup(cover_page_location)
|
soup = self.index_to_soup(cover_page_location)
|
||||||
cover_item = soup.find('img',attrs={'src':lambda x: x and '/IssueImg/3_MWJ_CurrIss_CoverImg' in x}) # There are three files named cover, we want the highest resolution which is the 3rd image. So we look for the pattern. Remember that the name of the cover image changes every month so we cannot search for the complete name. Instead we are searching for the pattern
|
cover_item = soup.find('img',attrs={'src':lambda x: x and '/IssueImg/3_MWJ_CurrIss_CoverImg' in x}) # There are three files named cover, we want the highest resolution which is the 3rd image. So we look for the pattern. Remember that the name of the cover image changes every month so we cannot search for the complete name. Instead we are searching for the pattern
|
||||||
if cover_item:
|
if cover_item:
|
||||||
cover_url = 'http://www.mwjournal.com' + cover_item['src'].strip() # yeah! we found it. Let's fetch the image file and pass it as cover to calibre
|
cover_url = 'http://www.mwjournal.com' + cover_item['src'].strip() # yeah! we found it. Let's fetch the image file and pass it as cover to calibre
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
'''
|
if url.find('/Journal/article.asp?HH_ID=') >= 0:
|
||||||
this function uses the print version of the article. Replaces the URL with its print version and fetch that page instead.
|
return self.browser.open_novisit(url).geturl().replace('/Journal/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
|
||||||
'''
|
elif url.find('/News/article.asp?HH_ID=') >= 0:
|
||||||
return url.replace('http://mwjournal.com/Journal/article.asp?HH_ID=', 'http://mwjournal.com/Journal/Print.asp?Id=')
|
return self.browser.open_novisit(url).geturl().replace('/News/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
|
||||||
|
elif url.find('/Resources/TechLib.asp?HH_ID=') >= 0:
|
||||||
|
return self.browser.open_novisit(url).geturl().replace('/Resources/TechLib.asp?HH_ID=', '/Resources/PrintRessource.asp?Id=')
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
'''
|
'''
|
||||||
@ -118,9 +127,9 @@ class MWJournal(BasicNewsRecipe):
|
|||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
url = ('http://www.omeda.com/cgi-win/mwjreg.cgi?m=login') # main login page.
|
url = ('http://www.omeda.com/cgi-win/mwjreg.cgi?m=login') # main login page.
|
||||||
br.open(url) # fetch the 1st login page
|
br.open(url) # fetch the 1st login page
|
||||||
br.select_form('login') # finds the login form
|
br.select_form('login') # finds the login form
|
||||||
br['EMAIL_ADDRESS'] = self.username # fills the username
|
br['EMAIL_ADDRESS'] = self.username # fills the username
|
||||||
br['PASSWORD'] = self.password # fills the password
|
br['PASSWORD'] = self.password # fills the password
|
||||||
raw = br.submit().read() # submit the form and read the 2nd login form
|
raw = br.submit().read() # submit the form and read the 2nd login form
|
||||||
# save it to an htm temp file (from ESPN recipe written by Kovid Goyal kovid@kovidgoyal.net
|
# save it to an htm temp file (from ESPN recipe written by Kovid Goyal kovid@kovidgoyal.net
|
||||||
with TemporaryFile(suffix='.htm') as fname:
|
with TemporaryFile(suffix='.htm') as fname:
|
||||||
@ -128,7 +137,7 @@ class MWJournal(BasicNewsRecipe):
|
|||||||
f.write(raw)
|
f.write(raw)
|
||||||
br.open_local_file(fname)
|
br.open_local_file(fname)
|
||||||
br.select_form(nr=0) # finds submit on the 2nd form
|
br.select_form(nr=0) # finds submit on the 2nd form
|
||||||
didwelogin = br.submit().read() # submit it and read the return html
|
didwelogin = br.submit().read() # submit it and read the return html
|
||||||
if 'Welcome ' not in didwelogin: # did it login successfully? Is Username/password correct?
|
if 'Welcome ' not in didwelogin: # did it login successfully? Is Username/password correct?
|
||||||
raise Exception('Failed to login, are you sure your username and password are correct?')
|
raise Exception('Failed to login, are you sure your username and password are correct?')
|
||||||
#login is done
|
#login is done
|
||||||
|
Loading…
x
Reference in New Issue
Block a user