sync to John's branch

This commit is contained in:
Tomasz Długosz 2012-03-16 22:48:26 +01:00
commit e69f33eed6
615 changed files with 327061 additions and 209429 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
'''
abc.net.au/news
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class TheDailyNewsEG(BasicNewsRecipe):
title = u'al-masry al-youm'
__author__ = 'Omm Mishmishah'
description = 'Independent News from Egypt'
masthead_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
cover_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
auto_cleanup = True
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'Independent News Egypt'
category = 'News, Egypt, World'
language = 'en_EG'
publication_type = 'newsportal'
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': False
}
keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height']
feeds = [(u'English News', u'http://www.almasryalyoum.com/en/rss_feed_term/113/rss.xml'),
(u'News Features', u'http://www.almasryalyoum.com/en/rss_feed_term/115/rss.xml'),
(u'Culture', u'http://www.almasryalyoum.com/en/rss_feed_term/133/rss.xml'),
(u'Cinema', u'http://www.almasryalyoum.com/en/rss_feed_term/134/rss.xml')
]

View File

@ -0,0 +1,18 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class AlbertMohlersBlog(BasicNewsRecipe):
title = u'Albert Mohler\'s Blog'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 90
max_articles_per_feed = 10
auto_cleanup = True
cover_url = 'http://www.albertmohler.com/wp-content/themes/albert-mohler-v5/img/logo-am-lg.gif'
publisher = 'Albert Mohler'
language = 'en'
author = 'Albert Mohler'
feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')]

View File

@ -10,11 +10,11 @@ class Alternet(BasicNewsRecipe):
category = 'News, Magazine'
description = 'News magazine and online community'
feeds = [
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
]
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
]
remove_attributes = ['width', 'align','cellspacing']
remove_javascript = True
use_embedded_content = False
@ -36,3 +36,5 @@ class Alternet(BasicNewsRecipe):
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
conversion_options = {'linearize_tables': True}

View File

@ -11,7 +11,6 @@ class AssociatedPress(BasicNewsRecipe):
language = 'en'
no_stylesheets = True
max_articles_per_feed = 15
html2lrf_options = ['--force-page-break-before-tag="chapter"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in

View File

@ -0,0 +1,51 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.asianreviewofbooks.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AsianReviewOfBooks(BasicNewsRecipe):
title = 'The Asian Review of Books'
__author__ = 'Darko Miletic'
description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.'
publisher = 'The Asian Review of Books'
category = 'literature, books, reviews, Asia'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = 'en_HK'
publication_type = 'magazine'
masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png'
extra_css = """
body{font-family: serif}
.big {font-size: xx-large}
.bold {font-weight: bold}
.italic {font-style: italic}
.small {font-size: small}
img {display: block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['object','script','iframe','embed'])]
remove_attributes = ['style', 'onclick']
feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')]
def print_version(self, url):
root, sep, artid = url.rpartition('?ID=')
return root + 'getarticle.php?articleID=' + artid + '&stats=web'
def preprocess_raw_html(self, raw, url):
return '<html><head><title>title</title></head><body>' + raw + '</body></html>'

View File

@ -0,0 +1,16 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327747616(BasicNewsRecipe):
title = u'Beppe Grillo'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Beppe Grillo', u'http://feeds.feedburner.com/beppegrillo/atom')]
description = 'Blog of the famous comedian and politician Beppe Grillo - v1.00 (28, January 2012)'
__author__ = 'faber1971'
language = 'it'

View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Birmingham post'
description = 'News for Birmingham UK'
timefmt = ''
__author__ = 'Dave Asbury'
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
oldest_article = 1
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
auto_cleanup = True
language = 'en_GB'
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
keep_only_tags = [
#dict(name='h1',attrs={'id' : 'article-headline'}),
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
#dict(name='p')
#dict(attrs={'id' : 'three-col'})
]
remove_tags = [
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
]
feeds = [
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
]
extra_css = '''
body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''

View File

@ -1,6 +1,6 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
blic.rs
'''
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
def print_version(self, url):
return url + '/print'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_cover_url(self):
soup = self.index_to_soup('http://www.blic.rs/')
alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
if alink:
return 'http://www.blic.rs' + alink['href']
return None

View File

@ -1,95 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
borba.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Borba(BasicNewsRecipe):
title = 'Borba Online'
__author__ = 'Darko Miletic'
description = 'Dnevne novine Borba Online'
publisher = 'IP Novine Borba'
category = 'news, politics, Serbia'
language = 'sr'
lang = _('sr-Latn-RS')
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
INDEX = u'http://www.borba.rs/'
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'main'})]
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
remove_tags = [
dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'id':'written_comments_title'})
]
feeds = [
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
]
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
url = item['href']
title = self.tag_to_string(item)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,45 +7,76 @@ __license__ = 'GPL v3'
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Calgary Herald
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
encoding = 'latin1'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
@ -64,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@ -98,9 +196,7 @@ class CanWestPaper(BasicNewsRecipe):
atag = h1tag.find('a',href=True)
if not atag:
continue
url = atag['href']
if not url.startswith('http:'):
url = self.url_prefix+'/news/todays-paper/'+atag['href']
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1328971305(BasicNewsRecipe):
title = u'Catholic Daily Readings'
language = 'en'
__author__ = 'adoucette'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Daily Readings - USCCB', u'http://www.usccb.org/bible/readings/rss/'), (u'Daily Reflection - One Bread One Body', u'http://www.presentationministries.com/general/rss.asp'), (u'Mass Readings - Universalis', u'http://www.universalis.com/atommass3.xml'), (u'Saint Of The Day - CNA', u'http://feeds.feedburner.com/catholicnewsagency/saintoftheday')]

View File

@ -77,8 +77,18 @@ class ChicagoTribune(BasicNewsRecipe):
def get_article_url(self, article):
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
url = article.get('feedburner_origlink', article.get('guid', article.get('link')))
if url.endswith('?track=rss'):
url = url.partition('?')[0]
return url
def skip_ad_pages(self, soup):
text = soup.find(text='click here to continue to article')
if text:
a = text.parent
url = a.get('href')
if url:
return self.index_to_soup(url, raw=True)
def postprocess_html(self, soup, first_fetch):
# Remove the navigation bar. It was kept until now to be able to follow

View File

@ -1,38 +1,89 @@
#!/usr/bin/env python
##
## Title: Common Dreams
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Cleaned up the output to have only the main article
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
commondreams.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class CommonDreams(BasicNewsRecipe):
# Identify the recipe
title = u'Common Dreams'
description = u'Progressive news and views'
description = u'Breaking News & Views for the Progressive Community.'
cover_url = 'https://s3.amazonaws.com/s3.commondreams.org/images/common-dreams.png'
__author__ = u'XanthanGum'
language = 'en'
# Format the text
extra_css = '''
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
h1{font-size: xx-large;}
h2{font-size: large;}
'''
# Pick no article older than seven days and limit the number of articles per feed to 100
oldest_article = 7
max_articles_per_feed = 100
# Remove everything before the article
no_stylesheets = True
remove_javascript = True
remove_tags_before = dict(name = 'div', attrs = {'id':'node-header'})
# Flattens all the tables to make it compatible with Nook
conversion_options = {'linearize_tables' : True}
# Remove everything after the article
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: ''),
(re.compile(r'<a.*?>'), lambda m: ''),
(re.compile(r'</a>'), lambda m: ''),
]
# Main article is inside this tag
keep_only_tags = [
dict(name='div', attrs={'id':lambda x: x and 'node-' in x}),
]
remove_tags = [
dict(name='div', attrs={'class':'node-links clear-block'}), # remove Share options
]
remove_tags_after = dict(name = 'div', attrs = {'class':'copyright-info'})
# Identify the news feeds
feeds = [(u'Headlines', u'http://www.commondreams.org/feed/headlines_rss'),
(u'Further News Articles', u'http://www.commondreams.org/feed/further_rss'),
(u'Views', u'http://www.commondreams.org/feed/views_rss'),
(u'Progressive Newswire', u'http://www.commondreams.org/feed/newswire_rss')]
feeds = [(u'Headlines', u'https://www.commondreams.org/feed/headlines_rss'),
(u'Further News Articles', u'https://www.commondreams.org/feed/further_rss'),
(u'Views', u'https://www.commondreams.org/feed/views_rss'),
(u'Progressive Newswire', u'https://www.commondreams.org/feed/newswire_rss')]
def print_version(self, url):
url = url + '?print'
return url

View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
##
## Title: Consortium News
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Initial release
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
consortiumnews.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ConsortiumNews(BasicNewsRecipe):
title = u'Consortium News'
publisher = 'Copyright © 2012 Consortiumnews. All Rights Reserved.'
language = 'en'
__author__ = 'kiavash'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
conversion_options = {'linearize_tables' : True} # Flattens all the tables to make it compatible with Nook
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: ''),
(re.compile(r'<a.*?>'), lambda h1: ''),
(re.compile(r'</a>'), lambda h2: ''),
]
# Main article is inside this tag
keep_only_tags = [dict(name='div', attrs={'id':lambda x: x and 'post-' in x})]
remove_tags = [
dict(name='div', attrs={'class':'sociable'}), # remove 'Share this Article'
dict(name='p', attrs={'class':'tags'}), # remove 'Tags: ... '
]
feeds = [(u'Consortium News', u'http://feeds.feedburner.com/Consortiumnewscom')]

View File

@ -7,6 +7,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
__author__ = 'Dave Asbury'
#last update 21/12/11
# greyscale code by Starson
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
no_stylesheets = True
@ -31,8 +32,9 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']})
]
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
dict(name='li',attrs={'class' : 'thumb'})
]
feeds = [
(u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
@ -48,4 +50,3 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -0,0 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
# last updated 29/1/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
remove_empty_feeds = True
no_stylesheets = True
auto_cleanup = True
#articles_are_obfuscated = True
remove_tags = [
# dict(attrs={'class' : ['player']}),
]
feeds = [
(u'Homepage', u'http://www.countryfile.com/rss/home'),
(u'Country News', u'http://www.countryfile.com/rss/news'),
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
]

View File

@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 30/10/11
# last updated 11/2/12
language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
@ -13,45 +13,65 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 2
max_articles_per_feed = 30
oldest_article = 1
max_articles_per_feed = 5
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
auto_cleanup = True
#conversion_options = { 'linearize_tables' : True }
keep_only_tags = [
dict(name='div',attrs={'id' : 'body-content'})
]
remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
#keep_only_tags = [
# dict(name='h1'),
# dict(name='div',attrs={'id' : 'body-content'}),
#dict(name='div',atts={'class' : 'article-body'}),
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
#dict(name='p'),
# ]
#remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
remove_tags = [
dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
dict(name='div',attrs={'class' : 'span-12 last sl-others addthis_toolbox addthis_default_style'})
dict(name='title'),
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
# dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
#dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
#dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
]
# preprocess_regexps = [
#(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
#preprocess_regexps = [
#(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [
(u'News', u'http://www.mirror.co.uk/news/rss.xml')
,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
(u'UK News', u'http://feed43.com/0287771688643868.xml')
,(u'Tech News', u'http://feed43.com/2455520588350501.xml')
,(u'Weird World','http://feed43.com/0863800333634654.xml')
,(u'Sport','http://feed43.com/7713243036546130.xml')
,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml')
,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml')
,(u'Sport : Other','http://feed43.com/4501416886323415.xml')
,(u'TV and Film','http://feed43.com/5238302853765104.xml')
,(u'Celebs','http://feed43.com/8770061048844683.xml')
,(u'Life Style : Family','http://feed43.com/4356170742410338.xml')
,(u'Travel','http://feed43.com/1436576006476607.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
]
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
h1{ font-size:18px;}
img { display:block}
'''

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324913694(BasicNewsRecipe):
title = u'Derin Dusunce'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
feeds = [(u'Derin D\xfc\u015f\xfcnce', u'http://www.derindusunce.org/feed/')]

View File

@ -0,0 +1,21 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class DesiringGodEnglish(BasicNewsRecipe):
title = u'Desiring God'
__author__ = 'Peter Grungi'
language = 'en'
cover_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
masthead_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
language = 'en'
oldest_article = 7
max_articles_per_feed = 50
auto_cleanup = True
publisher = 'Desiring God Ministries'
author = 'Desiring God Ministries'
feeds = [(u'Desiring God Blog', u'http://feeds.feedburner.com/DGBlog?format=xml')]

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324736687(BasicNewsRecipe):
title = u'D\xfcnya Bizim'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 10
auto_cleanup = True
feeds = [(u'Aktif \u0130mamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=31'), (u'Ayr\u0131nt\u0131 Defteri', u'http://dunyabizim.com/servisler/rss.php?kategoriID=58'), (u'Baba Kitaplar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=4'), (u'Bu da Oldu', u'http://dunyabizim.com/servisler/rss.php?kategoriID=32'), (u'\xc7-al\u0131nt\u0131 Yaz\u0131lar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=33'), (u'Dar\xfclmedya', u'http://dunyabizim.com/servisler/rss.php?kategoriID=49'), (u'Gidenler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=59'), (u'G\xfczel Mekanlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=43'), (u'\u0130yi Haberler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=18'), (u'\u0130yi M\xfczikler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=2'), (u'Kalite Dergiler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=3'), (u'Konu\u015fa Konu\u015fa', u'http://dunyabizim.com/servisler/rss.php?kategoriID=24'), (u'M\xfcstesta G\xfczeller', u'http://dunyabizim.com/servisler/rss.php?kategoriID=65'), (u'O \u015eimdi Nerede?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=52'), (u'Olsa Ke\u015fke', u'http://dunyabizim.com/servisler/rss.php?kategoriID=34'), (u'Orada Ne Oldu?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=38'), (u'\xd6nemli Adamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=1'), (u'Polemik', u'http://dunyabizim.com/servisler/rss.php?kategoriID=39'), (u'Sinema', u'http://dunyabizim.com/servisler/rss.php?kategoriID=23'), (u'Yalan Haber', u'http://dunyabizim.com/servisler/rss.php?kategoriID=40'), (u'Yeni \u015eeyler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=57'), (u'Zekeriya Sofras\u0131', u'http://dunyabizim.com/servisler/rss.php?kategoriID=60')]

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1321194347(BasicNewsRecipe):
title = u'D\xfcnya B\xfclteni'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 50
auto_cleanup = True
feeds = [(u'Tarih Dosyas\u0131', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=157'), (u'R\xf6portaj', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=153'), (u'Makale-Yorum', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=174'), (u'K\xfclt\xfcr-Sanat', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=66'), (u'Hayat\u0131n \u0130\xe7inden', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=200'), (u'Haber Analiz', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=123'), (u'Gezi-\u0130zlenim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=90'), (u'Aile Sa\u011fl\u0131k E\u011fitim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=75')]

View File

@ -0,0 +1,46 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
'''
Fetch echo-online.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Echo_Online(BasicNewsRecipe):
title = u'Echo Online' # 2011-12-28 AGe
description = '-Echo Online-'
publisher = 'Echo Online GmbH'
category = 'News, Germany'
__author__ = 'Armin Geller' # 2011-12-28 AGe
language = 'de'
lang = 'de-DE'
encoding = 'iso-8859-1'
timefmt = ' [%a, %d %b %Y]'
oldest_article = 7
max_articles_per_feed = 50 # 2011-12-28 AGe
no_stylesheets = True
auto_cleanup = True
remove_javascript = True
feeds = [
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
]
def print_version(self, url):
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif'

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2012 Levien van Zon <levien@zonnetjes.net>'
'''
Fetch Edge.org conversations
'''
from calibre.web.feeds.news import BasicNewsRecipe
class EdgeConversationRSS(BasicNewsRecipe):
title = u'Edge.org Conversations'
__author__ = 'levien'
language = 'en'
description = '''Edge.org offers "open-minded, free ranging, intellectually
playful ... an unadorned pleasure in curiosity, a collective expression of
wonder at the living and inanimate world ... an ongoing and thrilling
colloquium.'''
oldest_article = 60
max_articles_per_feed = 100
no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'class':'HomeLeftPannel IMGCTRL'}) ]
remove_tags = [
dict(name='div',attrs={'class':'Logo'})
]
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
def print_version(self, url):
return url.replace('conversation/', 'conversation.php?cid=')
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Remove anything that is not a conversation, and remove PDF files as well...
if not ('CONVERSATION' in article.title):
feed.articles.remove(article)
elif 'pdf' in article.url:
feed.articles.remove(article)
return feeds

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,45 +7,72 @@ __license__ = 'GPL v3'
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Edmonton Journal
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
title = u'Edmonton Journal'
url_prefix = 'http://www.edmontonjournal.com'
description = u'News from Edmonton, AB'
fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -68,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -0,0 +1,58 @@
################################################################################
#Description: http://es.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2012.01.20. - V1.2
################################################################################
from calibre.web.feeds.recipes import BasicNewsRecipe
class elet_es_irodalom(BasicNewsRecipe):
title = u'\u00c9let \u00e9s Irodalom'
__author__ = 'Bigpapa'
oldest_article = 7
max_articles_per_feed = 30 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'iso-8859-2'
category = 'Cikkek'
language = 'hu'
publication_type = 'newsportal'
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
needs_subscription = 'optional'
masthead_url = 'http://www.es.hu/images/logo.jpg'
timefmt = ' [%Y %b %d, %a]'
#Nem ide a kódba kell beleírni a hozzáférés adatait, hanem azt akkor adod meg, ha le akarod tölteni!
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.es.hu/')
br.select_form(name='userfrmlogin')
br['cusername'] = self.username
br['cpassword'] = self.password
br.submit()
return br
keep_only_tags = [
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
]
remove_tags = [
dict(name='a', attrs={'target':['_TOP']}),
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
]
feeds = [
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
]

View File

@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
use_embedded_content = False
remove_javascript = True
needs_subscription = True
needs_subscription = 'optional'
encoding= 'ISO-8859-1'
remove_tags_before = dict(name='font', attrs={'class':'date'})
@ -75,32 +75,30 @@ class ESPN(BasicNewsRecipe):
return soup
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
url = ('https://r.espn.go.com/members/v3_1/login')
raw = br.open(url).read()
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
with TemporaryFile(suffix='.htm') as fname:
with open(fname, 'wb') as f:
f.write(raw)
br.open_local_file(fname)
if self.username and self.password:
br.set_handle_refresh(False)
url = ('https://r.espn.go.com/members/v3_1/login')
raw = br.open(url).read()
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
with TemporaryFile(suffix='.htm') as fname:
with open(fname, 'wb') as f:
f.write(raw)
br.open_local_file(fname)
br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password
br.submit().read()
br.open('http://espn.go.com').read()
br.set_handle_refresh(True)
br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password
br.submit().read()
br.open('http://espn.go.com').read()
br.set_handle_refresh(True)
return br
def get_article_url(self, article):
return article.get('guid', None)
def print_version(self, url):
if 'eticket' in url:
return url.partition('&')[0].replace('story?', 'print?')
match = re.search(r'story\?(id=\d+)', url)

34
recipes/fhm_uk.recipe Normal file
View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'FHM UK'
description = 'Good News for Men'
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
# last updated 27/1/12
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
remove_empty_feeds = True
no_stylesheets = True
#auto_cleanup = True
#articles_are_obfuscated = True
keep_only_tags = [
dict(name='h1'),
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
dict(name='div',attrs={'id' : ['articleLeft']}),
dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}),
]
#remove_tags = [
#dict(attrs={'class' : ['player']}),
#]
feeds = [
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
]

View File

@ -3,10 +3,17 @@ import re
from calibre.ptempfile import PersistentTemporaryFile
class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications:
1) fetch issue cover
2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei weichen302@gmx.com, 2012-02-05'''
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en'
version = 1
version = 1.01
title = u'Foreign Affairs (Subcription or (free) Registration)'
publisher = u'Council on Foreign Relations'
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
remove_javascript = True
INDEX = 'http://www.foreignaffairs.com'
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
INCLUDE_PREMIUM = False
remove_tags = []
remove_tags.append(dict(name = 'base'))
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
temp_files = []
articles_are_obfuscated = True
def get_cover_url(self):
soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
img_url = div.find('img')['src']
return self.INDEX + img_url
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
@ -50,57 +66,46 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return self.temp_files[-1].name
def parse_index(self):
soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
articles = []
answer = []
content = soup.find('div', attrs = {'class': 'center-wrapper'})
if content:
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tag = div.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
# If they ever fix their markup, this will break :-(
summary = self.tag_to_string(tag.findNextSibling('p'))
description = author + '<br/>' + summary
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
else:
continue
answer.append(('Magazine', articles))
ul = content.find('ul')
if ul:
soup = self.index_to_soup(self.FRONTPAGE)
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
for sec in sec_start:
content = sec.nextSibling
if content:
section = self.tag_to_string(content.find('h2'))
articles = []
for li in ul.findAll('li'):
tag = li.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
description = ''
tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
if tag:
description = self.tag_to_string(tag)
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
tags = []
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tags.append(div)
for li in content.findAll('li'):
tags.append(li)
for div in tags:
title = url = description = author = None
if self.INCLUDE_PREMIUM:
found_premium = False
else:
continue
answer.append(('Letters to the Editor', articles))
found_premium = div.findAll('span', attrs={'class':
'premium-icon'})
if not found_premium:
tag = div.find('div', attrs={'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
description = self.tag_to_string(tag_summary)
articles.append({'title':title, 'date':None, 'url':url,
'description':description, 'author':author})
if articles:
answer.append((section, articles))
return answer
def preprocess_html(self, soup):

View File

@ -1,4 +1,3 @@
from calibre.web.feeds.news import BasicNewsRecipe
class GlasgowHerald(BasicNewsRecipe):
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
language = 'en_GB'
__author__ = 'Kovid Goyal'
use_embedded_content = False
keep_only_tags = [dict(attrs={'class':'article'})]
remove_tags = [
dict(id=['pic-nav']),
dict(attrs={'class':['comments-top']})
]
no_stylesheets = True
auto_cleanup = True
#keep_only_tags = [dict(attrs={'class':'article'})]
#remove_tags = [
#dict(id=['pic-nav']),
#dict(attrs={'class':['comments-top']})
#]
feeds = [
@ -26,4 +29,3 @@ class GlasgowHerald(BasicNewsRecipe):
u'http://www.heraldscotland.com/cmlink/1.768',),
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]

13
recipes/goal.recipe Normal file
View File

@ -0,0 +1,13 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325677767(BasicNewsRecipe):
title = u'Goal'
oldest_article = 1
language = 'it'
max_articles_per_feed = 100
auto_cleanup = True
remove_tags_after = [dict(id='article_content')]
feeds = [(u'Goal', u'http://www.goal.com/it/feeds/news?fmt=rss')]
__author__ = 'faber1971'
description = 'Sports news from Italy'

76
recipes/grantland.recipe Normal file
View File

@ -0,0 +1,76 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GrantLand(BasicNewsRecipe):
title = u"Grantland"
description = 'Writings on Sports & Pop Culture'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = True
# auto_cleanup is too aggressive sometimes and we end up with blank articles
auto_cleanup = False
timefmt = ' [%a, %d %b %Y]'
oldest_article = 90
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
INDEX = 'http://www.grantland.com'
CATEGORIES = [
# comment out second line if you don't want older articles
# (user friendly name, url suffix, max number of articles to load)
('Today in Grantland','',20),
('In Case You Missed It','incaseyoumissedit',35),
]
remove_tags = [
{'name':['style','aside','nav','footer','script']},
{'name':'h1','text':'Grantland'},
{'id':['header','col-right']},
{'class':['connect_widget']},
{'name':'section','class':re.compile(r'\b(ad|module)\b')},
]
preprocess_regexps = [
# remove blog banners
(re.compile(r'<a href="/blog/(?:(?!</a>).)+</a>', re.DOTALL|re.IGNORECASE), lambda m: ''),
]
def parse_index(self):
feeds = []
seen_urls = set([])
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
self.log('Reading category:', cat_name)
articles = []
page = "%s/%s" % (self.INDEX, tag)
soup = self.index_to_soup(page)
main = soup.find('div',id='col-main')
if main is None:
main = soup
for tag in main.findAll('a', href=re.compile(r'(story|post)/_/id/\d+')):
url = tag['href']
if url in seen_urls:
continue
title = tag.string
# blank title probably means <a href=".."><img /></a>. skip
if not title:
continue
self.log('\tFound article:', title)
self.log('\t', url)
articles.append({'title':title,'url':url})
seen_urls.add(url)
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

11
recipes/haksoz.recipe Normal file
View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324739199(BasicNewsRecipe):
title = u'Haks\xf6z'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
language = 'tr'
__author__ = 'asalet_r'
feeds = [(u'Haks\xf6z', u'http://www.haksozhaber.net/rss/')]

View File

@ -0,0 +1,58 @@
from calibre.web.feeds.news import BasicNewsRecipe
'''
Hamilton Spectator Calibre Recipe
'''
class HamiltonSpectator(BasicNewsRecipe):
title = u'Hamilton Spectator'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
__author__ = u'Eric Coolman'
publisher = u'thespec.com'
description = u'Ontario Canada Newspaper'
category = u'News, Ontario, Canada'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en_CA'
encoding = 'utf-8'
feeds = [
(u'Top Stories',u'http://www.thespec.com/rss?query=/&assetType=Article'),
(u'All News',u'http://www.thespec.com/rss?query=/news&assetType=Article'),
(u'Local',u'http://www.thespec.com/rss?query=/local&assetType=Article'),
(u'Ontario',u'http://www.thespec.com/rss?query=/ontario&assetType=Article'),
(u'Canada',u'http://www.thespec.com/rss?query=/canada&assetType=Article'),
(u'World News',u'http://www.thespec.com/rss?query=/world&assetType=Article'),
(u'Business',u'http://www.thespec.com/rss?query=/business&assetType=Article'),
(u'Crime',u'http://www.thespec.com/rss?query=/crime&assetType=Article'),
(u'All Sports',u'http://www.thespec.com/rss?query=/sports&assetType=Article'),
(u'Ticats',u'http://www.thespec.com/rss?query=/sports/ticats&assetType=Article'),
(u'Bulldogs',u'http://www.thespec.com/rss?query=/sports/bulldogs&assetType=Article'),
(u'High School Sports',u'http://www.thespec.com/rss?query=/sports/highschools&assetType=Article'),
(u'Local Sports',u'http://www.thespec.com/rss?query=/sports/local&assetType=Article'),
(u'What''s On',u'http://www.thespec.com/rss?query=/whatson&assetType=Article'),
(u'Arts and Entertainment',u'http://www.thespec.com/rss?query=/whatson/artsentertainment&assetType=Article'),
(u'Books',u'http://www.thespec.com/rss?query=/whatson/books&assetType=Article'),
(u'Movies',u'http://www.thespec.com/rss?query=/whatson/movies&assetType=Article'),
(u'Music',u'http://www.thespec.com/rss?query=/whatson/music&assetType=Article'),
(u'Restaurant Reviews',u'http://www.thespec.com/rss?query=/whatson/restaurants&assetType=Article'),
(u'Opinion',u'http://www.thespec.com/rss?query=/opinion&assetType=Article'),
(u'Opinion Columns',u'http://www.thespec.com/rss?query=/opinion/columns&assetType=Article'),
(u'Cartoons',u'http://www.thespec.com/rss?query=/opinion/cartoons&assetType=Article'),
(u'Letters',u'http://www.thespec.com/rss?query=/opinion/letters&assetType=Article'),
(u'Editorial',u'http://www.thespec.com/rss?query=/opinion/editorial&assetType=Article'),
(u'Community',u'http://www.thespec.com/rss?query=/community&assetType=Article'),
(u'Education',u'http://www.thespec.com/rss?query=/community/education&assetType=Article'),
(u'Faith',u'http://www.thespec.com/rss?query=/community/faith&assetType=Article'),
(u'Contests',u'http://www.thespec.com/rss?query=/community/contests&assetType=Article'),
(u'Living',u'http://www.thespec.com/rss?query=/living&assetType=Article'),
(u'Food',u'http://www.thespec.com/rss?query=/living/food&assetType=Article'),
(u'Health and Fitness',u'http://www.thespec.com/rss?query=/living/healthfitness&assetType=Article'),
(u'Your Home',u'http://www.thespec.com/rss?query=/living/home&assetType=Article'),
(u'Travel',u'http://www.thespec.com/rss?query=/living/travel&assetType=Article'),
(u'Family and Parenting',u'http://www.thespec.com/rss?query=/living/familyparenting&assetType=Article'),
(u'Style',u'http://www.thespec.com/rss?query=/living/style&assetType=Article')
]

View File

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
'''
Fetch High Country News
'''
from calibre.web.feeds.news import BasicNewsRecipe
class HighCountryNews(BasicNewsRecipe):
title = u'High Country News'
description = u'News from the American West'
__author__ = 'Armin Geller' # 2012-01-31
publisher = 'High Country News'
timefmt = ' [%a, %d %b %Y]'
language = 'en-Us'
encoding = 'UTF-8'
publication_type = 'newspaper'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = True
remove_javascript = True
use_embedded_content = False
masthead_url = 'http://www.hcn.org/logo.jpg' # 2012-01-31 AGe add
cover_source = 'http://www.hcn.org' # 2012-01-31 AGe add
def get_cover_url(self): # 2012-01-31 AGe add
cover_source_soup = self.index_to_soup(self.cover_source)
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
return preview_image_div.div.img['src']
feeds = [
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent'),
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue'),
(u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
(u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
]
def print_version(self, url):
return url + '/print_view'

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe
import urllib, re
class HindustanTimes(BasicNewsRecipe):
title = u'Hindustan Times'
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
]
def get_article_url(self, article):
'''
HT uses a variant of the feedportal RSS ad display mechanism
'''
try:
s = article.summary
return urllib.unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
url = BasicNewsRecipe.get_article_url(self, article)
res = self.browser.open_novisit(url)
url = res.geturl().split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
'www.'}
for k, v in encoding.iteritems():
url = url.replace(k, v)
return url

View File

@ -1,44 +1,58 @@
# -*- coding: utf-8 -*-
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
################################################################################
#Description: http://hvg.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2011.12.20. - V1.1
################################################################################
class HVG(BasicNewsRecipe):
title = 'HVG.HU'
__author__ = u'István Papp'
description = u'Friss hírek a HVG-től'
timefmt = ' [%Y. %b. %d., %a.]'
oldest_article = 4
language = 'hu'
from calibre.web.feeds.news import BasicNewsRecipe
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
publisher = 'HVG Online'
category = u'news, hírek, hvg'
extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
remove_tags_before = dict(id='pg-content')
remove_javascript = True
remove_empty_feeds = True
class hvg(BasicNewsRecipe):
title = u'HVG'
__author__ = 'Bigpapa'
language = 'hu'
oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
encoding = 'utf8'
extra_css = ' h2 { font:bold 28px} '
feeds = [
(u'Itthon', u'http://hvg.hu/rss/itthon')
,(u'Világ', u'http://hvg.hu/rss/vilag')
,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag')
,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
,(u'Karrier', u'http://hvg.hu/rss/karrier')
,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
,(u'Kultúra', u'http://hvg.hu/rss/kultura')
,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
,(u'Sport', u'http://hvg.hu/rss/sport')
]
remove_attributes = ['style','font', 'href']
def print_version(self, url):
return url.replace ('#rss', '/print')
keep_only_tags = [
dict(name='div', attrs={'id':['pg-content']})
]
remove_tags = [
dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
dict(name='table', attrs={'class':['banner2', 'monocle']}),
dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
dict(name='h3', attrs={'class':['hthree']}),
dict(name='ul', attrs={'class':['defaultul']}),
dict(name='form', attrs={'id':['commentForm']}),
dict(name='h6', attrs={'class':['hthree']}),
dict(name='h6', attrs={'class':['more2']}),
dict(name='img', attrs={'class':['framed']}),
dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),
]
feeds = [
# (u'\xd6sszes', 'http://hvg.hu/rss'),
(u'Itthon', 'http://hvg.hu/rss/itthon'),
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
(u'Karrier', 'http://hvg.hu/rss/karrier'),
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
(u'Sport', 'http://hvg.hu/rss/sport')
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 906 B

View File

Before

Width:  |  Height:  |  Size: 712 B

After

Width:  |  Height:  |  Size: 712 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

BIN
recipes/icons/moneynews.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 914 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 944 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 289 B

BIN
recipes/icons/rionegro.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 817 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 968 B

View File

@ -0,0 +1,68 @@
# encoding: utf-8 -*-
__license__ = 'GPL v3'
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
__copyright__ = 'Josemi Liébana'
__version__ = 'v0.1'
__date__ = '5 January 2012'
'''
www.ideal.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ideal(BasicNewsRecipe):
title = u'Ideal (Edición Almería)'
__author__ = u'Josemi Liébana'
description = u'Noticias de Almería y el resto del mundo'
publisher = 'Ideal'
category = u'News, Politics, Spain, Almería'
publication_type = 'Newspaper'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
masthead_url = u'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
cover_url = u'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
extra_css = u' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(attrs={'id':'title'})
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
]
remove_tags = [dict(name='ul')]
remove_attributes = ['width','height']
feeds = [
(u'Última Hora' , u'http://www.ideal.es/almeria/rss/feeds/ultima.xml' )
,(u'Portada' , u'http://www.ideal.es/almeria/portada.xml' )
,(u'Local' , u'http://www.ideal.es/almeria/rss/feeds/granada.xml' )
,(u'Deportes' , u'http://www.ideal.es/almeria/rss/feeds/deportes.xml' )
,(u'Sociedad' , u'http://www.ideal.es/almeria/rss/feeds/sociedad.xml' )
,(u'Cultura' , u'http://www.ideal.es/almeria/rss/feeds/cultura.xml' )
,(u'Economía' , u'http://www.ideal.es/almeria/rss/feeds/economia.xml' )
,(u'Costa' , u'http://www.ideal.es/almeria/rss/feeds/costa.xml' )
,(u'Puerta Purchena' , u'http://www.ideal.es/almeria/rss/feeds/puerta_purchena.xml' )
,(u'Andalucía' , u'http://www.ideal.es/almeria/rss/feeds/andalucia.xml' )
,(u'España' , u'http://www.ideal.es/almeria/rss/feeds/espana.xml' )
,(u'Mundo' , u'http://www.ideal.es/almeria/rss/feeds/internacional.xml' )
,(u'Vivir' , u'http://www.ideal.es/almeria/rss/feeds/vivir.xml' )
,(u'Opinión' , u'http://www.ideal.es/almeria/rss/feeds/opinion.xml' )
,(u'Televisión' , u'http://www.ideal.es/almeria/rss/feeds/television.xml' )
,(u'Contraportada' , u'http://www.ideal.es/almeria/rss/feeds/contraportada.xml' )
]

View File

@ -0,0 +1,69 @@
# encoding: utf-8 -*-
__license__ = 'GPL v3'
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
__copyright__ = 'Josemi Liébana'
__version__ = 'v0.1'
__date__ = '5 January 2012'
'''
www.ideal.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ideal(BasicNewsRecipe):
title = u'Ideal (Edición Granada)'
__author__ = u'Josemi Liébana'
description = u'Noticias de Granada y el resto del mundo'
publisher = 'Ideal'
category = 'News, Politics, Spain, Granada'
publication_type = 'Newspaper'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(attrs={'id':'title'})
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
]
remove_tags = [dict(name='ul')]
remove_attributes = ['width','height']
feeds = [
(u'Última Hora' , u'http://www.ideal.es/granada/rss/feeds/ultima.xml' )
,(u'Portada' , u'http://www.ideal.es/granada/portada.xml' )
,(u'Local' , u'http://www.ideal.es/granada/rss/feeds/granada.xml' )
,(u'Deportes' , u'http://www.ideal.es/granada/rss/feeds/deportes.xml' )
,(u'Sociedad' , u'http://www.ideal.es/granada/rss/feeds/sociedad.xml' )
,(u'Cultura' , u'http://www.ideal.es/granada/rss/feeds/cultura.xml' )
,(u'Economía' , u'http://www.ideal.es/granada/rss/feeds/economia.xml' )
,(u'Costa' , u'http://www.ideal.es/granada/rss/feeds/costa.xml' )
,(u'La Carrera' , u'http://www.ideal.es/granada/rss/feeds/la_carrera.xml' )
,(u'Puerta Real' , u'http://www.ideal.es/granada/rss/feeds/puerta_real.xml' )
,(u'Andalucía' , u'http://www.ideal.es/granada/rss/feeds/andalucia.xml' )
,(u'España' , u'http://www.ideal.es/granada/rss/feeds/espana.xml' )
,(u'Mundo' , u'http://www.ideal.es/granada/rss/feeds/internacional.xml' )
,(u'Vivir' , u'http://www.ideal.es/granada/rss/feeds/vivir.xml' )
,(u'Opinión' , u'http://www.ideal.es/granada/rss/feeds/opinion.xml' )
,(u'Televisión' , u'http://www.ideal.es/granada/rss/feeds/television.xml' )
,(u'Contraportada' , u'http://www.ideal.es/granada/rss/feeds/contraportada.xml' )
]

67
recipes/ideal_jaen.recipe Normal file
View File

@ -0,0 +1,67 @@
# encoding: utf-8 -*-
__license__ = 'GPL v3'
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
__copyright__ = 'Josemi Liébana'
__version__ = 'v0.1'
__date__ = '5 January 2012'
'''
www.ideal.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ideal(BasicNewsRecipe):
title = u'Ideal (Edición Jaén)'
__author__ = u'Josemi Liébana'
description = u'Noticias de Jaén y el resto del mundo'
publisher = 'Ideal'
category = u'News, Politics, Spain, Jaén'
publication_type = 'Newspaper'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(attrs={'id':'title'})
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
]
remove_tags = [dict(name='ul')]
remove_attributes = ['width','height']
feeds = [
(u'Última Hora' , u'http://www.ideal.es/jaen/rss/feeds/ultima.xml' )
,(u'Portada' , u'http://www.ideal.es/jaen/portada.xml' )
,(u'Local' , u'http://www.ideal.es/jaen/rss/feeds/granada.xml' )
,(u'Deportes' , u'http://www.ideal.es/jaen/rss/feeds/deportes.xml' )
,(u'Sociedad' , u'http://www.ideal.es/jaen/rss/feeds/sociedad.xml' )
,(u'Cultura' , u'http://www.ideal.es/jaen/rss/feeds/cultura.xml' )
,(u'Economía' , u'http://www.ideal.es/jaen/rss/feeds/economia.xml' )
,(u'Costa' , u'http://www.ideal.es/jaen/rss/feeds/costa.xml' )
,(u'Andalucía' , u'http://www.ideal.es/jaen/rss/feeds/andalucia.xml' )
,(u'España' , u'http://www.ideal.es/jaen/rss/feeds/espana.xml' )
,(u'Mundo' , u'http://www.ideal.es/jaen/rss/feeds/internacional.xml' )
,(u'Vivir' , u'http://www.ideal.es/jaen/rss/feeds/vivir.xml' )
,(u'Opinión' , u'http://www.ideal.es/jaen/rss/feeds/opinion.xml' )
,(u'Televisión' , u'http://www.ideal.es/jaen/rss/feeds/television.xml' )
,(u'Contraportada' , u'http://www.ideal.es/jaen/rss/feeds/contraportada.xml' )
]

View File

@ -1,63 +1,30 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Derry FitzGerald'
'''
iht.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class NYTimesGlobal(BasicNewsRecipe):
title = u'NY Times Global'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
class InternationalHeraldTribune(BasicNewsRecipe):
title = u'The International Herald Tribune'
__author__ = 'Derry FitzGerald'
language = 'en'
oldest_article = 1
max_articles_per_feed = 30
no_stylesheets = True
auto_cleanup = True
remove_tags = [dict(name='div', attrs={'class':['footer','header']}),
dict(name=['form'])]
preprocess_regexps = [
(re.compile(r'<!-- webtrends.*', re.DOTALL),
lambda m:'</body></html>')
]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
remove_empty_feeds = True
feeds = [
(u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'),
(u'Business', u'http://www.iht.com/rss/business.xml'),
(u'Americas', u'http://www.iht.com/rss/america.xml'),
(u'Europe', u'http://www.iht.com/rss/europe.xml'),
(u'Asia', u'http://www.iht.com/rss/asia.xml'),
(u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
(u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
(u'Technology', u'http://www.iht.com/rss/technology.xml'),
(u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
(u'Sports', u'http://www.iht.com/rss/sports.xml'),
(u'Culture', u'http://www.iht.com/rss/arts.xml'),
(u'Style and Design', u'http://www.iht.com/rss/style.xml'),
(u'Travel', u'http://www.iht.com/rss/travel.xml'),
(u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
(u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
(u'Properties', u'http://www.iht.com/rss/properties.xml')
]
temp_files = []
articles_are_obfuscated = True
masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response1 = br.follow_link(url_regex=re.compile(r'.*pagewanted=print.*'))
html = response1.read()
self.temp_files.append(PersistentTemporaryFile('_iht.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
('NYTimes',
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
('NYTimes global',
'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
('World',
'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
('U.S.',
'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
('Business',
'http://feeds.nytimes.com/nyt/rss/Business'),
('Sports',
'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
('Technology',
'http://feeds.nytimes.com/nyt/rss/Technology'),
]

12
recipes/iktibas.recipe Normal file
View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324739406(BasicNewsRecipe):
title = u'\u0130ktibas'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
feeds = [(u'\u0130ktibas', u'http://www.iktibasdergisi.com/rss/rss.xml')]

110
recipes/ilmanifesto.recipe Normal file
View File

@ -0,0 +1,110 @@
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
MANIFESTO_BASEURL = 'http://www.ilmanifesto.it/'
class IlManifesto(BasicNewsRecipe):
title = 'Il Manifesto'
__author__ = 'Giacomo Lacava'
description = 'quotidiano comunista - ultima edizione html disponibile'
publication_type = 'newspaper'
publisher = 'il manifesto coop. editrice a r.l.'
language = 'it'
oldest_article = 2
max_articles_per_feed = 100
delay = 1
no_stylesheets = True
simultaneous_downloads = 5
timeout = 30
auto_cleanup = True
remove_tags = [dict(name='div', attrs={'class':'column_1 float_left'})]
remove_tags_before = dict(name='div',attrs={'class':'column_2 float_right'})
remove_tags_after = dict(id='myPrintArea')
manifesto_index = None
manifesto_datestr = None
def _set_manifesto_index(self):
if self.manifesto_index == None:
startUrl = MANIFESTO_BASEURL + 'area-abbonati/in-edicola/'
startSoup = self.index_to_soup(startUrl)
lastEdition = startSoup.findAll('div',id='accordion_inedicola')[1].find('a')['href']
del(startSoup)
self.manifesto_index = MANIFESTO_BASEURL + lastEdition
urlsplit = lastEdition.split('/')
self.manifesto_datestr = urlsplit[-1]
if urlsplit[-1] == '':
self.manifesto_datestr = urlsplit[-2]
def get_cover_url(self):
self._set_manifesto_index()
url = MANIFESTO_BASEURL + 'fileadmin/archivi/in_edicola/%sprimapagina.gif' % self.manifesto_datestr
return url
def parse_index(self):
self._set_manifesto_index()
soup = self.index_to_soup(self.manifesto_index)
feedLinks = soup.find('div',id='accordion_inedicola').findAll('a')
result = []
for feed in feedLinks:
articles = []
feedName = feed.find('h2').string
feedUrl = MANIFESTO_BASEURL + feed['href']
feedSoup = self.index_to_soup(feedUrl)
indexRoot = feedSoup.find('div',attrs={'class':'column1'})
for div in indexRoot.findAll('div',attrs={'class':'strumenti1_inedicola'}):
artLink = div.find('a')
if artLink is None: continue # empty div
title = artLink.string
url = MANIFESTO_BASEURL + artLink['href']
description = ''
descNode = div.find('div',attrs={'class':'text_12'})
if descNode is not None:
description = descNode.string
author = ''
authNode = div.find('div',attrs={'class':'firma'})
if authNode is not None:
author = authNode.string
articleText = ''
article = {
'title':title,
'url':url,
'date': strftime('%d %B %Y'),
'description': description,
'content': articleText,
'author': author
}
articles.append(article)
result.append((feedName,articles))
return result
def extract_readable_article(self, html, url):
bs = BeautifulSoup(html)
col1 = bs.find('div',attrs={'class':'column1'})
content = col1.find('div',attrs={'class':'bodytext'})
title = bs.find(id='titolo_articolo').string
author = col1.find('span',attrs={'class':'firma'})
subtitle = ''
subNode = col1.findPrevious('div',attrs={'class':'occhiello_rosso'})
if subNode is not None:
subtitle = subNode
summary = ''
sommNode = bs.find('div',attrs={'class':'sommario'})
if sommNode is not None:
summary = sommNode
template = "<html><head><title>%(title)s</title></head><body><h1>%(title)s</h1><h2>%(subtitle)s</h2><h3>%(author)s</h3><div style='font-size: x-large;'>%(summary)s</div><div>%(content)s</div></body></html>"
del(bs)
return template % dict(title=title,subtitle=subtitle,author=author,summary=summary,content=content)

View File

@ -1,16 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1234144423(BasicNewsRecipe):
title = u'Indianapolis Star'
oldest_article = 5
language = 'en'
class IndianapolisStar(BasicNewsRecipe):
title = u'Indianapolis Star'
oldest_article = 10
auto_cleanup = True
language = 'en'
__author__ = 'Owen Kelly'
max_articles_per_feed = 100
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss'),
(u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss'),
(u'Business Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss'),
(u'Politics and Government', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS05&template=rss'),
(u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'),
(u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')
]
__author__ = 'Owen Kelly'
max_articles_per_feed = 100
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss&mime=XML'), (u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss&mime=XML'), (u'Business Headlines', u'http://www..indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss&mime=XML'), (u'Sports Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=SPORTS&template=rss&mime=XML'), (u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'), (u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')]
def print_version(self, url):
return url + '&template=printart'
def print_version(self, url):
return url + '&template=printart'

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324158549(BasicNewsRecipe):
title = u'izdiham.com'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
feeds = [(u'\u0130zdiham', u'http://www.izdiham.com/index.php/feed')]

72
recipes/klip_me.recipe Normal file
View File

@ -0,0 +1,72 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1299694372(BasicNewsRecipe):
title = u'Klipme'
__author__ = 'Ken Sun'
publisher = 'Klip.me'
category = 'info, custom, Klip.me'
oldest_article = 365
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
remove_tags = [
dict(name='div', attrs={'id':'text_controls_toggle'})
,dict(name='script')
,dict(name='div', attrs={'id':'text_controls'})
,dict(name='div', attrs={'id':'editing_controls'})
,dict(name='div', attrs={'class':'bar bottom'})
]
use_embedded_content = False
needs_subscription = True
INDEX = u'http://www.klip.me'
LOGIN = INDEX + u'/fav/signin?callback=/fav'
feeds = [
(u'Klip.me unread', u'http://www.klip.me/fav'),
(u'Klip.me started', u'http://www.klip.me/fav?s=starred')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None:
br.open(self.LOGIN)
br.select_form(nr=0)
br['Email'] = self.username
if self.password is not None:
br['Passwd'] = self.password
br.submit()
return br
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('table',attrs={'class':['item','item new']}):
atag = item.a
if atag and atag.has_key('href'):
url = atag['href']
articles.append({
'url' :url
})
totalfeeds.append((feedtitle, articles))
return totalfeeds
def print_version(self, url):
return 'http://www.klip.me' + url
def populate_article_metadata(self, article, soup, first):
article.title = soup.find('title').contents[0].strip()
def postprocess_html(self, soup, first_fetch):
for link_tag in soup.findAll(attrs={"id" : "story"}):
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
print link_tag
return soup

View File

@ -1,79 +1,79 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Attis <attis@attis.one.pl>'
__copyright__ = '2011 Attis <attis@attis.one.pl>, 2012 Tomasz Długosz <tomek3d@gmail.com>'
__version__ = 'v. 0.1'
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class KopalniaWiedzy(BasicNewsRecipe):
title = u'Kopalnia Wiedzy'
publisher = u'Kopalnia Wiedzy'
description = u'Ciekawostki ze świata nauki i techniki'
encoding = 'utf-8'
__author__ = 'Attis'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
INDEX = u'http://kopalniawiedzy.pl/'
remove_javascript = True
no_stylesheets = True
title = u'Kopalnia Wiedzy'
publisher = u'Kopalnia Wiedzy'
description = u'Ciekawostki ze świata nauki i techniki'
encoding = 'utf-8'
__author__ = 'Attis & Tomasz Długosz'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
INDEX = u'http://kopalniawiedzy.pl/'
remove_javascript = True
no_stylesheets = True
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}]
remove_tags_after = dict(attrs={'class':'ad-square'})
keep_only_tags = [dict(name="div", attrs={'id':'articleContent'})]
extra_css = '.topimage {margin-top: 30px}'
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}]
remove_tags_after = dict(attrs={'class':'ad-square'})
keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})]
extra_css = '.topimage {margin-top: 30px}'
preprocess_regexps = [
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
(re.compile(u'<br /><br />'),
lambda match: '<br\/>')
]
preprocess_regexps = [
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
(re.compile(u'<br /><br />'),
lambda match: '<br\/>')
]
feeds = [
(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
]
feeds = [
(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
]
def is_link_wanted(self, url, tag):
return tag['class'] == 'next'
def is_link_wanted(self, url, tag):
return tag['class'] == 'next'
def remove_beyond(self, tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
def remove_beyond(self, tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
def append_page(self, soup, appendtag, position):
pager = soup.find('a',attrs={'class':'next'})
if pager:
nexturl = self.INDEX + pager['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'id':'articleContent'})
def append_page(self, soup, appendtag, position):
pager = soup.find('a',attrs={'class':'next'})
if pager:
nexturl = self.INDEX + pager['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'id':'articleContent'})
tag = texttag.find(attrs={'class':'pages'})
self.remove_beyond(tag, 'nextSibling')
tag = texttag.find(attrs={'class':'pages'})
self.remove_beyond(tag, 'nextSibling')
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
appendtag.insert(position,texttag)
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
for item in soup.findAll('div',attrs={'class':'pages'}):
item.extract()
for item in soup.findAll('div',attrs={'class':'pages'}):
item.extract()
for item in soup.findAll('p', attrs={'class':'wykop'}):
item.extract()
for item in soup.findAll('p', attrs={'class':'wykop'}):
item.extract()
return soup
return soup

View File

@ -13,9 +13,10 @@ class Kurier(BasicNewsRecipe):
publisher = 'KURIER'
category = 'news, politics, Austria'
oldest_article = 2
max_articles_per_feed = 200
max_articles_per_feed = 100
timeout = 30
encoding = None
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'de_AT'
remove_empty_feeds = True
@ -29,9 +30,11 @@ class Kurier(BasicNewsRecipe):
, 'language' : language
}
remove_tags = [dict(attrs={'class':['functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})]
remove_tags = [ dict(attrs={'id':['artikel_expand_symbol2','imgzoom_close2']}),
dict(attrs={'class':['linkextern','functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})
]
keep_only_tags = [dict(attrs={'id':'content'})]
remove_tags_after = dict(attrs={'id':'author'})
remove_tags_after = [dict(attrs={'id':'author'})]
remove_attributes = ['width','height']
feeds = [
@ -41,7 +44,7 @@ class Kurier(BasicNewsRecipe):
,(u'Kultur' , u'http://kurier.at/rss/kultur_kultur_rss.xml' )
,(u'Freizeit' , u'http://kurier.at/rss/freizeit_freizeit_rss.xml' )
,(u'Wetter' , u'http://kurier.at/rss/oewetter_rss.xml' )
,(u'Verkehr' , u'http://kurier.at/rss/verkehr_rss.xml' )
,(u'Sport' , u'http://kurier.at/newsfeed/detail/sport_rss.xml' )
]
def preprocess_html(self, soup):

View File

@ -1,10 +1,9 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.la-razon.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class LaRazon_Bol(BasicNewsRecipe):
@ -16,19 +15,17 @@ class LaRazon_Bol(BasicNewsRecipe):
oldest_article = 1
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
encoding = 'utf8'
use_embedded_content = False
language = 'es_BO'
publication_type = 'newspaper'
delay = 1
remove_empty_feeds = True
cover_url = strftime('http://www.la-razon.com/portadas/%Y%m%d_LaRazon.jpg')
masthead_url = 'http://www.la-razon.com/imagenes/logo.jpg'
extra_css = """ body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em}
.noticia-titulo{font-family: Georgia,"Times New Roman",Times,serif}
.lead{font-weight: bold; font-size: 0.8em}
"""
masthead_url = 'http://www.la-razon.com/static/LRZRazon/images/lrz-logo.png'
extra_css = """ body{font-family: Georgia,"Times New Roman",Times,serif}
img{margin-bottom: 0.4em; display: block}
.meta{font-size: small; font-family: Arial,Helvetica,sans-serif}
"""
INDEX = 'http://www.la-razon.com/'
conversion_options = {
'comment' : description
@ -37,28 +34,37 @@ class LaRazon_Bol(BasicNewsRecipe):
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':['noticia-titulo','noticia-desarrollo']})]
remove_tags = [dict(name=['meta','link','form','iframe','embed','object'])]
keep_only_tags = [dict(name='div', attrs={'class':['pg-hd', 'pg-bd']})]
remove_tags = [
dict(name=['meta','link','form','iframe','embed','object'])
,dict(name='div', attrs={'class':'bd'})
]
remove_attributes = ['width','height']
feeds = [
(u'Editorial' , u'http://www.la-razon.com/rss_editorial.php' )
,(u'Opinión' , u'http://www.la-razon.com/rss_opinion.php' )
,(u'Nacional' , u'http://www.la-razon.com/rss_nacional.php' )
,(u'Economia' , u'http://www.la-razon.com/rss_economia.php' )
,(u'Ciudades' , u'http://www.la-razon.com/rss_ciudades.php' )
,(u'Sociedad' , u'http://www.la-razon.com/rss_sociedad.php' )
,(u'Mundo' , u'http://www.la-razon.com/rss_sociedad.php' )
,(u'La Revista' , u'http://www.la-razon.com/rss_larevista.php' )
,(u'Sociales' , u'http://www.la-razon.com/rss_sociales.php' )
,(u'Mia' , u'http://www.la-razon.com/rss_mia.php' )
,(u'Marcas' , u'http://www.la-razon.com/rss_marcas.php' )
,(u'Escape' , u'http://www.la-razon.com/rss_escape.php' )
,(u'El Financiero' , u'http://www.la-razon.com/rss_financiero.php')
,(u'Tendencias' , u'http://www.la-razon.com/rss_tendencias.php')
(u'Editorial' , u'http://www.la-razon.com/rss/opinion/editorial/' )
,(u'Nacional' , u'http://www.la-razon.com/rss/nacional/' )
,(u'Economia' , u'http://www.la-razon.com/rss/economia/' )
,(u'Ciudades' , u'http://www.la-razon.com/rss/ciudades/' )
,(u'Sociedad' , u'http://www.la-razon.com/rss/sociedad/' )
,(u'Mundo' , u'http://www.la-razon.com/rss/mundo/' )
,(u'La Revista' , u'http://www.la-razon.com/rss/la_revista/' )
,(u'Sociales' , u'http://www.la-razon.com/rss/sociales/' )
,(u'Mia' , u'http://www.la-razon.com/rss/suplementos/mia/' )
,(u'Marcas' , u'http://www.la-razon.com/rss/marcas/' )
,(u'Escape' , u'http://www.la-razon.com/rss/suplementos/escape/' )
,(u'El Financiero' , u'http://www.la-razon.com/rss/suplementos/financiero/')
,(u'Tendencias' , u'http://www.la-razon.com/rss/suplementos/tendencias/')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX)
lightbox = soup.find('div', attrs = {'class' : 'lightbox lightbox-frontpage'})
return lightbox.img['src']

15
recipes/la_voce.recipe Normal file
View File

@ -0,0 +1,15 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324114228(BasicNewsRecipe):
title = u'La Voce'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://www.lavoce.info/binary/la_voce/testata/lavoce.1184661635.gif'
feeds = [(u'La Voce', u'http://www.lavoce.info/feed_rss.php?id_feed=1')]
__author__ = 'faber1971'
description = 'Italian website on Economy - v1.01 (17, December 2011)'
language = 'it'

View File

@ -1,8 +1,8 @@
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__author__ = 'Lorenzo Vigentini and Olivier Daigle'
__copyright__ = '2012, Lorenzo Vigentini <l.vigentini at gmail.com>, Olivier Daigle <odaigle _at nuvucameras __dot__ com>'
__version__ = 'v1.01'
__date__ = '14, January 2010'
__date__ = '12, February 2012'
__description__ = 'Canadian Paper '
'''
@ -18,7 +18,7 @@ class ledevoir(BasicNewsRecipe):
description = 'Canadian Paper. A subscription is optional, with it you get more content'
cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
title = u'Le Devoir'
title = u'Le Devoir '
publisher = 'leDevoir.com'
category = 'News, finance, economy, politics'
@ -26,11 +26,15 @@ class ledevoir(BasicNewsRecipe):
encoding = 'utf-8'
timefmt = '[%a, %d %b, %Y]'
max_articles_per_feed = 50
oldest_article = 1
max_articles_per_feed = 200
use_embedded_content = False
recursion = 10
needs_subscription = 'optional'
filterDuplicates = False
url_list = []
remove_javascript = True
no_stylesheets = True
@ -38,7 +42,7 @@ class ledevoir(BasicNewsRecipe):
keep_only_tags = [
dict(name='div', attrs={'id':'article'}),
dict(name='ul', attrs={'id':'ariane'})
dict(name='div', attrs={'id':'colonne_principale'})
]
remove_tags = [
@ -51,7 +55,7 @@ class ledevoir(BasicNewsRecipe):
feeds = [
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
(u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
(u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
(u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
(u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
(u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
@ -61,7 +65,7 @@ class ledevoir(BasicNewsRecipe):
(u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
(u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
(u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
(u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50')
(u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50')
]
extra_css = '''
@ -85,8 +89,16 @@ class ledevoir(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.ledevoir.com')
br.select_form(nr=1)
br['login[courriel]'] = self.username
br['login[password]'] = self.password
br.select_form(nr=0)
br['login_popup[courriel]'] = self.username
br['login_popup[password]'] = self.password
br.submit()
return br
def print_version(self, url):
if self.filterDuplicates:
if url in self.url_list:
return
self.url_list.append(url)
return url

14
recipes/lega_nerd.recipe Normal file
View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1326135232(BasicNewsRecipe):
title = u'Lega Nerd'
description = 'nerd / geek culture, pc, comics, music, culture'
language = 'it'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Lega Nerd', u'http://feeds.feedburner.com/LegaNerd')]
__author__ = 'faber1971'
__version__ = 'v1.0'
__date__ = '9, January 2011'

View File

@ -0,0 +1,103 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
'''
liberation.fr
'''
# The cleanning is from the Liberation recipe, by Darko Miletic
from calibre.web.feeds.news import BasicNewsRecipe
class Liberation(BasicNewsRecipe):
title = u'Libération: Édition abonnés'
__author__ = 'Rémi Vanicat'
description = u'Actualités'
category = 'Actualités, France, Monde'
language = 'fr'
needs_subscription = True
use_embedded_content = False
no_stylesheets = True
remove_empty_feeds = True
extra_css = '''
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
keep_only_tags = [
dict(name='div', attrs={'class':'article'})
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
,dict(name='div', attrs={'class':'entry'})
,dict(name='div', attrs={'class':'col_contenu'})
]
remove_tags_after = [
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
,dict(name='p',attrs={'class':['chapo']})
,dict(id='_twitter_facebook')
]
remove_tags = [
dict(name='iframe')
,dict(name='a', attrs={'class':'lnk-comments'})
,dict(name='div', attrs={'class':'toolbox'})
,dict(name='ul', attrs={'class':'share-box'})
,dict(name='ul', attrs={'class':'tool-box'})
,dict(name='ul', attrs={'class':'rub'})
,dict(name='p',attrs={'class':['chapo']})
,dict(name='p',attrs={'class':['tag']})
,dict(name='div',attrs={'class':['blokLies']})
,dict(name='div',attrs={'class':['alire']})
,dict(id='_twitter_facebook')
]
index = 'http://www.liberation.fr/abonnes/'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.liberation.fr/jogger/login/')
br.select_form(nr=0)
br['email'] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
soup=self.index_to_soup(self.index)
content = soup.find('div', { 'class':'block-content' })
articles = []
cat_articles = []
for tag in content.findAll(recursive=False):
if(tag['class']=='headrest headrest-basic-rounded'):
cat_articles = []
articles.append((tag.find('h5').contents[0],cat_articles))
else:
title = tag.find('h3').contents[0]
url = tag.find('a')['href']
print(url)
descripion = tag.find('p',{ 'class':'subtitle' }).contents[0]
article = {
'title': title,
'url': url,
'descripion': descripion,
'content': ''
}
cat_articles.append(article)
return articles
# Local Variables:
# mode: python
# End:

View File

@ -1,41 +1,26 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.livemint.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LiveMint(BasicNewsRecipe):
title = u'Livemint'
__author__ = 'Darko Miletic'
description = 'The Wall Street Journal'
publisher = 'The Wall Street Journal'
category = 'news, games, adventure, technology'
language = 'en'
title = u'Live Mint'
language = 'en_IN'
__author__ = 'Krittika Goyal'
#encoding = 'cp1252'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = True
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
extra_css = ' #dvArtheadline{font-size: x-large} #dvArtAbstract{font-size: large} '
no_stylesheets = True
auto_cleanup = True
keep_only_tags = [dict(name='div', attrs={'class':'innercontent'})]
remove_tags = [dict(name=['object','link','embed','form','iframe'])]
feeds = [
('Latest News',
'http://www.livemint.com/StoryRss.aspx?LN=Latestnews'),
('Gallery',
'http://www.livemint.com/GalleryRssfeed.aspx'),
('Top Stories',
'http://www.livemint.com/StoryRss.aspx?ts=Topstories'),
('Banking',
'http://www.livemint.com/StoryRss.aspx?Id=104'),
]
feeds = [(u'Articles', u'http://www.livemint.com/SectionRssfeed.aspx?Mid=1')]
def print_version(self, url):
link = url
msoup = self.index_to_soup(link)
mlink = msoup.find(attrs={'id':'ctl00_bodyplaceholdercontent_cntlArtTool_printUrl'})
if mlink:
link = 'http://www.livemint.com/Articles/' + mlink['href'].rpartition('/Articles/')[2]
return link
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,25 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class LivingStonesPastorsBlog(BasicNewsRecipe):
title = u'Living Stones Pastors Blog'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 90
max_articles_per_feed = 10
auto_cleanup = True
cover_url = 'http://blogs.livingstonesreno.com/wp-content/uploads/2011/08/blogBGRD_norepeat.jpg'
masthead_url = 'http://www.livingstonesreno.com/podcast/LSpodcastnew.jpg'
publisher = 'Living Stones Church of Reno, NV'
language = 'en'
author = 'Living Stones Church of Reno, NV'
feeds = [(u'LS Blog', u'http://blogs.livingstonesreno.com/feed?utm_source=calibre&utm_medium=rss')]
def full_version(self, url):
import re
newurl = re.sub(r'\?.*','',url)
return newurl

View File

@ -41,7 +41,7 @@ class LosTiempos_Bol(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'id':'articulo'})]
remove_tags = [
dict(name=['meta','link','form','iframe','embed','object','hr'])
,dict(attrs={'class':['caja_fonts sin_border_bot','pub']})
,dict(attrs={'class':['caja_fonts sin_border_bot','pub','twitter-share-button']})
]
remove_attributes = ['width','height']

View File

@ -14,8 +14,11 @@ class WeeklyLWN(BasicNewsRecipe):
description = 'Weekly summary of what has happened in the free software world.'
__author__ = 'Davide Cavalca'
language = 'en'
site_url = 'http://lwn.net'
cover_url = 'http://lwn.net/images/lcorner.png'
extra_css = 'pre,code,samp,kbd,tt { font-size: 80% }\nblockquote {margin-left:0 }\n* { color: black }\n'
cover_url = site_url + '/images/lcorner.png'
#masthead_url = 'http://lwn.net/images/lcorner.png'
publication_type = 'magazine'
@ -43,11 +46,29 @@ class WeeklyLWN(BasicNewsRecipe):
br.submit()
return br
def print_version(self, url):
# Strip off anchor
url = url.split('#')[0]
# Prepend site_url
if url[0:len(self.site_url)] != self.site_url:
url = self.site_url + url
# Append printable URL parameter
print_param = '?format=printable'
if url[-len(print_param):] != print_param:
url += print_param
#import sys
#print >>sys.stderr, "*** print_version(url):", url
return url
def parse_index(self):
if self.username is not None and self.password is not None:
index_url = 'http://lwn.net/current/bigpage?format=printable'
index_url = self.print_version('/current/bigpage')
else:
index_url = 'http://lwn.net/free/bigpage?format=printable'
index_url = self.print_version('/free/bigpage')
soup = self.index_to_soup(index_url)
body = soup.body
@ -56,19 +77,19 @@ class WeeklyLWN(BasicNewsRecipe):
url_re = re.compile('^/Articles/')
while True:
tag_title = body.findNext(name='p', attrs={'class':'SummaryHL'})
tag_title = body.findNext(attrs={'class':'SummaryHL'})
if tag_title == None:
break
tag_section = tag_title.findPrevious(name='p', attrs={'class':'Cat1HL'})
tag_section = tag_title.findPrevious(attrs={'class':'Cat1HL'})
if tag_section == None:
section = 'Front Page'
else:
section = tag_section.string
tag_section2 = tag_title.findPrevious(name='p', attrs={'class':'Cat2HL'})
tag_section2 = tag_title.findPrevious(attrs={'class':'Cat2HL'})
if tag_section2 != None:
if tag_section2.findPrevious(name='p', attrs={'class':'Cat1HL'}) == tag_section:
if tag_section2.findPrevious(attrs={'class':'Cat1HL'}) == tag_section:
section = "%s: %s" %(section, tag_section2.string)
if section not in articles.keys():
@ -94,9 +115,10 @@ class WeeklyLWN(BasicNewsRecipe):
if tag_url == None:
break
article = dict(
title=self.tag_to_string(tag_title),
url= 'http://lwn.net' + tag_url['href'].split('#')[0] + '?format=printable',
url=tag_url['href'],
description='', content='', date='')
articles[section].append(article)

23
recipes/macity.recipe Normal file
View File

@ -0,0 +1,23 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325766771(BasicNewsRecipe):
title = u'Macity'
language = 'it'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
def get_article_url(self, article):
link = BasicNewsRecipe.get_article_url(self, article)
if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
a=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'L' , 'N' , 'S' ]
b=['0', '.', '/', '?', '-', '=', '&', '_', 'http://', '.com', 'www.']
for i in range(0,len(a)):
link=link.replace('0'+a[-i],b[-i])
return link
feeds = [(u'Macity', u'http://www.macitynet.it.feedsportal.com/c/33714/f/599513/index.rss')]
__author__ = 'faber1971'
description = 'Apple and hi-tech news'

View File

@ -0,0 +1,16 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
title = u'Marketing Magazine'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
__author__ = 'faber1971'
description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)'
language = 'it'

View File

@ -38,18 +38,23 @@ except:
removed keep_only tags
Version 1.8 26-11-2022
added remove tag: article-slideshow
Version 1.9 31-1-2012
removed some left debug settings
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL'
oldest_article = 10
max_articles_per_feed = 15
oldest_article = 1.2
max_articles_per_feed = 25
__author__ = u'DrMerry'
description = u'Metro Nederland'
language = u'nl'
simultaneous_downloads = 5
simultaneous_downloads = 3
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 2
timeout = 10
center_navbar = True
timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True

View File

@ -0,0 +1,217 @@
#!/usr/bin/env python
##
## Title: Microwave and RF
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Initial release
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
mwrf.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class Microwave_and_RF(BasicNewsRecipe):
Convert_Grayscale = False # Convert images to gray scale or not
# Add sections that want to be excluded from the magazine
exclude_sections = []
# Add sections that want to be included from the magazine
include_sections = []
title = u'Microwave and RF'
__author__ = 'kiavash'
description = u'Microwave and RF Montly Magazine'
publisher = 'Penton Media, Inc.'
publication_type = 'magazine'
site = 'http://mwrf.com'
language = 'en'
asciiize = True
timeout = 120
simultaneous_downloads = 1 # very peaky site!
# Main article is inside this tag
keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})]
no_stylesheets = True
remove_javascript = True
# Flattens all the tables to make it compatible with Nook
conversion_options = {'linearize_tables' : True}
remove_tags = [
dict(name='span', attrs={'class':'body12'}),
]
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: '')
]
def print_version(self, url):
url = re.sub(r'.html', '', url)
url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url)
return url
# Need to change the user agent to avoid potential download errors
def get_browser(self, *args, **kwargs):
from calibre import browser
kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0'
return browser(*args, **kwargs)
def parse_index(self):
# Fetches the main page of Microwave and RF
soup = self.index_to_soup(self.site)
# Searches the site for Issue ID link then returns the href address
# pointing to the latest issue
latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href')
# Fetches the index page for of the latest issue
soup = self.index_to_soup(latest_issue)
# Finds the main section of the page containing cover, issue date and
# TOC
ts = soup.find('div', attrs={'id':'columnContainer'})
# Finds the issue date
ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize()
self.log('Found Current Issue:', ds)
self.timefmt = ' [%s]'%ds
# Finds the cover image
cover = ts.find('img', src = lambda x: x and 'Cover' in x)
if cover is not None:
self.cover_url = self.site + cover['src']
self.log('Found Cover image:', self.cover_url)
feeds = []
article_info = []
# Finds all the articles (tiles and links)
articles = ts.findAll('a', attrs={'class':'commonArticleTitle'})
# Finds all the descriptions
descriptions = ts.findAll('span', attrs={'class':'commonCopy'})
# Find all the sections
sections = ts.findAll('span', attrs={'class':'kicker'})
title_number = 0
# Goes thru all the articles one by one and sort them out
for section in sections:
title_number = title_number + 1
# Removes the unwanted sections
if self.tag_to_string(section) in self.exclude_sections:
continue
# Only includes the wanted sections
if self.include_sections:
if self.tag_to_string(section) not in self.include_sections:
continue
title = self.tag_to_string(articles[title_number])
url = articles[title_number].get('href')
if url.startswith('/'):
url = self.site + url
self.log('\tFound article:', title, 'at', url)
desc = self.tag_to_string(descriptions[title_number])
self.log('\t\t', desc)
article_info.append({'title':title, 'url':url, 'description':desc,
'date':self.timefmt})
if article_info:
feeds.append((self.title, article_info))
#self.log(feeds)
return feeds
def postprocess_html(self, soup, first):
if self.Convert_Grayscale:
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup
def preprocess_html(self, soup):
# Includes all the figures inside the final ebook
# Finds all the jpg links
for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}):
# makes sure that the link points to the absolute web address
if figure['href'].startswith('/'):
figure['href'] = self.site + figure['href']
figure.name = 'img' # converts the links to img
figure['src'] = figure['href'] # with the same address as href
figure['style'] = 'display:block' # adds /n before and after the image
del figure['href']
del figure['target']
# Makes the title standing out
for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}):
title.name = 'h1'
del title['href']
del title['target']
# Makes the section name more visible
for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}):
section_name.name = 'h5'
del section_name['href']
del section_name['target']
# Removes all unrelated links
for link in soup.findAll('a', attrs = {'href': True}):
link.name = 'font'
del link['href']
del link['target']
return soup

View File

@ -10,6 +10,10 @@ __MakePeriodical__ = True
__UseChineseTitle__ = False
# Set it to False if you want to skip images (Default: True)
__KeepImages__ = True
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True
# (HK only) It is to disable premium content (Default: False)
@ -24,6 +28,9 @@ __Date__ = ''
'''
Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing
@ -53,6 +60,7 @@ Change Log:
2010/10/31: skip repeated articles in section pages
'''
from calibre.utils.date import now as nowf
import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
@ -60,11 +68,15 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
# MAIN CLASS
class MPRecipe(BasicNewsRecipe):
if __Region__ == 'Hong Kong':
title = 'Ming Pao - Hong Kong'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u9999\u6e2f)'
else:
title = 'Ming Pao - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
@ -109,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: "</b>")
]
elif __Region__ == 'Vancouver':
title = 'Ming Pao - Vancouver'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
else:
title = 'Ming Pao - Vancouver'
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
category = 'Chinese, News, Vancouver'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -127,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: ''),
]
elif __Region__ == 'Toronto':
title = 'Ming Pao - Toronto'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = 'Ming Pao - Toronto'
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
category = 'Chinese, News, Toronto'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -161,9 +179,9 @@ class MPRecipe(BasicNewsRecipe):
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
if __Region__ == 'Hong Kong':
# convert UTC to local hk time - at HKT 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
# convert UTC to local hk time - at HKT 4.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
elif __Region__ == 'Vancouver':
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -186,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
if __Date__ <> '':
return __Date__[6:8]
@ -654,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
del item['absmiddle']
return soup
def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
try:
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
# look for content
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
textFound = False
for p in paras:
if not textFound:
summary_candidate = self.tag_to_string(p).strip()
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
if len(summary_candidate) > 0:
article.summary = article.text_summary = summary_candidate
textFound = True
else:
# display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts
counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
for p in paras:
summary_candidate = self.tag_to_string(p).strip()
counts += len(summary_candidate)
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
except:
self.log("Error creating article descriptions")
return
# override from the one in version 0.8.31
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
if __UseChineseTitle__ == True:
if __Region__ == 'Hong Kong':
title = u'\u660e\u5831 (\u9999\u6e2f)'
elif __Region__ == 'Vancouver':
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
elif __Region__ == 'Toronto':
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = self.short_title()
# if not generating a periodical, force date to apply in title
if __MakePeriodical__ == False:
title = self.short_title()
# change 1: allow our own flag to tell if a periodical is to be generated
# also use customed date instead of current time
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
title = title + ' ' + self.get_fetchformatteddate()
if True:
mi = MetaInformation(title, [self.publisher])
mi.publisher = self.publisher
mi.author_sort = self.publisher
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal()
mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.pubdate = nowf()
mi.pubdate = self.get_dtlocal()
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
# end of change 1
# change 2: __appname__ replaced by newspaper publisher
__appname__ = self.publisher
mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
# change 4: in the following, all the nowf() are changed to adjusted time
# This one doesn't matter
mi.timestamp = nowf()
# change 5: skip listing the articles
#article_titles, aseen = [], set()
#for f in feeds:
# for a in f:
# if a.title and a.title not in aseen:
# aseen.add(a.title)
# article_titles.append(force_unicode(a.title, 'utf-8'))
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
#mi.comments = self.description
#if not isinstance(mi.comments, unicode):
# mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
# '\n\n'.join(article_titles))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
@ -739,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
desc = None
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po, author=auth, description=desc)
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
@ -762,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, self.publisher, prefix=prefix,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
@ -785,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
@ -799,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
# Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Toronto'
# Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False".
# please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True
# Turn below to true if your device supports display of CJK titles
# Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False
# Set it to False if you want to skip images
# Set it to False if you want to skip images (Default: True)
__KeepImages__ = True
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True
# (HK only) It is to disable premium content (Default: False)
__InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
__ParsePFF__ = True
# (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False
# Override the date returned by the program if specifying a YYYYMMDD below
__Date__ = ''
'''
Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
provide options to remove all images in the file
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,38 @@ Change Log:
2010/10/31: skip repeated articles in section pages
'''
import os, datetime, re
from calibre.utils.date import now as nowf
import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
# MAIN CLASS
class MPRecipe(BasicNewsRecipe):
if __Region__ == 'Hong Kong':
title = 'Ming Pao - Hong Kong'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u9999\u6e2f)'
else:
title = 'Ming Pao - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
dict(attrs={'class':['heading']}), # for heading from txt
dict(attrs={'id':['newscontent']}), # entertainment and column page content
dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['content']}), # for content from txt
dict(attrs={'class':['photo']}),
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
dict(attrs={'class':['images']}) # for images from txt
]
if __KeepImages__:
remove_tags = [dict(name='style'),
@ -90,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: "</b>")
]
elif __Region__ == 'Vancouver':
title = 'Ming Pao - Vancouver'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
else:
title = 'Ming Pao - Vancouver'
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
category = 'Chinese, News, Vancouver'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -108,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: ''),
]
elif __Region__ == 'Toronto':
title = 'Ming Pao - Toronto'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = 'Ming Pao - Toronto'
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
category = 'Chinese, News, Toronto'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
conversion_options = {'linearize_tables':True}
timefmt = ''
def image_url_processor(cls, baseurl, url):
# trick: break the url at the first occurance of digit, add an additional
# '_' at the front
# not working, may need to move this to preprocess_html() method
# minIdx = 10000
# i0 = url.find('0')
# if i0 >= 0 and i0 < minIdx:
# minIdx = i0
# i1 = url.find('1')
# if i1 >= 0 and i1 < minIdx:
# minIdx = i1
# i2 = url.find('2')
# if i2 >= 0 and i2 < minIdx:
# minIdx = i2
# i3 = url.find('3')
# if i3 >= 0 and i0 < minIdx:
# minIdx = i3
# i4 = url.find('4')
# if i4 >= 0 and i4 < minIdx:
# minIdx = i4
# i5 = url.find('5')
# if i5 >= 0 and i5 < minIdx:
# minIdx = i5
# i6 = url.find('6')
# if i6 >= 0 and i6 < minIdx:
# minIdx = i6
# i7 = url.find('7')
# if i7 >= 0 and i7 < minIdx:
# minIdx = i7
# i8 = url.find('8')
# if i8 >= 0 and i8 < minIdx:
# minIdx = i8
# i9 = url.find('9')
# if i9 >= 0 and i9 < minIdx:
# minIdx = i9
return url
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
if __Region__ == 'Hong Kong':
# convert UTC to local hk time - at HKT 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
# convert UTC to local hk time - at HKT 4.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
elif __Region__ == 'Vancouver':
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,13 +193,34 @@ class MPRecipe(BasicNewsRecipe):
return dt_local
def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d")
if __Date__ <> '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d")
if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
return self.get_dtlocal().strftime("%d")
if __Date__ <> '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
def get_cover_url(self):
if __Region__ == 'Hong Kong':
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
articles = self.parse_section2(url, keystr)
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
if __InclPremium__ == True:
articles = self.parse_section2_txt(url, keystr)
else:
articles = self.parse_section2(url, keystr)
if articles:
feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url)
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
else:
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
# special- editorial
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
if ed_articles:
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
#if ed_articles:
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
# special - finance
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
#if fin_articles:
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
articles = self.parse_section(url)
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
# special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles:
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
#if ent_articles:
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
# special- columns
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
if col_articles:
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
elif __Region__ == 'Vancouver':
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a)
url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version
if __ParsePFF__ == True:
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_')
else:
url = url.replace('.htm', '_print.htm')
if url not in included_urls and url.rfind('Redirect') == -1:
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url)
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
# parse from life.mingpao.com
def parse_section2(self, url, keystr):
br = mechanize.Browser()
br.set_handle_redirect(False)
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
@ -350,7 +409,29 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
try:
br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
except:
print 'skipping a premium article'
current_articles.reverse()
return current_articles
# parse from text file of life.mingpao.com
def parse_section2_txt(self, url, keystr):
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
current_articles = []
included_urls = []
for i in a:
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
current_articles.reverse()
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse()
return current_articles
# preprocess those .txt and javascript based files
def preprocess_raw_html(self, raw_html, url):
new_html = raw_html
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
if url.rfind('_print.htm') <> -1:
# javascript based file
splitter = re.compile(r'\n')
new_raw_html = '<html><head><title>Untitled</title></head>'
new_raw_html = new_raw_html + '<body>'
for item in splitter.split(raw_html):
if item.startswith('var heading1 ='):
heading = item.replace('var heading1 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
new_raw_html = new_raw_html + '<div class="heading">' + heading
if item.startswith('var heading2 ='):
heading = item.replace('var heading2 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
if heading <> '':
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
else:
new_raw_html = new_raw_html + '</div>'
if item.startswith('var content ='):
content = item.replace("var content = ", '')
content = content.replace('\'', '')
content = content.replace(';', '')
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
if item.startswith('var photocontent ='):
photo = item.replace('var photocontent = \'', '')
photo = photo.replace('\'', '')
photo = photo.replace(';', '')
photo = photo.replace('<tr>', '')
photo = photo.replace('<td>', '')
photo = photo.replace('</tr>', '')
photo = photo.replace('</td>', '<br>')
photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
new_html = new_raw_html + '</body></html>'
else:
# .txt based file
splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_img_txt = False
title_started = False
title_break_reached = False
met_article_start_char = False
for item in splitter.split(raw_html):
item = item.strip()
# if title already reached but break between title and content not yet found, record title_break_reached
if title_started == True and title_break_reached == False and item == '':
title_break_reached = True
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
# start content
elif title_started == True and title_break_reached == True and met_article_start_char == False:
if item <> '':
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
#if item.startswith(u'\u3010'):
# met_article_start_char = True
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else:
if next_is_img_txt == False:
if item.startswith("=@"):
print 'skip movie link'
elif item.startswith("=?"):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
elif item.startswith('=='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[2:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
elif item.startswith('='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[1:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
else:
if next_is_img_txt == False and met_article_start_char == False:
if item <> '':
if title_started == False:
#print 'Title started at ', item
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '<p>\n'
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
new_html = new_raw_html + '</div></body></html>'
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True:
# TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
for img in imglist:
gifimg = img.replace('jpg"', 'gif"')
try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
# find the location of the first _
pos = img.find('_')
if pos > -1:
# if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:]
new_html = new_html.replace(img, newimg)
else:
# if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
#print 'Img list: ', imglist, '\n'
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg\'', 'gif\'')
try:
gifurl = re.sub(r'dailynews.*txt', '', url)
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.rfind('/')
newimg = img[0:pos+1] + '_' + img[pos+1:]
new_html = new_html.replace(img, newimg)
# repeat with src quoted by double quotes, for text parsed from src txt
imglist = re.findall('src="?.*?jpg"', new_html)
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg"', 'gif"')
try:
#print 'url', url
pos = url.rfind('/')
gifurl = url[:pos+1]
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.find('"')
newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg)
return new_html
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
@ -447,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
del item['absmiddle']
return soup
def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
try:
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
# look for content
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
textFound = False
for p in paras:
if not textFound:
summary_candidate = self.tag_to_string(p).strip()
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
if len(summary_candidate) > 0:
article.summary = article.text_summary = summary_candidate
textFound = True
else:
# display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts
counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
for p in paras:
summary_candidate = self.tag_to_string(p).strip()
counts += len(summary_candidate)
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
except:
self.log("Error creating article descriptions")
return
# override from the one in version 0.8.31
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
if __UseChineseTitle__ == True:
if __Region__ == 'Hong Kong':
title = u'\u660e\u5831 (\u9999\u6e2f)'
elif __Region__ == 'Vancouver':
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
elif __Region__ == 'Toronto':
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = self.short_title()
# if not generating a periodical, force date to apply in title
if __MakePeriodical__ == False:
title = self.short_title()
# change 1: allow our own flag to tell if a periodical is to be generated
# also use customed date instead of current time
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
title = title + ' ' + self.get_fetchformatteddate()
if True:
mi = MetaInformation(title, [self.publisher])
mi.publisher = self.publisher
mi.author_sort = self.publisher
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal()
mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.pubdate = nowf()
mi.pubdate = self.get_dtlocal()
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
# end of change 1
# change 2: __appname__ replaced by newspaper publisher
__appname__ = self.publisher
mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
# change 4: in the following, all the nowf() are changed to adjusted time
# This one doesn't matter
mi.timestamp = nowf()
# change 5: skip listing the articles
#article_titles, aseen = [], set()
#for f in feeds:
# for a in f:
# if a.title and a.title not in aseen:
# aseen.add(a.title)
# article_titles.append(force_unicode(a.title, 'utf-8'))
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
#mi.comments = self.description
#if not isinstance(mi.comments, unicode):
# mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
# '\n\n'.join(article_titles))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
desc = None
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po, author=auth, description=desc)
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, self.publisher, prefix=prefix,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
@ -578,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
# Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Vancouver'
# Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False".
# please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True
# Turn below to true if your device supports display of CJK titles
# Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False
# Set it to False if you want to skip images
# Set it to False if you want to skip images (Default: True)
__KeepImages__ = True
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True
# (HK only) It is to disable premium content (Default: False)
__InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
__ParsePFF__ = True
# (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False
# Override the date returned by the program if specifying a YYYYMMDD below
__Date__ = ''
'''
Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
provide options to remove all images in the file
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,38 @@ Change Log:
2010/10/31: skip repeated articles in section pages
'''
import os, datetime, re
from calibre.utils.date import now as nowf
import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
# MAIN CLASS
class MPRecipe(BasicNewsRecipe):
if __Region__ == 'Hong Kong':
title = 'Ming Pao - Hong Kong'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u9999\u6e2f)'
else:
title = 'Ming Pao - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
dict(attrs={'class':['heading']}), # for heading from txt
dict(attrs={'id':['newscontent']}), # entertainment and column page content
dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['content']}), # for content from txt
dict(attrs={'class':['photo']}),
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
dict(attrs={'class':['images']}) # for images from txt
]
if __KeepImages__:
remove_tags = [dict(name='style'),
@ -90,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: "</b>")
]
elif __Region__ == 'Vancouver':
title = 'Ming Pao - Vancouver'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
else:
title = 'Ming Pao - Vancouver'
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
category = 'Chinese, News, Vancouver'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -108,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: ''),
]
elif __Region__ == 'Toronto':
title = 'Ming Pao - Toronto'
if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = 'Ming Pao - Toronto'
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
category = 'Chinese, News, Toronto'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
conversion_options = {'linearize_tables':True}
timefmt = ''
def image_url_processor(cls, baseurl, url):
# trick: break the url at the first occurance of digit, add an additional
# '_' at the front
# not working, may need to move this to preprocess_html() method
# minIdx = 10000
# i0 = url.find('0')
# if i0 >= 0 and i0 < minIdx:
# minIdx = i0
# i1 = url.find('1')
# if i1 >= 0 and i1 < minIdx:
# minIdx = i1
# i2 = url.find('2')
# if i2 >= 0 and i2 < minIdx:
# minIdx = i2
# i3 = url.find('3')
# if i3 >= 0 and i0 < minIdx:
# minIdx = i3
# i4 = url.find('4')
# if i4 >= 0 and i4 < minIdx:
# minIdx = i4
# i5 = url.find('5')
# if i5 >= 0 and i5 < minIdx:
# minIdx = i5
# i6 = url.find('6')
# if i6 >= 0 and i6 < minIdx:
# minIdx = i6
# i7 = url.find('7')
# if i7 >= 0 and i7 < minIdx:
# minIdx = i7
# i8 = url.find('8')
# if i8 >= 0 and i8 < minIdx:
# minIdx = i8
# i9 = url.find('9')
# if i9 >= 0 and i9 < minIdx:
# minIdx = i9
return url
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
if __Region__ == 'Hong Kong':
# convert UTC to local hk time - at HKT 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
# convert UTC to local hk time - at HKT 4.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
elif __Region__ == 'Vancouver':
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,13 +193,34 @@ class MPRecipe(BasicNewsRecipe):
return dt_local
def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d")
if __Date__ <> '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d")
if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
return self.get_dtlocal().strftime("%d")
if __Date__ <> '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
def get_cover_url(self):
if __Region__ == 'Hong Kong':
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
articles = self.parse_section2(url, keystr)
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
if __InclPremium__ == True:
articles = self.parse_section2_txt(url, keystr)
else:
articles = self.parse_section2(url, keystr)
if articles:
feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url)
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
else:
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
# special- editorial
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
if ed_articles:
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
#if ed_articles:
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
# special - finance
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
#if fin_articles:
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
articles = self.parse_section(url)
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
# special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles:
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
#if ent_articles:
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
if articles:
feeds.append((title, articles))
# special- columns
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
if col_articles:
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
elif __Region__ == 'Vancouver':
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a)
url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version
if __ParsePFF__ == True:
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_')
else:
url = url.replace('.htm', '_print.htm')
if url not in included_urls and url.rfind('Redirect') == -1:
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url)
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
# parse from life.mingpao.com
def parse_section2(self, url, keystr):
br = mechanize.Browser()
br.set_handle_redirect(False)
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
@ -350,7 +409,29 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
try:
br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
except:
print 'skipping a premium article'
current_articles.reverse()
return current_articles
# parse from text file of life.mingpao.com
def parse_section2_txt(self, url, keystr):
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
current_articles = []
included_urls = []
for i in a:
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
current_articles.reverse()
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse()
return current_articles
# preprocess those .txt and javascript based files
def preprocess_raw_html(self, raw_html, url):
new_html = raw_html
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
if url.rfind('_print.htm') <> -1:
# javascript based file
splitter = re.compile(r'\n')
new_raw_html = '<html><head><title>Untitled</title></head>'
new_raw_html = new_raw_html + '<body>'
for item in splitter.split(raw_html):
if item.startswith('var heading1 ='):
heading = item.replace('var heading1 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
new_raw_html = new_raw_html + '<div class="heading">' + heading
if item.startswith('var heading2 ='):
heading = item.replace('var heading2 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
if heading <> '':
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
else:
new_raw_html = new_raw_html + '</div>'
if item.startswith('var content ='):
content = item.replace("var content = ", '')
content = content.replace('\'', '')
content = content.replace(';', '')
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
if item.startswith('var photocontent ='):
photo = item.replace('var photocontent = \'', '')
photo = photo.replace('\'', '')
photo = photo.replace(';', '')
photo = photo.replace('<tr>', '')
photo = photo.replace('<td>', '')
photo = photo.replace('</tr>', '')
photo = photo.replace('</td>', '<br>')
photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
new_html = new_raw_html + '</body></html>'
else:
# .txt based file
splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_img_txt = False
title_started = False
title_break_reached = False
met_article_start_char = False
for item in splitter.split(raw_html):
item = item.strip()
# if title already reached but break between title and content not yet found, record title_break_reached
if title_started == True and title_break_reached == False and item == '':
title_break_reached = True
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
# start content
elif title_started == True and title_break_reached == True and met_article_start_char == False:
if item <> '':
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
#if item.startswith(u'\u3010'):
# met_article_start_char = True
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else:
if next_is_img_txt == False:
if item.startswith("=@"):
print 'skip movie link'
elif item.startswith("=?"):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
elif item.startswith('=='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[2:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
elif item.startswith('='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[1:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
else:
if next_is_img_txt == False and met_article_start_char == False:
if item <> '':
if title_started == False:
#print 'Title started at ', item
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '<p>\n'
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
new_html = new_raw_html + '</div></body></html>'
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True:
# TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
for img in imglist:
gifimg = img.replace('jpg"', 'gif"')
try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
# find the location of the first _
pos = img.find('_')
if pos > -1:
# if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:]
new_html = new_html.replace(img, newimg)
else:
# if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
#print 'Img list: ', imglist, '\n'
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg\'', 'gif\'')
try:
gifurl = re.sub(r'dailynews.*txt', '', url)
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.rfind('/')
newimg = img[0:pos+1] + '_' + img[pos+1:]
new_html = new_html.replace(img, newimg)
# repeat with src quoted by double quotes, for text parsed from src txt
imglist = re.findall('src="?.*?jpg"', new_html)
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg"', 'gif"')
try:
#print 'url', url
pos = url.rfind('/')
gifurl = url[:pos+1]
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.find('"')
newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg)
return new_html
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
@ -447,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
del item['absmiddle']
return soup
def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
try:
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
# look for content
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
textFound = False
for p in paras:
if not textFound:
summary_candidate = self.tag_to_string(p).strip()
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
if len(summary_candidate) > 0:
article.summary = article.text_summary = summary_candidate
textFound = True
else:
# display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts
counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
for p in paras:
summary_candidate = self.tag_to_string(p).strip()
counts += len(summary_candidate)
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
except:
self.log("Error creating article descriptions")
return
# override from the one in version 0.8.31
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
if __UseChineseTitle__ == True:
if __Region__ == 'Hong Kong':
title = u'\u660e\u5831 (\u9999\u6e2f)'
elif __Region__ == 'Vancouver':
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
elif __Region__ == 'Toronto':
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = self.short_title()
# if not generating a periodical, force date to apply in title
if __MakePeriodical__ == False:
title = self.short_title()
# change 1: allow our own flag to tell if a periodical is to be generated
# also use customed date instead of current time
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
title = title + ' ' + self.get_fetchformatteddate()
if True:
mi = MetaInformation(title, [self.publisher])
mi.publisher = self.publisher
mi.author_sort = self.publisher
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.timestamp = nowf()
mi.timestamp = self.get_dtlocal()
mi.comments = self.description
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.pubdate = nowf()
mi.pubdate = self.get_dtlocal()
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
# end of change 1
# change 2: __appname__ replaced by newspaper publisher
__appname__ = self.publisher
mi = MetaInformation(title, [__appname__])
mi.publisher = __appname__
mi.author_sort = __appname__
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
if __MakePeriodical__ == True:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
else:
mi.publication_type = self.publication_type+':'+self.short_title()
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
# change 4: in the following, all the nowf() are changed to adjusted time
# This one doesn't matter
mi.timestamp = nowf()
# change 5: skip listing the articles
#article_titles, aseen = [], set()
#for f in feeds:
# for a in f:
# if a.title and a.title not in aseen:
# aseen.add(a.title)
# article_titles.append(force_unicode(a.title, 'utf-8'))
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
#mi.comments = self.description
#if not isinstance(mi.comments, unicode):
# mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
# '\n\n'.join(article_titles))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf = OPFCreator(dir, mi)
# Add mastheadImage entry to <guide> section
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
desc = None
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po, author=auth, description=desc)
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, self.publisher, prefix=prefix,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
@ -578,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

76
recipes/money_pl.recipe Normal file
View File

@ -0,0 +1,76 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class FocusRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = u'intromatyk <intromatyk@gmail.com>'
language = 'pl'
version = 1
title = u'Money.pl'
category = u'News'
description = u'Informacje finansowe z kraju i ze świata. Aktualne i archiwalne: notowania giełdowe, kursy walut, wskaźniki gospodarcze.'
remove_empty_feeds= True
no_stylesheets=True
oldest_article = 1
max_articles_per_feed = 100000
recursions = 0
no_stylesheets = True
remove_javascript = True
simultaneous_downloads = 2
r = re.compile('.*(?P<url>http:\/\/(www.money.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
keep_only_tags =[]
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'artykul'}))
remove_tags = [dict(name='ul', attrs={'class':'socialStuff'})]
extra_css = '''
body {font-family: Arial,Helvetica,sans-serif ;}
h1{text-align: left;}
h2{font-size: medium; font-weight: bold;}
p.lead {font-weight: bold; text-align: left;}
.authordate {font-size: small; color: #696969;}
.fot{font-size: x-small; color: #666666;}
'''
feeds = [
('Wiadomosci z kraju', 'http://money.pl.feedsportal.com/c/33900/f/612847/index.rss'),
('Wiadomosci ze swiata', 'http://money.pl.feedsportal.com/c/33900/f/612848/index.rss'),
('Gospodarka', 'http://money.pl.feedsportal.com/c/33900/f/612849/index.rss'),
('Waluty', 'http://money.pl.feedsportal.com/c/33900/f/612850/index.rss'),
('Gielda', 'http://money.pl.feedsportal.com/c/33900/f/612851/index.rss'),
('Banki', 'http://money.pl.feedsportal.com/c/33900/f/612852/index.rss'),
('Fundusze', 'http://money.pl.feedsportal.com/c/33900/f/612853/index.rss'),
('Emerytury', 'http://money.pl.feedsportal.com/c/33900/f/612854/index.rss'),
('Podatki', 'http://money.pl.feedsportal.com/c/33900/f/612855/index.rss'),
('Ubezpieczenia', 'http://money.pl.feedsportal.com/c/33900/f/612856/index.rss'),
('Poradniki', 'http://money.pl.feedsportal.com/c/33900/f/612857/index.rss'),
('Raporty', 'http://money.pl.feedsportal.com/c/33900/f/612858/index.rss'),
('Motoryzacja', 'http://money.pl.feedsportal.com/c/33900/f/612859/index.rss'),
('Manager', 'http://money.pl.feedsportal.com/c/33900/f/612860/index.rss'),
('Dla firm', 'http://money.pl.feedsportal.com/c/33900/f/612861/index.rss'),
('Prawo', 'http://money.pl.feedsportal.com/c/33900/f/612862/index.rss'),
('Nieruchomosci', 'http://money.pl.feedsportal.com/c/33900/f/612863/index.rss'),
('Praca', 'http://money.pl.feedsportal.com/c/33900/f/612864/index.rss'),
]
def print_version(self, url):
if url.count ('money.pl.feedsportal.com'):
u = url.find('0Cartykul0C')
u = 'http://www.m.money.pl/wiadomosci/artykul/' + url[u + 21:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace ('0E','-')
u = u.replace ('0P',';')
u = u.replace ('0H',',')
u = u.replace ('0B','.')
u = u.replace (',0,',',-1,')
u = u.replace('0Tutm0Isource0Frss0Gutm0Imedium0Frss0Gutm0Icampaign0Frss/story01.htm', '')
else:
u = url.replace('/nc/1','/do-druku/1')
return u

View File

@ -1,9 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
moneynews.newsmax.com
www.moneynews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
@ -12,40 +10,40 @@ class MoneyNews(BasicNewsRecipe):
title = 'Moneynews.com'
__author__ = 'Darko Miletic'
description = 'Financial news worldwide'
publisher = 'moneynews.com'
language = 'en'
publisher = 'Newsmax.com'
language = 'en'
category = 'news, finances, USA, business'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
encoding = 'utf8'
extra_css = 'img{display: block} body{font-family: Arial, Helvetica, sans-serif}'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
feeds = [
(u'Street Talk' , u'http://moneynews.newsmax.com/xml/streettalk.xml' )
,(u'Finance News' , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' )
,(u'Economy' , u'http://moneynews.newsmax.com/xml/economy.xml' )
,(u'Companies' , u'http://moneynews.newsmax.com/xml/companies.xml' )
,(u'Markets' , u'http://moneynews.newsmax.com/xml/Markets.xml' )
,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml' )
(u'Street Talk' , u'http://www.moneynews.com/rss/StreetTalk/8.xml' )
,(u'Finance News' , u'http://www.moneynews.com/rss/FinanceNews/4.xml' )
,(u'Economy' , u'http://www.moneynews.com/rss/Economy/2.xml' )
,(u'Companies' , u'http://www.moneynews.com/rss/Companies/6.xml' )
,(u'Markets' , u'http://www.moneynews.com/rss/Markets/7.xml' )
,(u'Investing & Analysis' , u'http://www.moneynews.com/rss/InvestingAnalysis/17.xml')
]
keep_only_tags = [dict(name='table', attrs={'class':'copy'})]
keep_only_tags = [dict(name='div', attrs={'class':'copy'})]
remove_tags = [
dict(name='td' , attrs={'id':'article_fontsize'})
,dict(name='table', attrs={'id':'toolbox' })
,dict(name='tr' , attrs={'id':'noprint3' })
dict(attrs={'class':['MsoNormal', 'MsoNoSpacing']}),
dict(name=['object','link','embed','form','meta'])
]
def print_version(self, url):
nodeid = url.rpartition('/')[2]
return 'http://www.moneynews.com/PrintTemplate?nodeid=' + nodeid

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,15 +7,72 @@ __license__ = 'GPL v3'
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Montreal Gazette
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
title = u'Montreal Gazette'
url_prefix = 'http://www.montrealgazette.com'
description = u'News from Montreal, QC'
fp_tag = 'CAN_MG'
language = 'en_CA'
@ -38,14 +96,81 @@ class CanWestPaper(BasicNewsRecipe):
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -0,0 +1,59 @@
from calibre.web.feeds.news import BasicNewsRecipe
class MumbaiMirror(BasicNewsRecipe):
title = u'Mumbai Mirror'
oldest_article = 2
max_articles_per_feed = 100
__author__ = 'Krittika Goyal'
description = 'People Daily Newspaper'
language = 'en_IN'
category = 'News, Mumbai, India'
remove_javascript = True
use_embedded_content = False
auto_cleanup = True
no_stylesheets = True
#encoding = 'GB2312'
conversion_options = {'linearize_tables':True}
feeds = [
('Cover Story',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=latest'),
('City Diary',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=citydiary'),
('Columnists',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=mmcolumnists'),
('Mumbai, The City',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=city'),
('Nation',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=nation'),
('Top Stories',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=topstories'),
('Business',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=business'),
('World',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=world'),
(' Chai Time',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=chaitime'),
('Technology',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=technology'),
('Entertainment',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=entertainment'),
('Style',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=style'),
('Ask the Sexpert',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=askthesexpert'),
('Television',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=television'),
('Lifestyle',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=lifestyle'),
('Sports',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=sports'),
('Travel: Travelers Diary',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=travellersdiaries'),
('Travel: Domestic',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=traveldomestic'),
('Travel: International',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=travelinternational')
]

141
recipes/mwjournal.recipe Normal file
View File

@ -0,0 +1,141 @@
#!/usr/bin/env python
##
## Title: Microwave Journal
## Contact: Kiavash (use Mobile Read)
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: Kiavash
##
## Written: Jan 2012
## Last Edited: Feb 2012
##
# Feb 2012: New Recipe compatible with the MWJournal 2.0 website
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'Kiavash'
__author__ = 'Kaivash'
'''
microwavejournal.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class MWJournal(BasicNewsRecipe):
title = u'Microwave Journal'
description = u'Microwave Journal Monthly Magazine'
publisher = 'Horizon House'
publication_type = 'magazine'
INDEX = 'http://www.microwavejournal.com/publications/'
language = 'en'
timeout = 30
Convert_Grayscale = False # Convert images to gray scale or not
keep_only_tags = [dict(name='div', attrs={'class':'record'})]
no_stylesheets = True
remove_javascript = True
remove_tags = [
dict(name='font', attrs={'class':'footer'}), # remove fonts
]
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks, href links and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<a.*?>'), lambda h1: ''),
(re.compile(r'</a>'), lambda h2: ''),
(re.compile(r'float:.*?'), lambda h3: ''),
(re.compile(r'width:.*?px'), lambda h4: ''),
(re.compile(r'height:.*?px'), lambda h5: '')
]
def print_version(self, url):
return url.replace('/articles/', '/articles/print/')
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
ts = soup.find('div', attrs={'class':'box1 article publications-show'})
ds = self.tag_to_string(ts.find('h2'))
self.log('Found Current Issue:', ds)
self.timefmt = ' [%s]'%ds
cover = ts.find('img', src=True)
if cover is not None:
self.cover_url = 'http://www.microwavejournal.com' + cover['src']
self.log('Found Cover image:', self.cover_url)
feeds = []
seen_titles = set([]) # This is used to remove duplicant articles
sections = soup.find('div', attrs={'class':'box2 publication'})
for section in sections.findAll('div', attrs={'class':'records'}):
section_title = self.tag_to_string(section.find('h3'))
self.log('Found section:', section_title)
articles = []
for post in section.findAll('div', attrs={'class':'record'}):
h = post.find('h2')
title = self.tag_to_string(h)
if title.find('The MWJ Puzzler') >=0: #Let's get rid of the useless Puzzler!
continue
if title in seen_titles:
continue
seen_titles.add(title)
a = post.find('a', href=True)
url = a['href']
if url.startswith('/'):
url = 'http://www.microwavejournal.com'+url
abstract = post.find('div', attrs={'class':'abstract'})
p = abstract.find('p')
desc = None
self.log('\tFound article:', title, 'at', url)
if p is not None:
desc = self.tag_to_string(p)
self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc,
'date':self.timefmt})
if articles:
feeds.append((section_title, articles))
return feeds
def postprocess_html(self, soup, first):
if self.Convert_Grayscale:
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -1,16 +1,35 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
##
## Title: Microwave Journal RSS recipe
## Contact: AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: 2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## Written: 2008
## Last Edited: Jan 2012
##
'''
01-19-2012: Added GrayScale Image conversion and Duplicant article removals
'''
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2008-2012, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
__version__ = 'v0.5.0'
__date__ = '2012-01-19'
__author__ = 'Darko Miletic'
'''
newscientist.com
'''
import re
import urllib
from calibre.utils.magick import Image
from calibre.web.feeds.news import BasicNewsRecipe
class NewScientist(BasicNewsRecipe):
title = 'New Scientist - Online News w. subscription'
__author__ = 'Darko Miletic'
description = 'Science news and science articles from New Scientist.'
language = 'en'
publisher = 'Reed Business Information Ltd.'
@ -39,6 +58,15 @@ class NewScientist(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
# more than one section). If True, only the first occurance will be downloaded.
filterDuplicates = True
# Whether to convert images to grayscale for eInk readers.
Convert_Grayscale = False
url_list = [] # This list is used to check if an article had already been included.
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.open('http://www.newscientist.com/')
@ -80,6 +108,10 @@ class NewScientist(BasicNewsRecipe):
return article.get('guid', None)
def print_version(self, url):
if self.filterDuplicates:
if url in self.url_list:
return
self.url_list.append(url)
return url + '?full=true&print=true'
def preprocess_html(self, soup):
@ -101,3 +133,16 @@ class NewScientist(BasicNewsRecipe):
tg.replaceWith(tstr)
return soup
# Converts images to Gray Scale
def postprocess_html(self, soup, first):
if self.Convert_Grayscale:
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -1,14 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
##
## Title: New Journal of Physics
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: Chema Cort\xe9s
##
## Written: Jan 2011
## Last Edited: Jan 2012 - by Kiavash
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
__version__ = 'v0.01'
__date__ = '2011-01-05'
__version__ = 'v0.5.0'
__date__ = '2012-01-13'
'''
njp.org
'''
import re # Import the regular expressions module.
from calibre.web.feeds.news import BasicNewsRecipe
class NewJournalOfPhysics(BasicNewsRecipe):
@ -19,14 +30,60 @@ class NewJournalOfPhysics(BasicNewsRecipe):
category = 'physics, journal, science'
language = 'en'
oldest_article = 30
max_articles_per_feed = 100
keep_only_tags = [dict(id=['fulltextContainer'])]
no_stylesheets=True
use_embedded_content=False
feeds = [(u'Latest Papers', u'http://iopscience.iop.org/1367-2630/?rss=1')]
cover_url = 'http://images.iop.org/journals_icons/Info/1367-2630/cover.gif'
oldest_article = 7
max_articles_per_feed = 30
timeout = 30
no_stylesheets = True
use_embedded_content = False
remove_javascript = True
remove_empty_feeds = True
asciiize = True # Converts all none ascii characters to their ascii equivalents
keep_only_tags = [
dict(id=['articleEvoContainer']),
]
remove_tags = [
dict(name='div', attrs={'class':'affiliations'}), # Removes Shoow Affiliations
dict(name='div', attrs={'class':'abst-icon-links'}), # Removes Tags and PDF export
dict(name='p', attrs={'class':'studyimage'}), # remove Studay image
dict(name='a', attrs={'class':'icon powerpoint'}), # remove Export to PowerPoint Slide
dict(name='a', attrs={'title':'CrossRef'}), # remove CrossRef icon
dict(name='a', attrs={'title':'PubMed'}), # remove PubMed icon
dict(name='a', attrs={'e4f5426941':'true'}), # remove cross ref image
dict(name='img', attrs={'src':''}), # remove empty image
dict(name='a', attrs={'class':'closeChap'}), # remove 'Close'
dict(name='ul', attrs={'class':'breadcrumbs'}), # remove Top breadcrumbs
]
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
]
def print_version(self, url):
return url+"/fulltext"
return url+"/article"

54
recipes/nol.recipe Normal file
View File

@ -0,0 +1,54 @@
################################################################################
#Description: http://nol.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2011.12.18. - V1.1
################################################################################
from calibre.web.feeds.recipes import BasicNewsRecipe
class NOL(BasicNewsRecipe):
title = u'NOL'
__author__ = 'Bigpapa'
oldest_article = 5
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
language = 'hu'
publication_type = 'newsportal'
conversion_options ={
'linearize_tables' : True,
}
keep_only_tags = [
dict(name='table', attrs={'class':['article-box']})
]
remove_tags = [
dict(name='div', attrs={'class':['h','ad-container-outer','tags noborder','ad-container-inner','image-container-lead','tags','related-container']}),
dict(name='h4'),
dict(name='tfoot'),
dict(name='td', attrs={'class':['foot']}),
dict(name='span', attrs={'class':['image-container-caption']}),
]
feeds = [
# (u'V\xe1logat\xe1s', 'http://nol.hu/feed/valogatas.rss'),
(u'Belf\xf6ld', 'http://nol.hu/feed/belfold.rss'),
(u'K\xfclf\xf6ld', 'http://nol.hu/feed/kulfold.rss'),
(u'Gazdas\xe1g', 'http://nol.hu/feed/gazdasag.rss'),
(u'V\xe9lem\xe9ny', 'http://nol.hu/feed/velemeny.rss'),
(u'Kult\xfara', 'http://nol.hu/feed/kult.rss'),
(u'Tud/Tech', 'http://nol.hu/feed/tud-tech.rss'),
(u'Sport', 'http://nol.hu/feed/sport.rss'),
(u'Noller', 'http://nol.hu/feed/noller.rss'),
(u'Mozaik', 'http://nol.hu/feed/mozaik.rss'),
(u'Utaz\xe1s', 'http://nol.hu/feed/utazas.rss'),
(u'Aut\xf3', 'http://nol.hu/feed/auto.rss'),
(u'Voks', 'http://nol.hu/feed/voks.rss'),
]

View File

@ -0,0 +1,100 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
novine.novilist.hr
'''
import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class NoviList_hr(BasicNewsRecipe):
title = 'Novi List'
__author__ = 'Darko Miletic'
description = 'Vijesti iz Hrvatske'
publisher = 'NOVI LIST d.d.'
category = 'Novi list, politika, hrvatski dnevnik, Novine, Hrvatska, Croatia, News, newspaper, Hrvatski,Primorje, dnevni list, Rijeka'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1250'
use_embedded_content = False
language = 'hr'
remove_empty_feeds = True
publication_type = 'newspaper'
needs_subscription = True
masthead_url = 'http://novine.novilist.hr/images/system/novilist-logo.jpg'
index = 'http://novine.novilist.hr/'
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Geneva,Arial,Helvetica,Swiss,sans1,sans-serif }
img{display:block; margin-bottom: 0.4em; margin-top: 0.4em}
.nadnaslov,.podnaslov{font-size: small; display: block; margin-bottom: 1em}
.naslov{font-size: x-large; color: maroon; font-weight: bold; display: block; margin-bottom: 1em;}
p{display: block}
"""
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
keep_only_tags = [
dict(name='td', attrs={'class':['nadnaslov', 'naslov', 'podnaslov']}),
dict(name='font', attrs={'face':'Geneva,Arial,Helvetica,Swiss'})
]
remove_tags = [dict(name=['meta', 'link', 'iframe', 'embed', 'object'])]
remove_attributes=['border', 'lang', 'size', 'face', 'bgcolor']
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.index + 'loginnow.asp')
br.select_form(nr=0)
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
articles = []
count = 0
soup = self.index_to_soup(self.index)
#cover url
for alink in soup.findAll('a'):
if alink['href'].startswith('images/clanci/DOC_'):
self.cover_url = self.index + alink['href']
#feeds
for item in soup.findAll('td',attrs={'class':'tocrubrika'}):
count = count +1
if self.test and count > 2:
return articles
aitem = item.a
section = self.tag_to_string(aitem)
feedlink = self.index + aitem['href']
feedpage = self.index_to_soup(feedlink)
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
inarts = []
for alink in feedpage.findAll('a',attrs={'class':'naslovlinkdesno'}):
url = self.index + alink['href']
inarts.append({
'title' :self.tag_to_string(alink)
,'date' :strftime(self.timefmt)
,'url' :url
,'description':''
})
if self.remove_empty_feeds:
if inarts:
articles.append((section,inarts))
else:
articles.append((section,inarts))
return articles
def print_version(self, url):
return url.replace('?WCI=Rubrike&','?WCI=Pretrazivac&')

View File

@ -0,0 +1,49 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.novilist.hr
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class NoviList_Portal_hr(BasicNewsRecipe):
title = 'Novi List - online portal'
__author__ = 'Darko Miletic'
description = 'Portal Novog Lista'
publisher = 'NOVI LIST d.d.'
category = 'Novi list, politika, hrvatski dnevnik, Novine, Hrvatska, Croatia, News, newspaper, Hrvatski,Primorje, dnevni list, Rijeka'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'hr'
publication_type = 'newsportal'
masthead_url = 'http://www.novilist.hr/design/novilist/images/logo-print.gif'
extra_css = """
body{font-family: Geneva,Arial,Helvetica,Swiss,sans-serif }
h1{font-family: Georgia,serif}
img{display:block; margin-bottom: 0.4em; margin-top: 0.4em}
"""
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
remove_tags = [dict(name=['meta', 'link', 'iframe', 'embed', 'object'])]
remove_attributes=['border', 'lang']
feeds = [(u'Vijesti', u'http://www.novilist.hr/rss/feed/sve.xml')]
def print_version(self, url):
return url.replace('http://www.novilist.hr/','http://www.novilist.hr/layout/set/print/')

View File

@ -0,0 +1,26 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1329123365(BasicNewsRecipe):
title = u'Novinite.bg'
__author__ = 'M3 Web'
description = 'Real time provider of the latest news from Bulgaria and the world'
category = 'Business, Politics, Society, Sports, Crime, Lifestyle, World, Health'
oldest_article = 7
max_articles_per_feed = 6
language = 'bg'
encoding = 'windows-1251'
no_stylesheets = True
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
remove_tags = [dict(name='div', attrs={'id':'text_options'})]
remove_tags = [dict(name='div', attrs={'id':'social_shares_top'})]
remove_tags_after = dict(id='textsize')
feeds = [(u'Business', u'http://novinite.bg/rss.php?category_id=1'),
(u'Politics', u'http://novinite.bg/rss.php?category_id=2'),
(u'Society', u'http://novinite.bg/rss.php?category_id=3'),
(u'Sport', u'http://novinite.bg/rss.php?category_id=4'),
(u'Crime', u'http://novinite.bg/rss.php?category_id=5'),
(u'Lifestyle', u'http://novinite.bg/rss.php?category_id=6'),
(u'Health', u'http://novinite.bg/rss.php?category_id=7'),
(u'Other', u'http://novinite.bg/rss.php?category_id=10'),
(u'World', u'http://novinite.bg/rss.php?category_id=9')]

View File

@ -325,7 +325,8 @@ class NYTimes(BasicNewsRecipe):
'''
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw):
f = self.browser.open(url_or_raw)
br = self.clone_browser(self.browser)
f = br.open_novisit(url_or_raw)
_raw = f.read()
f.close()
if not _raw:

View File

@ -364,7 +364,8 @@ class NYTimes(BasicNewsRecipe):
'''
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw):
f = self.browser.open(url_or_raw)
br = self.clone_browser(self.browser)
f = br.open_novisit(url_or_raw)
_raw = f.read()
f.close()
if not _raw:

21
recipes/onda_rock.recipe Normal file
View File

@ -0,0 +1,21 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1328535130(BasicNewsRecipe):
title = u'Onda Rock'
__author__ = 'faber1971'
description = 'Italian rock webzine'
language = 'it'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = False
remove_tags = [
dict(name='div', attrs={'id':['boxHeader','boxlinks_med','footer','boxinterviste','box_special_med','boxdiscografia_head','path']}),
dict(name='div', attrs={'align':'left'}),
dict(name='div', attrs={'style':'text-align: center'}),
]
no_stylesheets = True
feeds = [(u'Onda Rock', u'http://www.ondarock.it/feed.php')]
masthead_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/71135_45820579767_4993043_n.jpg'

77
recipes/opinion_bo.recipe Normal file
View File

@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011, Piet van Oostrum <piet@vanoostrum.org>'
'''
www.opinion.com.bo
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Opinion_Bol(BasicNewsRecipe):
title = u'Opinión - Bolivia'
__author__ = 'Piet van Oostrum'
description = u'Opinión diario de circulación nacional, Cochabamba, Bolivia'
publisher = 'Coboce Ltda - Editora Opinión'
category = 'news, politics, Bolivia'
version = 1
oldest_article = 1
max_articles_per_feed = 20
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'es_BO'
publication_type = 'newspaper'
delay = 1
remove_empty_feeds = True
cover_url = strftime('http://www.opinion.com.bo/opinion/articulos/%Y/%m%d/fotos/portada_650.jpg')
masthead_url = 'http://opinion.com.bo/opinion/articulos/imagenes/logo_opinion.gif'
extra_css = """body{font-family: Helvetica,Arial,sans-serif}
.seccion_encabezado_nota_inte{font-size: 1.1em;
font-weight: bold;}
.autor_nota_inte{color: #999999; font-size: 0.8em;
margin-bottom: 0.5em; text-align: right;}
.pie{font-size: 0.8em;}"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':'columna_izq_nota_intererior'})]
remove_tags = [dict(name=['meta','link','form','iframe','embed','object','style']),
dict(name='div', attrs={'class':'ocultar'})]
remove_attributes = ['width','height']
feeds = [
(u'El País' , u'http://www.opinion.com.bo/opinion/rss/el_pais_rss.xml' )
,(u'Cochabamba' , u'http://www.opinion.com.bo/opinion/rss/cochabamba_rss.xml' )
,(u'Economía' , u'http://www.opinion.com.bo/opinion/rss/economia_rss.xml' )
,(u'Cultura' , u'http://www.opinion.com.bo/opinion/rss/cultura_rss.xml' )
,(u'Mundo' , u'http://www.opinion.com.bo/opinion/rss/mundo_rss.xml' )
,(u'Ciencia y Tecnología', u'http://www.opinion.com.bo/opinion/rss/ciencia_tecnologia_rss.xml' )
,(u'Policial' , u'http://www.opinion.com.bo/opinion/rss/policial_rss.xml' )
,(u'Editorial' , u'http://www.opinion.com.bo/opinion/rss/editorial_rss.xml' )
,(u'Subeditorial' , u'http://www.opinion.com.bo/opinion/rss/subeditorial_rss.xml' )
,(u'Opinión' , u'http://www.opinion.com.bo/opinion/rss/opinion_rss.xml' )
,(u'Deportes' , u'http://www.opinion.com.bo/opinion/rss/deportes_rss.xml')
,(u' Vida de hoy' , u'http://www.opinion.com.bo/opinion/rss/vidadehoy_rss.xml' )
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
# Filter out today's articles
# maybe should take timezone into account
today = strftime('/%Y/%m%d/')
def get_article_url(self, article):
link = article.link
if self.today in link:
return link

View File

@ -0,0 +1,197 @@
# Talking Points is not grabbing everything.
# The look is right, but only the last one added?
import re
import time
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
# strip ads and graphics
# Current Column lacks a title.
# Talking Points Memo - shorten title - Remove year and Bill's name
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
auto_cleanup = True
encoding = 'utf8'
needs_subscription = True
no_stylesheets = True
oldest_article = 20
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 2000
debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
br.select_form(name='login')
br['formEmailField'] = self.username
br['formPasswordField'] = self.password
br.submit()
return br
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, baseURL, pageURL, printString):
tagURL = pageURL
soup = self.index_to_soup(pageURL)
if soup :
printText = soup.find('a', text=printString)
else :
print("Failed to find Print string "+printString+ " in "+pageURL)
if printText:
tag = printText.parent
tagURL = baseURL+tag['href']
return tagURL
def stripBadChars(self, inString) :
return inString.replace("\'", "")
def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
# NoSpin and TV are generic
fullReturn = []
for i in range(len(self.catList)) :
articleList = []
soup = self.index_to_soup(self.catList[i][1])
# Set defaults
description = 'None'
pubdate = time.strftime('%a, %d %b')
# Problem: 0-2 create many in an array
# 3-5 create one.
# So no for-div for 3-5
if i < 3 :
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
print(div)
if i == 1:
a = div.find('a', href=True)
else :
a = div
print(a)
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
if not a:
continue
# url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href']
if i < 2 :
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
elif i == 2 :
# Daily Briefs
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = div.contents[0]
if self.debugMessages :
print(title+" @ "+url)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
elif i == 3 : # Stratfor
a = soup.find('a', self.catList[i][3])
if a is None :
continue
url = baseURL+a['href']
title = self.tag_to_string(a, use_alt=True).strip()
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
# Changed 12 Jan 2012 - new page format
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
#stratBody = stratSoup.find('td', {'class':['blogBody']})
elif i == 4 : # Talking Points
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
myTable = topDate.findParents('table')[0]
if myTable is not None:
upOneTable = myTable.findParents('table')[0]
if upOneTable is not None:
upTwo = upOneTable.findParents('table')[0]
if upTwo is None:
continue
# Now navigate rows of upTwo
if self.debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
# This looks wrong.
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
url = baseURL+re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if self.debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
else : # Current Column
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
if titleSpan is None :
continue
title = titleSpan.contents[0]
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
if i == 3 or i == 5 :
if self.debugMessages :
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
self.catList[i][3] = articleList
fullReturn.append((self.catList[i][0], articleList))
return fullReturn
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
# 'title' : article title,
# 'url' : URL of print version,
# 'date' : The publication date of the article as a string,
# 'description' : A summary of the article
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self):
# Parse the page into Python Soup
baseURL = "https://www.billoreilly.com"
return self.parseGeneric(baseURL)
def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,20 +7,72 @@ __license__ = 'GPL v3'
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Ottawa Citizen
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
title = u'Ottawa Citizen'
url_prefix = 'http://www.ottawacitizen.com'
description = u'News from Ottawa, ON'
fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -43,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,12 +1,10 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
pagina12.com.ar
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Pagina12(BasicNewsRecipe):
title = 'Pagina - 12'
@ -66,9 +64,7 @@ class Pagina12(BasicNewsRecipe):
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
def get_cover_url(self):
rawc = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html',True)
rawc2 = re.sub(r'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN','PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"',rawc)
soup = BeautifulSoup(rawc2,fromEncoding=self.encoding,smartQuotesTo=None)
soup = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html')
for image in soup.findAll('img',alt=True):
if image['alt'].startswith('Tapa de la fecha'):
return image['src']

14
recipes/pambianco.recipe Normal file
View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1326135591(BasicNewsRecipe):
title = u'Pambianco'
description = 'fashion magazine for professional people'
language = 'it'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Pambianco', u'http://feeds.feedburner.com/pambianconews/YGXu')]
__author__ = 'faber1971'
__version__ = 'v1.0'
__date__ = '9, January 2011'

View File

@ -1,10 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
import os, time
class AdvancedUserRecipe1277129332(BasicNewsRecipe):
title = u'People Daily - China'
title = u'人民日报'
oldest_article = 2
max_articles_per_feed = 100
__author__ = 'rty'
__author__ = 'zzh'
pubisher = 'people.com.cn'
description = 'People Daily Newspaper'
@ -14,21 +15,65 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
use_embedded_content = False
no_stylesheets = True
encoding = 'GB2312'
language = 'zh'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.people.com.cn/img/2010wb/images/logo.gif'
feeds = [(u'\u56fd\u5185\u65b0\u95fb', u'http://www.people.com.cn/rss/politics.xml'),
(u'\u56fd\u9645\u65b0\u95fb', u'http://www.people.com.cn/rss/world.xml'),
(u'\u7ecf\u6d4e\u65b0\u95fb', u'http://www.people.com.cn/rss/finance.xml'),
(u'\u4f53\u80b2\u65b0\u95fb', u'http://www.people.com.cn/rss/sports.xml'),
(u'\u53f0\u6e7e\u65b0\u95fb', u'http://www.people.com.cn/rss/haixia.xml')]
feeds = [
(u'时政', u'http://www.people.com.cn/rss/politics.xml'),
(u'国际', u'http://www.people.com.cn/rss/world.xml'),
(u'经济', u'http://www.people.com.cn/rss/finance.xml'),
(u'体育', u'http://www.people.com.cn/rss/sports.xml'),
(u'教育', u'http://www.people.com.cn/rss/edu.xml'),
(u'文化', u'http://www.people.com.cn/rss/culture.xml'),
(u'社会', u'http://www.people.com.cn/rss/society.xml'),
(u'传媒', u'http://www.people.com.cn/rss/media.xml'),
(u'娱乐', u'http://www.people.com.cn/rss/ent.xml'),
# (u'汽车', u'http://www.people.com.cn/rss/auto.xml'),
(u'海峡两岸', u'http://www.people.com.cn/rss/haixia.xml'),
# (u'IT频道', u'http://www.people.com.cn/rss/it.xml'),
# (u'环保', u'http://www.people.com.cn/rss/env.xml'),
# (u'科技', u'http://www.people.com.cn/rss/scitech.xml'),
# (u'新农村', u'http://www.people.com.cn/rss/nc.xml'),
# (u'天气频道', u'http://www.people.com.cn/rss/weather.xml'),
(u'生活提示', u'http://www.people.com.cn/rss/life.xml'),
(u'卫生', u'http://www.people.com.cn/rss/medicine.xml'),
# (u'人口', u'http://www.people.com.cn/rss/npmpc.xml'),
# (u'读书', u'http://www.people.com.cn/rss/booker.xml'),
# (u'食品', u'http://www.people.com.cn/rss/shipin.xml'),
# (u'女性新闻', u'http://www.people.com.cn/rss/women.xml'),
# (u'游戏', u'http://www.people.com.cn/rss/game.xml'),
# (u'家电频道', u'http://www.people.com.cn/rss/homea.xml'),
# (u'房产', u'http://www.people.com.cn/rss/house.xml'),
# (u'健康', u'http://www.people.com.cn/rss/health.xml'),
# (u'科学发展观', u'http://www.people.com.cn/rss/kxfz.xml'),
# (u'知识产权', u'http://www.people.com.cn/rss/ip.xml'),
# (u'高层动态', u'http://www.people.com.cn/rss/64094.xml'),
# (u'党的各项工作', u'http://www.people.com.cn/rss/64107.xml'),
# (u'党建聚焦', u'http://www.people.com.cn/rss/64101.xml'),
# (u'机关党建', u'http://www.people.com.cn/rss/117094.xml'),
# (u'事业党建', u'http://www.people.com.cn/rss/117095.xml'),
# (u'国企党建', u'http://www.people.com.cn/rss/117096.xml'),
# (u'非公党建', u'http://www.people.com.cn/rss/117097.xml'),
# (u'社区党建', u'http://www.people.com.cn/rss/117098.xml'),
# (u'高校党建', u'http://www.people.com.cn/rss/117099.xml'),
# (u'农村党建', u'http://www.people.com.cn/rss/117100.xml'),
# (u'军队党建', u'http://www.people.com.cn/rss/117101.xml'),
# (u'时代先锋', u'http://www.people.com.cn/rss/78693.xml'),
# (u'网友声音', u'http://www.people.com.cn/rss/64103.xml'),
# (u'反腐倡廉', u'http://www.people.com.cn/rss/64371.xml'),
# (u'综合报道', u'http://www.people.com.cn/rss/64387.xml'),
# (u'中国人大新闻', u'http://www.people.com.cn/rss/14576.xml'),
# (u'中国政协新闻', u'http://www.people.com.cn/rss/34948.xml'),
]
keep_only_tags = [
dict(name='div', attrs={'class':'left_content'}),
dict(name='div', attrs={'class':'text_c'}),
]
remove_tags = [
dict(name='table', attrs={'class':'title'}),
dict(name='div', attrs={'class':'tools'}),
]
remove_tags_after = [
dict(name='table', attrs={'class':'bianji'}),
dict(name='div', attrs={'id':'p_content'}),
]
def append_page(self, soup, appendtag, position):
@ -36,7 +81,7 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
if pager:
nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'left_content'})
texttag = soup2.find('div', attrs={'class':'text_c'})
#for it in texttag.findAll(style=True):
# del it['style']
newpos = len(texttag.contents)
@ -44,9 +89,15 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
texttag.extract()
appendtag.insert(position,texttag)
def skip_ad_pages(self, soup):
if ('advertisement' in soup.find('title').string.lower()):
href = soup.find('a').get('href')
return self.browser.open(href).read().decode('GB2312', 'ignore')
else:
return None
def preprocess_html(self, soup):
mtag = '<meta http-equiv="content-type" content="text/html;charset=GB2312" />\n<meta http-equiv="content-language" content="utf-8" />'
mtag = '<meta http-equiv="content-type" content="text/html;charset=GB2312" />\n<meta http-equiv="content-language" content="GB2312" />'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['form']
@ -55,3 +106,19 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
#if pager:
# pager.extract()
return soup
def get_cover_url(self):
cover = None
os.environ['TZ'] = 'Asia/Shanghai'
time.tzset()
year = time.strftime('%Y')
month = time.strftime('%m')
day = time.strftime('%d')
cover = 'http://paper.people.com.cn/rmrb/page/'+year+'-'+month+'/'+day+'/01/RMRB'+year+month+day+'B001_b.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nCover unavailable: " + cover)
cover = None
return cover

View File

@ -1,18 +1,18 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
pescanik.net
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Pescanik(BasicNewsRecipe):
title = 'Pescanik'
title = 'Peščanik'
__author__ = 'Darko Miletic'
description = 'Pescanik'
publisher = 'Pescanik'
description = 'Peščanik je udruženje građana osnovano 2006. godine. Glavni proizvod Peščanika je radio emisija koja je emitovana na Radiju B92 od 02.02.2000. do 16.06.2011, a od septembra 2011. se emituje na osam radio stanica u Srbiji, Crnoj Gori i BiH'
publisher = 'Peščanik'
category = 'news, politics, Serbia'
oldest_article = 10
max_articles_per_feed = 100
@ -21,7 +21,12 @@ class Pescanik(BasicNewsRecipe):
encoding = 'utf-8'
language = 'sr'
publication_type = 'newsportal'
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,"Lucida Grande",Tahoma,Verdana,sans1,sans-serif} .contentheading{font-size: x-large; font-weight: bold} .small{font-size: small} .createdate{font-size: x-small; font-weight: bold} '
masthead_url = 'http://pescanik.net/wp-content/uploads/2011/10/logo1.png'
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Verdana,Arial,Tahoma,sans1,sans-serif}
#BlogTitle{font-size: xx-large; font-weight: bold}
"""
conversion_options = {
'comment' : description
@ -32,29 +37,12 @@ class Pescanik(BasicNewsRecipe):
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_attributes = ['valign','colspan','width','height','align','alt']
remove_tags = [dict(name=['object','link','meta','script'])]
keep_only_tags = [
dict(attrs={'class':['contentheading','small','createdate']})
,dict(name='td', attrs={'valign':'top','colspan':'2'})
]
feeds = [(u'Pescanik Online', u'http://www.pescanik.net/index.php?option=com_rd_rss&id=12')]
remove_tags = [dict(name=['object','link','meta','script','iframe','embed'])]
keep_only_tags = [dict(attrs={'id':['BlogTitle','BlogDate','BlogContent']})]
feeds = [
(u'Autori' , u'http://pescanik.net/category/autori/feed/'),
(u'Prevodi', u'http://pescanik.net/category/prevodi/feed/')
]
def print_version(self, url):
nurl = url.replace('/index.php','/index2.php')
return nurl + '&pop=1&page=0'
def preprocess_html(self, soup):
st = soup.findAll('td')
for it in st:
it.name='p'
for pt in soup.findAll('img'):
brtag = Tag(soup,'br')
brtag2 = Tag(soup,'br')
pt.append(brtag)
pt.append(brtag2)
return soup
return url + 'print/'

View File

@ -33,3 +33,6 @@ class BasicUserRecipe1314970845(BasicNewsRecipe):
(u'Obituaries', u'http://www.philly.com/inquirer_obituaries.rss')
]
def print_version(self, url):
return url + '?viewAll=y'

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__author__ = 'Darko Spasovski'
__license__ = 'GPL v3'
@ -7,7 +8,6 @@ __copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
www.plusinfo.mk
'''
from calibre.web.feeds.news import BasicNewsRecipe
class PlusInfo(BasicNewsRecipe):
@ -27,8 +27,11 @@ class PlusInfo(BasicNewsRecipe):
oldest_article = 1
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class': 'vest'})]
remove_tags = [dict(name='div', attrs={'class':['komentari_holder', 'objava']})]
remove_tags = []
remove_tags.append(dict(name='div', attrs={'class':['komentari_holder', 'objava', 'koment']}))
remove_tags.append(dict(name='ul', attrs={'class':['vest_meni']}))
remove_tags.append(dict(name='a', attrs={'name': ['fb_share']}))
keep_only_tags = [dict(name='div', attrs={'class': 'vest1'})]
feeds = [(u'Македонија', u'http://www.plusinfo.mk/rss/makedonija'),
(u'Бизнис', u'http://www.plusinfo.mk/rss/biznis'),

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__license__ = 'GPL v3'
'''
calibre recipe for prospectmagazine.co.uk (subscription)
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class ProspectMagUK(BasicNewsRecipe):
title = u'Prospect Magazine'
description = 'A general-interest publication offering analysis and commentary about politics, news and business.'
__author__ = 'barty, duluoz'
timefmt = ' [%d %B %Y]'
no_stylesheets = True
publication_type = 'magazine'
masthead_url = 'http://www.prospectmagazine.co.uk/wp-content/themes/prospect/images/titleMain.jpg'
category = 'news, UK'
language = 'en_GB'
max_articles_per_feed = 100
auto_cleanup = True
needs_subscription = True
auto_cleanup_keep = '//div[@class="lead_image"]'
remove_tags = [{'class':['shareinpost','postutils','postinfo']}]
INDEX = 'http://www.prospectmagazine.co.uk/current-issue'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.prospectmagazine.co.uk/wp-login.php')
br.select_form(name='loginform')
br['log'] = self.username
br['pwd'] = self.password
br.submit()
return br
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
#div = soup.find('h1',text=re.compile(r'Issue \d+'))
#fname = self.tag_to_string( div) if div is not None else 'Current Issue'
div = soup.find('div', id='cover_image')
if div is not None:
img = div.find('img', src=True)
if img is not None:
src = img['src']
if src.startswith('/'):
src = 'http://www.prospectmagazine.co.uk' + src
self.cover_url = src
feeds = []
# loop through sections
for sect in soup.findAll('div',attrs={'class':'sectionheading'}):
fname = self.tag_to_string( sect).replace('>','').strip()
self.log('Found section', fname)
articles = []
# note: can't just find siblings with class='post' because that will also
# grab all the articles belonging to the sections that follow.
for item in sect.findNextSiblings('div',attrs={'class':True}):
if not 'post' in item['class']: break
a = item.find('a', href=True)
if a is None: continue
url = a['href']
title = self.tag_to_string(a)
p = item.find('p')
desc = self.tag_to_string( p) if p is not None else ''
art = {'title':title, 'description':desc,'date':' ', 'url':url}
p = item.find(attrs={'class':re.compile('author')})
self.log('\tFound article:', title, '::', url)
if p is not None:
art['author'] = self.tag_to_string( p).strip()
articles.append(art)
feeds.append((fname, articles))
return feeds

View File

@ -1,30 +1,36 @@
"""
readitlaterlist.com
"""
__license__ = 'GPL v3'
__copyright__ = '''
2010, Darko Miletic <darko.miletic at gmail.com>
2011, Przemyslaw Kryger <pkryger at gmail.com>
'''
'''
readitlaterlist.com
2012, tBunnyMan <Wag That Tail At Me dot com>
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Readitlater(BasicNewsRecipe):
title = 'Read It Later'
__author__ = 'Darko Miletic, Przemyslaw Kryger'
description = '''Personalized news feeds. Go to readitlaterlist.com to
setup up your news. Fill in your account
username, and optionally you can add password.'''
publisher = 'readitlater.com'
title = 'ReadItLater'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
up your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
publisher = 'readitlaterlist.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 100
max_articles_per_feed = 50
minimum_articles = 1
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
INDEX = u'http://readitlaterlist.com'
LOGIN = INDEX + u'/l'
readList = []
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@ -33,41 +39,46 @@ class Readitlater(BasicNewsRecipe):
br.select_form(nr=0)
br['feed_id'] = self.username
if self.password is not None:
br['password'] = self.password
br['password'] = self.password
br.submit()
return br
def get_feeds(self):
self.report_progress(0, ('Fetching list of feeds...'))
self.report_progress(0, ('Fetching list of pages...'))
lfeeds = []
i = 1
feedurl = self.INDEX + u'/unread/1'
while True:
title = u'Unread articles, page ' + str(i)
lfeeds.append((title, feedurl))
self.report_progress(0, ('Got ') + str(i) + (' feeds'))
lfeeds.insert(0, (title, feedurl))
self.report_progress(0, ('Got ') + str(i) + (' pages'))
i += 1
soup = self.index_to_soup(feedurl)
ritem = soup.find('a',attrs={'id':'next', 'class':'active'})
ritem = soup.find('a', attrs={'id':'next', 'class':'active'})
if ritem is None:
break
feedurl = self.INDEX + ritem['href']
if self.test:
return lfeeds[:2]
return lfeeds
def parse_index(self):
totalfeeds = []
articlesToGrab = self.max_articles_per_feed
lfeeds = self.get_feeds()
for feedobj in lfeeds:
if articlesToGrab < 1:
break
feedtitle, feedurl = feedobj
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
ritem = soup.find('ul',attrs={'id':'list'})
for item in ritem.findAll('li'):
ritem = soup.find('ul', attrs={'id':'list'})
for item in reversed(ritem.findAll('li')):
if articlesToGrab < 1:
break
else:
articlesToGrab -= 1
description = ''
atag = item.find('a',attrs={'class':'text'})
atag = item.find('a', attrs={'class':'text'})
if atag and atag.has_key('href'):
url = self.INDEX + atag['href']
title = self.tag_to_string(item.div)
@ -78,6 +89,20 @@ class Readitlater(BasicNewsRecipe):
,'url' :url
,'description':description
})
readLink = item.find('a', attrs={'class':'check'})['href']
self.readList.append(readLink)
totalfeeds.append((feedtitle, articles))
if len(self.readList) < self.minimum_articles:
raise Exception("Not enough articles in RIL! Change minimum_articles or add more.")
return totalfeeds
def mark_as_read(self, markList):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
response = br.open(url)
response
def cleanup(self):
self.mark_as_read(self.readList)

Some files were not shown because too many files have changed in this diff Show More