sync to John's branch
4210
Changelog.old.yaml
4769
Changelog.yaml
50
recipes/al_masry_al_youm.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
|
||||||
|
'''
|
||||||
|
abc.net.au/news
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class TheDailyNewsEG(BasicNewsRecipe):
|
||||||
|
title = u'al-masry al-youm'
|
||||||
|
__author__ = 'Omm Mishmishah'
|
||||||
|
description = 'Independent News from Egypt'
|
||||||
|
masthead_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
|
||||||
|
cover_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
|
||||||
|
|
||||||
|
auto_cleanup = True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = False
|
||||||
|
#delay = 1
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf8'
|
||||||
|
publisher = 'Independent News Egypt'
|
||||||
|
category = 'News, Egypt, World'
|
||||||
|
language = 'en_EG'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||||
|
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
|
||||||
|
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
|
||||||
|
conversion_options = {
|
||||||
|
'comments' : description
|
||||||
|
,'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': False
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':['article section']})]
|
||||||
|
|
||||||
|
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
|
||||||
|
'inline-content story left', 'inline-content map left contracted', 'published',
|
||||||
|
'story-map', 'statepromo', 'topics', ]})]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [(u'English News', u'http://www.almasryalyoum.com/en/rss_feed_term/113/rss.xml'),
|
||||||
|
(u'News Features', u'http://www.almasryalyoum.com/en/rss_feed_term/115/rss.xml'),
|
||||||
|
(u'Culture', u'http://www.almasryalyoum.com/en/rss_feed_term/133/rss.xml'),
|
||||||
|
(u'Cinema', u'http://www.almasryalyoum.com/en/rss_feed_term/134/rss.xml')
|
||||||
|
]
|
18
recipes/albert_mohler.recipe
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AlbertMohlersBlog(BasicNewsRecipe):
|
||||||
|
title = u'Albert Mohler\'s Blog'
|
||||||
|
__author__ = 'Peter Grungi'
|
||||||
|
language = 'en'
|
||||||
|
oldest_article = 90
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
auto_cleanup = True
|
||||||
|
cover_url = 'http://www.albertmohler.com/wp-content/themes/albert-mohler-v5/img/logo-am-lg.gif'
|
||||||
|
publisher = 'Albert Mohler'
|
||||||
|
language = 'en'
|
||||||
|
author = 'Albert Mohler'
|
||||||
|
|
||||||
|
feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')]
|
@ -36,3 +36,5 @@ class Alternet(BasicNewsRecipe):
|
|||||||
self.temp_files[-1].write(html)
|
self.temp_files[-1].write(html)
|
||||||
self.temp_files[-1].close()
|
self.temp_files[-1].close()
|
||||||
return self.temp_files[-1].name
|
return self.temp_files[-1].name
|
||||||
|
|
||||||
|
conversion_options = {'linearize_tables': True}
|
||||||
|
@ -11,7 +11,6 @@ class AssociatedPress(BasicNewsRecipe):
|
|||||||
language = 'en'
|
language = 'en'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
max_articles_per_feed = 15
|
max_articles_per_feed = 15
|
||||||
html2lrf_options = ['--force-page-break-before-tag="chapter"']
|
|
||||||
|
|
||||||
|
|
||||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
|
51
recipes/asianreviewofbooks.recipe
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.asianreviewofbooks.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AsianReviewOfBooks(BasicNewsRecipe):
|
||||||
|
title = 'The Asian Review of Books'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.'
|
||||||
|
publisher = 'The Asian Review of Books'
|
||||||
|
category = 'literature, books, reviews, Asia'
|
||||||
|
oldest_article = 30
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'cp1252'
|
||||||
|
language = 'en_HK'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: serif}
|
||||||
|
.big {font-size: xx-large}
|
||||||
|
.bold {font-weight: bold}
|
||||||
|
.italic {font-style: italic}
|
||||||
|
.small {font-size: small}
|
||||||
|
img {display: block}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
remove_tags = [dict(name=['object','script','iframe','embed'])]
|
||||||
|
remove_attributes = ['style', 'onclick']
|
||||||
|
feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
root, sep, artid = url.rpartition('?ID=')
|
||||||
|
return root + 'getarticle.php?articleID=' + artid + '&stats=web'
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw, url):
|
||||||
|
return '<html><head><title>title</title></head><body>' + raw + '</body></html>'
|
||||||
|
|
16
recipes/beppe_grillo.recipe
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1327747616(BasicNewsRecipe):
|
||||||
|
title = u'Beppe Grillo'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Beppe Grillo', u'http://feeds.feedburner.com/beppegrillo/atom')]
|
||||||
|
description = 'Blog of the famous comedian and politician Beppe Grillo - v1.00 (28, January 2012)'
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
|
||||||
|
language = 'it'
|
||||||
|
|
44
recipes/birmingham_post.recipe
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||||
|
title = u'Birmingham post'
|
||||||
|
description = 'News for Birmingham UK'
|
||||||
|
timefmt = ''
|
||||||
|
__author__ = 'Dave Asbury'
|
||||||
|
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
auto_cleanup = True
|
||||||
|
language = 'en_GB'
|
||||||
|
|
||||||
|
|
||||||
|
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
|
||||||
|
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
#dict(name='h1',attrs={'id' : 'article-headline'}),
|
||||||
|
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
|
||||||
|
#dict(name='p')
|
||||||
|
#dict(attrs={'id' : 'three-col'})
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
|
||||||
|
|
||||||
|
]
|
||||||
|
feeds = [
|
||||||
|
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
|
||||||
|
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
|
||||||
|
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
|
||||||
|
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
|
||||||
|
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
|
||||||
|
|
||||||
|
]
|
||||||
|
extra_css = '''
|
||||||
|
body {font: sans-serif medium;}'
|
||||||
|
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
|
||||||
|
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
|
||||||
|
span{ font-size:9.5px; font-weight:bold;font-style:italic}
|
||||||
|
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||||
|
|
||||||
|
'''
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
blic.rs
|
blic.rs
|
||||||
'''
|
'''
|
||||||
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
|
|||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url + '/print'
|
return url + '/print'
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def get_cover_url(self):
|
||||||
for item in soup.findAll(style=True):
|
soup = self.index_to_soup('http://www.blic.rs/')
|
||||||
del item['style']
|
alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
|
||||||
return soup
|
if alink:
|
||||||
|
return 'http://www.blic.rs' + alink['href']
|
||||||
|
return None
|
||||||
|
|
@ -1,95 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
|
||||||
|
|
||||||
'''
|
|
||||||
borba.rs
|
|
||||||
'''
|
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
class Borba(BasicNewsRecipe):
|
|
||||||
title = 'Borba Online'
|
|
||||||
__author__ = 'Darko Miletic'
|
|
||||||
description = 'Dnevne novine Borba Online'
|
|
||||||
publisher = 'IP Novine Borba'
|
|
||||||
category = 'news, politics, Serbia'
|
|
||||||
language = 'sr'
|
|
||||||
|
|
||||||
lang = _('sr-Latn-RS')
|
|
||||||
oldest_article = 2
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
encoding = 'utf-8'
|
|
||||||
use_embedded_content = False
|
|
||||||
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
|
|
||||||
INDEX = u'http://www.borba.rs/'
|
|
||||||
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
|
|
||||||
|
|
||||||
conversion_options = {
|
|
||||||
'comment' : description
|
|
||||||
, 'tags' : category
|
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : lang
|
|
||||||
, 'pretty_print' : True
|
|
||||||
}
|
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'main'})]
|
|
||||||
|
|
||||||
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name=['object','link','iframe','base','img'])
|
|
||||||
,dict(name='div',attrs={'id':'written_comments_title'})
|
|
||||||
]
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
|
|
||||||
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
|
|
||||||
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
|
|
||||||
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
|
|
||||||
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
|
|
||||||
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
|
|
||||||
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
|
|
||||||
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
|
|
||||||
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
|
|
||||||
]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
attribs = [ 'style','font','valign'
|
|
||||||
,'colspan','width','height'
|
|
||||||
,'rowspan','summary','align'
|
|
||||||
,'cellspacing','cellpadding'
|
|
||||||
,'frames','rules','border'
|
|
||||||
]
|
|
||||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
|
||||||
item.name = 'div'
|
|
||||||
for attrib in attribs:
|
|
||||||
if item.has_key(attrib):
|
|
||||||
del item[attrib]
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def parse_index(self):
|
|
||||||
totalfeeds = []
|
|
||||||
lfeeds = self.get_feeds()
|
|
||||||
for feedobj in lfeeds:
|
|
||||||
feedtitle, feedurl = feedobj
|
|
||||||
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
|
||||||
articles = []
|
|
||||||
soup = self.index_to_soup(feedurl)
|
|
||||||
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
|
|
||||||
url = item['href']
|
|
||||||
title = self.tag_to_string(item)
|
|
||||||
articles.append({
|
|
||||||
'title' :title
|
|
||||||
,'date' :''
|
|
||||||
,'url' :url
|
|
||||||
,'description':''
|
|
||||||
})
|
|
||||||
totalfeeds.append((feedtitle, articles))
|
|
||||||
return totalfeeds
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
@ -6,45 +7,76 @@ __license__ = 'GPL v3'
|
|||||||
www.canada.com
|
www.canada.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class CanWestPaper(BasicNewsRecipe):
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
# un-comment the following three lines for the Calgary Herald
|
# un-comment the following four lines for the Victoria Times Colonist
|
||||||
|
## title = u'Victoria Times Colonist'
|
||||||
|
## url_prefix = 'http://www.timescolonist.com'
|
||||||
|
## description = u'News from Victoria, BC'
|
||||||
|
## fp_tag = 'CAN_TC'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Province
|
||||||
|
## title = u'Vancouver Province'
|
||||||
|
## url_prefix = 'http://www.theprovince.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VP'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Sun
|
||||||
|
## title = u'Vancouver Sun'
|
||||||
|
## url_prefix = 'http://www.vancouversun.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VS'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Edmonton Journal
|
||||||
|
## title = u'Edmonton Journal'
|
||||||
|
## url_prefix = 'http://www.edmontonjournal.com'
|
||||||
|
## description = u'News from Edmonton, AB'
|
||||||
|
## fp_tag = 'CAN_EJ'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Calgary Herald
|
||||||
title = u'Calgary Herald'
|
title = u'Calgary Herald'
|
||||||
url_prefix = 'http://www.calgaryherald.com'
|
url_prefix = 'http://www.calgaryherald.com'
|
||||||
description = u'News from Calgary, AB'
|
description = u'News from Calgary, AB'
|
||||||
|
fp_tag = 'CAN_CH'
|
||||||
|
|
||||||
# un-comment the following three lines for the Regina Leader-Post
|
# un-comment the following four lines for the Regina Leader-Post
|
||||||
#title = u'Regina Leader-Post'
|
## title = u'Regina Leader-Post'
|
||||||
#url_prefix = 'http://www.leaderpost.com'
|
## url_prefix = 'http://www.leaderpost.com'
|
||||||
#description = u'News from Regina, SK'
|
## description = u'News from Regina, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||||
#title = u'Saskatoon Star-Phoenix'
|
## title = u'Saskatoon Star-Phoenix'
|
||||||
#url_prefix = 'http://www.thestarphoenix.com'
|
## url_prefix = 'http://www.thestarphoenix.com'
|
||||||
#description = u'News from Saskatoon, SK'
|
## description = u'News from Saskatoon, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
# un-comment the following three lines for the Windsor Star
|
# un-comment the following four lines for the Windsor Star
|
||||||
#title = u'Windsor Star'
|
## title = u'Windsor Star'
|
||||||
#url_prefix = 'http://www.windsorstar.com'
|
## url_prefix = 'http://www.windsorstar.com'
|
||||||
#description = u'News from Windsor, ON'
|
## description = u'News from Windsor, ON'
|
||||||
|
## fp_tag = 'CAN_'
|
||||||
|
|
||||||
# un-comment the following three lines for the Ottawa Citizen
|
# un-comment the following four lines for the Ottawa Citizen
|
||||||
#title = u'Ottawa Citizen'
|
## title = u'Ottawa Citizen'
|
||||||
#url_prefix = 'http://www.ottawacitizen.com'
|
## url_prefix = 'http://www.ottawacitizen.com'
|
||||||
#description = u'News from Ottawa, ON'
|
## description = u'News from Ottawa, ON'
|
||||||
|
## fp_tag = 'CAN_OC'
|
||||||
|
|
||||||
# un-comment the following three lines for the Montreal Gazette
|
# un-comment the following four lines for the Montreal Gazette
|
||||||
#title = u'Montreal Gazette'
|
## title = u'Montreal Gazette'
|
||||||
#url_prefix = 'http://www.montrealgazette.com'
|
## url_prefix = 'http://www.montrealgazette.com'
|
||||||
#description = u'News from Montreal, QC'
|
## description = u'News from Montreal, QC'
|
||||||
|
## fp_tag = 'CAN_MG'
|
||||||
|
|
||||||
|
|
||||||
language = 'en_CA'
|
language = 'en_CA'
|
||||||
__author__ = 'Nick Redding'
|
__author__ = 'Nick Redding'
|
||||||
encoding = 'latin1'
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
timefmt = ' [%b %d]'
|
timefmt = ' [%b %d]'
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
@ -64,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
def preprocess_html(self,soup):
|
def get_cover_url(self):
|
||||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
from datetime import timedelta, date
|
||||||
divtags = soup.findAll('div',attrs={'id':''})
|
if self.fp_tag=='':
|
||||||
if divtags:
|
return None
|
||||||
for div in divtags:
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
del(div['id'])
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
daysback=1
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
while daysback<7:
|
||||||
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
daysback = daysback+1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if daysback==7:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
# Replace rsquo (\x92)
|
||||||
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
# Replace ldquo (\x93)
|
||||||
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
# Replace rdquo (\x94)
|
||||||
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
fixed = re.sub("’","’",fixed)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first:
|
||||||
|
picdiv = soup.find('body').find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||||
|
xtitle = article.text_summary.strip()
|
||||||
|
if len(xtitle) == 0:
|
||||||
|
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||||
|
if desc is not None:
|
||||||
|
article.summary = article.text_summary = desc['content']
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
@ -98,8 +196,6 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
atag = h1tag.find('a',href=True)
|
atag = h1tag.find('a',href=True)
|
||||||
if not atag:
|
if not atag:
|
||||||
continue
|
continue
|
||||||
url = atag['href']
|
|
||||||
if not url.startswith('http:'):
|
|
||||||
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
#self.log("Section %s" % key)
|
#self.log("Section %s" % key)
|
||||||
#self.log("url %s" % url)
|
#self.log("url %s" % url)
|
||||||
|
11
recipes/catholic_daily_readings.recipe
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1328971305(BasicNewsRecipe):
|
||||||
|
title = u'Catholic Daily Readings'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'adoucette'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Daily Readings - USCCB', u'http://www.usccb.org/bible/readings/rss/'), (u'Daily Reflection - One Bread One Body', u'http://www.presentationministries.com/general/rss.asp'), (u'Mass Readings - Universalis', u'http://www.universalis.com/atommass3.xml'), (u'Saint Of The Day - CNA', u'http://feeds.feedburner.com/catholicnewsagency/saintoftheday')]
|
@ -77,8 +77,18 @@ class ChicagoTribune(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
url = article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||||
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
if url.endswith('?track=rss'):
|
||||||
|
url = url.partition('?')[0]
|
||||||
|
return url
|
||||||
|
|
||||||
|
def skip_ad_pages(self, soup):
|
||||||
|
text = soup.find(text='click here to continue to article')
|
||||||
|
if text:
|
||||||
|
a = text.parent
|
||||||
|
url = a.get('href')
|
||||||
|
if url:
|
||||||
|
return self.index_to_soup(url, raw=True)
|
||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
def postprocess_html(self, soup, first_fetch):
|
||||||
# Remove the navigation bar. It was kept until now to be able to follow
|
# Remove the navigation bar. It was kept until now to be able to follow
|
||||||
|
@ -1,38 +1,89 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
##
|
||||||
|
## Title: Common Dreams
|
||||||
|
##
|
||||||
|
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||||
|
|
||||||
|
# Feb 2012: Cleaned up the output to have only the main article
|
||||||
|
|
||||||
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||||
|
'''
|
||||||
|
commondreams.org
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class CommonDreams(BasicNewsRecipe):
|
class CommonDreams(BasicNewsRecipe):
|
||||||
# Identify the recipe
|
# Identify the recipe
|
||||||
|
|
||||||
title = u'Common Dreams'
|
title = u'Common Dreams'
|
||||||
description = u'Progressive news and views'
|
description = u'Breaking News & Views for the Progressive Community.'
|
||||||
|
cover_url = 'https://s3.amazonaws.com/s3.commondreams.org/images/common-dreams.png'
|
||||||
__author__ = u'XanthanGum'
|
__author__ = u'XanthanGum'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
# Format the text
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
|
|
||||||
h1{font-size: xx-large;}
|
|
||||||
h2{font-size: large;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
# Pick no article older than seven days and limit the number of articles per feed to 100
|
|
||||||
|
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
# Remove everything before the article
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
remove_tags_before = dict(name = 'div', attrs = {'id':'node-header'})
|
# Flattens all the tables to make it compatible with Nook
|
||||||
|
conversion_options = {'linearize_tables' : True}
|
||||||
|
|
||||||
# Remove everything after the article
|
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||||
|
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||||
|
|
||||||
|
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||||
|
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||||
|
.introduction, .first { font-weight: bold; } \
|
||||||
|
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||||
|
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||||
|
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||||
|
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||||
|
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||||
|
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||||
|
.story-date, .published { font-size: 80%; } \
|
||||||
|
table { width: 100%; } \
|
||||||
|
td img { display: block; margin: 5px auto; } \
|
||||||
|
ul { padding-top: 10px; } \
|
||||||
|
ol { padding-top: 10px; } \
|
||||||
|
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||||
|
h1 { font-size: 175%; font-weight: bold; } \
|
||||||
|
h2 { font-size: 150%; font-weight: bold; } \
|
||||||
|
h3 { font-size: 125%; font-weight: bold; } \
|
||||||
|
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||||
|
|
||||||
|
# Remove the line breaks and float left/right and picture width/height.
|
||||||
|
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'float:.*?'), lambda m: ''),
|
||||||
|
(re.compile(r'width:.*?px'), lambda m: ''),
|
||||||
|
(re.compile(r'height:.*?px'), lambda m: ''),
|
||||||
|
(re.compile(r'<a.*?>'), lambda m: ''),
|
||||||
|
(re.compile(r'</a>'), lambda m: ''),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Main article is inside this tag
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'id':lambda x: x and 'node-' in x}),
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':'node-links clear-block'}), # remove Share options
|
||||||
|
]
|
||||||
|
|
||||||
remove_tags_after = dict(name = 'div', attrs = {'class':'copyright-info'})
|
|
||||||
|
|
||||||
# Identify the news feeds
|
# Identify the news feeds
|
||||||
|
|
||||||
feeds = [(u'Headlines', u'http://www.commondreams.org/feed/headlines_rss'),
|
feeds = [(u'Headlines', u'https://www.commondreams.org/feed/headlines_rss'),
|
||||||
(u'Further News Articles', u'http://www.commondreams.org/feed/further_rss'),
|
(u'Further News Articles', u'https://www.commondreams.org/feed/further_rss'),
|
||||||
(u'Views', u'http://www.commondreams.org/feed/views_rss'),
|
(u'Views', u'https://www.commondreams.org/feed/views_rss'),
|
||||||
(u'Progressive Newswire', u'http://www.commondreams.org/feed/newswire_rss')]
|
(u'Progressive Newswire', u'https://www.commondreams.org/feed/newswire_rss')]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
url = url + '?print'
|
||||||
|
return url
|
71
recipes/consortium_news.recipe
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
##
|
||||||
|
## Title: Consortium News
|
||||||
|
##
|
||||||
|
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||||
|
|
||||||
|
# Feb 2012: Initial release
|
||||||
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||||
|
'''
|
||||||
|
consortiumnews.com
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ConsortiumNews(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Consortium News'
|
||||||
|
publisher = 'Copyright © 2012 Consortiumnews. All Rights Reserved.'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'kiavash'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
conversion_options = {'linearize_tables' : True} # Flattens all the tables to make it compatible with Nook
|
||||||
|
|
||||||
|
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||||
|
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||||
|
|
||||||
|
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||||
|
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||||
|
.introduction, .first { font-weight: bold; } \
|
||||||
|
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||||
|
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||||
|
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||||
|
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||||
|
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||||
|
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||||
|
.story-date, .published { font-size: 80%; } \
|
||||||
|
table { width: 100%; } \
|
||||||
|
td img { display: block; margin: 5px auto; } \
|
||||||
|
ul { padding-top: 10px; } \
|
||||||
|
ol { padding-top: 10px; } \
|
||||||
|
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||||
|
h1 { font-size: 175%; font-weight: bold; } \
|
||||||
|
h2 { font-size: 150%; font-weight: bold; } \
|
||||||
|
h3 { font-size: 125%; font-weight: bold; } \
|
||||||
|
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||||
|
|
||||||
|
# Remove the line breaks and float left/right and picture width/height.
|
||||||
|
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'float:.*?'), lambda m: ''),
|
||||||
|
(re.compile(r'width:.*?px'), lambda m: ''),
|
||||||
|
(re.compile(r'height:.*?px'), lambda m: ''),
|
||||||
|
(re.compile(r'<a.*?>'), lambda h1: ''),
|
||||||
|
(re.compile(r'</a>'), lambda h2: ''),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Main article is inside this tag
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':lambda x: x and 'post-' in x})]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':'sociable'}), # remove 'Share this Article'
|
||||||
|
dict(name='p', attrs={'class':'tags'}), # remove 'Tags: ... '
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'Consortium News', u'http://feeds.feedburner.com/Consortiumnewscom')]
|
@ -7,6 +7,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
|
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
|
||||||
|
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
|
#last update 21/12/11
|
||||||
# greyscale code by Starson
|
# greyscale code by Starson
|
||||||
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
|
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -31,7 +32,8 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
|
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
|
||||||
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
|
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
|
||||||
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
|
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
|
||||||
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']})
|
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
|
||||||
|
dict(name='li',attrs={'class' : 'thumb'})
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -48,4 +50,3 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
img.type = "GrayscaleType"
|
img.type = "GrayscaleType"
|
||||||
img.save(iurl)
|
img.save(iurl)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
25
recipes/countryfile.recipe
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||||
|
title = u'Countryfile.com'
|
||||||
|
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
|
||||||
|
__author__ = 'Dave Asbury'
|
||||||
|
description = 'The official website of Countryfile Magazine'
|
||||||
|
# last updated 29/1/12
|
||||||
|
language = 'en_GB'
|
||||||
|
oldest_article = 30
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
remove_empty_feeds = True
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
#articles_are_obfuscated = True
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
# dict(attrs={'class' : ['player']}),
|
||||||
|
|
||||||
|
]
|
||||||
|
feeds = [
|
||||||
|
(u'Homepage', u'http://www.countryfile.com/rss/home'),
|
||||||
|
(u'Country News', u'http://www.countryfile.com/rss/news'),
|
||||||
|
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
|
||||||
|
]
|
@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
description = 'News as provide by The Daily Mirror -UK'
|
description = 'News as provide by The Daily Mirror -UK'
|
||||||
|
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
# last updated 30/10/11
|
# last updated 11/2/12
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
|
|
||||||
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||||
@ -13,45 +13,65 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
||||||
|
|
||||||
|
|
||||||
oldest_article = 2
|
oldest_article = 1
|
||||||
max_articles_per_feed = 30
|
max_articles_per_feed = 5
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
auto_cleanup = True
|
||||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
#conversion_options = { 'linearize_tables' : True }
|
||||||
'''
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div',attrs={'id' : 'body-content'})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
|
#keep_only_tags = [
|
||||||
|
# dict(name='h1'),
|
||||||
|
# dict(name='div',attrs={'id' : 'body-content'}),
|
||||||
|
#dict(name='div',atts={'class' : 'article-body'}),
|
||||||
|
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
|
||||||
|
#dict(name='p'),
|
||||||
|
# ]
|
||||||
|
|
||||||
|
#remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
|
dict(name='title'),
|
||||||
dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
|
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
|
||||||
dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
|
# dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
|
||||||
dict(name='div',attrs={'class' : 'span-12 last sl-others addthis_toolbox addthis_default_style'})
|
#dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
|
||||||
|
#dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# preprocess_regexps = [
|
||||||
|
#(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||||
|
|
||||||
|
#preprocess_regexps = [
|
||||||
|
#(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
|
|
||||||
(u'News', u'http://www.mirror.co.uk/news/rss.xml')
|
(u'UK News', u'http://feed43.com/0287771688643868.xml')
|
||||||
,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
|
,(u'Tech News', u'http://feed43.com/2455520588350501.xml')
|
||||||
,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
|
,(u'Weird World','http://feed43.com/0863800333634654.xml')
|
||||||
,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
|
,(u'Sport','http://feed43.com/7713243036546130.xml')
|
||||||
,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
|
,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml')
|
||||||
,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
|
,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml')
|
||||||
,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
|
,(u'Sport : Other','http://feed43.com/4501416886323415.xml')
|
||||||
,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
|
,(u'TV and Film','http://feed43.com/5238302853765104.xml')
|
||||||
,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
|
,(u'Celebs','http://feed43.com/8770061048844683.xml')
|
||||||
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
,(u'Life Style : Family','http://feed43.com/4356170742410338.xml')
|
||||||
|
,(u'Travel','http://feed43.com/1436576006476607.xml')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||||
|
|
||||||
]
|
]
|
||||||
|
extra_css = '''
|
||||||
|
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||||
|
h1{ font-size:18px;}
|
||||||
|
img { display:block}
|
||||||
|
'''
|
||||||
|
|
||||||
|
11
recipes/derin_dusunce.recipe
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1324913694(BasicNewsRecipe):
|
||||||
|
title = u'Derin Dusunce'
|
||||||
|
language = 'tr'
|
||||||
|
__author__ = 'asalet_r'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Derin D\xfc\u015f\xfcnce', u'http://www.derindusunce.org/feed/')]
|
21
recipes/desiring_god.recipe
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class DesiringGodEnglish(BasicNewsRecipe):
|
||||||
|
title = u'Desiring God'
|
||||||
|
__author__ = 'Peter Grungi'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
cover_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
|
||||||
|
masthead_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
|
||||||
|
language = 'en'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 50
|
||||||
|
auto_cleanup = True
|
||||||
|
publisher = 'Desiring God Ministries'
|
||||||
|
author = 'Desiring God Ministries'
|
||||||
|
|
||||||
|
feeds = [(u'Desiring God Blog', u'http://feeds.feedburner.com/DGBlog?format=xml')]
|
12
recipes/dunya_bizim.recipe
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1324736687(BasicNewsRecipe):
|
||||||
|
title = u'D\xfcnya Bizim'
|
||||||
|
language = 'tr'
|
||||||
|
__author__ = 'asalet_r'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Aktif \u0130mamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=31'), (u'Ayr\u0131nt\u0131 Defteri', u'http://dunyabizim.com/servisler/rss.php?kategoriID=58'), (u'Baba Kitaplar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=4'), (u'Bu da Oldu', u'http://dunyabizim.com/servisler/rss.php?kategoriID=32'), (u'\xc7-al\u0131nt\u0131 Yaz\u0131lar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=33'), (u'Dar\xfclmedya', u'http://dunyabizim.com/servisler/rss.php?kategoriID=49'), (u'Gidenler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=59'), (u'G\xfczel Mekanlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=43'), (u'\u0130yi Haberler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=18'), (u'\u0130yi M\xfczikler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=2'), (u'Kalite Dergiler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=3'), (u'Konu\u015fa Konu\u015fa', u'http://dunyabizim.com/servisler/rss.php?kategoriID=24'), (u'M\xfcstesta G\xfczeller', u'http://dunyabizim.com/servisler/rss.php?kategoriID=65'), (u'O \u015eimdi Nerede?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=52'), (u'Olsa Ke\u015fke', u'http://dunyabizim.com/servisler/rss.php?kategoriID=34'), (u'Orada Ne Oldu?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=38'), (u'\xd6nemli Adamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=1'), (u'Polemik', u'http://dunyabizim.com/servisler/rss.php?kategoriID=39'), (u'Sinema', u'http://dunyabizim.com/servisler/rss.php?kategoriID=23'), (u'Yalan Haber', u'http://dunyabizim.com/servisler/rss.php?kategoriID=40'), (u'Yeni \u015eeyler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=57'), (u'Zekeriya Sofras\u0131', u'http://dunyabizim.com/servisler/rss.php?kategoriID=60')]
|
12
recipes/dunya_bulteni.recipe
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1321194347(BasicNewsRecipe):
|
||||||
|
title = u'D\xfcnya B\xfclteni'
|
||||||
|
language = 'tr'
|
||||||
|
__author__ = 'asalet_r'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 50
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Tarih Dosyas\u0131', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=157'), (u'R\xf6portaj', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=153'), (u'Makale-Yorum', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=174'), (u'K\xfclt\xfcr-Sanat', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=66'), (u'Hayat\u0131n \u0130\xe7inden', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=200'), (u'Haber Analiz', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=123'), (u'Gezi-\u0130zlenim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=90'), (u'Aile Sa\u011fl\u0131k E\u011fitim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=75')]
|
46
recipes/echo_online.recipe
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
||||||
|
'''
|
||||||
|
Fetch echo-online.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
class Echo_Online(BasicNewsRecipe):
|
||||||
|
title = u'Echo Online' # 2011-12-28 AGe
|
||||||
|
description = '-Echo Online-'
|
||||||
|
publisher = 'Echo Online GmbH'
|
||||||
|
category = 'News, Germany'
|
||||||
|
__author__ = 'Armin Geller' # 2011-12-28 AGe
|
||||||
|
language = 'de'
|
||||||
|
lang = 'de-DE'
|
||||||
|
encoding = 'iso-8859-1'
|
||||||
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 50 # 2011-12-28 AGe
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
|
||||||
|
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
|
||||||
|
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
|
||||||
|
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
|
||||||
|
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
|
||||||
|
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
|
||||||
|
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
|
||||||
|
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
|
||||||
|
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
|
||||||
|
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
|
||||||
|
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
|
||||||
|
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
|
||||||
|
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
|
||||||
|
|
||||||
|
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif'
|
||||||
|
|
50
recipes/edge_conversations.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012 Levien van Zon <levien@zonnetjes.net>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Fetch Edge.org conversations
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class EdgeConversationRSS(BasicNewsRecipe):
|
||||||
|
title = u'Edge.org Conversations'
|
||||||
|
__author__ = 'levien'
|
||||||
|
language = 'en'
|
||||||
|
description = '''Edge.org offers "open-minded, free ranging, intellectually
|
||||||
|
playful ... an unadorned pleasure in curiosity, a collective expression of
|
||||||
|
wonder at the living and inanimate world ... an ongoing and thrilling
|
||||||
|
colloquium.'''
|
||||||
|
oldest_article = 60
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'HomeLeftPannel IMGCTRL'}) ]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div',attrs={'class':'Logo'})
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('conversation/', 'conversation.php?cid=')
|
||||||
|
|
||||||
|
def parse_feeds(self):
|
||||||
|
|
||||||
|
# Call parent's method.
|
||||||
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
|
||||||
|
# Loop through all feeds.
|
||||||
|
for feed in feeds:
|
||||||
|
|
||||||
|
# Loop through all articles in feed.
|
||||||
|
for article in feed.articles[:]:
|
||||||
|
|
||||||
|
# Remove anything that is not a conversation, and remove PDF files as well...
|
||||||
|
|
||||||
|
if not ('CONVERSATION' in article.title):
|
||||||
|
feed.articles.remove(article)
|
||||||
|
elif 'pdf' in article.url:
|
||||||
|
feed.articles.remove(article)
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
@ -6,45 +7,72 @@ __license__ = 'GPL v3'
|
|||||||
www.canada.com
|
www.canada.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class CanWestPaper(BasicNewsRecipe):
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
# un-comment the following three lines for the Edmonton Journal
|
# un-comment the following four lines for the Victoria Times Colonist
|
||||||
|
## title = u'Victoria Times Colonist'
|
||||||
|
## url_prefix = 'http://www.timescolonist.com'
|
||||||
|
## description = u'News from Victoria, BC'
|
||||||
|
## fp_tag = 'CAN_TC'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Province
|
||||||
|
## title = u'Vancouver Province'
|
||||||
|
## url_prefix = 'http://www.theprovince.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VP'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Sun
|
||||||
|
## title = u'Vancouver Sun'
|
||||||
|
## url_prefix = 'http://www.vancouversun.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VS'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Edmonton Journal
|
||||||
title = u'Edmonton Journal'
|
title = u'Edmonton Journal'
|
||||||
url_prefix = 'http://www.edmontonjournal.com'
|
url_prefix = 'http://www.edmontonjournal.com'
|
||||||
description = u'News from Edmonton, AB'
|
description = u'News from Edmonton, AB'
|
||||||
|
fp_tag = 'CAN_EJ'
|
||||||
|
|
||||||
# un-comment the following three lines for the Calgary Herald
|
# un-comment the following four lines for the Calgary Herald
|
||||||
#title = u'Calgary Herald'
|
## title = u'Calgary Herald'
|
||||||
#url_prefix = 'http://www.calgaryherald.com'
|
## url_prefix = 'http://www.calgaryherald.com'
|
||||||
#description = u'News from Calgary, AB'
|
## description = u'News from Calgary, AB'
|
||||||
|
## fp_tag = 'CAN_CH'
|
||||||
|
|
||||||
# un-comment the following three lines for the Regina Leader-Post
|
# un-comment the following four lines for the Regina Leader-Post
|
||||||
#title = u'Regina Leader-Post'
|
## title = u'Regina Leader-Post'
|
||||||
#url_prefix = 'http://www.leaderpost.com'
|
## url_prefix = 'http://www.leaderpost.com'
|
||||||
#description = u'News from Regina, SK'
|
## description = u'News from Regina, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||||
#title = u'Saskatoon Star-Phoenix'
|
## title = u'Saskatoon Star-Phoenix'
|
||||||
#url_prefix = 'http://www.thestarphoenix.com'
|
## url_prefix = 'http://www.thestarphoenix.com'
|
||||||
#description = u'News from Saskatoon, SK'
|
## description = u'News from Saskatoon, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
# un-comment the following three lines for the Windsor Star
|
# un-comment the following four lines for the Windsor Star
|
||||||
#title = u'Windsor Star'
|
## title = u'Windsor Star'
|
||||||
#url_prefix = 'http://www.windsorstar.com'
|
## url_prefix = 'http://www.windsorstar.com'
|
||||||
#description = u'News from Windsor, ON'
|
## description = u'News from Windsor, ON'
|
||||||
|
## fp_tag = 'CAN_'
|
||||||
|
|
||||||
# un-comment the following three lines for the Ottawa Citizen
|
# un-comment the following four lines for the Ottawa Citizen
|
||||||
#title = u'Ottawa Citizen'
|
## title = u'Ottawa Citizen'
|
||||||
#url_prefix = 'http://www.ottawacitizen.com'
|
## url_prefix = 'http://www.ottawacitizen.com'
|
||||||
#description = u'News from Ottawa, ON'
|
## description = u'News from Ottawa, ON'
|
||||||
|
## fp_tag = 'CAN_OC'
|
||||||
|
|
||||||
# un-comment the following three lines for the Montreal Gazette
|
# un-comment the following four lines for the Montreal Gazette
|
||||||
#title = u'Montreal Gazette'
|
## title = u'Montreal Gazette'
|
||||||
#url_prefix = 'http://www.montrealgazette.com'
|
## url_prefix = 'http://www.montrealgazette.com'
|
||||||
#description = u'News from Montreal, QC'
|
## description = u'News from Montreal, QC'
|
||||||
|
## fp_tag = 'CAN_MG'
|
||||||
|
|
||||||
|
|
||||||
language = 'en_CA'
|
language = 'en_CA'
|
||||||
@ -68,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
def preprocess_html(self,soup):
|
def get_cover_url(self):
|
||||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
from datetime import timedelta, date
|
||||||
divtags = soup.findAll('div',attrs={'id':''})
|
if self.fp_tag=='':
|
||||||
if divtags:
|
return None
|
||||||
for div in divtags:
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
del(div['id'])
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
daysback=1
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
while daysback<7:
|
||||||
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
daysback = daysback+1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if daysback==7:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
# Replace rsquo (\x92)
|
||||||
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
# Replace ldquo (\x93)
|
||||||
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
# Replace rdquo (\x94)
|
||||||
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
fixed = re.sub("’","’",fixed)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first:
|
||||||
|
picdiv = soup.find('body').find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||||
|
xtitle = article.text_summary.strip()
|
||||||
|
if len(xtitle) == 0:
|
||||||
|
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||||
|
if desc is not None:
|
||||||
|
article.summary = article.text_summary = desc['content']
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
58
recipes/elet_es_irodalom.recipe
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
################################################################################
|
||||||
|
#Description: http://es.hu/ RSS channel
|
||||||
|
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||||
|
#Date: 2012.01.20. - V1.2
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class elet_es_irodalom(BasicNewsRecipe):
|
||||||
|
title = u'\u00c9let \u00e9s Irodalom'
|
||||||
|
__author__ = 'Bigpapa'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 30 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||||
|
no_stylesheets = True
|
||||||
|
#delay = 1
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'iso-8859-2'
|
||||||
|
category = 'Cikkek'
|
||||||
|
language = 'hu'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
|
||||||
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
|
masthead_url = 'http://www.es.hu/images/logo.jpg'
|
||||||
|
timefmt = ' [%Y %b %d, %a]'
|
||||||
|
|
||||||
|
#Nem ide a kódba kell beleírni a hozzáférés adatait, hanem azt akkor adod meg, ha le akarod tölteni!
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.es.hu/')
|
||||||
|
br.select_form(name='userfrmlogin')
|
||||||
|
br['cusername'] = self.username
|
||||||
|
br['cpassword'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='a', attrs={'target':['_TOP']}),
|
||||||
|
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
|
||||||
|
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
|
||||||
|
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
|
||||||
|
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
|
||||||
|
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
|
||||||
|
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
|
||||||
|
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
|
||||||
|
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
|
||||||
|
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
|
||||||
|
]
|
@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
|
|||||||
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
needs_subscription = True
|
needs_subscription = 'optional'
|
||||||
encoding= 'ISO-8859-1'
|
encoding= 'ISO-8859-1'
|
||||||
|
|
||||||
remove_tags_before = dict(name='font', attrs={'class':'date'})
|
remove_tags_before = dict(name='font', attrs={'class':'date'})
|
||||||
@ -75,10 +75,9 @@ class ESPN(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username and self.password:
|
||||||
br.set_handle_refresh(False)
|
br.set_handle_refresh(False)
|
||||||
url = ('https://r.espn.go.com/members/v3_1/login')
|
url = ('https://r.espn.go.com/members/v3_1/login')
|
||||||
raw = br.open(url).read()
|
raw = br.open(url).read()
|
||||||
@ -100,7 +99,6 @@ class ESPN(BasicNewsRecipe):
|
|||||||
return article.get('guid', None)
|
return article.get('guid', None)
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
|
|
||||||
if 'eticket' in url:
|
if 'eticket' in url:
|
||||||
return url.partition('&')[0].replace('story?', 'print?')
|
return url.partition('&')[0].replace('story?', 'print?')
|
||||||
match = re.search(r'story\?(id=\d+)', url)
|
match = re.search(r'story\?(id=\d+)', url)
|
||||||
|
34
recipes/fhm_uk.recipe
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||||
|
title = u'FHM UK'
|
||||||
|
description = 'Good News for Men'
|
||||||
|
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||||
|
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
|
||||||
|
__author__ = 'Dave Asbury'
|
||||||
|
# last updated 27/1/12
|
||||||
|
language = 'en_GB'
|
||||||
|
oldest_article = 28
|
||||||
|
max_articles_per_feed = 12
|
||||||
|
remove_empty_feeds = True
|
||||||
|
no_stylesheets = True
|
||||||
|
#auto_cleanup = True
|
||||||
|
#articles_are_obfuscated = True
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
|
||||||
|
dict(name='div',attrs={'id' : ['articleLeft']}),
|
||||||
|
dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
#remove_tags = [
|
||||||
|
#dict(attrs={'class' : ['player']}),
|
||||||
|
|
||||||
|
#]
|
||||||
|
feeds = [
|
||||||
|
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
|
||||||
|
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
|
||||||
|
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||||
|
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
|
||||||
|
]
|
@ -3,10 +3,17 @@ import re
|
|||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
class ForeignAffairsRecipe(BasicNewsRecipe):
|
class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||||
|
''' there are three modifications:
|
||||||
|
1) fetch issue cover
|
||||||
|
2) toggle ignore premium articles
|
||||||
|
3) extract proper section names, ie. "Comments", "Essay"
|
||||||
|
|
||||||
|
by Chen Wei weichen302@gmx.com, 2012-02-05'''
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'kwetal'
|
__author__ = 'kwetal'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
version = 1
|
version = 1.01
|
||||||
|
|
||||||
title = u'Foreign Affairs (Subcription or (free) Registration)'
|
title = u'Foreign Affairs (Subcription or (free) Registration)'
|
||||||
publisher = u'Council on Foreign Relations'
|
publisher = u'Council on Foreign Relations'
|
||||||
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
INDEX = 'http://www.foreignaffairs.com'
|
INDEX = 'http://www.foreignaffairs.com'
|
||||||
|
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
|
||||||
|
INCLUDE_PREMIUM = False
|
||||||
|
|
||||||
|
|
||||||
remove_tags = []
|
remove_tags = []
|
||||||
remove_tags.append(dict(name = 'base'))
|
remove_tags.append(dict(name = 'base'))
|
||||||
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
temp_files = []
|
temp_files = []
|
||||||
articles_are_obfuscated = True
|
articles_are_obfuscated = True
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup(self.FRONTPAGE)
|
||||||
|
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
|
||||||
|
img_url = div.find('img')['src']
|
||||||
|
return self.INDEX + img_url
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
def get_obfuscated_article(self, url):
|
||||||
br = self.get_browser()
|
br = self.get_browser()
|
||||||
br.open(url)
|
br.open(url)
|
||||||
@ -50,57 +66,46 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
return self.temp_files[-1].name
|
return self.temp_files[-1].name
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
|
|
||||||
articles = []
|
|
||||||
answer = []
|
answer = []
|
||||||
content = soup.find('div', attrs = {'class': 'center-wrapper'})
|
soup = self.index_to_soup(self.FRONTPAGE)
|
||||||
|
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
|
||||||
|
for sec in sec_start:
|
||||||
|
content = sec.nextSibling
|
||||||
if content:
|
if content:
|
||||||
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
|
section = self.tag_to_string(content.find('h2'))
|
||||||
tag = div.find('div', attrs = {'class': 'views-field-title'})
|
|
||||||
if tag:
|
|
||||||
a = tag.find('a')
|
|
||||||
if a:
|
|
||||||
title = self.tag_to_string(a)
|
|
||||||
url = self.INDEX + a['href']
|
|
||||||
|
|
||||||
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
|
|
||||||
tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
|
||||||
# If they ever fix their markup, this will break :-(
|
|
||||||
summary = self.tag_to_string(tag.findNextSibling('p'))
|
|
||||||
description = author + '<br/>' + summary
|
|
||||||
|
|
||||||
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
answer.append(('Magazine', articles))
|
|
||||||
|
|
||||||
ul = content.find('ul')
|
|
||||||
if ul:
|
|
||||||
articles = []
|
articles = []
|
||||||
for li in ul.findAll('li'):
|
|
||||||
tag = li.find('div', attrs = {'class': 'views-field-title'})
|
tags = []
|
||||||
|
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
|
||||||
|
tags.append(div)
|
||||||
|
for li in content.findAll('li'):
|
||||||
|
tags.append(li)
|
||||||
|
|
||||||
|
for div in tags:
|
||||||
|
title = url = description = author = None
|
||||||
|
|
||||||
|
if self.INCLUDE_PREMIUM:
|
||||||
|
found_premium = False
|
||||||
|
else:
|
||||||
|
found_premium = div.findAll('span', attrs={'class':
|
||||||
|
'premium-icon'})
|
||||||
|
if not found_premium:
|
||||||
|
tag = div.find('div', attrs={'class': 'views-field-title'})
|
||||||
|
|
||||||
if tag:
|
if tag:
|
||||||
a = tag.find('a')
|
a = tag.find('a')
|
||||||
if a:
|
if a:
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = self.INDEX + a['href']
|
url = self.INDEX + a['href']
|
||||||
description = ''
|
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
|
||||||
tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
||||||
if tag:
|
description = self.tag_to_string(tag_summary)
|
||||||
description = self.tag_to_string(tag)
|
articles.append({'title':title, 'date':None, 'url':url,
|
||||||
|
'description':description, 'author':author})
|
||||||
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
|
if articles:
|
||||||
else:
|
answer.append((section, articles))
|
||||||
continue
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
answer.append(('Letters to the Editor', articles))
|
|
||||||
|
|
||||||
return answer
|
return answer
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class GlasgowHerald(BasicNewsRecipe):
|
class GlasgowHerald(BasicNewsRecipe):
|
||||||
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
|
|||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
|
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
keep_only_tags = [dict(attrs={'class':'article'})]
|
no_stylesheets = True
|
||||||
remove_tags = [
|
auto_cleanup = True
|
||||||
dict(id=['pic-nav']),
|
|
||||||
dict(attrs={'class':['comments-top']})
|
#keep_only_tags = [dict(attrs={'class':'article'})]
|
||||||
]
|
#remove_tags = [
|
||||||
|
#dict(id=['pic-nav']),
|
||||||
|
#dict(attrs={'class':['comments-top']})
|
||||||
|
#]
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -26,4 +29,3 @@ class GlasgowHerald(BasicNewsRecipe):
|
|||||||
u'http://www.heraldscotland.com/cmlink/1.768',),
|
u'http://www.heraldscotland.com/cmlink/1.768',),
|
||||||
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]
|
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]
|
||||||
|
|
||||||
|
|
||||||
|
13
recipes/goal.recipe
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1325677767(BasicNewsRecipe):
|
||||||
|
title = u'Goal'
|
||||||
|
oldest_article = 1
|
||||||
|
language = 'it'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
remove_tags_after = [dict(id='article_content')]
|
||||||
|
feeds = [(u'Goal', u'http://www.goal.com/it/feeds/news?fmt=rss')]
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
description = 'Sports news from Italy'
|
||||||
|
|
76
recipes/grantland.recipe
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class GrantLand(BasicNewsRecipe):
|
||||||
|
title = u"Grantland"
|
||||||
|
description = 'Writings on Sports & Pop Culture'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'barty on mobileread.com forum'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
# auto_cleanup is too aggressive sometimes and we end up with blank articles
|
||||||
|
auto_cleanup = False
|
||||||
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
|
oldest_article = 90
|
||||||
|
|
||||||
|
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
|
||||||
|
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
|
||||||
|
|
||||||
|
INDEX = 'http://www.grantland.com'
|
||||||
|
CATEGORIES = [
|
||||||
|
# comment out second line if you don't want older articles
|
||||||
|
# (user friendly name, url suffix, max number of articles to load)
|
||||||
|
('Today in Grantland','',20),
|
||||||
|
('In Case You Missed It','incaseyoumissedit',35),
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
{'name':['style','aside','nav','footer','script']},
|
||||||
|
{'name':'h1','text':'Grantland'},
|
||||||
|
{'id':['header','col-right']},
|
||||||
|
{'class':['connect_widget']},
|
||||||
|
{'name':'section','class':re.compile(r'\b(ad|module)\b')},
|
||||||
|
]
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
# remove blog banners
|
||||||
|
(re.compile(r'<a href="/blog/(?:(?!</a>).)+</a>', re.DOTALL|re.IGNORECASE), lambda m: ''),
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
seen_urls = set([])
|
||||||
|
|
||||||
|
for category in self.CATEGORIES:
|
||||||
|
|
||||||
|
(cat_name, tag, max_articles) = category
|
||||||
|
self.log('Reading category:', cat_name)
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
page = "%s/%s" % (self.INDEX, tag)
|
||||||
|
soup = self.index_to_soup(page)
|
||||||
|
|
||||||
|
main = soup.find('div',id='col-main')
|
||||||
|
if main is None:
|
||||||
|
main = soup
|
||||||
|
|
||||||
|
for tag in main.findAll('a', href=re.compile(r'(story|post)/_/id/\d+')):
|
||||||
|
url = tag['href']
|
||||||
|
if url in seen_urls:
|
||||||
|
continue
|
||||||
|
title = tag.string
|
||||||
|
# blank title probably means <a href=".."><img /></a>. skip
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
self.log('\tFound article:', title)
|
||||||
|
self.log('\t', url)
|
||||||
|
articles.append({'title':title,'url':url})
|
||||||
|
seen_urls.add(url)
|
||||||
|
|
||||||
|
if len(articles) >= max_articles:
|
||||||
|
break
|
||||||
|
|
||||||
|
if articles:
|
||||||
|
feeds.append((cat_name, articles))
|
||||||
|
|
||||||
|
return feeds
|
11
recipes/haksoz.recipe
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1324739199(BasicNewsRecipe):
|
||||||
|
title = u'Haks\xf6z'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
auto_cleanup = True
|
||||||
|
language = 'tr'
|
||||||
|
__author__ = 'asalet_r'
|
||||||
|
|
||||||
|
feeds = [(u'Haks\xf6z', u'http://www.haksozhaber.net/rss/')]
|
58
recipes/hamilton_spectator.recipe
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
'''
|
||||||
|
Hamilton Spectator Calibre Recipe
|
||||||
|
'''
|
||||||
|
class HamiltonSpectator(BasicNewsRecipe):
|
||||||
|
title = u'Hamilton Spectator'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
__author__ = u'Eric Coolman'
|
||||||
|
publisher = u'thespec.com'
|
||||||
|
description = u'Ontario Canada Newspaper'
|
||||||
|
category = u'News, Ontario, Canada'
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'en_CA'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Top Stories',u'http://www.thespec.com/rss?query=/&assetType=Article'),
|
||||||
|
(u'All News',u'http://www.thespec.com/rss?query=/news&assetType=Article'),
|
||||||
|
(u'Local',u'http://www.thespec.com/rss?query=/local&assetType=Article'),
|
||||||
|
(u'Ontario',u'http://www.thespec.com/rss?query=/ontario&assetType=Article'),
|
||||||
|
(u'Canada',u'http://www.thespec.com/rss?query=/canada&assetType=Article'),
|
||||||
|
(u'World News',u'http://www.thespec.com/rss?query=/world&assetType=Article'),
|
||||||
|
(u'Business',u'http://www.thespec.com/rss?query=/business&assetType=Article'),
|
||||||
|
(u'Crime',u'http://www.thespec.com/rss?query=/crime&assetType=Article'),
|
||||||
|
(u'All Sports',u'http://www.thespec.com/rss?query=/sports&assetType=Article'),
|
||||||
|
(u'Ticats',u'http://www.thespec.com/rss?query=/sports/ticats&assetType=Article'),
|
||||||
|
(u'Bulldogs',u'http://www.thespec.com/rss?query=/sports/bulldogs&assetType=Article'),
|
||||||
|
(u'High School Sports',u'http://www.thespec.com/rss?query=/sports/highschools&assetType=Article'),
|
||||||
|
(u'Local Sports',u'http://www.thespec.com/rss?query=/sports/local&assetType=Article'),
|
||||||
|
(u'What''s On',u'http://www.thespec.com/rss?query=/whatson&assetType=Article'),
|
||||||
|
(u'Arts and Entertainment',u'http://www.thespec.com/rss?query=/whatson/artsentertainment&assetType=Article'),
|
||||||
|
(u'Books',u'http://www.thespec.com/rss?query=/whatson/books&assetType=Article'),
|
||||||
|
(u'Movies',u'http://www.thespec.com/rss?query=/whatson/movies&assetType=Article'),
|
||||||
|
(u'Music',u'http://www.thespec.com/rss?query=/whatson/music&assetType=Article'),
|
||||||
|
(u'Restaurant Reviews',u'http://www.thespec.com/rss?query=/whatson/restaurants&assetType=Article'),
|
||||||
|
(u'Opinion',u'http://www.thespec.com/rss?query=/opinion&assetType=Article'),
|
||||||
|
(u'Opinion Columns',u'http://www.thespec.com/rss?query=/opinion/columns&assetType=Article'),
|
||||||
|
(u'Cartoons',u'http://www.thespec.com/rss?query=/opinion/cartoons&assetType=Article'),
|
||||||
|
(u'Letters',u'http://www.thespec.com/rss?query=/opinion/letters&assetType=Article'),
|
||||||
|
(u'Editorial',u'http://www.thespec.com/rss?query=/opinion/editorial&assetType=Article'),
|
||||||
|
(u'Community',u'http://www.thespec.com/rss?query=/community&assetType=Article'),
|
||||||
|
(u'Education',u'http://www.thespec.com/rss?query=/community/education&assetType=Article'),
|
||||||
|
(u'Faith',u'http://www.thespec.com/rss?query=/community/faith&assetType=Article'),
|
||||||
|
(u'Contests',u'http://www.thespec.com/rss?query=/community/contests&assetType=Article'),
|
||||||
|
(u'Living',u'http://www.thespec.com/rss?query=/living&assetType=Article'),
|
||||||
|
(u'Food',u'http://www.thespec.com/rss?query=/living/food&assetType=Article'),
|
||||||
|
(u'Health and Fitness',u'http://www.thespec.com/rss?query=/living/healthfitness&assetType=Article'),
|
||||||
|
(u'Your Home',u'http://www.thespec.com/rss?query=/living/home&assetType=Article'),
|
||||||
|
(u'Travel',u'http://www.thespec.com/rss?query=/living/travel&assetType=Article'),
|
||||||
|
(u'Family and Parenting',u'http://www.thespec.com/rss?query=/living/familyparenting&assetType=Article'),
|
||||||
|
(u'Style',u'http://www.thespec.com/rss?query=/living/style&assetType=Article')
|
||||||
|
]
|
||||||
|
|
43
recipes/high_country_news.recipe
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Fetch High Country News
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class HighCountryNews(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'High Country News'
|
||||||
|
description = u'News from the American West'
|
||||||
|
__author__ = 'Armin Geller' # 2012-01-31
|
||||||
|
publisher = 'High Country News'
|
||||||
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
|
language = 'en-Us'
|
||||||
|
encoding = 'UTF-8'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
masthead_url = 'http://www.hcn.org/logo.jpg' # 2012-01-31 AGe add
|
||||||
|
cover_source = 'http://www.hcn.org' # 2012-01-31 AGe add
|
||||||
|
|
||||||
|
def get_cover_url(self): # 2012-01-31 AGe add
|
||||||
|
cover_source_soup = self.index_to_soup(self.cover_source)
|
||||||
|
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
|
||||||
|
return preview_image_div.div.img['src']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent'),
|
||||||
|
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue'),
|
||||||
|
|
||||||
|
(u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
|
||||||
|
(u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '/print_view'
|
||||||
|
|
@ -1,4 +1,5 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import urllib, re
|
||||||
|
|
||||||
class HindustanTimes(BasicNewsRecipe):
|
class HindustanTimes(BasicNewsRecipe):
|
||||||
title = u'Hindustan Times'
|
title = u'Hindustan Times'
|
||||||
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
|
|||||||
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
|
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
'''
|
||||||
|
HT uses a variant of the feedportal RSS ad display mechanism
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
s = article.summary
|
||||||
|
return urllib.unquote(
|
||||||
|
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
url = BasicNewsRecipe.get_article_url(self, article)
|
||||||
|
res = self.browser.open_novisit(url)
|
||||||
|
url = res.geturl().split('/')[-2]
|
||||||
|
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||||
|
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
|
||||||
|
'www.'}
|
||||||
|
for k, v in encoding.iteritems():
|
||||||
|
url = url.replace(k, v)
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,44 +1,58 @@
|
|||||||
# -*- coding: utf-8 -*-
|
################################################################################
|
||||||
import re
|
#Description: http://hvg.hu/ RSS channel
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||||
|
#Date: 2011.12.20. - V1.1
|
||||||
|
################################################################################
|
||||||
|
|
||||||
class HVG(BasicNewsRecipe):
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
title = 'HVG.HU'
|
|
||||||
__author__ = u'István Papp'
|
class hvg(BasicNewsRecipe):
|
||||||
description = u'Friss hírek a HVG-től'
|
title = u'HVG'
|
||||||
timefmt = ' [%Y. %b. %d., %a.]'
|
__author__ = 'Bigpapa'
|
||||||
oldest_article = 4
|
|
||||||
language = 'hu'
|
language = 'hu'
|
||||||
|
oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
publisher = 'HVG Online'
|
extra_css = ' h2 { font:bold 28px} '
|
||||||
category = u'news, hírek, hvg'
|
|
||||||
extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
|
||||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
|
||||||
remove_tags_before = dict(id='pg-content')
|
|
||||||
remove_javascript = True
|
|
||||||
remove_empty_feeds = True
|
|
||||||
|
|
||||||
feeds = [
|
remove_attributes = ['style','font', 'href']
|
||||||
(u'Itthon', u'http://hvg.hu/rss/itthon')
|
|
||||||
,(u'Világ', u'http://hvg.hu/rss/vilag')
|
keep_only_tags = [
|
||||||
,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag')
|
dict(name='div', attrs={'id':['pg-content']})
|
||||||
,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
|
|
||||||
,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
|
|
||||||
,(u'Karrier', u'http://hvg.hu/rss/karrier')
|
|
||||||
,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
|
|
||||||
,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
|
|
||||||
,(u'Kultúra', u'http://hvg.hu/rss/kultura')
|
|
||||||
,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
|
|
||||||
,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
|
|
||||||
,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
|
|
||||||
,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
|
|
||||||
,(u'Sport', u'http://hvg.hu/rss/sport')
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
remove_tags = [
|
||||||
return url.replace ('#rss', '/print')
|
dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
|
||||||
|
dict(name='table', attrs={'class':['banner2', 'monocle']}),
|
||||||
|
dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
|
||||||
|
dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
|
||||||
|
dict(name='h3', attrs={'class':['hthree']}),
|
||||||
|
dict(name='ul', attrs={'class':['defaultul']}),
|
||||||
|
dict(name='form', attrs={'id':['commentForm']}),
|
||||||
|
dict(name='h6', attrs={'class':['hthree']}),
|
||||||
|
dict(name='h6', attrs={'class':['more2']}),
|
||||||
|
dict(name='img', attrs={'class':['framed']}),
|
||||||
|
dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
# (u'\xd6sszes', 'http://hvg.hu/rss'),
|
||||||
|
(u'Itthon', 'http://hvg.hu/rss/itthon'),
|
||||||
|
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
|
||||||
|
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
|
||||||
|
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
|
||||||
|
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
|
||||||
|
(u'Karrier', 'http://hvg.hu/rss/karrier'),
|
||||||
|
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
|
||||||
|
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
|
||||||
|
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
|
||||||
|
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
|
||||||
|
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
|
||||||
|
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
|
||||||
|
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
|
||||||
|
(u'Sport', 'http://hvg.hu/rss/sport')
|
||||||
|
]
|
BIN
recipes/icons/asianreviewofbooks.png
Normal file
After Width: | Height: | Size: 906 B |
Before Width: | Height: | Size: 712 B After Width: | Height: | Size: 712 B |
BIN
recipes/icons/mlody_technik_pl.png
Normal file
After Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 15 KiB |
BIN
recipes/icons/moneynews.png
Normal file
After Width: | Height: | Size: 914 B |
BIN
recipes/icons/novilist_novine_hr.png
Normal file
After Width: | Height: | Size: 241 B |
BIN
recipes/icons/novilist_portal_hr.png
Normal file
After Width: | Height: | Size: 944 B |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 289 B |
BIN
recipes/icons/rionegro.png
Normal file
After Width: | Height: | Size: 817 B |
BIN
recipes/icons/samanyolu_haber.png
Normal file
After Width: | Height: | Size: 968 B |
68
recipes/ideal_almeria.recipe
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
# encoding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||||
|
__copyright__ = 'Josemi Liébana'
|
||||||
|
__version__ = 'v0.1'
|
||||||
|
__date__ = '5 January 2012'
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.ideal.es
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Ideal(BasicNewsRecipe):
|
||||||
|
title = u'Ideal (Edición Almería)'
|
||||||
|
__author__ = u'Josemi Liébana'
|
||||||
|
description = u'Noticias de Almería y el resto del mundo'
|
||||||
|
publisher = 'Ideal'
|
||||||
|
category = u'News, Politics, Spain, Almería'
|
||||||
|
publication_type = 'Newspaper'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'cp1252'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = u'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||||
|
cover_url = u'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||||
|
extra_css = u' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(attrs={'id':'title'})
|
||||||
|
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [dict(name='ul')]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Última Hora' , u'http://www.ideal.es/almeria/rss/feeds/ultima.xml' )
|
||||||
|
,(u'Portada' , u'http://www.ideal.es/almeria/portada.xml' )
|
||||||
|
,(u'Local' , u'http://www.ideal.es/almeria/rss/feeds/granada.xml' )
|
||||||
|
,(u'Deportes' , u'http://www.ideal.es/almeria/rss/feeds/deportes.xml' )
|
||||||
|
,(u'Sociedad' , u'http://www.ideal.es/almeria/rss/feeds/sociedad.xml' )
|
||||||
|
,(u'Cultura' , u'http://www.ideal.es/almeria/rss/feeds/cultura.xml' )
|
||||||
|
,(u'Economía' , u'http://www.ideal.es/almeria/rss/feeds/economia.xml' )
|
||||||
|
,(u'Costa' , u'http://www.ideal.es/almeria/rss/feeds/costa.xml' )
|
||||||
|
,(u'Puerta Purchena' , u'http://www.ideal.es/almeria/rss/feeds/puerta_purchena.xml' )
|
||||||
|
,(u'Andalucía' , u'http://www.ideal.es/almeria/rss/feeds/andalucia.xml' )
|
||||||
|
,(u'España' , u'http://www.ideal.es/almeria/rss/feeds/espana.xml' )
|
||||||
|
,(u'Mundo' , u'http://www.ideal.es/almeria/rss/feeds/internacional.xml' )
|
||||||
|
,(u'Vivir' , u'http://www.ideal.es/almeria/rss/feeds/vivir.xml' )
|
||||||
|
,(u'Opinión' , u'http://www.ideal.es/almeria/rss/feeds/opinion.xml' )
|
||||||
|
,(u'Televisión' , u'http://www.ideal.es/almeria/rss/feeds/television.xml' )
|
||||||
|
,(u'Contraportada' , u'http://www.ideal.es/almeria/rss/feeds/contraportada.xml' )
|
||||||
|
]
|
||||||
|
|
69
recipes/ideal_granada.recipe
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# encoding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||||
|
__copyright__ = 'Josemi Liébana'
|
||||||
|
__version__ = 'v0.1'
|
||||||
|
__date__ = '5 January 2012'
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.ideal.es
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Ideal(BasicNewsRecipe):
|
||||||
|
title = u'Ideal (Edición Granada)'
|
||||||
|
__author__ = u'Josemi Liébana'
|
||||||
|
description = u'Noticias de Granada y el resto del mundo'
|
||||||
|
publisher = 'Ideal'
|
||||||
|
category = 'News, Politics, Spain, Granada'
|
||||||
|
publication_type = 'Newspaper'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'cp1252'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||||
|
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||||
|
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(attrs={'id':'title'})
|
||||||
|
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [dict(name='ul')]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Última Hora' , u'http://www.ideal.es/granada/rss/feeds/ultima.xml' )
|
||||||
|
,(u'Portada' , u'http://www.ideal.es/granada/portada.xml' )
|
||||||
|
,(u'Local' , u'http://www.ideal.es/granada/rss/feeds/granada.xml' )
|
||||||
|
,(u'Deportes' , u'http://www.ideal.es/granada/rss/feeds/deportes.xml' )
|
||||||
|
,(u'Sociedad' , u'http://www.ideal.es/granada/rss/feeds/sociedad.xml' )
|
||||||
|
,(u'Cultura' , u'http://www.ideal.es/granada/rss/feeds/cultura.xml' )
|
||||||
|
,(u'Economía' , u'http://www.ideal.es/granada/rss/feeds/economia.xml' )
|
||||||
|
,(u'Costa' , u'http://www.ideal.es/granada/rss/feeds/costa.xml' )
|
||||||
|
,(u'La Carrera' , u'http://www.ideal.es/granada/rss/feeds/la_carrera.xml' )
|
||||||
|
,(u'Puerta Real' , u'http://www.ideal.es/granada/rss/feeds/puerta_real.xml' )
|
||||||
|
,(u'Andalucía' , u'http://www.ideal.es/granada/rss/feeds/andalucia.xml' )
|
||||||
|
,(u'España' , u'http://www.ideal.es/granada/rss/feeds/espana.xml' )
|
||||||
|
,(u'Mundo' , u'http://www.ideal.es/granada/rss/feeds/internacional.xml' )
|
||||||
|
,(u'Vivir' , u'http://www.ideal.es/granada/rss/feeds/vivir.xml' )
|
||||||
|
,(u'Opinión' , u'http://www.ideal.es/granada/rss/feeds/opinion.xml' )
|
||||||
|
,(u'Televisión' , u'http://www.ideal.es/granada/rss/feeds/television.xml' )
|
||||||
|
,(u'Contraportada' , u'http://www.ideal.es/granada/rss/feeds/contraportada.xml' )
|
||||||
|
]
|
||||||
|
|
67
recipes/ideal_jaen.recipe
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
# encoding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||||
|
__copyright__ = 'Josemi Liébana'
|
||||||
|
__version__ = 'v0.1'
|
||||||
|
__date__ = '5 January 2012'
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.ideal.es
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Ideal(BasicNewsRecipe):
|
||||||
|
title = u'Ideal (Edición Jaén)'
|
||||||
|
__author__ = u'Josemi Liébana'
|
||||||
|
description = u'Noticias de Jaén y el resto del mundo'
|
||||||
|
publisher = 'Ideal'
|
||||||
|
category = u'News, Politics, Spain, Jaén'
|
||||||
|
publication_type = 'Newspaper'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'cp1252'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||||
|
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||||
|
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(attrs={'id':'title'})
|
||||||
|
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [dict(name='ul')]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Última Hora' , u'http://www.ideal.es/jaen/rss/feeds/ultima.xml' )
|
||||||
|
,(u'Portada' , u'http://www.ideal.es/jaen/portada.xml' )
|
||||||
|
,(u'Local' , u'http://www.ideal.es/jaen/rss/feeds/granada.xml' )
|
||||||
|
,(u'Deportes' , u'http://www.ideal.es/jaen/rss/feeds/deportes.xml' )
|
||||||
|
,(u'Sociedad' , u'http://www.ideal.es/jaen/rss/feeds/sociedad.xml' )
|
||||||
|
,(u'Cultura' , u'http://www.ideal.es/jaen/rss/feeds/cultura.xml' )
|
||||||
|
,(u'Economía' , u'http://www.ideal.es/jaen/rss/feeds/economia.xml' )
|
||||||
|
,(u'Costa' , u'http://www.ideal.es/jaen/rss/feeds/costa.xml' )
|
||||||
|
,(u'Andalucía' , u'http://www.ideal.es/jaen/rss/feeds/andalucia.xml' )
|
||||||
|
,(u'España' , u'http://www.ideal.es/jaen/rss/feeds/espana.xml' )
|
||||||
|
,(u'Mundo' , u'http://www.ideal.es/jaen/rss/feeds/internacional.xml' )
|
||||||
|
,(u'Vivir' , u'http://www.ideal.es/jaen/rss/feeds/vivir.xml' )
|
||||||
|
,(u'Opinión' , u'http://www.ideal.es/jaen/rss/feeds/opinion.xml' )
|
||||||
|
,(u'Televisión' , u'http://www.ideal.es/jaen/rss/feeds/television.xml' )
|
||||||
|
,(u'Contraportada' , u'http://www.ideal.es/jaen/rss/feeds/contraportada.xml' )
|
||||||
|
]
|
||||||
|
|
@ -1,63 +1,30 @@
|
|||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2008, Derry FitzGerald'
|
|
||||||
'''
|
|
||||||
iht.com
|
|
||||||
'''
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
|
|
||||||
|
class NYTimesGlobal(BasicNewsRecipe):
|
||||||
class InternationalHeraldTribune(BasicNewsRecipe):
|
title = u'NY Times Global'
|
||||||
title = u'The International Herald Tribune'
|
|
||||||
__author__ = 'Derry FitzGerald'
|
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
oldest_article = 1 #days
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
oldest_article = 1
|
|
||||||
max_articles_per_feed = 30
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'class':['footer','header']}),
|
|
||||||
dict(name=['form'])]
|
|
||||||
preprocess_regexps = [
|
|
||||||
(re.compile(r'<!-- webtrends.*', re.DOTALL),
|
|
||||||
lambda m:'</body></html>')
|
|
||||||
]
|
|
||||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
|
||||||
|
|
||||||
remove_empty_feeds = True
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'),
|
('NYTimes',
|
||||||
(u'Business', u'http://www.iht.com/rss/business.xml'),
|
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
|
||||||
(u'Americas', u'http://www.iht.com/rss/america.xml'),
|
('NYTimes global',
|
||||||
(u'Europe', u'http://www.iht.com/rss/europe.xml'),
|
'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
|
||||||
(u'Asia', u'http://www.iht.com/rss/asia.xml'),
|
('World',
|
||||||
(u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
|
'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
|
||||||
(u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
|
('U.S.',
|
||||||
(u'Technology', u'http://www.iht.com/rss/technology.xml'),
|
'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
|
||||||
(u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
|
('Business',
|
||||||
(u'Sports', u'http://www.iht.com/rss/sports.xml'),
|
'http://feeds.nytimes.com/nyt/rss/Business'),
|
||||||
(u'Culture', u'http://www.iht.com/rss/arts.xml'),
|
('Sports',
|
||||||
(u'Style and Design', u'http://www.iht.com/rss/style.xml'),
|
'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
|
||||||
(u'Travel', u'http://www.iht.com/rss/travel.xml'),
|
('Technology',
|
||||||
(u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
|
'http://feeds.nytimes.com/nyt/rss/Technology'),
|
||||||
(u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
|
|
||||||
(u'Properties', u'http://www.iht.com/rss/properties.xml')
|
|
||||||
]
|
]
|
||||||
temp_files = []
|
|
||||||
articles_are_obfuscated = True
|
|
||||||
|
|
||||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
|
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
|
||||||
br = self.get_browser()
|
|
||||||
br.open(url)
|
|
||||||
response1 = br.follow_link(url_regex=re.compile(r'.*pagewanted=print.*'))
|
|
||||||
html = response1.read()
|
|
||||||
|
|
||||||
self.temp_files.append(PersistentTemporaryFile('_iht.html'))
|
|
||||||
self.temp_files[-1].write(html)
|
|
||||||
self.temp_files[-1].close()
|
|
||||||
return self.temp_files[-1].name
|
|
||||||
|
12
recipes/iktibas.recipe
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1324739406(BasicNewsRecipe):
|
||||||
|
title = u'\u0130ktibas'
|
||||||
|
language = 'tr'
|
||||||
|
__author__ = 'asalet_r'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'\u0130ktibas', u'http://www.iktibasdergisi.com/rss/rss.xml')]
|
110
recipes/ilmanifesto.recipe
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
MANIFESTO_BASEURL = 'http://www.ilmanifesto.it/'
|
||||||
|
|
||||||
|
class IlManifesto(BasicNewsRecipe):
|
||||||
|
title = 'Il Manifesto'
|
||||||
|
__author__ = 'Giacomo Lacava'
|
||||||
|
description = 'quotidiano comunista - ultima edizione html disponibile'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
publisher = 'il manifesto coop. editrice a r.l.'
|
||||||
|
language = 'it'
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
delay = 1
|
||||||
|
no_stylesheets = True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
timeout = 30
|
||||||
|
auto_cleanup = True
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':'column_1 float_left'})]
|
||||||
|
remove_tags_before = dict(name='div',attrs={'class':'column_2 float_right'})
|
||||||
|
remove_tags_after = dict(id='myPrintArea')
|
||||||
|
|
||||||
|
manifesto_index = None
|
||||||
|
manifesto_datestr = None
|
||||||
|
|
||||||
|
def _set_manifesto_index(self):
|
||||||
|
if self.manifesto_index == None:
|
||||||
|
startUrl = MANIFESTO_BASEURL + 'area-abbonati/in-edicola/'
|
||||||
|
startSoup = self.index_to_soup(startUrl)
|
||||||
|
lastEdition = startSoup.findAll('div',id='accordion_inedicola')[1].find('a')['href']
|
||||||
|
del(startSoup)
|
||||||
|
self.manifesto_index = MANIFESTO_BASEURL + lastEdition
|
||||||
|
urlsplit = lastEdition.split('/')
|
||||||
|
self.manifesto_datestr = urlsplit[-1]
|
||||||
|
if urlsplit[-1] == '':
|
||||||
|
self.manifesto_datestr = urlsplit[-2]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
self._set_manifesto_index()
|
||||||
|
url = MANIFESTO_BASEURL + 'fileadmin/archivi/in_edicola/%sprimapagina.gif' % self.manifesto_datestr
|
||||||
|
return url
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
self._set_manifesto_index()
|
||||||
|
soup = self.index_to_soup(self.manifesto_index)
|
||||||
|
feedLinks = soup.find('div',id='accordion_inedicola').findAll('a')
|
||||||
|
result = []
|
||||||
|
for feed in feedLinks:
|
||||||
|
articles = []
|
||||||
|
feedName = feed.find('h2').string
|
||||||
|
feedUrl = MANIFESTO_BASEURL + feed['href']
|
||||||
|
feedSoup = self.index_to_soup(feedUrl)
|
||||||
|
indexRoot = feedSoup.find('div',attrs={'class':'column1'})
|
||||||
|
for div in indexRoot.findAll('div',attrs={'class':'strumenti1_inedicola'}):
|
||||||
|
artLink = div.find('a')
|
||||||
|
if artLink is None: continue # empty div
|
||||||
|
title = artLink.string
|
||||||
|
url = MANIFESTO_BASEURL + artLink['href']
|
||||||
|
|
||||||
|
description = ''
|
||||||
|
descNode = div.find('div',attrs={'class':'text_12'})
|
||||||
|
if descNode is not None:
|
||||||
|
description = descNode.string
|
||||||
|
|
||||||
|
author = ''
|
||||||
|
authNode = div.find('div',attrs={'class':'firma'})
|
||||||
|
if authNode is not None:
|
||||||
|
author = authNode.string
|
||||||
|
|
||||||
|
articleText = ''
|
||||||
|
article = {
|
||||||
|
'title':title,
|
||||||
|
'url':url,
|
||||||
|
'date': strftime('%d %B %Y'),
|
||||||
|
'description': description,
|
||||||
|
'content': articleText,
|
||||||
|
'author': author
|
||||||
|
}
|
||||||
|
articles.append(article)
|
||||||
|
result.append((feedName,articles))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def extract_readable_article(self, html, url):
|
||||||
|
|
||||||
|
bs = BeautifulSoup(html)
|
||||||
|
col1 = bs.find('div',attrs={'class':'column1'})
|
||||||
|
|
||||||
|
content = col1.find('div',attrs={'class':'bodytext'})
|
||||||
|
title = bs.find(id='titolo_articolo').string
|
||||||
|
author = col1.find('span',attrs={'class':'firma'})
|
||||||
|
subtitle = ''
|
||||||
|
subNode = col1.findPrevious('div',attrs={'class':'occhiello_rosso'})
|
||||||
|
if subNode is not None:
|
||||||
|
subtitle = subNode
|
||||||
|
summary = ''
|
||||||
|
sommNode = bs.find('div',attrs={'class':'sommario'})
|
||||||
|
if sommNode is not None:
|
||||||
|
summary = sommNode
|
||||||
|
|
||||||
|
template = "<html><head><title>%(title)s</title></head><body><h1>%(title)s</h1><h2>%(subtitle)s</h2><h3>%(author)s</h3><div style='font-size: x-large;'>%(summary)s</div><div>%(content)s</div></body></html>"
|
||||||
|
del(bs)
|
||||||
|
return template % dict(title=title,subtitle=subtitle,author=author,summary=summary,content=content)
|
||||||
|
|
||||||
|
|
@ -1,16 +1,20 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1234144423(BasicNewsRecipe):
|
class IndianapolisStar(BasicNewsRecipe):
|
||||||
title = u'Indianapolis Star'
|
title = u'Indianapolis Star'
|
||||||
oldest_article = 5
|
oldest_article = 10
|
||||||
|
auto_cleanup = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
__author__ = 'Owen Kelly'
|
__author__ = 'Owen Kelly'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
|
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
|
||||||
|
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss'),
|
||||||
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss&mime=XML'), (u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss&mime=XML'), (u'Business Headlines', u'http://www..indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss&mime=XML'), (u'Sports Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=SPORTS&template=rss&mime=XML'), (u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'), (u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')]
|
(u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss'),
|
||||||
|
(u'Business Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss'),
|
||||||
|
(u'Politics and Government', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS05&template=rss'),
|
||||||
|
(u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'),
|
||||||
|
(u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')
|
||||||
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url + '&template=printart'
|
return url + '&template=printart'
|
12
recipes/izdiham.com.recipe
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1324158549(BasicNewsRecipe):
|
||||||
|
title = u'izdiham.com'
|
||||||
|
language = 'tr'
|
||||||
|
__author__ = 'asalet_r'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'\u0130zdiham', u'http://www.izdiham.com/index.php/feed')]
|
72
recipes/klip_me.recipe
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
||||||
|
title = u'Klipme'
|
||||||
|
__author__ = 'Ken Sun'
|
||||||
|
publisher = 'Klip.me'
|
||||||
|
category = 'info, custom, Klip.me'
|
||||||
|
oldest_article = 365
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':'text_controls_toggle'})
|
||||||
|
,dict(name='script')
|
||||||
|
,dict(name='div', attrs={'id':'text_controls'})
|
||||||
|
,dict(name='div', attrs={'id':'editing_controls'})
|
||||||
|
,dict(name='div', attrs={'class':'bar bottom'})
|
||||||
|
]
|
||||||
|
use_embedded_content = False
|
||||||
|
needs_subscription = True
|
||||||
|
INDEX = u'http://www.klip.me'
|
||||||
|
LOGIN = INDEX + u'/fav/signin?callback=/fav'
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Klip.me unread', u'http://www.klip.me/fav'),
|
||||||
|
(u'Klip.me started', u'http://www.klip.me/fav?s=starred')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None:
|
||||||
|
br.open(self.LOGIN)
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br['Email'] = self.username
|
||||||
|
if self.password is not None:
|
||||||
|
br['Passwd'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
totalfeeds = []
|
||||||
|
lfeeds = self.get_feeds()
|
||||||
|
for feedobj in lfeeds:
|
||||||
|
feedtitle, feedurl = feedobj
|
||||||
|
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||||
|
articles = []
|
||||||
|
soup = self.index_to_soup(feedurl)
|
||||||
|
for item in soup.findAll('table',attrs={'class':['item','item new']}):
|
||||||
|
atag = item.a
|
||||||
|
if atag and atag.has_key('href'):
|
||||||
|
url = atag['href']
|
||||||
|
articles.append({
|
||||||
|
'url' :url
|
||||||
|
})
|
||||||
|
totalfeeds.append((feedtitle, articles))
|
||||||
|
return totalfeeds
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return 'http://www.klip.me' + url
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
article.title = soup.find('title').contents[0].strip()
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first_fetch):
|
||||||
|
for link_tag in soup.findAll(attrs={"id" : "story"}):
|
||||||
|
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
|
||||||
|
print link_tag
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2011, Attis <attis@attis.one.pl>'
|
__copyright__ = '2011 Attis <attis@attis.one.pl>, 2012 Tomasz Długosz <tomek3d@gmail.com>'
|
||||||
__version__ = 'v. 0.1'
|
__version__ = 'v. 0.1'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
@ -10,7 +10,7 @@ class KopalniaWiedzy(BasicNewsRecipe):
|
|||||||
publisher = u'Kopalnia Wiedzy'
|
publisher = u'Kopalnia Wiedzy'
|
||||||
description = u'Ciekawostki ze świata nauki i techniki'
|
description = u'Ciekawostki ze świata nauki i techniki'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
__author__ = 'Attis'
|
__author__ = 'Attis & Tomasz Długosz'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -18,9 +18,9 @@ class KopalniaWiedzy(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}]
|
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}]
|
||||||
remove_tags_after = dict(attrs={'class':'ad-square'})
|
remove_tags_after = dict(attrs={'class':'ad-square'})
|
||||||
keep_only_tags = [dict(name="div", attrs={'id':'articleContent'})]
|
keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})]
|
||||||
extra_css = '.topimage {margin-top: 30px}'
|
extra_css = '.topimage {margin-top: 30px}'
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
|
@ -13,9 +13,10 @@ class Kurier(BasicNewsRecipe):
|
|||||||
publisher = 'KURIER'
|
publisher = 'KURIER'
|
||||||
category = 'news, politics, Austria'
|
category = 'news, politics, Austria'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 200
|
max_articles_per_feed = 100
|
||||||
|
timeout = 30
|
||||||
|
encoding = None
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'de_AT'
|
language = 'de_AT'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
@ -29,9 +30,11 @@ class Kurier(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags = [dict(attrs={'class':['functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})]
|
remove_tags = [ dict(attrs={'id':['artikel_expand_symbol2','imgzoom_close2']}),
|
||||||
|
dict(attrs={'class':['linkextern','functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})
|
||||||
|
]
|
||||||
keep_only_tags = [dict(attrs={'id':'content'})]
|
keep_only_tags = [dict(attrs={'id':'content'})]
|
||||||
remove_tags_after = dict(attrs={'id':'author'})
|
remove_tags_after = [dict(attrs={'id':'author'})]
|
||||||
remove_attributes = ['width','height']
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -41,7 +44,7 @@ class Kurier(BasicNewsRecipe):
|
|||||||
,(u'Kultur' , u'http://kurier.at/rss/kultur_kultur_rss.xml' )
|
,(u'Kultur' , u'http://kurier.at/rss/kultur_kultur_rss.xml' )
|
||||||
,(u'Freizeit' , u'http://kurier.at/rss/freizeit_freizeit_rss.xml' )
|
,(u'Freizeit' , u'http://kurier.at/rss/freizeit_freizeit_rss.xml' )
|
||||||
,(u'Wetter' , u'http://kurier.at/rss/oewetter_rss.xml' )
|
,(u'Wetter' , u'http://kurier.at/rss/oewetter_rss.xml' )
|
||||||
,(u'Verkehr' , u'http://kurier.at/rss/verkehr_rss.xml' )
|
,(u'Sport' , u'http://kurier.at/newsfeed/detail/sport_rss.xml' )
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
www.la-razon.com
|
www.la-razon.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre import strftime
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class LaRazon_Bol(BasicNewsRecipe):
|
class LaRazon_Bol(BasicNewsRecipe):
|
||||||
@ -16,19 +15,17 @@ class LaRazon_Bol(BasicNewsRecipe):
|
|||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 200
|
max_articles_per_feed = 200
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es_BO'
|
language = 'es_BO'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
delay = 1
|
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
cover_url = strftime('http://www.la-razon.com/portadas/%Y%m%d_LaRazon.jpg')
|
masthead_url = 'http://www.la-razon.com/static/LRZRazon/images/lrz-logo.png'
|
||||||
masthead_url = 'http://www.la-razon.com/imagenes/logo.jpg'
|
extra_css = """ body{font-family: Georgia,"Times New Roman",Times,serif}
|
||||||
extra_css = """ body{font-family: Arial,Helvetica,sans-serif }
|
img{margin-bottom: 0.4em; display: block}
|
||||||
img{margin-bottom: 0.4em}
|
.meta{font-size: small; font-family: Arial,Helvetica,sans-serif}
|
||||||
.noticia-titulo{font-family: Georgia,"Times New Roman",Times,serif}
|
|
||||||
.lead{font-weight: bold; font-size: 0.8em}
|
|
||||||
"""
|
"""
|
||||||
|
INDEX = 'http://www.la-razon.com/'
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -37,28 +34,37 @@ class LaRazon_Bol(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['noticia-titulo','noticia-desarrollo']})]
|
keep_only_tags = [dict(name='div', attrs={'class':['pg-hd', 'pg-bd']})]
|
||||||
remove_tags = [dict(name=['meta','link','form','iframe','embed','object'])]
|
remove_tags = [
|
||||||
|
dict(name=['meta','link','form','iframe','embed','object'])
|
||||||
|
,dict(name='div', attrs={'class':'bd'})
|
||||||
|
]
|
||||||
remove_attributes = ['width','height']
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Editorial' , u'http://www.la-razon.com/rss_editorial.php' )
|
(u'Editorial' , u'http://www.la-razon.com/rss/opinion/editorial/' )
|
||||||
,(u'Opinión' , u'http://www.la-razon.com/rss_opinion.php' )
|
,(u'Nacional' , u'http://www.la-razon.com/rss/nacional/' )
|
||||||
,(u'Nacional' , u'http://www.la-razon.com/rss_nacional.php' )
|
,(u'Economia' , u'http://www.la-razon.com/rss/economia/' )
|
||||||
,(u'Economia' , u'http://www.la-razon.com/rss_economia.php' )
|
,(u'Ciudades' , u'http://www.la-razon.com/rss/ciudades/' )
|
||||||
,(u'Ciudades' , u'http://www.la-razon.com/rss_ciudades.php' )
|
,(u'Sociedad' , u'http://www.la-razon.com/rss/sociedad/' )
|
||||||
,(u'Sociedad' , u'http://www.la-razon.com/rss_sociedad.php' )
|
,(u'Mundo' , u'http://www.la-razon.com/rss/mundo/' )
|
||||||
,(u'Mundo' , u'http://www.la-razon.com/rss_sociedad.php' )
|
,(u'La Revista' , u'http://www.la-razon.com/rss/la_revista/' )
|
||||||
,(u'La Revista' , u'http://www.la-razon.com/rss_larevista.php' )
|
,(u'Sociales' , u'http://www.la-razon.com/rss/sociales/' )
|
||||||
,(u'Sociales' , u'http://www.la-razon.com/rss_sociales.php' )
|
,(u'Mia' , u'http://www.la-razon.com/rss/suplementos/mia/' )
|
||||||
,(u'Mia' , u'http://www.la-razon.com/rss_mia.php' )
|
,(u'Marcas' , u'http://www.la-razon.com/rss/marcas/' )
|
||||||
,(u'Marcas' , u'http://www.la-razon.com/rss_marcas.php' )
|
,(u'Escape' , u'http://www.la-razon.com/rss/suplementos/escape/' )
|
||||||
,(u'Escape' , u'http://www.la-razon.com/rss_escape.php' )
|
,(u'El Financiero' , u'http://www.la-razon.com/rss/suplementos/financiero/')
|
||||||
,(u'El Financiero' , u'http://www.la-razon.com/rss_financiero.php')
|
,(u'Tendencias' , u'http://www.la-razon.com/rss/suplementos/tendencias/')
|
||||||
,(u'Tendencias' , u'http://www.la-razon.com/rss_tendencias.php')
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
lightbox = soup.find('div', attrs = {'class' : 'lightbox lightbox-frontpage'})
|
||||||
|
return lightbox.img['src']
|
||||||
|
|
||||||
|
|
||||||
|
15
recipes/la_voce.recipe
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1324114228(BasicNewsRecipe):
|
||||||
|
title = u'La Voce'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
masthead_url = 'http://www.lavoce.info/binary/la_voce/testata/lavoce.1184661635.gif'
|
||||||
|
feeds = [(u'La Voce', u'http://www.lavoce.info/feed_rss.php?id_feed=1')]
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
description = 'Italian website on Economy - v1.01 (17, December 2011)'
|
||||||
|
language = 'it'
|
||||||
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'Lorenzo Vigentini'
|
__author__ = 'Lorenzo Vigentini and Olivier Daigle'
|
||||||
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
__copyright__ = '2012, Lorenzo Vigentini <l.vigentini at gmail.com>, Olivier Daigle <odaigle _at nuvucameras __dot__ com>'
|
||||||
__version__ = 'v1.01'
|
__version__ = 'v1.01'
|
||||||
__date__ = '14, January 2010'
|
__date__ = '12, February 2012'
|
||||||
__description__ = 'Canadian Paper '
|
__description__ = 'Canadian Paper '
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -26,11 +26,15 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
max_articles_per_feed = 50
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 200
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 10
|
recursion = 10
|
||||||
needs_subscription = 'optional'
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
|
filterDuplicates = False
|
||||||
|
url_list = []
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
@ -38,7 +42,7 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'id':'article'}),
|
dict(name='div', attrs={'id':'article'}),
|
||||||
dict(name='ul', attrs={'id':'ariane'})
|
dict(name='div', attrs={'id':'colonne_principale'})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
@ -51,7 +55,7 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
|
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
|
||||||
(u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
|
(u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
|
||||||
(u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
|
(u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
|
||||||
(u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
|
(u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
|
||||||
(u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
|
(u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
|
||||||
@ -61,7 +65,7 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
(u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
|
(u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
|
||||||
(u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
|
(u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
|
||||||
(u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
|
(u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
|
||||||
(u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50')
|
(u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50')
|
||||||
]
|
]
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
@ -85,8 +89,16 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
br.open('http://www.ledevoir.com')
|
br.open('http://www.ledevoir.com')
|
||||||
br.select_form(nr=1)
|
br.select_form(nr=0)
|
||||||
br['login[courriel]'] = self.username
|
br['login_popup[courriel]'] = self.username
|
||||||
br['login[password]'] = self.password
|
br['login_popup[password]'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
if self.filterDuplicates:
|
||||||
|
if url in self.url_list:
|
||||||
|
return
|
||||||
|
self.url_list.append(url)
|
||||||
|
return url
|
||||||
|
|
||||||
|
14
recipes/lega_nerd.recipe
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1326135232(BasicNewsRecipe):
|
||||||
|
title = u'Lega Nerd'
|
||||||
|
description = 'nerd / geek culture, pc, comics, music, culture'
|
||||||
|
language = 'it'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Lega Nerd', u'http://feeds.feedburner.com/LegaNerd')]
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
__version__ = 'v1.0'
|
||||||
|
__date__ = '9, January 2011'
|
103
recipes/liberation_sub.recipe
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
|
||||||
|
'''
|
||||||
|
liberation.fr
|
||||||
|
'''
|
||||||
|
# The cleanning is from the Liberation recipe, by Darko Miletic
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Liberation(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Libération: Édition abonnés'
|
||||||
|
__author__ = 'Rémi Vanicat'
|
||||||
|
description = u'Actualités'
|
||||||
|
category = 'Actualités, France, Monde'
|
||||||
|
language = 'fr'
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':'article'})
|
||||||
|
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
|
||||||
|
,dict(name='div', attrs={'class':'entry'})
|
||||||
|
,dict(name='div', attrs={'class':'col_contenu'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_after = [
|
||||||
|
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
|
||||||
|
,dict(name='p',attrs={'class':['chapo']})
|
||||||
|
,dict(id='_twitter_facebook')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='iframe')
|
||||||
|
,dict(name='a', attrs={'class':'lnk-comments'})
|
||||||
|
,dict(name='div', attrs={'class':'toolbox'})
|
||||||
|
,dict(name='ul', attrs={'class':'share-box'})
|
||||||
|
,dict(name='ul', attrs={'class':'tool-box'})
|
||||||
|
,dict(name='ul', attrs={'class':'rub'})
|
||||||
|
,dict(name='p',attrs={'class':['chapo']})
|
||||||
|
,dict(name='p',attrs={'class':['tag']})
|
||||||
|
,dict(name='div',attrs={'class':['blokLies']})
|
||||||
|
,dict(name='div',attrs={'class':['alire']})
|
||||||
|
,dict(id='_twitter_facebook')
|
||||||
|
]
|
||||||
|
|
||||||
|
index = 'http://www.liberation.fr/abonnes/'
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.liberation.fr/jogger/login/')
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br['email'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup=self.index_to_soup(self.index)
|
||||||
|
|
||||||
|
content = soup.find('div', { 'class':'block-content' })
|
||||||
|
|
||||||
|
articles = []
|
||||||
|
cat_articles = []
|
||||||
|
|
||||||
|
for tag in content.findAll(recursive=False):
|
||||||
|
if(tag['class']=='headrest headrest-basic-rounded'):
|
||||||
|
cat_articles = []
|
||||||
|
articles.append((tag.find('h5').contents[0],cat_articles))
|
||||||
|
else:
|
||||||
|
title = tag.find('h3').contents[0]
|
||||||
|
url = tag.find('a')['href']
|
||||||
|
print(url)
|
||||||
|
descripion = tag.find('p',{ 'class':'subtitle' }).contents[0]
|
||||||
|
article = {
|
||||||
|
'title': title,
|
||||||
|
'url': url,
|
||||||
|
'descripion': descripion,
|
||||||
|
'content': ''
|
||||||
|
}
|
||||||
|
cat_articles.append(article)
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Local Variables:
|
||||||
|
# mode: python
|
||||||
|
# End:
|
@ -1,41 +1,26 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
|
||||||
'''
|
|
||||||
www.livemint.com
|
|
||||||
'''
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class LiveMint(BasicNewsRecipe):
|
class LiveMint(BasicNewsRecipe):
|
||||||
title = u'Livemint'
|
title = u'Live Mint'
|
||||||
__author__ = 'Darko Miletic'
|
language = 'en_IN'
|
||||||
description = 'The Wall Street Journal'
|
__author__ = 'Krittika Goyal'
|
||||||
publisher = 'The Wall Street Journal'
|
#encoding = 'cp1252'
|
||||||
category = 'news, games, adventure, technology'
|
oldest_article = 1 #days
|
||||||
language = 'en'
|
max_articles_per_feed = 25
|
||||||
|
use_embedded_content = True
|
||||||
|
|
||||||
oldest_article = 15
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf-8'
|
auto_cleanup = True
|
||||||
use_embedded_content = False
|
|
||||||
extra_css = ' #dvArtheadline{font-size: x-large} #dvArtAbstract{font-size: large} '
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'innercontent'})]
|
|
||||||
|
|
||||||
remove_tags = [dict(name=['object','link','embed','form','iframe'])]
|
feeds = [
|
||||||
|
('Latest News',
|
||||||
|
'http://www.livemint.com/StoryRss.aspx?LN=Latestnews'),
|
||||||
|
('Gallery',
|
||||||
|
'http://www.livemint.com/GalleryRssfeed.aspx'),
|
||||||
|
('Top Stories',
|
||||||
|
'http://www.livemint.com/StoryRss.aspx?ts=Topstories'),
|
||||||
|
('Banking',
|
||||||
|
'http://www.livemint.com/StoryRss.aspx?Id=104'),
|
||||||
|
]
|
||||||
|
|
||||||
feeds = [(u'Articles', u'http://www.livemint.com/SectionRssfeed.aspx?Mid=1')]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
link = url
|
|
||||||
msoup = self.index_to_soup(link)
|
|
||||||
mlink = msoup.find(attrs={'id':'ctl00_bodyplaceholdercontent_cntlArtTool_printUrl'})
|
|
||||||
if mlink:
|
|
||||||
link = 'http://www.livemint.com/Articles/' + mlink['href'].rpartition('/Articles/')[2]
|
|
||||||
return link
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
return self.adeify_images(soup)
|
|
||||||
|
25
recipes/living_stones.recipe
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class LivingStonesPastorsBlog(BasicNewsRecipe):
|
||||||
|
title = u'Living Stones Pastors Blog'
|
||||||
|
__author__ = 'Peter Grungi'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
oldest_article = 90
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
auto_cleanup = True
|
||||||
|
cover_url = 'http://blogs.livingstonesreno.com/wp-content/uploads/2011/08/blogBGRD_norepeat.jpg'
|
||||||
|
masthead_url = 'http://www.livingstonesreno.com/podcast/LSpodcastnew.jpg'
|
||||||
|
publisher = 'Living Stones Church of Reno, NV'
|
||||||
|
language = 'en'
|
||||||
|
author = 'Living Stones Church of Reno, NV'
|
||||||
|
|
||||||
|
feeds = [(u'LS Blog', u'http://blogs.livingstonesreno.com/feed?utm_source=calibre&utm_medium=rss')]
|
||||||
|
|
||||||
|
def full_version(self, url):
|
||||||
|
import re
|
||||||
|
newurl = re.sub(r'\?.*','',url)
|
||||||
|
return newurl
|
@ -41,7 +41,7 @@ class LosTiempos_Bol(BasicNewsRecipe):
|
|||||||
keep_only_tags = [dict(name='div', attrs={'id':'articulo'})]
|
keep_only_tags = [dict(name='div', attrs={'id':'articulo'})]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['meta','link','form','iframe','embed','object','hr'])
|
dict(name=['meta','link','form','iframe','embed','object','hr'])
|
||||||
,dict(attrs={'class':['caja_fonts sin_border_bot','pub']})
|
,dict(attrs={'class':['caja_fonts sin_border_bot','pub','twitter-share-button']})
|
||||||
]
|
]
|
||||||
remove_attributes = ['width','height']
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
@ -14,8 +14,11 @@ class WeeklyLWN(BasicNewsRecipe):
|
|||||||
description = 'Weekly summary of what has happened in the free software world.'
|
description = 'Weekly summary of what has happened in the free software world.'
|
||||||
__author__ = 'Davide Cavalca'
|
__author__ = 'Davide Cavalca'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
site_url = 'http://lwn.net'
|
||||||
|
|
||||||
cover_url = 'http://lwn.net/images/lcorner.png'
|
extra_css = 'pre,code,samp,kbd,tt { font-size: 80% }\nblockquote {margin-left:0 }\n* { color: black }\n'
|
||||||
|
|
||||||
|
cover_url = site_url + '/images/lcorner.png'
|
||||||
#masthead_url = 'http://lwn.net/images/lcorner.png'
|
#masthead_url = 'http://lwn.net/images/lcorner.png'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
|
|
||||||
@ -43,11 +46,29 @@ class WeeklyLWN(BasicNewsRecipe):
|
|||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
|
||||||
|
# Strip off anchor
|
||||||
|
url = url.split('#')[0]
|
||||||
|
|
||||||
|
# Prepend site_url
|
||||||
|
if url[0:len(self.site_url)] != self.site_url:
|
||||||
|
url = self.site_url + url
|
||||||
|
|
||||||
|
# Append printable URL parameter
|
||||||
|
print_param = '?format=printable'
|
||||||
|
if url[-len(print_param):] != print_param:
|
||||||
|
url += print_param
|
||||||
|
|
||||||
|
#import sys
|
||||||
|
#print >>sys.stderr, "*** print_version(url):", url
|
||||||
|
return url
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
index_url = 'http://lwn.net/current/bigpage?format=printable'
|
index_url = self.print_version('/current/bigpage')
|
||||||
else:
|
else:
|
||||||
index_url = 'http://lwn.net/free/bigpage?format=printable'
|
index_url = self.print_version('/free/bigpage')
|
||||||
soup = self.index_to_soup(index_url)
|
soup = self.index_to_soup(index_url)
|
||||||
body = soup.body
|
body = soup.body
|
||||||
|
|
||||||
@ -56,19 +77,19 @@ class WeeklyLWN(BasicNewsRecipe):
|
|||||||
url_re = re.compile('^/Articles/')
|
url_re = re.compile('^/Articles/')
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
tag_title = body.findNext(name='p', attrs={'class':'SummaryHL'})
|
tag_title = body.findNext(attrs={'class':'SummaryHL'})
|
||||||
if tag_title == None:
|
if tag_title == None:
|
||||||
break
|
break
|
||||||
|
|
||||||
tag_section = tag_title.findPrevious(name='p', attrs={'class':'Cat1HL'})
|
tag_section = tag_title.findPrevious(attrs={'class':'Cat1HL'})
|
||||||
if tag_section == None:
|
if tag_section == None:
|
||||||
section = 'Front Page'
|
section = 'Front Page'
|
||||||
else:
|
else:
|
||||||
section = tag_section.string
|
section = tag_section.string
|
||||||
|
|
||||||
tag_section2 = tag_title.findPrevious(name='p', attrs={'class':'Cat2HL'})
|
tag_section2 = tag_title.findPrevious(attrs={'class':'Cat2HL'})
|
||||||
if tag_section2 != None:
|
if tag_section2 != None:
|
||||||
if tag_section2.findPrevious(name='p', attrs={'class':'Cat1HL'}) == tag_section:
|
if tag_section2.findPrevious(attrs={'class':'Cat1HL'}) == tag_section:
|
||||||
section = "%s: %s" %(section, tag_section2.string)
|
section = "%s: %s" %(section, tag_section2.string)
|
||||||
|
|
||||||
if section not in articles.keys():
|
if section not in articles.keys():
|
||||||
@ -94,9 +115,10 @@ class WeeklyLWN(BasicNewsRecipe):
|
|||||||
if tag_url == None:
|
if tag_url == None:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
article = dict(
|
article = dict(
|
||||||
title=self.tag_to_string(tag_title),
|
title=self.tag_to_string(tag_title),
|
||||||
url= 'http://lwn.net' + tag_url['href'].split('#')[0] + '?format=printable',
|
url=tag_url['href'],
|
||||||
description='', content='', date='')
|
description='', content='', date='')
|
||||||
articles[section].append(article)
|
articles[section].append(article)
|
||||||
|
|
||||||
|
23
recipes/macity.recipe
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1325766771(BasicNewsRecipe):
|
||||||
|
title = u'Macity'
|
||||||
|
language = 'it'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = BasicNewsRecipe.get_article_url(self, article)
|
||||||
|
if link.split('/')[-1]=="story01.htm":
|
||||||
|
link=link.split('/')[-2]
|
||||||
|
a=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'L' , 'N' , 'S' ]
|
||||||
|
b=['0', '.', '/', '?', '-', '=', '&', '_', 'http://', '.com', 'www.']
|
||||||
|
for i in range(0,len(a)):
|
||||||
|
link=link.replace('0'+a[-i],b[-i])
|
||||||
|
return link
|
||||||
|
|
||||||
|
feeds = [(u'Macity', u'http://www.macitynet.it.feedsportal.com/c/33714/f/599513/index.rss')]
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
description = 'Apple and hi-tech news'
|
||||||
|
|
16
recipes/marketing_magazine.recipe
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
|
||||||
|
title = u'Marketing Magazine'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
remove_javascript = True
|
||||||
|
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
|
||||||
|
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)'
|
||||||
|
language = 'it'
|
||||||
|
|
||||||
|
|
@ -38,18 +38,23 @@ except:
|
|||||||
removed keep_only tags
|
removed keep_only tags
|
||||||
Version 1.8 26-11-2022
|
Version 1.8 26-11-2022
|
||||||
added remove tag: article-slideshow
|
added remove tag: article-slideshow
|
||||||
|
Version 1.9 31-1-2012
|
||||||
|
removed some left debug settings
|
||||||
|
extended timeout from 2 to 10
|
||||||
|
changed oldest article from 10 to 1.2
|
||||||
|
changed max articles from 15 to 25
|
||||||
'''
|
'''
|
||||||
|
|
||||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||||
title = u'Metro Nieuws NL'
|
title = u'Metro Nieuws NL'
|
||||||
oldest_article = 10
|
oldest_article = 1.2
|
||||||
max_articles_per_feed = 15
|
max_articles_per_feed = 25
|
||||||
__author__ = u'DrMerry'
|
__author__ = u'DrMerry'
|
||||||
description = u'Metro Nederland'
|
description = u'Metro Nederland'
|
||||||
language = u'nl'
|
language = u'nl'
|
||||||
simultaneous_downloads = 5
|
simultaneous_downloads = 3
|
||||||
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
|
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
|
||||||
timeout = 2
|
timeout = 10
|
||||||
center_navbar = True
|
center_navbar = True
|
||||||
timefmt = ' [%A, %d %b %Y]'
|
timefmt = ' [%A, %d %b %Y]'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
217
recipes/microwave_and_rf.recipe
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
##
|
||||||
|
## Title: Microwave and RF
|
||||||
|
##
|
||||||
|
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||||
|
|
||||||
|
# Feb 2012: Initial release
|
||||||
|
|
||||||
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||||
|
'''
|
||||||
|
mwrf.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.magick import Image
|
||||||
|
|
||||||
|
class Microwave_and_RF(BasicNewsRecipe):
|
||||||
|
|
||||||
|
Convert_Grayscale = False # Convert images to gray scale or not
|
||||||
|
|
||||||
|
# Add sections that want to be excluded from the magazine
|
||||||
|
exclude_sections = []
|
||||||
|
|
||||||
|
# Add sections that want to be included from the magazine
|
||||||
|
include_sections = []
|
||||||
|
|
||||||
|
title = u'Microwave and RF'
|
||||||
|
__author__ = 'kiavash'
|
||||||
|
description = u'Microwave and RF Montly Magazine'
|
||||||
|
publisher = 'Penton Media, Inc.'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
site = 'http://mwrf.com'
|
||||||
|
|
||||||
|
language = 'en'
|
||||||
|
asciiize = True
|
||||||
|
timeout = 120
|
||||||
|
simultaneous_downloads = 1 # very peaky site!
|
||||||
|
|
||||||
|
# Main article is inside this tag
|
||||||
|
keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})]
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
# Flattens all the tables to make it compatible with Nook
|
||||||
|
conversion_options = {'linearize_tables' : True}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='span', attrs={'class':'body12'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||||
|
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||||
|
|
||||||
|
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||||
|
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||||
|
.introduction, .first { font-weight: bold; } \
|
||||||
|
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||||
|
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||||
|
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||||
|
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||||
|
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||||
|
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||||
|
.story-date, .published { font-size: 80%; } \
|
||||||
|
table { width: 100%; } \
|
||||||
|
td img { display: block; margin: 5px auto; } \
|
||||||
|
ul { padding-top: 10px; } \
|
||||||
|
ol { padding-top: 10px; } \
|
||||||
|
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||||
|
h1 { font-size: 175%; font-weight: bold; } \
|
||||||
|
h2 { font-size: 150%; font-weight: bold; } \
|
||||||
|
h3 { font-size: 125%; font-weight: bold; } \
|
||||||
|
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||||
|
|
||||||
|
# Remove the line breaks and float left/right and picture width/height.
|
||||||
|
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'float:.*?'), lambda m: ''),
|
||||||
|
(re.compile(r'width:.*?px'), lambda m: ''),
|
||||||
|
(re.compile(r'height:.*?px'), lambda m: '')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
url = re.sub(r'.html', '', url)
|
||||||
|
url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url)
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Need to change the user agent to avoid potential download errors
|
||||||
|
def get_browser(self, *args, **kwargs):
|
||||||
|
from calibre import browser
|
||||||
|
kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0'
|
||||||
|
return browser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
|
||||||
|
# Fetches the main page of Microwave and RF
|
||||||
|
soup = self.index_to_soup(self.site)
|
||||||
|
|
||||||
|
# Searches the site for Issue ID link then returns the href address
|
||||||
|
# pointing to the latest issue
|
||||||
|
latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href')
|
||||||
|
|
||||||
|
# Fetches the index page for of the latest issue
|
||||||
|
soup = self.index_to_soup(latest_issue)
|
||||||
|
|
||||||
|
# Finds the main section of the page containing cover, issue date and
|
||||||
|
# TOC
|
||||||
|
ts = soup.find('div', attrs={'id':'columnContainer'})
|
||||||
|
|
||||||
|
# Finds the issue date
|
||||||
|
ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize()
|
||||||
|
self.log('Found Current Issue:', ds)
|
||||||
|
self.timefmt = ' [%s]'%ds
|
||||||
|
|
||||||
|
# Finds the cover image
|
||||||
|
cover = ts.find('img', src = lambda x: x and 'Cover' in x)
|
||||||
|
if cover is not None:
|
||||||
|
self.cover_url = self.site + cover['src']
|
||||||
|
self.log('Found Cover image:', self.cover_url)
|
||||||
|
|
||||||
|
feeds = []
|
||||||
|
article_info = []
|
||||||
|
|
||||||
|
# Finds all the articles (tiles and links)
|
||||||
|
articles = ts.findAll('a', attrs={'class':'commonArticleTitle'})
|
||||||
|
|
||||||
|
# Finds all the descriptions
|
||||||
|
descriptions = ts.findAll('span', attrs={'class':'commonCopy'})
|
||||||
|
|
||||||
|
# Find all the sections
|
||||||
|
sections = ts.findAll('span', attrs={'class':'kicker'})
|
||||||
|
|
||||||
|
title_number = 0
|
||||||
|
|
||||||
|
# Goes thru all the articles one by one and sort them out
|
||||||
|
for section in sections:
|
||||||
|
title_number = title_number + 1
|
||||||
|
|
||||||
|
# Removes the unwanted sections
|
||||||
|
if self.tag_to_string(section) in self.exclude_sections:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Only includes the wanted sections
|
||||||
|
if self.include_sections:
|
||||||
|
if self.tag_to_string(section) not in self.include_sections:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
title = self.tag_to_string(articles[title_number])
|
||||||
|
url = articles[title_number].get('href')
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.site + url
|
||||||
|
|
||||||
|
self.log('\tFound article:', title, 'at', url)
|
||||||
|
desc = self.tag_to_string(descriptions[title_number])
|
||||||
|
self.log('\t\t', desc)
|
||||||
|
|
||||||
|
article_info.append({'title':title, 'url':url, 'description':desc,
|
||||||
|
'date':self.timefmt})
|
||||||
|
|
||||||
|
if article_info:
|
||||||
|
feeds.append((self.title, article_info))
|
||||||
|
|
||||||
|
#self.log(feeds)
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
if self.Convert_Grayscale:
|
||||||
|
#process all the images
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.type = "GrayscaleType"
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
# Includes all the figures inside the final ebook
|
||||||
|
# Finds all the jpg links
|
||||||
|
for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}):
|
||||||
|
|
||||||
|
# makes sure that the link points to the absolute web address
|
||||||
|
if figure['href'].startswith('/'):
|
||||||
|
figure['href'] = self.site + figure['href']
|
||||||
|
|
||||||
|
figure.name = 'img' # converts the links to img
|
||||||
|
figure['src'] = figure['href'] # with the same address as href
|
||||||
|
figure['style'] = 'display:block' # adds /n before and after the image
|
||||||
|
del figure['href']
|
||||||
|
del figure['target']
|
||||||
|
|
||||||
|
# Makes the title standing out
|
||||||
|
for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}):
|
||||||
|
title.name = 'h1'
|
||||||
|
del title['href']
|
||||||
|
del title['target']
|
||||||
|
|
||||||
|
# Makes the section name more visible
|
||||||
|
for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}):
|
||||||
|
section_name.name = 'h5'
|
||||||
|
del section_name['href']
|
||||||
|
del section_name['target']
|
||||||
|
|
||||||
|
# Removes all unrelated links
|
||||||
|
for link in soup.findAll('a', attrs = {'href': True}):
|
||||||
|
link.name = 'font'
|
||||||
|
del link['href']
|
||||||
|
del link['target']
|
||||||
|
|
||||||
|
return soup
|
@ -10,6 +10,10 @@ __MakePeriodical__ = True
|
|||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images (Default: True)
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
|
__IncludeSummary__ = False
|
||||||
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
|
__IncludeThumbnails__ = True
|
||||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
# (HK only) It is to disable premium content (Default: False)
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
@ -24,6 +28,9 @@ __Date__ = ''
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
|
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||||
2011/10/19: fix a bug in txt source parsing
|
2011/10/19: fix a bug in txt source parsing
|
||||||
@ -53,6 +60,7 @@ Change Log:
|
|||||||
2010/10/31: skip repeated articles in section pages
|
2010/10/31: skip repeated articles in section pages
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
import os, datetime, re, mechanize
|
import os, datetime, re, mechanize
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
@ -60,10 +68,14 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
# MAIN CLASS
|
# MAIN CLASS
|
||||||
class MPRecipe(BasicNewsRecipe):
|
class MPRecipe(BasicNewsRecipe):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Hong Kong'
|
title = 'Ming Pao - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
@ -109,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: "</b>")
|
lambda match: "</b>")
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Vancouver'
|
title = 'Ming Pao - Vancouver'
|
||||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||||
category = 'Chinese, News, Vancouver'
|
category = 'Chinese, News, Vancouver'
|
||||||
@ -127,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: ''),
|
lambda match: ''),
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Toronto':
|
elif __Region__ == 'Toronto':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Toronto'
|
title = 'Ming Pao - Toronto'
|
||||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||||
category = 'Chinese, News, Toronto'
|
category = 'Chinese, News, Toronto'
|
||||||
@ -161,9 +179,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
def get_dtlocal(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||||
@ -186,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
def get_fetchday(self):
|
def get_fetchday(self):
|
||||||
if __Date__ <> '':
|
if __Date__ <> '':
|
||||||
return __Date__[6:8]
|
return __Date__[6:8]
|
||||||
@ -654,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
del item['absmiddle']
|
del item['absmiddle']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
# thumbnails shouldn't be available if using hi-res images
|
||||||
|
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
img = soup.find('img')
|
||||||
|
if img is not None:
|
||||||
|
self.add_toc_thumbnail(article, img['src'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||||
|
# look for content
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
textFound = False
|
||||||
|
for p in paras:
|
||||||
|
if not textFound:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||||
|
if len(summary_candidate) > 0:
|
||||||
|
article.summary = article.text_summary = summary_candidate
|
||||||
|
textFound = True
|
||||||
|
else:
|
||||||
|
# display a simple text
|
||||||
|
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||||
|
# display word counts
|
||||||
|
counts = 0
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
for p in paras:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
counts += len(summary_candidate)
|
||||||
|
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
# override from the one in version 0.8.31
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
if __UseChineseTitle__ == True:
|
|
||||||
if __Region__ == 'Hong Kong':
|
|
||||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
|
||||||
elif __Region__ == 'Vancouver':
|
|
||||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
|
||||||
elif __Region__ == 'Toronto':
|
|
||||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
|
||||||
else:
|
|
||||||
title = self.short_title()
|
title = self.short_title()
|
||||||
# if not generating a periodical, force date to apply in title
|
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||||
if __MakePeriodical__ == False:
|
# also use customed date instead of current time
|
||||||
|
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||||
title = title + ' ' + self.get_fetchformatteddate()
|
title = title + ' ' + self.get_fetchformatteddate()
|
||||||
if True:
|
# end of change 1
|
||||||
mi = MetaInformation(title, [self.publisher])
|
# change 2: __appname__ replaced by newspaper publisher
|
||||||
mi.publisher = self.publisher
|
__appname__ = self.publisher
|
||||||
mi.author_sort = self.publisher
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||||
if __MakePeriodical__ == True:
|
if __MakePeriodical__ == True:
|
||||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
else:
|
else:
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
#mi.timestamp = nowf()
|
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.timestamp = self.get_dtlocal()
|
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||||
mi.comments = self.description
|
# This one doesn't matter
|
||||||
if not isinstance(mi.comments, unicode):
|
mi.timestamp = nowf()
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
# change 5: skip listing the articles
|
||||||
|
#article_titles, aseen = [], set()
|
||||||
|
#for f in feeds:
|
||||||
|
# for a in f:
|
||||||
|
# if a.title and a.title not in aseen:
|
||||||
|
# aseen.add(a.title)
|
||||||
|
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
|
|
||||||
|
#mi.comments = self.description
|
||||||
|
#if not isinstance(mi.comments, unicode):
|
||||||
|
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
# '\n\n'.join(article_titles))
|
||||||
|
|
||||||
|
language = canonicalize_lang(self.language)
|
||||||
|
if language is not None:
|
||||||
|
mi.language = language
|
||||||
|
# This one affects the pub date shown in kindle title
|
||||||
#mi.pubdate = nowf()
|
#mi.pubdate = nowf()
|
||||||
mi.pubdate = self.get_dtlocal()
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
opf = OPFCreator(dir, mi)
|
opf = OPFCreator(dir, mi)
|
||||||
# Add mastheadImage entry to <guide> section
|
# Add mastheadImage entry to <guide> section
|
||||||
mp = getattr(self, 'masthead_path', None)
|
mp = getattr(self, 'masthead_path', None)
|
||||||
@ -721,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
mani.id = 'ncx'
|
mani.id = 'ncx'
|
||||||
if mani.path.endswith('mastheadImage.jpg'):
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
mani.id = 'masthead-image'
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
entries = ['index.html']
|
entries = ['index.html']
|
||||||
toc = TOC(base_path=dir)
|
toc = TOC(base_path=dir)
|
||||||
self.play_order_counter = 0
|
self.play_order_counter = 0
|
||||||
self.play_order_map = {}
|
self.play_order_map = {}
|
||||||
|
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
for j, a in enumerate(f):
|
for j, a in enumerate(f):
|
||||||
@ -739,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
desc = None
|
desc = None
|
||||||
else:
|
else:
|
||||||
desc = self.description_limiter(desc)
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
po = self.play_order_map.get(entries[-1], None)
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
play_order=po, author=auth, description=desc)
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
@ -762,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
templ = self.navbar.generate(True, num, j, len(f),
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
not self.has_single_feed,
|
not self.has_single_feed,
|
||||||
a.orig_url, self.publisher, prefix=prefix,
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
center=self.center_navbar)
|
center=self.center_navbar)
|
||||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
body.insert(len(body.contents), elem)
|
body.insert(len(body.contents), elem)
|
||||||
@ -799,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
|||||||
# Region - Hong Kong, Vancouver, Toronto
|
# Region - Hong Kong, Vancouver, Toronto
|
||||||
__Region__ = 'Toronto'
|
__Region__ = 'Toronto'
|
||||||
# Users of Kindle 3 with limited system-level CJK support
|
# Users of Kindle 3 with limited system-level CJK support
|
||||||
# please replace the following "True" with "False".
|
# please replace the following "True" with "False". (Default: True)
|
||||||
__MakePeriodical__ = True
|
__MakePeriodical__ = True
|
||||||
# Turn below to true if your device supports display of CJK titles
|
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
|
__IncludeSummary__ = False
|
||||||
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
|
__IncludeThumbnails__ = True
|
||||||
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
|
__InclPremium__ = False
|
||||||
|
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||||
|
__ParsePFF__ = True
|
||||||
|
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||||
|
__HiResImg__ = False
|
||||||
|
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||||
|
__Date__ = ''
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
|
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||||
|
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||||
|
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||||
|
2011/10/19: fix a bug in txt source parsing
|
||||||
|
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||||
|
2011/10/04: option to get hi-res photos for the articles
|
||||||
|
2011/09/21: fetching "column" section is made optional.
|
||||||
|
2011/09/18: parse "column" section stuff from source text file directly.
|
||||||
|
2011/09/07: disable "column" section as it is no longer offered free.
|
||||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||||
provide options to remove all images in the file
|
provide options to remove all images in the file
|
||||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||||
@ -37,30 +60,38 @@ Change Log:
|
|||||||
2010/10/31: skip repeated articles in section pages
|
2010/10/31: skip repeated articles in section pages
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, datetime, re
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re, mechanize
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
# MAIN CLASS
|
# MAIN CLASS
|
||||||
class MPRecipe(BasicNewsRecipe):
|
class MPRecipe(BasicNewsRecipe):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Hong Kong'
|
title = 'Ming Pao - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||||
|
dict(attrs={'class':['heading']}), # for heading from txt
|
||||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||||
|
dict(attrs={'class':['content']}), # for content from txt
|
||||||
dict(attrs={'class':['photo']}),
|
dict(attrs={'class':['photo']}),
|
||||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||||
|
dict(attrs={'class':['images']}) # for images from txt
|
||||||
]
|
]
|
||||||
if __KeepImages__:
|
if __KeepImages__:
|
||||||
remove_tags = [dict(name='style'),
|
remove_tags = [dict(name='style'),
|
||||||
@ -90,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: "</b>")
|
lambda match: "</b>")
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Vancouver'
|
title = 'Ming Pao - Vancouver'
|
||||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||||
category = 'Chinese, News, Vancouver'
|
category = 'Chinese, News, Vancouver'
|
||||||
@ -108,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: ''),
|
lambda match: ''),
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Toronto':
|
elif __Region__ == 'Toronto':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Toronto'
|
title = 'Ming Pao - Toronto'
|
||||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||||
category = 'Chinese, News, Toronto'
|
category = 'Chinese, News, Toronto'
|
||||||
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
def image_url_processor(cls, baseurl, url):
|
|
||||||
# trick: break the url at the first occurance of digit, add an additional
|
|
||||||
# '_' at the front
|
|
||||||
# not working, may need to move this to preprocess_html() method
|
|
||||||
# minIdx = 10000
|
|
||||||
# i0 = url.find('0')
|
|
||||||
# if i0 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i0
|
|
||||||
# i1 = url.find('1')
|
|
||||||
# if i1 >= 0 and i1 < minIdx:
|
|
||||||
# minIdx = i1
|
|
||||||
# i2 = url.find('2')
|
|
||||||
# if i2 >= 0 and i2 < minIdx:
|
|
||||||
# minIdx = i2
|
|
||||||
# i3 = url.find('3')
|
|
||||||
# if i3 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i3
|
|
||||||
# i4 = url.find('4')
|
|
||||||
# if i4 >= 0 and i4 < minIdx:
|
|
||||||
# minIdx = i4
|
|
||||||
# i5 = url.find('5')
|
|
||||||
# if i5 >= 0 and i5 < minIdx:
|
|
||||||
# minIdx = i5
|
|
||||||
# i6 = url.find('6')
|
|
||||||
# if i6 >= 0 and i6 < minIdx:
|
|
||||||
# minIdx = i6
|
|
||||||
# i7 = url.find('7')
|
|
||||||
# if i7 >= 0 and i7 < minIdx:
|
|
||||||
# minIdx = i7
|
|
||||||
# i8 = url.find('8')
|
|
||||||
# if i8 >= 0 and i8 < minIdx:
|
|
||||||
# minIdx = i8
|
|
||||||
# i9 = url.find('9')
|
|
||||||
# if i9 >= 0 and i9 < minIdx:
|
|
||||||
# minIdx = i9
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_dtlocal(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||||
@ -193,12 +193,33 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
return dt_local
|
return dt_local
|
||||||
|
|
||||||
def get_fetchdate(self):
|
def get_fetchdate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
return self.get_dtlocal().strftime("%Y%m%d")
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
def get_fetchformatteddate(self):
|
def get_fetchformatteddate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
def get_fetchday(self):
|
def get_fetchday(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
return self.get_dtlocal().strftime("%d")
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
]:
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
else:
|
||||||
articles = self.parse_section2(url, keystr)
|
articles = self.parse_section2(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||||
|
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
# special- editorial
|
# special- editorial
|
||||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||||
if ed_articles:
|
#if ed_articles:
|
||||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||||
|
|
||||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||||
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# special - finance
|
# special - finance
|
||||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||||
if fin_articles:
|
#if fin_articles:
|
||||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||||
|
|
||||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
articles = self.parse_section(url)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
|
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||||
|
# articles = self.parse_section(url)
|
||||||
|
# if articles:
|
||||||
|
# feeds.append((title, articles))
|
||||||
|
|
||||||
# special - entertainment
|
# special - entertainment
|
||||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||||
if ent_articles:
|
#if ent_articles:
|
||||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||||
|
|
||||||
|
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
|
||||||
# special- columns
|
|
||||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
|
||||||
if col_articles:
|
|
||||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||||
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||||
|
# replace the url to the print-friendly version
|
||||||
|
if __ParsePFF__ == True:
|
||||||
|
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||||
|
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||||
|
url = re.sub('%2F.*%2F', '/', url)
|
||||||
|
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||||
|
url = url.replace('%2Etxt', '_print.htm')
|
||||||
|
url = url.replace('%5F', '_')
|
||||||
|
else:
|
||||||
|
url = url.replace('.htm', '_print.htm')
|
||||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# parse from life.mingpao.com
|
# parse from life.mingpao.com
|
||||||
def parse_section2(self, url, keystr):
|
def parse_section2(self, url, keystr):
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
a = soup.findAll('a', href=True)
|
a = soup.findAll('a', href=True)
|
||||||
@ -350,9 +409,31 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(i)
|
title = self.tag_to_string(i)
|
||||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
|
try:
|
||||||
|
br.open_novisit(url)
|
||||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
|
except:
|
||||||
|
print 'skipping a premium article'
|
||||||
|
current_articles.reverse()
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
# parse from text file of life.mingpao.com
|
||||||
|
def parse_section2_txt(self, url, keystr):
|
||||||
|
self.get_fetchdate()
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
a = soup.findAll('a', href=True)
|
||||||
|
a.reverse()
|
||||||
|
current_articles = []
|
||||||
|
included_urls = []
|
||||||
|
for i in a:
|
||||||
|
title = self.tag_to_string(i)
|
||||||
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
|
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
|
included_urls.append(url)
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
|
# preprocess those .txt and javascript based files
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
new_html = raw_html
|
||||||
|
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||||
|
if url.rfind('_print.htm') <> -1:
|
||||||
|
# javascript based file
|
||||||
|
splitter = re.compile(r'\n')
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||||
|
new_raw_html = new_raw_html + '<body>'
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
if item.startswith('var heading1 ='):
|
||||||
|
heading = item.replace('var heading1 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||||
|
if item.startswith('var heading2 ='):
|
||||||
|
heading = item.replace('var heading2 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
if heading <> '':
|
||||||
|
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + '</div>'
|
||||||
|
if item.startswith('var content ='):
|
||||||
|
content = item.replace("var content = ", '')
|
||||||
|
content = content.replace('\'', '')
|
||||||
|
content = content.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||||
|
if item.startswith('var photocontent ='):
|
||||||
|
photo = item.replace('var photocontent = \'', '')
|
||||||
|
photo = photo.replace('\'', '')
|
||||||
|
photo = photo.replace(';', '')
|
||||||
|
photo = photo.replace('<tr>', '')
|
||||||
|
photo = photo.replace('<td>', '')
|
||||||
|
photo = photo.replace('</tr>', '')
|
||||||
|
photo = photo.replace('</td>', '<br>')
|
||||||
|
photo = photo.replace('class="photo"', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||||
|
new_html = new_raw_html + '</body></html>'
|
||||||
|
else:
|
||||||
|
# .txt based file
|
||||||
|
splitter = re.compile(r'\n') # Match non-digits
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||||
|
next_is_img_txt = False
|
||||||
|
title_started = False
|
||||||
|
title_break_reached = False
|
||||||
|
met_article_start_char = False
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
item = item.strip()
|
||||||
|
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||||
|
if title_started == True and title_break_reached == False and item == '':
|
||||||
|
title_break_reached = True
|
||||||
|
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||||
|
# start content
|
||||||
|
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
met_article_start_char = True
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
#if item.startswith(u'\u3010'):
|
||||||
|
# met_article_start_char = True
|
||||||
|
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False:
|
||||||
|
if item.startswith("=@"):
|
||||||
|
print 'skip movie link'
|
||||||
|
elif item.startswith("=?"):
|
||||||
|
next_is_img_txt = True
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||||
|
elif item.startswith('=='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[2:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||||
|
elif item.startswith('='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[1:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
if title_started == False:
|
||||||
|
#print 'Title started at ', item
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||||
|
title_started = True
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
next_is_img_txt = False
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
new_html = new_raw_html + '</div></body></html>'
|
||||||
|
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||||
|
if __HiResImg__ == True:
|
||||||
|
# TODO: add a _ in front of an image url
|
||||||
|
if url.rfind('news.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
for img in imglist:
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
# find the location of the first _
|
||||||
|
pos = img.find('_')
|
||||||
|
if pos > -1:
|
||||||
|
# if found, insert _ after the first _
|
||||||
|
newimg = img[0:pos] + '_' + img[pos:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
else:
|
||||||
|
# if not found, insert _ after "
|
||||||
|
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||||
|
elif url.rfind('life.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
#print 'Img list: ', imglist, '\n'
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg\'', 'gif\'')
|
||||||
|
try:
|
||||||
|
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.rfind('/')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
#print 'url', url
|
||||||
|
pos = url.rfind('/')
|
||||||
|
gifurl = url[:pos+1]
|
||||||
|
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.find('"')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
#print 'Use hi-res img', newimg
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
return new_html
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
@ -447,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
del item['absmiddle']
|
del item['absmiddle']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
# thumbnails shouldn't be available if using hi-res images
|
||||||
|
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
img = soup.find('img')
|
||||||
|
if img is not None:
|
||||||
|
self.add_toc_thumbnail(article, img['src'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||||
|
# look for content
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
textFound = False
|
||||||
|
for p in paras:
|
||||||
|
if not textFound:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||||
|
if len(summary_candidate) > 0:
|
||||||
|
article.summary = article.text_summary = summary_candidate
|
||||||
|
textFound = True
|
||||||
|
else:
|
||||||
|
# display a simple text
|
||||||
|
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||||
|
# display word counts
|
||||||
|
counts = 0
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
for p in paras:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
counts += len(summary_candidate)
|
||||||
|
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
# override from the one in version 0.8.31
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
if __UseChineseTitle__ == True:
|
|
||||||
if __Region__ == 'Hong Kong':
|
|
||||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
|
||||||
elif __Region__ == 'Vancouver':
|
|
||||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
|
||||||
elif __Region__ == 'Toronto':
|
|
||||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
|
||||||
else:
|
|
||||||
title = self.short_title()
|
title = self.short_title()
|
||||||
# if not generating a periodical, force date to apply in title
|
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||||
if __MakePeriodical__ == False:
|
# also use customed date instead of current time
|
||||||
|
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||||
title = title + ' ' + self.get_fetchformatteddate()
|
title = title + ' ' + self.get_fetchformatteddate()
|
||||||
if True:
|
# end of change 1
|
||||||
mi = MetaInformation(title, [self.publisher])
|
# change 2: __appname__ replaced by newspaper publisher
|
||||||
mi.publisher = self.publisher
|
__appname__ = self.publisher
|
||||||
mi.author_sort = self.publisher
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||||
if __MakePeriodical__ == True:
|
if __MakePeriodical__ == True:
|
||||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
else:
|
else:
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
#mi.timestamp = nowf()
|
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.timestamp = self.get_dtlocal()
|
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||||
mi.comments = self.description
|
# This one doesn't matter
|
||||||
if not isinstance(mi.comments, unicode):
|
mi.timestamp = nowf()
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
# change 5: skip listing the articles
|
||||||
|
#article_titles, aseen = [], set()
|
||||||
|
#for f in feeds:
|
||||||
|
# for a in f:
|
||||||
|
# if a.title and a.title not in aseen:
|
||||||
|
# aseen.add(a.title)
|
||||||
|
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
|
|
||||||
|
#mi.comments = self.description
|
||||||
|
#if not isinstance(mi.comments, unicode):
|
||||||
|
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
# '\n\n'.join(article_titles))
|
||||||
|
|
||||||
|
language = canonicalize_lang(self.language)
|
||||||
|
if language is not None:
|
||||||
|
mi.language = language
|
||||||
|
# This one affects the pub date shown in kindle title
|
||||||
#mi.pubdate = nowf()
|
#mi.pubdate = nowf()
|
||||||
mi.pubdate = self.get_dtlocal()
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
opf = OPFCreator(dir, mi)
|
opf = OPFCreator(dir, mi)
|
||||||
# Add mastheadImage entry to <guide> section
|
# Add mastheadImage entry to <guide> section
|
||||||
mp = getattr(self, 'masthead_path', None)
|
mp = getattr(self, 'masthead_path', None)
|
||||||
@ -514,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
mani.id = 'ncx'
|
mani.id = 'ncx'
|
||||||
if mani.path.endswith('mastheadImage.jpg'):
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
mani.id = 'masthead-image'
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
entries = ['index.html']
|
entries = ['index.html']
|
||||||
toc = TOC(base_path=dir)
|
toc = TOC(base_path=dir)
|
||||||
self.play_order_counter = 0
|
self.play_order_counter = 0
|
||||||
self.play_order_map = {}
|
self.play_order_map = {}
|
||||||
|
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
for j, a in enumerate(f):
|
for j, a in enumerate(f):
|
||||||
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
desc = None
|
desc = None
|
||||||
else:
|
else:
|
||||||
desc = self.description_limiter(desc)
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
po = self.play_order_map.get(entries[-1], None)
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
play_order=po, author=auth, description=desc)
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
templ = self.navbar.generate(True, num, j, len(f),
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
not self.has_single_feed,
|
not self.has_single_feed,
|
||||||
a.orig_url, self.publisher, prefix=prefix,
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
center=self.center_navbar)
|
center=self.center_navbar)
|
||||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
body.insert(len(body.contents), elem)
|
body.insert(len(body.contents), elem)
|
||||||
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
|||||||
# Region - Hong Kong, Vancouver, Toronto
|
# Region - Hong Kong, Vancouver, Toronto
|
||||||
__Region__ = 'Vancouver'
|
__Region__ = 'Vancouver'
|
||||||
# Users of Kindle 3 with limited system-level CJK support
|
# Users of Kindle 3 with limited system-level CJK support
|
||||||
# please replace the following "True" with "False".
|
# please replace the following "True" with "False". (Default: True)
|
||||||
__MakePeriodical__ = True
|
__MakePeriodical__ = True
|
||||||
# Turn below to true if your device supports display of CJK titles
|
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = False
|
||||||
# Set it to False if you want to skip images
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
|
__IncludeSummary__ = False
|
||||||
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
|
__IncludeThumbnails__ = True
|
||||||
|
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||||
__UseLife__ = True
|
__UseLife__ = True
|
||||||
|
# (HK only) It is to disable premium content (Default: False)
|
||||||
|
__InclPremium__ = False
|
||||||
|
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||||
|
__ParsePFF__ = True
|
||||||
|
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||||
|
__HiResImg__ = False
|
||||||
|
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||||
|
__Date__ = ''
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
|
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||||
|
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||||
|
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||||
|
2011/10/19: fix a bug in txt source parsing
|
||||||
|
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||||
|
2011/10/04: option to get hi-res photos for the articles
|
||||||
|
2011/09/21: fetching "column" section is made optional.
|
||||||
|
2011/09/18: parse "column" section stuff from source text file directly.
|
||||||
|
2011/09/07: disable "column" section as it is no longer offered free.
|
||||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||||
provide options to remove all images in the file
|
provide options to remove all images in the file
|
||||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||||
@ -37,30 +60,38 @@ Change Log:
|
|||||||
2010/10/31: skip repeated articles in section pages
|
2010/10/31: skip repeated articles in section pages
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, datetime, re
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re, mechanize
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
# MAIN CLASS
|
# MAIN CLASS
|
||||||
class MPRecipe(BasicNewsRecipe):
|
class MPRecipe(BasicNewsRecipe):
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Hong Kong'
|
title = 'Ming Pao - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||||
|
dict(attrs={'class':['heading']}), # for heading from txt
|
||||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||||
|
dict(attrs={'class':['content']}), # for content from txt
|
||||||
dict(attrs={'class':['photo']}),
|
dict(attrs={'class':['photo']}),
|
||||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||||
|
dict(attrs={'class':['images']}) # for images from txt
|
||||||
]
|
]
|
||||||
if __KeepImages__:
|
if __KeepImages__:
|
||||||
remove_tags = [dict(name='style'),
|
remove_tags = [dict(name='style'),
|
||||||
@ -90,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: "</b>")
|
lambda match: "</b>")
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Vancouver'
|
title = 'Ming Pao - Vancouver'
|
||||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||||
category = 'Chinese, News, Vancouver'
|
category = 'Chinese, News, Vancouver'
|
||||||
@ -108,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
lambda match: ''),
|
lambda match: ''),
|
||||||
]
|
]
|
||||||
elif __Region__ == 'Toronto':
|
elif __Region__ == 'Toronto':
|
||||||
|
if __UseChineseTitle__ == True:
|
||||||
|
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||||
|
else:
|
||||||
title = 'Ming Pao - Toronto'
|
title = 'Ming Pao - Toronto'
|
||||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||||
category = 'Chinese, News, Toronto'
|
category = 'Chinese, News, Toronto'
|
||||||
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
def image_url_processor(cls, baseurl, url):
|
|
||||||
# trick: break the url at the first occurance of digit, add an additional
|
|
||||||
# '_' at the front
|
|
||||||
# not working, may need to move this to preprocess_html() method
|
|
||||||
# minIdx = 10000
|
|
||||||
# i0 = url.find('0')
|
|
||||||
# if i0 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i0
|
|
||||||
# i1 = url.find('1')
|
|
||||||
# if i1 >= 0 and i1 < minIdx:
|
|
||||||
# minIdx = i1
|
|
||||||
# i2 = url.find('2')
|
|
||||||
# if i2 >= 0 and i2 < minIdx:
|
|
||||||
# minIdx = i2
|
|
||||||
# i3 = url.find('3')
|
|
||||||
# if i3 >= 0 and i0 < minIdx:
|
|
||||||
# minIdx = i3
|
|
||||||
# i4 = url.find('4')
|
|
||||||
# if i4 >= 0 and i4 < minIdx:
|
|
||||||
# minIdx = i4
|
|
||||||
# i5 = url.find('5')
|
|
||||||
# if i5 >= 0 and i5 < minIdx:
|
|
||||||
# minIdx = i5
|
|
||||||
# i6 = url.find('6')
|
|
||||||
# if i6 >= 0 and i6 < minIdx:
|
|
||||||
# minIdx = i6
|
|
||||||
# i7 = url.find('7')
|
|
||||||
# if i7 >= 0 and i7 < minIdx:
|
|
||||||
# minIdx = i7
|
|
||||||
# i8 = url.find('8')
|
|
||||||
# if i8 >= 0 and i8 < minIdx:
|
|
||||||
# minIdx = i8
|
|
||||||
# i9 = url.find('9')
|
|
||||||
# if i9 >= 0 and i9 < minIdx:
|
|
||||||
# minIdx = i9
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_dtlocal(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
if __Region__ == 'Hong Kong':
|
if __Region__ == 'Hong Kong':
|
||||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||||
@ -193,12 +193,33 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
return dt_local
|
return dt_local
|
||||||
|
|
||||||
def get_fetchdate(self):
|
def get_fetchdate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
return self.get_dtlocal().strftime("%Y%m%d")
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
def get_fetchformatteddate(self):
|
def get_fetchformatteddate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
def get_fetchday(self):
|
def get_fetchday(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
return self.get_dtlocal().strftime("%d")
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
]:
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
else:
|
||||||
articles = self.parse_section2(url, keystr)
|
articles = self.parse_section2(url, keystr)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||||
|
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||||
articles = self.parse_section(url)
|
articles = self.parse_section(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
# special- editorial
|
# special- editorial
|
||||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||||
if ed_articles:
|
#if ed_articles:
|
||||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||||
|
|
||||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||||
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# special - finance
|
# special - finance
|
||||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||||
if fin_articles:
|
#if fin_articles:
|
||||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||||
|
|
||||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
articles = self.parse_section(url)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
|
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||||
|
# articles = self.parse_section(url)
|
||||||
|
# if articles:
|
||||||
|
# feeds.append((title, articles))
|
||||||
|
|
||||||
# special - entertainment
|
# special - entertainment
|
||||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||||
if ent_articles:
|
#if ent_articles:
|
||||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||||
|
|
||||||
|
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
if __InclPremium__ == True:
|
||||||
|
# parse column section articles directly from .txt files
|
||||||
|
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||||
|
]:
|
||||||
|
articles = self.parse_section2_txt(url, keystr)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
|
||||||
|
|
||||||
# special- columns
|
|
||||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
|
||||||
if col_articles:
|
|
||||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
|
||||||
elif __Region__ == 'Vancouver':
|
elif __Region__ == 'Vancouver':
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||||
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||||
|
# replace the url to the print-friendly version
|
||||||
|
if __ParsePFF__ == True:
|
||||||
|
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||||
|
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||||
|
url = re.sub('%2F.*%2F', '/', url)
|
||||||
|
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||||
|
url = url.replace('%2Etxt', '_print.htm')
|
||||||
|
url = url.replace('%5F', '_')
|
||||||
|
else:
|
||||||
|
url = url.replace('.htm', '_print.htm')
|
||||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# parse from life.mingpao.com
|
# parse from life.mingpao.com
|
||||||
def parse_section2(self, url, keystr):
|
def parse_section2(self, url, keystr):
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
self.get_fetchdate()
|
self.get_fetchdate()
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
a = soup.findAll('a', href=True)
|
a = soup.findAll('a', href=True)
|
||||||
@ -350,9 +409,31 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(i)
|
title = self.tag_to_string(i)
|
||||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
|
try:
|
||||||
|
br.open_novisit(url)
|
||||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
|
except:
|
||||||
|
print 'skipping a premium article'
|
||||||
|
current_articles.reverse()
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
# parse from text file of life.mingpao.com
|
||||||
|
def parse_section2_txt(self, url, keystr):
|
||||||
|
self.get_fetchdate()
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
a = soup.findAll('a', href=True)
|
||||||
|
a.reverse()
|
||||||
|
current_articles = []
|
||||||
|
included_urls = []
|
||||||
|
for i in a:
|
||||||
|
title = self.tag_to_string(i)
|
||||||
|
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||||
|
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||||
|
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
|
included_urls.append(url)
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
|
# preprocess those .txt and javascript based files
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
new_html = raw_html
|
||||||
|
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||||
|
if url.rfind('_print.htm') <> -1:
|
||||||
|
# javascript based file
|
||||||
|
splitter = re.compile(r'\n')
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||||
|
new_raw_html = new_raw_html + '<body>'
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
if item.startswith('var heading1 ='):
|
||||||
|
heading = item.replace('var heading1 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||||
|
if item.startswith('var heading2 ='):
|
||||||
|
heading = item.replace('var heading2 = \'', '')
|
||||||
|
heading = heading.replace('\'', '')
|
||||||
|
heading = heading.replace(';', '')
|
||||||
|
if heading <> '':
|
||||||
|
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + '</div>'
|
||||||
|
if item.startswith('var content ='):
|
||||||
|
content = item.replace("var content = ", '')
|
||||||
|
content = content.replace('\'', '')
|
||||||
|
content = content.replace(';', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||||
|
if item.startswith('var photocontent ='):
|
||||||
|
photo = item.replace('var photocontent = \'', '')
|
||||||
|
photo = photo.replace('\'', '')
|
||||||
|
photo = photo.replace(';', '')
|
||||||
|
photo = photo.replace('<tr>', '')
|
||||||
|
photo = photo.replace('<td>', '')
|
||||||
|
photo = photo.replace('</tr>', '')
|
||||||
|
photo = photo.replace('</td>', '<br>')
|
||||||
|
photo = photo.replace('class="photo"', '')
|
||||||
|
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||||
|
new_html = new_raw_html + '</body></html>'
|
||||||
|
else:
|
||||||
|
# .txt based file
|
||||||
|
splitter = re.compile(r'\n') # Match non-digits
|
||||||
|
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||||
|
next_is_img_txt = False
|
||||||
|
title_started = False
|
||||||
|
title_break_reached = False
|
||||||
|
met_article_start_char = False
|
||||||
|
for item in splitter.split(raw_html):
|
||||||
|
item = item.strip()
|
||||||
|
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||||
|
if title_started == True and title_break_reached == False and item == '':
|
||||||
|
title_break_reached = True
|
||||||
|
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||||
|
# start content
|
||||||
|
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
met_article_start_char = True
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
#if item.startswith(u'\u3010'):
|
||||||
|
# met_article_start_char = True
|
||||||
|
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False:
|
||||||
|
if item.startswith("=@"):
|
||||||
|
print 'skip movie link'
|
||||||
|
elif item.startswith("=?"):
|
||||||
|
next_is_img_txt = True
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||||
|
elif item.startswith('=='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[2:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||||
|
elif item.startswith('='):
|
||||||
|
next_is_img_txt = True
|
||||||
|
if False:
|
||||||
|
# TODO: check existence of .gif first
|
||||||
|
newimg = '_' + item[1:].strip() + '.jpg'
|
||||||
|
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||||
|
else:
|
||||||
|
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||||
|
else:
|
||||||
|
if next_is_img_txt == False and met_article_start_char == False:
|
||||||
|
if item <> '':
|
||||||
|
if title_started == False:
|
||||||
|
#print 'Title started at ', item
|
||||||
|
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||||
|
title_started = True
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
else:
|
||||||
|
new_raw_html = new_raw_html + item + '<p>\n'
|
||||||
|
else:
|
||||||
|
next_is_img_txt = False
|
||||||
|
new_raw_html = new_raw_html + item + '\n'
|
||||||
|
new_html = new_raw_html + '</div></body></html>'
|
||||||
|
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||||
|
if __HiResImg__ == True:
|
||||||
|
# TODO: add a _ in front of an image url
|
||||||
|
if url.rfind('news.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
for img in imglist:
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
# find the location of the first _
|
||||||
|
pos = img.find('_')
|
||||||
|
if pos > -1:
|
||||||
|
# if found, insert _ after the first _
|
||||||
|
newimg = img[0:pos] + '_' + img[pos:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
else:
|
||||||
|
# if not found, insert _ after "
|
||||||
|
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||||
|
elif url.rfind('life.mingpao.com') > -1:
|
||||||
|
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
#print 'Img list: ', imglist, '\n'
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg\'', 'gif\'')
|
||||||
|
try:
|
||||||
|
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.rfind('/')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||||
|
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||||
|
for img in imglist:
|
||||||
|
#print 'Found img: ', img
|
||||||
|
gifimg = img.replace('jpg"', 'gif"')
|
||||||
|
try:
|
||||||
|
#print 'url', url
|
||||||
|
pos = url.rfind('/')
|
||||||
|
gifurl = url[:pos+1]
|
||||||
|
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||||
|
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||||
|
new_html = new_html.replace(img, gifimg)
|
||||||
|
except:
|
||||||
|
pos = img.find('"')
|
||||||
|
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||||
|
#print 'Use hi-res img', newimg
|
||||||
|
new_html = new_html.replace(img, newimg)
|
||||||
|
return new_html
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
@ -447,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
del item['absmiddle']
|
del item['absmiddle']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
# thumbnails shouldn't be available if using hi-res images
|
||||||
|
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
img = soup.find('img')
|
||||||
|
if img is not None:
|
||||||
|
self.add_toc_thumbnail(article, img['src'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||||
|
# look for content
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
textFound = False
|
||||||
|
for p in paras:
|
||||||
|
if not textFound:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||||
|
if len(summary_candidate) > 0:
|
||||||
|
article.summary = article.text_summary = summary_candidate
|
||||||
|
textFound = True
|
||||||
|
else:
|
||||||
|
# display a simple text
|
||||||
|
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||||
|
# display word counts
|
||||||
|
counts = 0
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||||
|
if not articlebodies:
|
||||||
|
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
if not paras:
|
||||||
|
paras = articlebody
|
||||||
|
for p in paras:
|
||||||
|
summary_candidate = self.tag_to_string(p).strip()
|
||||||
|
counts += len(summary_candidate)
|
||||||
|
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||||
|
except:
|
||||||
|
self.log("Error creating article descriptions")
|
||||||
|
return
|
||||||
|
|
||||||
|
# override from the one in version 0.8.31
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
if __UseChineseTitle__ == True:
|
|
||||||
if __Region__ == 'Hong Kong':
|
|
||||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
|
||||||
elif __Region__ == 'Vancouver':
|
|
||||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
|
||||||
elif __Region__ == 'Toronto':
|
|
||||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
|
||||||
else:
|
|
||||||
title = self.short_title()
|
title = self.short_title()
|
||||||
# if not generating a periodical, force date to apply in title
|
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||||
if __MakePeriodical__ == False:
|
# also use customed date instead of current time
|
||||||
|
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||||
title = title + ' ' + self.get_fetchformatteddate()
|
title = title + ' ' + self.get_fetchformatteddate()
|
||||||
if True:
|
# end of change 1
|
||||||
mi = MetaInformation(title, [self.publisher])
|
# change 2: __appname__ replaced by newspaper publisher
|
||||||
mi.publisher = self.publisher
|
__appname__ = self.publisher
|
||||||
mi.author_sort = self.publisher
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||||
if __MakePeriodical__ == True:
|
if __MakePeriodical__ == True:
|
||||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
else:
|
else:
|
||||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
#mi.timestamp = nowf()
|
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
mi.timestamp = self.get_dtlocal()
|
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||||
mi.comments = self.description
|
# This one doesn't matter
|
||||||
if not isinstance(mi.comments, unicode):
|
mi.timestamp = nowf()
|
||||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
# change 5: skip listing the articles
|
||||||
|
#article_titles, aseen = [], set()
|
||||||
|
#for f in feeds:
|
||||||
|
# for a in f:
|
||||||
|
# if a.title and a.title not in aseen:
|
||||||
|
# aseen.add(a.title)
|
||||||
|
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
|
|
||||||
|
#mi.comments = self.description
|
||||||
|
#if not isinstance(mi.comments, unicode):
|
||||||
|
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
# '\n\n'.join(article_titles))
|
||||||
|
|
||||||
|
language = canonicalize_lang(self.language)
|
||||||
|
if language is not None:
|
||||||
|
mi.language = language
|
||||||
|
# This one affects the pub date shown in kindle title
|
||||||
#mi.pubdate = nowf()
|
#mi.pubdate = nowf()
|
||||||
mi.pubdate = self.get_dtlocal()
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
opf_path = os.path.join(dir, 'index.opf')
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
ncx_path = os.path.join(dir, 'index.ncx')
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
opf = OPFCreator(dir, mi)
|
opf = OPFCreator(dir, mi)
|
||||||
# Add mastheadImage entry to <guide> section
|
# Add mastheadImage entry to <guide> section
|
||||||
mp = getattr(self, 'masthead_path', None)
|
mp = getattr(self, 'masthead_path', None)
|
||||||
@ -514,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
mani.id = 'ncx'
|
mani.id = 'ncx'
|
||||||
if mani.path.endswith('mastheadImage.jpg'):
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
mani.id = 'masthead-image'
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
entries = ['index.html']
|
entries = ['index.html']
|
||||||
toc = TOC(base_path=dir)
|
toc = TOC(base_path=dir)
|
||||||
self.play_order_counter = 0
|
self.play_order_counter = 0
|
||||||
self.play_order_map = {}
|
self.play_order_map = {}
|
||||||
|
|
||||||
|
|
||||||
def feed_index(num, parent):
|
def feed_index(num, parent):
|
||||||
f = feeds[num]
|
f = feeds[num]
|
||||||
for j, a in enumerate(f):
|
for j, a in enumerate(f):
|
||||||
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
desc = None
|
desc = None
|
||||||
else:
|
else:
|
||||||
desc = self.description_limiter(desc)
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
po = self.play_order_map.get(entries[-1], None)
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
play_order=po, author=auth, description=desc)
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
templ = self.navbar.generate(True, num, j, len(f),
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
not self.has_single_feed,
|
not self.has_single_feed,
|
||||||
a.orig_url, self.publisher, prefix=prefix,
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
center=self.center_navbar)
|
center=self.center_navbar)
|
||||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
body.insert(len(body.contents), elem)
|
body.insert(len(body.contents), elem)
|
||||||
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
|
||||||
|
76
recipes/money_pl.recipe
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class FocusRecipe(BasicNewsRecipe):
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = u'intromatyk <intromatyk@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
version = 1
|
||||||
|
|
||||||
|
title = u'Money.pl'
|
||||||
|
category = u'News'
|
||||||
|
description = u'Informacje finansowe z kraju i ze świata. Aktualne i archiwalne: notowania giełdowe, kursy walut, wskaźniki gospodarcze.'
|
||||||
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets=True
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100000
|
||||||
|
recursions = 0
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
simultaneous_downloads = 2
|
||||||
|
|
||||||
|
r = re.compile('.*(?P<url>http:\/\/(www.money.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'artykul'}))
|
||||||
|
remove_tags = [dict(name='ul', attrs={'class':'socialStuff'})]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
body {font-family: Arial,Helvetica,sans-serif ;}
|
||||||
|
h1{text-align: left;}
|
||||||
|
h2{font-size: medium; font-weight: bold;}
|
||||||
|
p.lead {font-weight: bold; text-align: left;}
|
||||||
|
.authordate {font-size: small; color: #696969;}
|
||||||
|
.fot{font-size: x-small; color: #666666;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Wiadomosci z kraju', 'http://money.pl.feedsportal.com/c/33900/f/612847/index.rss'),
|
||||||
|
('Wiadomosci ze swiata', 'http://money.pl.feedsportal.com/c/33900/f/612848/index.rss'),
|
||||||
|
('Gospodarka', 'http://money.pl.feedsportal.com/c/33900/f/612849/index.rss'),
|
||||||
|
('Waluty', 'http://money.pl.feedsportal.com/c/33900/f/612850/index.rss'),
|
||||||
|
('Gielda', 'http://money.pl.feedsportal.com/c/33900/f/612851/index.rss'),
|
||||||
|
('Banki', 'http://money.pl.feedsportal.com/c/33900/f/612852/index.rss'),
|
||||||
|
('Fundusze', 'http://money.pl.feedsportal.com/c/33900/f/612853/index.rss'),
|
||||||
|
('Emerytury', 'http://money.pl.feedsportal.com/c/33900/f/612854/index.rss'),
|
||||||
|
('Podatki', 'http://money.pl.feedsportal.com/c/33900/f/612855/index.rss'),
|
||||||
|
('Ubezpieczenia', 'http://money.pl.feedsportal.com/c/33900/f/612856/index.rss'),
|
||||||
|
('Poradniki', 'http://money.pl.feedsportal.com/c/33900/f/612857/index.rss'),
|
||||||
|
('Raporty', 'http://money.pl.feedsportal.com/c/33900/f/612858/index.rss'),
|
||||||
|
('Motoryzacja', 'http://money.pl.feedsportal.com/c/33900/f/612859/index.rss'),
|
||||||
|
('Manager', 'http://money.pl.feedsportal.com/c/33900/f/612860/index.rss'),
|
||||||
|
('Dla firm', 'http://money.pl.feedsportal.com/c/33900/f/612861/index.rss'),
|
||||||
|
('Prawo', 'http://money.pl.feedsportal.com/c/33900/f/612862/index.rss'),
|
||||||
|
('Nieruchomosci', 'http://money.pl.feedsportal.com/c/33900/f/612863/index.rss'),
|
||||||
|
('Praca', 'http://money.pl.feedsportal.com/c/33900/f/612864/index.rss'),
|
||||||
|
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
if url.count ('money.pl.feedsportal.com'):
|
||||||
|
u = url.find('0Cartykul0C')
|
||||||
|
u = 'http://www.m.money.pl/wiadomosci/artykul/' + url[u + 21:]
|
||||||
|
u = u.replace('0C', '/')
|
||||||
|
u = u.replace('A', '')
|
||||||
|
u = u.replace ('0E','-')
|
||||||
|
u = u.replace ('0P',';')
|
||||||
|
u = u.replace ('0H',',')
|
||||||
|
u = u.replace ('0B','.')
|
||||||
|
u = u.replace (',0,',',-1,')
|
||||||
|
u = u.replace('0Tutm0Isource0Frss0Gutm0Imedium0Frss0Gutm0Icampaign0Frss/story01.htm', '')
|
||||||
|
else:
|
||||||
|
u = url.replace('/nc/1','/do-druku/1')
|
||||||
|
return u
|
@ -1,9 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
moneynews.newsmax.com
|
www.moneynews.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
@ -12,40 +10,40 @@ class MoneyNews(BasicNewsRecipe):
|
|||||||
title = 'Moneynews.com'
|
title = 'Moneynews.com'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Financial news worldwide'
|
description = 'Financial news worldwide'
|
||||||
publisher = 'moneynews.com'
|
publisher = 'Newsmax.com'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
category = 'news, finances, USA, business'
|
category = 'news, finances, USA, business'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'cp1252'
|
encoding = 'utf8'
|
||||||
|
extra_css = 'img{display: block} body{font-family: Arial, Helvetica, sans-serif}'
|
||||||
|
|
||||||
html2lrf_options = [
|
conversion_options = {
|
||||||
'--comment', description
|
'comment' : description
|
||||||
, '--category', category
|
, 'tags' : category
|
||||||
, '--publisher', publisher
|
, 'publisher' : publisher
|
||||||
, '--ignore-tables'
|
, 'language' : language
|
||||||
]
|
, 'linearize_tables' : True
|
||||||
|
}
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Street Talk' , u'http://moneynews.newsmax.com/xml/streettalk.xml' )
|
(u'Street Talk' , u'http://www.moneynews.com/rss/StreetTalk/8.xml' )
|
||||||
,(u'Finance News' , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' )
|
,(u'Finance News' , u'http://www.moneynews.com/rss/FinanceNews/4.xml' )
|
||||||
,(u'Economy' , u'http://moneynews.newsmax.com/xml/economy.xml' )
|
,(u'Economy' , u'http://www.moneynews.com/rss/Economy/2.xml' )
|
||||||
,(u'Companies' , u'http://moneynews.newsmax.com/xml/companies.xml' )
|
,(u'Companies' , u'http://www.moneynews.com/rss/Companies/6.xml' )
|
||||||
,(u'Markets' , u'http://moneynews.newsmax.com/xml/Markets.xml' )
|
,(u'Markets' , u'http://www.moneynews.com/rss/Markets/7.xml' )
|
||||||
,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml' )
|
,(u'Investing & Analysis' , u'http://www.moneynews.com/rss/InvestingAnalysis/17.xml')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'copy'})]
|
||||||
keep_only_tags = [dict(name='table', attrs={'class':'copy'})]
|
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='td' , attrs={'id':'article_fontsize'})
|
dict(attrs={'class':['MsoNormal', 'MsoNoSpacing']}),
|
||||||
,dict(name='table', attrs={'id':'toolbox' })
|
dict(name=['object','link','embed','form','meta'])
|
||||||
,dict(name='tr' , attrs={'id':'noprint3' })
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
nodeid = url.rpartition('/')[2]
|
||||||
|
return 'http://www.moneynews.com/PrintTemplate?nodeid=' + nodeid
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
@ -6,15 +7,72 @@ __license__ = 'GPL v3'
|
|||||||
www.canada.com
|
www.canada.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class CanWestPaper(BasicNewsRecipe):
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
# un-comment the following three lines for the Montreal Gazette
|
# un-comment the following four lines for the Victoria Times Colonist
|
||||||
|
## title = u'Victoria Times Colonist'
|
||||||
|
## url_prefix = 'http://www.timescolonist.com'
|
||||||
|
## description = u'News from Victoria, BC'
|
||||||
|
## fp_tag = 'CAN_TC'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Province
|
||||||
|
## title = u'Vancouver Province'
|
||||||
|
## url_prefix = 'http://www.theprovince.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VP'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Sun
|
||||||
|
## title = u'Vancouver Sun'
|
||||||
|
## url_prefix = 'http://www.vancouversun.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VS'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Edmonton Journal
|
||||||
|
## title = u'Edmonton Journal'
|
||||||
|
## url_prefix = 'http://www.edmontonjournal.com'
|
||||||
|
## description = u'News from Edmonton, AB'
|
||||||
|
## fp_tag = 'CAN_EJ'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Calgary Herald
|
||||||
|
## title = u'Calgary Herald'
|
||||||
|
## url_prefix = 'http://www.calgaryherald.com'
|
||||||
|
## description = u'News from Calgary, AB'
|
||||||
|
## fp_tag = 'CAN_CH'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Regina Leader-Post
|
||||||
|
## title = u'Regina Leader-Post'
|
||||||
|
## url_prefix = 'http://www.leaderpost.com'
|
||||||
|
## description = u'News from Regina, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||||
|
## title = u'Saskatoon Star-Phoenix'
|
||||||
|
## url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
## description = u'News from Saskatoon, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Windsor Star
|
||||||
|
## title = u'Windsor Star'
|
||||||
|
## url_prefix = 'http://www.windsorstar.com'
|
||||||
|
## description = u'News from Windsor, ON'
|
||||||
|
## fp_tag = 'CAN_'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Ottawa Citizen
|
||||||
|
## title = u'Ottawa Citizen'
|
||||||
|
## url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
## description = u'News from Ottawa, ON'
|
||||||
|
## fp_tag = 'CAN_OC'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Montreal Gazette
|
||||||
title = u'Montreal Gazette'
|
title = u'Montreal Gazette'
|
||||||
url_prefix = 'http://www.montrealgazette.com'
|
url_prefix = 'http://www.montrealgazette.com'
|
||||||
description = u'News from Montreal, QC'
|
description = u'News from Montreal, QC'
|
||||||
|
fp_tag = 'CAN_MG'
|
||||||
|
|
||||||
|
|
||||||
language = 'en_CA'
|
language = 'en_CA'
|
||||||
@ -38,14 +96,81 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
def preprocess_html(self,soup):
|
|
||||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
def get_cover_url(self):
|
||||||
divtags = soup.findAll('div',attrs={'id':''})
|
from datetime import timedelta, date
|
||||||
if divtags:
|
if self.fp_tag=='':
|
||||||
for div in divtags:
|
return None
|
||||||
del(div['id'])
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
daysback=1
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
while daysback<7:
|
||||||
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
daysback = daysback+1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if daysback==7:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
# Replace rsquo (\x92)
|
||||||
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
# Replace ldquo (\x93)
|
||||||
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
# Replace rdquo (\x94)
|
||||||
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
fixed = re.sub("’","’",fixed)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first:
|
||||||
|
picdiv = soup.find('body').find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||||
|
xtitle = article.text_summary.strip()
|
||||||
|
if len(xtitle) == 0:
|
||||||
|
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||||
|
if desc is not None:
|
||||||
|
article.summary = article.text_summary = desc['content']
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
59
recipes/mumbai_mirror.recipe
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class MumbaiMirror(BasicNewsRecipe):
|
||||||
|
title = u'Mumbai Mirror'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
|
||||||
|
description = 'People Daily Newspaper'
|
||||||
|
language = 'en_IN'
|
||||||
|
category = 'News, Mumbai, India'
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
auto_cleanup = True
|
||||||
|
no_stylesheets = True
|
||||||
|
#encoding = 'GB2312'
|
||||||
|
conversion_options = {'linearize_tables':True}
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Cover Story',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=latest'),
|
||||||
|
('City Diary',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=citydiary'),
|
||||||
|
('Columnists',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=mmcolumnists'),
|
||||||
|
('Mumbai, The City',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=city'),
|
||||||
|
('Nation',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=nation'),
|
||||||
|
('Top Stories',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=topstories'),
|
||||||
|
('Business',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=business'),
|
||||||
|
('World',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=world'),
|
||||||
|
(' Chai Time',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=chaitime'),
|
||||||
|
('Technology',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=technology'),
|
||||||
|
('Entertainment',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=entertainment'),
|
||||||
|
('Style',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=style'),
|
||||||
|
('Ask the Sexpert',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=askthesexpert'),
|
||||||
|
('Television',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=television'),
|
||||||
|
('Lifestyle',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=lifestyle'),
|
||||||
|
('Sports',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=sports'),
|
||||||
|
('Travel: Travelers Diary',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=travellersdiaries'),
|
||||||
|
('Travel: Domestic',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=traveldomestic'),
|
||||||
|
('Travel: International',
|
||||||
|
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=travelinternational')
|
||||||
|
]
|
141
recipes/mwjournal.recipe
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
##
|
||||||
|
## Title: Microwave Journal
|
||||||
|
## Contact: Kiavash (use Mobile Read)
|
||||||
|
##
|
||||||
|
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||||
|
## Copyright: Kiavash
|
||||||
|
##
|
||||||
|
## Written: Jan 2012
|
||||||
|
## Last Edited: Feb 2012
|
||||||
|
##
|
||||||
|
|
||||||
|
# Feb 2012: New Recipe compatible with the MWJournal 2.0 website
|
||||||
|
|
||||||
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||||
|
__copyright__ = 'Kiavash'
|
||||||
|
__author__ = 'Kaivash'
|
||||||
|
|
||||||
|
'''
|
||||||
|
microwavejournal.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.magick import Image
|
||||||
|
|
||||||
|
class MWJournal(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Microwave Journal'
|
||||||
|
description = u'Microwave Journal Monthly Magazine'
|
||||||
|
publisher = 'Horizon House'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
INDEX = 'http://www.microwavejournal.com/publications/'
|
||||||
|
|
||||||
|
language = 'en'
|
||||||
|
timeout = 30
|
||||||
|
|
||||||
|
Convert_Grayscale = False # Convert images to gray scale or not
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'record'})]
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='font', attrs={'class':'footer'}), # remove fonts
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||||
|
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||||
|
|
||||||
|
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||||
|
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||||
|
.introduction, .first { font-weight: bold; } \
|
||||||
|
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||||
|
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||||
|
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||||
|
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||||
|
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||||
|
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||||
|
.story-date, .published { font-size: 80%; } \
|
||||||
|
table { width: 100%; } \
|
||||||
|
td img { display: block; margin: 5px auto; } \
|
||||||
|
ul { padding-top: 10px; } \
|
||||||
|
ol { padding-top: 10px; } \
|
||||||
|
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||||
|
h1 { font-size: 175%; font-weight: bold; } \
|
||||||
|
h2 { font-size: 150%; font-weight: bold; } \
|
||||||
|
h3 { font-size: 125%; font-weight: bold; } \
|
||||||
|
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||||
|
|
||||||
|
# Remove the line breaks, href links and float left/right and picture width/height.
|
||||||
|
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'<a.*?>'), lambda h1: ''),
|
||||||
|
(re.compile(r'</a>'), lambda h2: ''),
|
||||||
|
(re.compile(r'float:.*?'), lambda h3: ''),
|
||||||
|
(re.compile(r'width:.*?px'), lambda h4: ''),
|
||||||
|
(re.compile(r'height:.*?px'), lambda h5: '')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('/articles/', '/articles/print/')
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
ts = soup.find('div', attrs={'class':'box1 article publications-show'})
|
||||||
|
ds = self.tag_to_string(ts.find('h2'))
|
||||||
|
self.log('Found Current Issue:', ds)
|
||||||
|
self.timefmt = ' [%s]'%ds
|
||||||
|
|
||||||
|
cover = ts.find('img', src=True)
|
||||||
|
if cover is not None:
|
||||||
|
self.cover_url = 'http://www.microwavejournal.com' + cover['src']
|
||||||
|
self.log('Found Cover image:', self.cover_url)
|
||||||
|
|
||||||
|
feeds = []
|
||||||
|
seen_titles = set([]) # This is used to remove duplicant articles
|
||||||
|
sections = soup.find('div', attrs={'class':'box2 publication'})
|
||||||
|
for section in sections.findAll('div', attrs={'class':'records'}):
|
||||||
|
section_title = self.tag_to_string(section.find('h3'))
|
||||||
|
self.log('Found section:', section_title)
|
||||||
|
articles = []
|
||||||
|
for post in section.findAll('div', attrs={'class':'record'}):
|
||||||
|
h = post.find('h2')
|
||||||
|
title = self.tag_to_string(h)
|
||||||
|
if title.find('The MWJ Puzzler') >=0: #Let's get rid of the useless Puzzler!
|
||||||
|
continue
|
||||||
|
if title in seen_titles:
|
||||||
|
continue
|
||||||
|
seen_titles.add(title)
|
||||||
|
a = post.find('a', href=True)
|
||||||
|
url = a['href']
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://www.microwavejournal.com'+url
|
||||||
|
abstract = post.find('div', attrs={'class':'abstract'})
|
||||||
|
p = abstract.find('p')
|
||||||
|
desc = None
|
||||||
|
self.log('\tFound article:', title, 'at', url)
|
||||||
|
if p is not None:
|
||||||
|
desc = self.tag_to_string(p)
|
||||||
|
self.log('\t\t', desc)
|
||||||
|
articles.append({'title':title, 'url':url, 'description':desc,
|
||||||
|
'date':self.timefmt})
|
||||||
|
if articles:
|
||||||
|
feeds.append((section_title, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
if self.Convert_Grayscale:
|
||||||
|
#process all the images
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.type = "GrayscaleType"
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
@ -1,16 +1,35 @@
|
|||||||
__license__ = 'GPL v3'
|
##
|
||||||
__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
|
## Title: Microwave Journal RSS recipe
|
||||||
|
## Contact: AprilHare, Darko Miletic <darko.miletic at gmail.com>
|
||||||
|
##
|
||||||
|
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||||
|
## Copyright: 2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>
|
||||||
|
##
|
||||||
|
## Written: 2008
|
||||||
|
## Last Edited: Jan 2012
|
||||||
|
##
|
||||||
|
|
||||||
|
'''
|
||||||
|
01-19-2012: Added GrayScale Image conversion and Duplicant article removals
|
||||||
|
'''
|
||||||
|
|
||||||
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||||
|
__copyright__ = '2008-2012, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
__version__ = 'v0.5.0'
|
||||||
|
__date__ = '2012-01-19'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
newscientist.com
|
newscientist.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import urllib
|
import urllib
|
||||||
|
from calibre.utils.magick import Image
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class NewScientist(BasicNewsRecipe):
|
class NewScientist(BasicNewsRecipe):
|
||||||
title = 'New Scientist - Online News w. subscription'
|
title = 'New Scientist - Online News w. subscription'
|
||||||
__author__ = 'Darko Miletic'
|
|
||||||
description = 'Science news and science articles from New Scientist.'
|
description = 'Science news and science articles from New Scientist.'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
publisher = 'Reed Business Information Ltd.'
|
publisher = 'Reed Business Information Ltd.'
|
||||||
@ -39,6 +58,15 @@ class NewScientist(BasicNewsRecipe):
|
|||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
|
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
|
||||||
|
|
||||||
|
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||||
|
# more than one section). If True, only the first occurance will be downloaded.
|
||||||
|
filterDuplicates = True
|
||||||
|
|
||||||
|
# Whether to convert images to grayscale for eInk readers.
|
||||||
|
Convert_Grayscale = False
|
||||||
|
|
||||||
|
url_list = [] # This list is used to check if an article had already been included.
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
br.open('http://www.newscientist.com/')
|
br.open('http://www.newscientist.com/')
|
||||||
@ -80,6 +108,10 @@ class NewScientist(BasicNewsRecipe):
|
|||||||
return article.get('guid', None)
|
return article.get('guid', None)
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
|
if self.filterDuplicates:
|
||||||
|
if url in self.url_list:
|
||||||
|
return
|
||||||
|
self.url_list.append(url)
|
||||||
return url + '?full=true&print=true'
|
return url + '?full=true&print=true'
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
@ -101,3 +133,16 @@ class NewScientist(BasicNewsRecipe):
|
|||||||
tg.replaceWith(tstr)
|
tg.replaceWith(tstr)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
# Converts images to Gray Scale
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
if self.Convert_Grayscale:
|
||||||
|
#process all the images
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.type = "GrayscaleType"
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
||||||
|
@ -1,14 +1,25 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
##
|
||||||
|
## Title: New Journal of Physics
|
||||||
|
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||||
|
## Copyright: Chema Cort\xe9s
|
||||||
|
##
|
||||||
|
## Written: Jan 2011
|
||||||
|
## Last Edited: Jan 2012 - by Kiavash
|
||||||
|
##
|
||||||
|
|
||||||
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||||
__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
|
__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
|
||||||
__version__ = 'v0.01'
|
__version__ = 'v0.5.0'
|
||||||
__date__ = '2011-01-05'
|
__date__ = '2012-01-13'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
njp.org
|
njp.org
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re # Import the regular expressions module.
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class NewJournalOfPhysics(BasicNewsRecipe):
|
class NewJournalOfPhysics(BasicNewsRecipe):
|
||||||
@ -19,14 +30,60 @@ class NewJournalOfPhysics(BasicNewsRecipe):
|
|||||||
category = 'physics, journal, science'
|
category = 'physics, journal, science'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
oldest_article = 30
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
|
|
||||||
keep_only_tags = [dict(id=['fulltextContainer'])]
|
|
||||||
no_stylesheets=True
|
|
||||||
use_embedded_content=False
|
|
||||||
|
|
||||||
feeds = [(u'Latest Papers', u'http://iopscience.iop.org/1367-2630/?rss=1')]
|
feeds = [(u'Latest Papers', u'http://iopscience.iop.org/1367-2630/?rss=1')]
|
||||||
|
|
||||||
|
cover_url = 'http://images.iop.org/journals_icons/Info/1367-2630/cover.gif'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 30
|
||||||
|
timeout = 30
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
asciiize = True # Converts all none ascii characters to their ascii equivalents
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(id=['articleEvoContainer']),
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':'affiliations'}), # Removes Shoow Affiliations
|
||||||
|
dict(name='div', attrs={'class':'abst-icon-links'}), # Removes Tags and PDF export
|
||||||
|
dict(name='p', attrs={'class':'studyimage'}), # remove Studay image
|
||||||
|
dict(name='a', attrs={'class':'icon powerpoint'}), # remove Export to PowerPoint Slide
|
||||||
|
dict(name='a', attrs={'title':'CrossRef'}), # remove CrossRef icon
|
||||||
|
dict(name='a', attrs={'title':'PubMed'}), # remove PubMed icon
|
||||||
|
dict(name='a', attrs={'e4f5426941':'true'}), # remove cross ref image
|
||||||
|
dict(name='img', attrs={'src':''}), # remove empty image
|
||||||
|
dict(name='a', attrs={'class':'closeChap'}), # remove 'Close'
|
||||||
|
dict(name='ul', attrs={'class':'breadcrumbs'}), # remove Top breadcrumbs
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||||
|
.introduction, .first { font-weight: bold; } \
|
||||||
|
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||||
|
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||||
|
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||||
|
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||||
|
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||||
|
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||||
|
.story-date, .published { font-size: 80%; } \
|
||||||
|
table { width: 100%; } \
|
||||||
|
td img { display: block; margin: 5px auto; } \
|
||||||
|
ul { padding-top: 10px; } \
|
||||||
|
ol { padding-top: 10px; } \
|
||||||
|
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||||
|
h1 { font-size: 175%; font-weight: bold; } \
|
||||||
|
h2 { font-size: 150%; font-weight: bold; } \
|
||||||
|
h3 { font-size: 125%; font-weight: bold; } \
|
||||||
|
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||||
|
|
||||||
|
# Remove the line breaks.
|
||||||
|
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||||
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url+"/fulltext"
|
return url+"/article"
|
54
recipes/nol.recipe
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
################################################################################
|
||||||
|
#Description: http://nol.hu/ RSS channel
|
||||||
|
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||||
|
#Date: 2011.12.18. - V1.1
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NOL(BasicNewsRecipe):
|
||||||
|
title = u'NOL'
|
||||||
|
__author__ = 'Bigpapa'
|
||||||
|
oldest_article = 5
|
||||||
|
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||||
|
no_stylesheets = True
|
||||||
|
#delay = 1
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf8'
|
||||||
|
language = 'hu'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
|
||||||
|
conversion_options ={
|
||||||
|
'linearize_tables' : True,
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='table', attrs={'class':['article-box']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
|
||||||
|
dict(name='div', attrs={'class':['h','ad-container-outer','tags noborder','ad-container-inner','image-container-lead','tags','related-container']}),
|
||||||
|
dict(name='h4'),
|
||||||
|
dict(name='tfoot'),
|
||||||
|
dict(name='td', attrs={'class':['foot']}),
|
||||||
|
dict(name='span', attrs={'class':['image-container-caption']}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
# (u'V\xe1logat\xe1s', 'http://nol.hu/feed/valogatas.rss'),
|
||||||
|
(u'Belf\xf6ld', 'http://nol.hu/feed/belfold.rss'),
|
||||||
|
(u'K\xfclf\xf6ld', 'http://nol.hu/feed/kulfold.rss'),
|
||||||
|
(u'Gazdas\xe1g', 'http://nol.hu/feed/gazdasag.rss'),
|
||||||
|
(u'V\xe9lem\xe9ny', 'http://nol.hu/feed/velemeny.rss'),
|
||||||
|
(u'Kult\xfara', 'http://nol.hu/feed/kult.rss'),
|
||||||
|
(u'Tud/Tech', 'http://nol.hu/feed/tud-tech.rss'),
|
||||||
|
(u'Sport', 'http://nol.hu/feed/sport.rss'),
|
||||||
|
(u'Noller', 'http://nol.hu/feed/noller.rss'),
|
||||||
|
(u'Mozaik', 'http://nol.hu/feed/mozaik.rss'),
|
||||||
|
(u'Utaz\xe1s', 'http://nol.hu/feed/utazas.rss'),
|
||||||
|
(u'Aut\xf3', 'http://nol.hu/feed/auto.rss'),
|
||||||
|
(u'Voks', 'http://nol.hu/feed/voks.rss'),
|
||||||
|
|
||||||
|
]
|
100
recipes/novilist_novine_hr.recipe
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
novine.novilist.hr
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NoviList_hr(BasicNewsRecipe):
|
||||||
|
title = 'Novi List'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Vijesti iz Hrvatske'
|
||||||
|
publisher = 'NOVI LIST d.d.'
|
||||||
|
category = 'Novi list, politika, hrvatski dnevnik, Novine, Hrvatska, Croatia, News, newspaper, Hrvatski,Primorje, dnevni list, Rijeka'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'cp1250'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'hr'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
needs_subscription = True
|
||||||
|
masthead_url = 'http://novine.novilist.hr/images/system/novilist-logo.jpg'
|
||||||
|
index = 'http://novine.novilist.hr/'
|
||||||
|
extra_css = """
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Geneva,Arial,Helvetica,Swiss,sans1,sans-serif }
|
||||||
|
img{display:block; margin-bottom: 0.4em; margin-top: 0.4em}
|
||||||
|
.nadnaslov,.podnaslov{font-size: small; display: block; margin-bottom: 1em}
|
||||||
|
.naslov{font-size: x-large; color: maroon; font-weight: bold; display: block; margin-bottom: 1em;}
|
||||||
|
p{display: block}
|
||||||
|
"""
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : True
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='td', attrs={'class':['nadnaslov', 'naslov', 'podnaslov']}),
|
||||||
|
dict(name='font', attrs={'face':'Geneva,Arial,Helvetica,Swiss'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [dict(name=['meta', 'link', 'iframe', 'embed', 'object'])]
|
||||||
|
remove_attributes=['border', 'lang', 'size', 'face', 'bgcolor']
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open(self.index + 'loginnow.asp')
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br['username'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
articles = []
|
||||||
|
count = 0
|
||||||
|
soup = self.index_to_soup(self.index)
|
||||||
|
#cover url
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink['href'].startswith('images/clanci/DOC_'):
|
||||||
|
self.cover_url = self.index + alink['href']
|
||||||
|
#feeds
|
||||||
|
for item in soup.findAll('td',attrs={'class':'tocrubrika'}):
|
||||||
|
count = count +1
|
||||||
|
if self.test and count > 2:
|
||||||
|
return articles
|
||||||
|
aitem = item.a
|
||||||
|
section = self.tag_to_string(aitem)
|
||||||
|
feedlink = self.index + aitem['href']
|
||||||
|
feedpage = self.index_to_soup(feedlink)
|
||||||
|
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
|
||||||
|
inarts = []
|
||||||
|
for alink in feedpage.findAll('a',attrs={'class':'naslovlinkdesno'}):
|
||||||
|
url = self.index + alink['href']
|
||||||
|
inarts.append({
|
||||||
|
'title' :self.tag_to_string(alink)
|
||||||
|
,'date' :strftime(self.timefmt)
|
||||||
|
,'url' :url
|
||||||
|
,'description':''
|
||||||
|
})
|
||||||
|
if self.remove_empty_feeds:
|
||||||
|
if inarts:
|
||||||
|
articles.append((section,inarts))
|
||||||
|
else:
|
||||||
|
articles.append((section,inarts))
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('?WCI=Rubrike&','?WCI=Pretrazivac&')
|
49
recipes/novilist_portal_hr.recipe
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.novilist.hr
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NoviList_Portal_hr(BasicNewsRecipe):
|
||||||
|
title = 'Novi List - online portal'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Portal Novog Lista'
|
||||||
|
publisher = 'NOVI LIST d.d.'
|
||||||
|
category = 'Novi list, politika, hrvatski dnevnik, Novine, Hrvatska, Croatia, News, newspaper, Hrvatski,Primorje, dnevni list, Rijeka'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'hr'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
masthead_url = 'http://www.novilist.hr/design/novilist/images/logo-print.gif'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Geneva,Arial,Helvetica,Swiss,sans-serif }
|
||||||
|
h1{font-family: Georgia,serif}
|
||||||
|
img{display:block; margin-bottom: 0.4em; margin-top: 0.4em}
|
||||||
|
"""
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : True
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||||
|
|
||||||
|
remove_tags = [dict(name=['meta', 'link', 'iframe', 'embed', 'object'])]
|
||||||
|
remove_attributes=['border', 'lang']
|
||||||
|
|
||||||
|
feeds = [(u'Vijesti', u'http://www.novilist.hr/rss/feed/sve.xml')]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('http://www.novilist.hr/','http://www.novilist.hr/layout/set/print/')
|
26
recipes/novinite_bg.recipe
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1329123365(BasicNewsRecipe):
|
||||||
|
title = u'Novinite.bg'
|
||||||
|
__author__ = 'M3 Web'
|
||||||
|
description = 'Real time provider of the latest news from Bulgaria and the world'
|
||||||
|
category = 'Business, Politics, Society, Sports, Crime, Lifestyle, World, Health'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 6
|
||||||
|
language = 'bg'
|
||||||
|
encoding = 'windows-1251'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||||
|
remove_tags = [dict(name='div', attrs={'id':'text_options'})]
|
||||||
|
remove_tags = [dict(name='div', attrs={'id':'social_shares_top'})]
|
||||||
|
remove_tags_after = dict(id='textsize')
|
||||||
|
feeds = [(u'Business', u'http://novinite.bg/rss.php?category_id=1'),
|
||||||
|
(u'Politics', u'http://novinite.bg/rss.php?category_id=2'),
|
||||||
|
(u'Society', u'http://novinite.bg/rss.php?category_id=3'),
|
||||||
|
(u'Sport', u'http://novinite.bg/rss.php?category_id=4'),
|
||||||
|
(u'Crime', u'http://novinite.bg/rss.php?category_id=5'),
|
||||||
|
(u'Lifestyle', u'http://novinite.bg/rss.php?category_id=6'),
|
||||||
|
(u'Health', u'http://novinite.bg/rss.php?category_id=7'),
|
||||||
|
(u'Other', u'http://novinite.bg/rss.php?category_id=10'),
|
||||||
|
(u'World', u'http://novinite.bg/rss.php?category_id=9')]
|
@ -325,7 +325,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'''
|
'''
|
||||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||||
if re.match(r'\w+://', url_or_raw):
|
if re.match(r'\w+://', url_or_raw):
|
||||||
f = self.browser.open(url_or_raw)
|
br = self.clone_browser(self.browser)
|
||||||
|
f = br.open_novisit(url_or_raw)
|
||||||
_raw = f.read()
|
_raw = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
if not _raw:
|
if not _raw:
|
||||||
|
@ -364,7 +364,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'''
|
'''
|
||||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||||
if re.match(r'\w+://', url_or_raw):
|
if re.match(r'\w+://', url_or_raw):
|
||||||
f = self.browser.open(url_or_raw)
|
br = self.clone_browser(self.browser)
|
||||||
|
f = br.open_novisit(url_or_raw)
|
||||||
_raw = f.read()
|
_raw = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
if not _raw:
|
if not _raw:
|
||||||
|
21
recipes/onda_rock.recipe
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1328535130(BasicNewsRecipe):
|
||||||
|
title = u'Onda Rock'
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
description = 'Italian rock webzine'
|
||||||
|
language = 'it'
|
||||||
|
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = False
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':['boxHeader','boxlinks_med','footer','boxinterviste','box_special_med','boxdiscografia_head','path']}),
|
||||||
|
dict(name='div', attrs={'align':'left'}),
|
||||||
|
dict(name='div', attrs={'style':'text-align: center'}),
|
||||||
|
]
|
||||||
|
no_stylesheets = True
|
||||||
|
feeds = [(u'Onda Rock', u'http://www.ondarock.it/feed.php')]
|
||||||
|
masthead_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/71135_45820579767_4993043_n.jpg'
|
77
recipes/opinion_bo.recipe
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Piet van Oostrum <piet@vanoostrum.org>'
|
||||||
|
'''
|
||||||
|
www.opinion.com.bo
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Opinion_Bol(BasicNewsRecipe):
|
||||||
|
title = u'Opinión - Bolivia'
|
||||||
|
__author__ = 'Piet van Oostrum'
|
||||||
|
description = u'Opinión diario de circulación nacional, Cochabamba, Bolivia'
|
||||||
|
publisher = 'Coboce Ltda - Editora Opinión'
|
||||||
|
category = 'news, politics, Bolivia'
|
||||||
|
version = 1
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es_BO'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
delay = 1
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
cover_url = strftime('http://www.opinion.com.bo/opinion/articulos/%Y/%m%d/fotos/portada_650.jpg')
|
||||||
|
masthead_url = 'http://opinion.com.bo/opinion/articulos/imagenes/logo_opinion.gif'
|
||||||
|
extra_css = """body{font-family: Helvetica,Arial,sans-serif}
|
||||||
|
.seccion_encabezado_nota_inte{font-size: 1.1em;
|
||||||
|
font-weight: bold;}
|
||||||
|
.autor_nota_inte{color: #999999; font-size: 0.8em;
|
||||||
|
margin-bottom: 0.5em; text-align: right;}
|
||||||
|
.pie{font-size: 0.8em;}"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'columna_izq_nota_intererior'})]
|
||||||
|
|
||||||
|
remove_tags = [dict(name=['meta','link','form','iframe','embed','object','style']),
|
||||||
|
dict(name='div', attrs={'class':'ocultar'})]
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'El País' , u'http://www.opinion.com.bo/opinion/rss/el_pais_rss.xml' )
|
||||||
|
,(u'Cochabamba' , u'http://www.opinion.com.bo/opinion/rss/cochabamba_rss.xml' )
|
||||||
|
,(u'Economía' , u'http://www.opinion.com.bo/opinion/rss/economia_rss.xml' )
|
||||||
|
,(u'Cultura' , u'http://www.opinion.com.bo/opinion/rss/cultura_rss.xml' )
|
||||||
|
,(u'Mundo' , u'http://www.opinion.com.bo/opinion/rss/mundo_rss.xml' )
|
||||||
|
,(u'Ciencia y Tecnología', u'http://www.opinion.com.bo/opinion/rss/ciencia_tecnologia_rss.xml' )
|
||||||
|
,(u'Policial' , u'http://www.opinion.com.bo/opinion/rss/policial_rss.xml' )
|
||||||
|
,(u'Editorial' , u'http://www.opinion.com.bo/opinion/rss/editorial_rss.xml' )
|
||||||
|
,(u'Subeditorial' , u'http://www.opinion.com.bo/opinion/rss/subeditorial_rss.xml' )
|
||||||
|
,(u'Opinión' , u'http://www.opinion.com.bo/opinion/rss/opinion_rss.xml' )
|
||||||
|
,(u'Deportes' , u'http://www.opinion.com.bo/opinion/rss/deportes_rss.xml')
|
||||||
|
,(u' Vida de hoy' , u'http://www.opinion.com.bo/opinion/rss/vidadehoy_rss.xml' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
||||||
|
|
||||||
|
# Filter out today's articles
|
||||||
|
# maybe should take timezone into account
|
||||||
|
|
||||||
|
today = strftime('/%Y/%m%d/')
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = article.link
|
||||||
|
if self.today in link:
|
||||||
|
return link
|
197
recipes/oreilly_premium.recipe
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
# Talking Points is not grabbing everything.
|
||||||
|
# The look is right, but only the last one added?
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
# Allows the Python soup converter, which makes parsing easier.
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
# strip ads and graphics
|
||||||
|
# Current Column lacks a title.
|
||||||
|
# Talking Points Memo - shorten title - Remove year and Bill's name
|
||||||
|
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
|
||||||
|
# Newsletters: Talking Points Memos covered by cat12
|
||||||
|
|
||||||
|
class OReillyPremium(BasicNewsRecipe):
|
||||||
|
title = u'OReilly Premium'
|
||||||
|
__author__ = 'TMcN'
|
||||||
|
language = 'en'
|
||||||
|
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
|
||||||
|
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
|
||||||
|
auto_cleanup = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
needs_subscription = True
|
||||||
|
no_stylesheets = True
|
||||||
|
oldest_article = 20
|
||||||
|
remove_javascript = True
|
||||||
|
remove_tags = [dict(name='img', attrs={})]
|
||||||
|
# Don't go down
|
||||||
|
recursions = 0
|
||||||
|
max_articles_per_feed = 2000
|
||||||
|
|
||||||
|
debugMessages = True
|
||||||
|
|
||||||
|
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
||||||
|
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
|
||||||
|
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
|
||||||
|
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
|
||||||
|
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
|
||||||
|
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
|
||||||
|
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
|
||||||
|
br.select_form(name='login')
|
||||||
|
br['formEmailField'] = self.username
|
||||||
|
br['formPasswordField'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
# Returns the best-guess print url.
|
||||||
|
# The second parameter (pageURL) is returned if nothing is found.
|
||||||
|
def extractPrintURL(self, baseURL, pageURL, printString):
|
||||||
|
tagURL = pageURL
|
||||||
|
soup = self.index_to_soup(pageURL)
|
||||||
|
if soup :
|
||||||
|
printText = soup.find('a', text=printString)
|
||||||
|
else :
|
||||||
|
print("Failed to find Print string "+printString+ " in "+pageURL)
|
||||||
|
if printText:
|
||||||
|
tag = printText.parent
|
||||||
|
tagURL = baseURL+tag['href']
|
||||||
|
return tagURL
|
||||||
|
|
||||||
|
def stripBadChars(self, inString) :
|
||||||
|
return inString.replace("\'", "")
|
||||||
|
|
||||||
|
def parseGeneric(self, baseURL):
|
||||||
|
# Does a generic parsing of the articles. There are six categories (0-5)
|
||||||
|
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
||||||
|
# NoSpin and TV are generic
|
||||||
|
fullReturn = []
|
||||||
|
for i in range(len(self.catList)) :
|
||||||
|
articleList = []
|
||||||
|
soup = self.index_to_soup(self.catList[i][1])
|
||||||
|
# Set defaults
|
||||||
|
description = 'None'
|
||||||
|
pubdate = time.strftime('%a, %d %b')
|
||||||
|
# Problem: 0-2 create many in an array
|
||||||
|
# 3-5 create one.
|
||||||
|
# So no for-div for 3-5
|
||||||
|
|
||||||
|
if i < 3 :
|
||||||
|
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
|
||||||
|
print(div)
|
||||||
|
if i == 1:
|
||||||
|
a = div.find('a', href=True)
|
||||||
|
else :
|
||||||
|
a = div
|
||||||
|
print(a)
|
||||||
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
|
if summary:
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
if not a:
|
||||||
|
continue
|
||||||
|
# url = baseURL+re.sub(r'\?.*', '', a['href'])
|
||||||
|
url = baseURL+a['href']
|
||||||
|
if i < 2 :
|
||||||
|
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
elif i == 2 :
|
||||||
|
# Daily Briefs
|
||||||
|
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
||||||
|
title = div.contents[0]
|
||||||
|
if self.debugMessages :
|
||||||
|
print(title+" @ "+url)
|
||||||
|
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||||
|
|
||||||
|
elif i == 3 : # Stratfor
|
||||||
|
a = soup.find('a', self.catList[i][3])
|
||||||
|
if a is None :
|
||||||
|
continue
|
||||||
|
url = baseURL+a['href']
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
# Get Stratfor contents so we can get the real title.
|
||||||
|
stratSoup = self.index_to_soup(url)
|
||||||
|
title = stratSoup.html.head.title.string
|
||||||
|
stratIndex = title.find('Stratfor.com:', 0)
|
||||||
|
if (stratIndex > -1) :
|
||||||
|
title = title[stratIndex+14:-1]
|
||||||
|
# Look for first blogBody <td class="blogBody"
|
||||||
|
# Changed 12 Jan 2012 - new page format
|
||||||
|
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
|
||||||
|
#stratBody = stratSoup.find('td', {'class':['blogBody']})
|
||||||
|
elif i == 4 : # Talking Points
|
||||||
|
topDate = soup.find("td", "blogBody")
|
||||||
|
if not topDate :
|
||||||
|
print("Failed to find date in Talking Points")
|
||||||
|
# This page has the contents in double-wrapped tables!
|
||||||
|
myTable = topDate.findParents('table')[0]
|
||||||
|
if myTable is not None:
|
||||||
|
upOneTable = myTable.findParents('table')[0]
|
||||||
|
if upOneTable is not None:
|
||||||
|
upTwo = upOneTable.findParents('table')[0]
|
||||||
|
if upTwo is None:
|
||||||
|
continue
|
||||||
|
# Now navigate rows of upTwo
|
||||||
|
if self.debugMessages :
|
||||||
|
print("Entering rows")
|
||||||
|
for rows in upTwo.findChildren("tr", recursive=False):
|
||||||
|
# Inside top level table, each row is an article
|
||||||
|
rowTable = rows.find("table")
|
||||||
|
articleTable = rowTable.find("table")
|
||||||
|
# This looks wrong.
|
||||||
|
articleTable = rows.find("tr")
|
||||||
|
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
|
||||||
|
blogDate = articleTable.find("a","blogDate").contents[0]
|
||||||
|
# Skip to second blogBody for this.
|
||||||
|
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
|
||||||
|
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
|
||||||
|
url = baseURL+re.sub(r'\?.*', '', blogURL)
|
||||||
|
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
|
||||||
|
if self.debugMessages :
|
||||||
|
print("Talking Points Memo title "+title+" at url: "+url)
|
||||||
|
pubdate = time.strftime('%a, %d %b')
|
||||||
|
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
|
||||||
|
else : # Current Column
|
||||||
|
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
|
||||||
|
if titleSpan is None :
|
||||||
|
continue
|
||||||
|
title = titleSpan.contents[0]
|
||||||
|
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
|
||||||
|
if i == 3 or i == 5 :
|
||||||
|
if self.debugMessages :
|
||||||
|
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
|
||||||
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
|
if summary:
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||||
|
self.catList[i][3] = articleList
|
||||||
|
fullReturn.append((self.catList[i][0], articleList))
|
||||||
|
return fullReturn
|
||||||
|
|
||||||
|
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
||||||
|
# returns a list of tuple ('feed title', list of articles)
|
||||||
|
# {
|
||||||
|
# 'title' : article title,
|
||||||
|
# 'url' : URL of print version,
|
||||||
|
# 'date' : The publication date of the article as a string,
|
||||||
|
# 'description' : A summary of the article
|
||||||
|
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
|
||||||
|
# }
|
||||||
|
# this is used instead of BasicNewsRecipe.parse_feeds().
|
||||||
|
def parse_index(self):
|
||||||
|
# Parse the page into Python Soup
|
||||||
|
baseURL = "https://www.billoreilly.com"
|
||||||
|
return self.parseGeneric(baseURL)
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
|
if refresh is None:
|
||||||
|
return soup
|
||||||
|
content = refresh.get('content').partition('=')[2]
|
||||||
|
raw = self.browser.open('https://www.billoreilly.com'+content).read()
|
||||||
|
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
|
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
@ -6,20 +7,72 @@ __license__ = 'GPL v3'
|
|||||||
www.canada.com
|
www.canada.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class CanWestPaper(BasicNewsRecipe):
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
# un-comment the following three lines for the Ottawa Citizen
|
# un-comment the following four lines for the Victoria Times Colonist
|
||||||
|
## title = u'Victoria Times Colonist'
|
||||||
|
## url_prefix = 'http://www.timescolonist.com'
|
||||||
|
## description = u'News from Victoria, BC'
|
||||||
|
## fp_tag = 'CAN_TC'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Province
|
||||||
|
## title = u'Vancouver Province'
|
||||||
|
## url_prefix = 'http://www.theprovince.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VP'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Vancouver Sun
|
||||||
|
## title = u'Vancouver Sun'
|
||||||
|
## url_prefix = 'http://www.vancouversun.com'
|
||||||
|
## description = u'News from Vancouver, BC'
|
||||||
|
## fp_tag = 'CAN_VS'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Edmonton Journal
|
||||||
|
## title = u'Edmonton Journal'
|
||||||
|
## url_prefix = 'http://www.edmontonjournal.com'
|
||||||
|
## description = u'News from Edmonton, AB'
|
||||||
|
## fp_tag = 'CAN_EJ'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Calgary Herald
|
||||||
|
## title = u'Calgary Herald'
|
||||||
|
## url_prefix = 'http://www.calgaryherald.com'
|
||||||
|
## description = u'News from Calgary, AB'
|
||||||
|
## fp_tag = 'CAN_CH'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Regina Leader-Post
|
||||||
|
## title = u'Regina Leader-Post'
|
||||||
|
## url_prefix = 'http://www.leaderpost.com'
|
||||||
|
## description = u'News from Regina, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||||
|
## title = u'Saskatoon Star-Phoenix'
|
||||||
|
## url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
## description = u'News from Saskatoon, SK'
|
||||||
|
## fp_tag = ''
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Windsor Star
|
||||||
|
## title = u'Windsor Star'
|
||||||
|
## url_prefix = 'http://www.windsorstar.com'
|
||||||
|
## description = u'News from Windsor, ON'
|
||||||
|
## fp_tag = 'CAN_'
|
||||||
|
|
||||||
|
# un-comment the following four lines for the Ottawa Citizen
|
||||||
title = u'Ottawa Citizen'
|
title = u'Ottawa Citizen'
|
||||||
url_prefix = 'http://www.ottawacitizen.com'
|
url_prefix = 'http://www.ottawacitizen.com'
|
||||||
description = u'News from Ottawa, ON'
|
description = u'News from Ottawa, ON'
|
||||||
|
fp_tag = 'CAN_OC'
|
||||||
|
|
||||||
# un-comment the following three lines for the Montreal Gazette
|
# un-comment the following four lines for the Montreal Gazette
|
||||||
#title = u'Montreal Gazette'
|
## title = u'Montreal Gazette'
|
||||||
#url_prefix = 'http://www.montrealgazette.com'
|
## url_prefix = 'http://www.montrealgazette.com'
|
||||||
#description = u'News from Montreal, QC'
|
## description = u'News from Montreal, QC'
|
||||||
|
## fp_tag = 'CAN_MG'
|
||||||
|
|
||||||
|
|
||||||
language = 'en_CA'
|
language = 'en_CA'
|
||||||
@ -43,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
def preprocess_html(self,soup):
|
def get_cover_url(self):
|
||||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
from datetime import timedelta, date
|
||||||
divtags = soup.findAll('div',attrs={'id':''})
|
if self.fp_tag=='':
|
||||||
if divtags:
|
return None
|
||||||
for div in divtags:
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
del(div['id'])
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
daysback=1
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
while daysback<7:
|
||||||
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
daysback = daysback+1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if daysback==7:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
# Replace rsquo (\x92)
|
||||||
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
# Replace ldquo (\x93)
|
||||||
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
# Replace rdquo (\x94)
|
||||||
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
fixed = re.sub("’","’",fixed)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first:
|
||||||
|
picdiv = soup.find('body').find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||||
|
xtitle = article.text_summary.strip()
|
||||||
|
if len(xtitle) == 0:
|
||||||
|
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||||
|
if desc is not None:
|
||||||
|
article.summary = article.text_summary = desc['content']
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
@ -1,12 +1,10 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
pagina12.com.ar
|
pagina12.com.ar
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
||||||
|
|
||||||
class Pagina12(BasicNewsRecipe):
|
class Pagina12(BasicNewsRecipe):
|
||||||
title = 'Pagina - 12'
|
title = 'Pagina - 12'
|
||||||
@ -66,9 +64,7 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
|
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
rawc = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html',True)
|
soup = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html')
|
||||||
rawc2 = re.sub(r'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN','PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"',rawc)
|
|
||||||
soup = BeautifulSoup(rawc2,fromEncoding=self.encoding,smartQuotesTo=None)
|
|
||||||
for image in soup.findAll('img',alt=True):
|
for image in soup.findAll('img',alt=True):
|
||||||
if image['alt'].startswith('Tapa de la fecha'):
|
if image['alt'].startswith('Tapa de la fecha'):
|
||||||
return image['src']
|
return image['src']
|
||||||
|
14
recipes/pambianco.recipe
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1326135591(BasicNewsRecipe):
|
||||||
|
title = u'Pambianco'
|
||||||
|
description = 'fashion magazine for professional people'
|
||||||
|
language = 'it'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Pambianco', u'http://feeds.feedburner.com/pambianconews/YGXu')]
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
__version__ = 'v1.0'
|
||||||
|
__date__ = '9, January 2011'
|
@ -1,10 +1,11 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import os, time
|
||||||
|
|
||||||
class AdvancedUserRecipe1277129332(BasicNewsRecipe):
|
class AdvancedUserRecipe1277129332(BasicNewsRecipe):
|
||||||
title = u'People Daily - China'
|
title = u'人民日报'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
__author__ = 'rty'
|
__author__ = 'zzh'
|
||||||
|
|
||||||
pubisher = 'people.com.cn'
|
pubisher = 'people.com.cn'
|
||||||
description = 'People Daily Newspaper'
|
description = 'People Daily Newspaper'
|
||||||
@ -14,21 +15,65 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'GB2312'
|
encoding = 'GB2312'
|
||||||
|
language = 'zh'
|
||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
|
masthead_url = 'http://www.people.com.cn/img/2010wb/images/logo.gif'
|
||||||
|
|
||||||
feeds = [(u'\u56fd\u5185\u65b0\u95fb', u'http://www.people.com.cn/rss/politics.xml'),
|
feeds = [
|
||||||
(u'\u56fd\u9645\u65b0\u95fb', u'http://www.people.com.cn/rss/world.xml'),
|
(u'时政', u'http://www.people.com.cn/rss/politics.xml'),
|
||||||
(u'\u7ecf\u6d4e\u65b0\u95fb', u'http://www.people.com.cn/rss/finance.xml'),
|
(u'国际', u'http://www.people.com.cn/rss/world.xml'),
|
||||||
(u'\u4f53\u80b2\u65b0\u95fb', u'http://www.people.com.cn/rss/sports.xml'),
|
(u'经济', u'http://www.people.com.cn/rss/finance.xml'),
|
||||||
(u'\u53f0\u6e7e\u65b0\u95fb', u'http://www.people.com.cn/rss/haixia.xml')]
|
(u'体育', u'http://www.people.com.cn/rss/sports.xml'),
|
||||||
|
(u'教育', u'http://www.people.com.cn/rss/edu.xml'),
|
||||||
|
(u'文化', u'http://www.people.com.cn/rss/culture.xml'),
|
||||||
|
(u'社会', u'http://www.people.com.cn/rss/society.xml'),
|
||||||
|
(u'传媒', u'http://www.people.com.cn/rss/media.xml'),
|
||||||
|
(u'娱乐', u'http://www.people.com.cn/rss/ent.xml'),
|
||||||
|
# (u'汽车', u'http://www.people.com.cn/rss/auto.xml'),
|
||||||
|
(u'海峡两岸', u'http://www.people.com.cn/rss/haixia.xml'),
|
||||||
|
# (u'IT频道', u'http://www.people.com.cn/rss/it.xml'),
|
||||||
|
# (u'环保', u'http://www.people.com.cn/rss/env.xml'),
|
||||||
|
# (u'科技', u'http://www.people.com.cn/rss/scitech.xml'),
|
||||||
|
# (u'新农村', u'http://www.people.com.cn/rss/nc.xml'),
|
||||||
|
# (u'天气频道', u'http://www.people.com.cn/rss/weather.xml'),
|
||||||
|
(u'生活提示', u'http://www.people.com.cn/rss/life.xml'),
|
||||||
|
(u'卫生', u'http://www.people.com.cn/rss/medicine.xml'),
|
||||||
|
# (u'人口', u'http://www.people.com.cn/rss/npmpc.xml'),
|
||||||
|
# (u'读书', u'http://www.people.com.cn/rss/booker.xml'),
|
||||||
|
# (u'食品', u'http://www.people.com.cn/rss/shipin.xml'),
|
||||||
|
# (u'女性新闻', u'http://www.people.com.cn/rss/women.xml'),
|
||||||
|
# (u'游戏', u'http://www.people.com.cn/rss/game.xml'),
|
||||||
|
# (u'家电频道', u'http://www.people.com.cn/rss/homea.xml'),
|
||||||
|
# (u'房产', u'http://www.people.com.cn/rss/house.xml'),
|
||||||
|
# (u'健康', u'http://www.people.com.cn/rss/health.xml'),
|
||||||
|
# (u'科学发展观', u'http://www.people.com.cn/rss/kxfz.xml'),
|
||||||
|
# (u'知识产权', u'http://www.people.com.cn/rss/ip.xml'),
|
||||||
|
# (u'高层动态', u'http://www.people.com.cn/rss/64094.xml'),
|
||||||
|
# (u'党的各项工作', u'http://www.people.com.cn/rss/64107.xml'),
|
||||||
|
# (u'党建聚焦', u'http://www.people.com.cn/rss/64101.xml'),
|
||||||
|
# (u'机关党建', u'http://www.people.com.cn/rss/117094.xml'),
|
||||||
|
# (u'事业党建', u'http://www.people.com.cn/rss/117095.xml'),
|
||||||
|
# (u'国企党建', u'http://www.people.com.cn/rss/117096.xml'),
|
||||||
|
# (u'非公党建', u'http://www.people.com.cn/rss/117097.xml'),
|
||||||
|
# (u'社区党建', u'http://www.people.com.cn/rss/117098.xml'),
|
||||||
|
# (u'高校党建', u'http://www.people.com.cn/rss/117099.xml'),
|
||||||
|
# (u'农村党建', u'http://www.people.com.cn/rss/117100.xml'),
|
||||||
|
# (u'军队党建', u'http://www.people.com.cn/rss/117101.xml'),
|
||||||
|
# (u'时代先锋', u'http://www.people.com.cn/rss/78693.xml'),
|
||||||
|
# (u'网友声音', u'http://www.people.com.cn/rss/64103.xml'),
|
||||||
|
# (u'反腐倡廉', u'http://www.people.com.cn/rss/64371.xml'),
|
||||||
|
# (u'综合报道', u'http://www.people.com.cn/rss/64387.xml'),
|
||||||
|
# (u'中国人大新闻', u'http://www.people.com.cn/rss/14576.xml'),
|
||||||
|
# (u'中国政协新闻', u'http://www.people.com.cn/rss/34948.xml'),
|
||||||
|
]
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'class':'left_content'}),
|
dict(name='div', attrs={'class':'text_c'}),
|
||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='table', attrs={'class':'title'}),
|
dict(name='div', attrs={'class':'tools'}),
|
||||||
]
|
]
|
||||||
remove_tags_after = [
|
remove_tags_after = [
|
||||||
dict(name='table', attrs={'class':'bianji'}),
|
dict(name='div', attrs={'id':'p_content'}),
|
||||||
]
|
]
|
||||||
|
|
||||||
def append_page(self, soup, appendtag, position):
|
def append_page(self, soup, appendtag, position):
|
||||||
@ -36,7 +81,7 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
|
|||||||
if pager:
|
if pager:
|
||||||
nexturl = self.INDEX + pager.a['href']
|
nexturl = self.INDEX + pager.a['href']
|
||||||
soup2 = self.index_to_soup(nexturl)
|
soup2 = self.index_to_soup(nexturl)
|
||||||
texttag = soup2.find('div', attrs={'class':'left_content'})
|
texttag = soup2.find('div', attrs={'class':'text_c'})
|
||||||
#for it in texttag.findAll(style=True):
|
#for it in texttag.findAll(style=True):
|
||||||
# del it['style']
|
# del it['style']
|
||||||
newpos = len(texttag.contents)
|
newpos = len(texttag.contents)
|
||||||
@ -44,9 +89,15 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
|
|||||||
texttag.extract()
|
texttag.extract()
|
||||||
appendtag.insert(position,texttag)
|
appendtag.insert(position,texttag)
|
||||||
|
|
||||||
|
def skip_ad_pages(self, soup):
|
||||||
|
if ('advertisement' in soup.find('title').string.lower()):
|
||||||
|
href = soup.find('a').get('href')
|
||||||
|
return self.browser.open(href).read().decode('GB2312', 'ignore')
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
mtag = '<meta http-equiv="content-type" content="text/html;charset=GB2312" />\n<meta http-equiv="content-language" content="utf-8" />'
|
mtag = '<meta http-equiv="content-type" content="text/html;charset=GB2312" />\n<meta http-equiv="content-language" content="GB2312" />'
|
||||||
soup.head.insert(0,mtag)
|
soup.head.insert(0,mtag)
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['form']
|
del item['form']
|
||||||
@ -55,3 +106,19 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
|
|||||||
#if pager:
|
#if pager:
|
||||||
# pager.extract()
|
# pager.extract()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover = None
|
||||||
|
os.environ['TZ'] = 'Asia/Shanghai'
|
||||||
|
time.tzset()
|
||||||
|
year = time.strftime('%Y')
|
||||||
|
month = time.strftime('%m')
|
||||||
|
day = time.strftime('%d')
|
||||||
|
cover = 'http://paper.people.com.cn/rmrb/page/'+year+'-'+month+'/'+day+'/01/RMRB'+year+month+day+'B001_b.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
self.log("\nCover unavailable: " + cover)
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
pescanik.net
|
pescanik.net
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
class Pescanik(BasicNewsRecipe):
|
class Pescanik(BasicNewsRecipe):
|
||||||
title = 'Pescanik'
|
title = 'Peščanik'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Pescanik'
|
description = 'Peščanik je udruženje građana osnovano 2006. godine. Glavni proizvod Peščanika je radio emisija koja je emitovana na Radiju B92 od 02.02.2000. do 16.06.2011, a od septembra 2011. se emituje na osam radio stanica u Srbiji, Crnoj Gori i BiH'
|
||||||
publisher = 'Pescanik'
|
publisher = 'Peščanik'
|
||||||
category = 'news, politics, Serbia'
|
category = 'news, politics, Serbia'
|
||||||
oldest_article = 10
|
oldest_article = 10
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -21,7 +21,12 @@ class Pescanik(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'sr'
|
language = 'sr'
|
||||||
publication_type = 'newsportal'
|
publication_type = 'newsportal'
|
||||||
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,"Lucida Grande",Tahoma,Verdana,sans1,sans-serif} .contentheading{font-size: x-large; font-weight: bold} .small{font-size: small} .createdate{font-size: x-small; font-weight: bold} '
|
masthead_url = 'http://pescanik.net/wp-content/uploads/2011/10/logo1.png'
|
||||||
|
extra_css = """
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Verdana,Arial,Tahoma,sans1,sans-serif}
|
||||||
|
#BlogTitle{font-size: xx-large; font-weight: bold}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -32,29 +37,12 @@ class Pescanik(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
remove_tags = [dict(name=['object','link','meta','script','iframe','embed'])]
|
||||||
remove_attributes = ['valign','colspan','width','height','align','alt']
|
keep_only_tags = [dict(attrs={'id':['BlogTitle','BlogDate','BlogContent']})]
|
||||||
|
feeds = [
|
||||||
remove_tags = [dict(name=['object','link','meta','script'])]
|
(u'Autori' , u'http://pescanik.net/category/autori/feed/'),
|
||||||
|
(u'Prevodi', u'http://pescanik.net/category/prevodi/feed/')
|
||||||
keep_only_tags = [
|
|
||||||
dict(attrs={'class':['contentheading','small','createdate']})
|
|
||||||
,dict(name='td', attrs={'valign':'top','colspan':'2'})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [(u'Pescanik Online', u'http://www.pescanik.net/index.php?option=com_rd_rss&id=12')]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
nurl = url.replace('/index.php','/index2.php')
|
return url + 'print/'
|
||||||
return nurl + '&pop=1&page=0'
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
st = soup.findAll('td')
|
|
||||||
for it in st:
|
|
||||||
it.name='p'
|
|
||||||
for pt in soup.findAll('img'):
|
|
||||||
brtag = Tag(soup,'br')
|
|
||||||
brtag2 = Tag(soup,'br')
|
|
||||||
pt.append(brtag)
|
|
||||||
pt.append(brtag2)
|
|
||||||
return soup
|
|
@ -33,3 +33,6 @@ class BasicUserRecipe1314970845(BasicNewsRecipe):
|
|||||||
(u'Obituaries', u'http://www.philly.com/inquirer_obituaries.rss')
|
(u'Obituaries', u'http://www.philly.com/inquirer_obituaries.rss')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?viewAll=y'
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
__author__ = 'Darko Spasovski'
|
__author__ = 'Darko Spasovski'
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
@ -7,7 +8,6 @@ __copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
|
|||||||
'''
|
'''
|
||||||
www.plusinfo.mk
|
www.plusinfo.mk
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class PlusInfo(BasicNewsRecipe):
|
class PlusInfo(BasicNewsRecipe):
|
||||||
@ -27,8 +27,11 @@ class PlusInfo(BasicNewsRecipe):
|
|||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class': 'vest'})]
|
remove_tags = []
|
||||||
remove_tags = [dict(name='div', attrs={'class':['komentari_holder', 'objava']})]
|
remove_tags.append(dict(name='div', attrs={'class':['komentari_holder', 'objava', 'koment']}))
|
||||||
|
remove_tags.append(dict(name='ul', attrs={'class':['vest_meni']}))
|
||||||
|
remove_tags.append(dict(name='a', attrs={'name': ['fb_share']}))
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class': 'vest1'})]
|
||||||
|
|
||||||
feeds = [(u'Македонија', u'http://www.plusinfo.mk/rss/makedonija'),
|
feeds = [(u'Македонија', u'http://www.plusinfo.mk/rss/makedonija'),
|
||||||
(u'Бизнис', u'http://www.plusinfo.mk/rss/biznis'),
|
(u'Бизнис', u'http://www.plusinfo.mk/rss/biznis'),
|
||||||
|
79
recipes/prospectmaguk.recipe
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
calibre recipe for prospectmagazine.co.uk (subscription)
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ProspectMagUK(BasicNewsRecipe):
|
||||||
|
title = u'Prospect Magazine'
|
||||||
|
description = 'A general-interest publication offering analysis and commentary about politics, news and business.'
|
||||||
|
__author__ = 'barty, duluoz'
|
||||||
|
timefmt = ' [%d %B %Y]'
|
||||||
|
no_stylesheets = True
|
||||||
|
publication_type = 'magazine'
|
||||||
|
masthead_url = 'http://www.prospectmagazine.co.uk/wp-content/themes/prospect/images/titleMain.jpg'
|
||||||
|
category = 'news, UK'
|
||||||
|
language = 'en_GB'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
|
auto_cleanup_keep = '//div[@class="lead_image"]'
|
||||||
|
remove_tags = [{'class':['shareinpost','postutils','postinfo']}]
|
||||||
|
|
||||||
|
INDEX = 'http://www.prospectmagazine.co.uk/current-issue'
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.prospectmagazine.co.uk/wp-login.php')
|
||||||
|
br.select_form(name='loginform')
|
||||||
|
br['log'] = self.username
|
||||||
|
br['pwd'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
#div = soup.find('h1',text=re.compile(r'Issue \d+'))
|
||||||
|
#fname = self.tag_to_string( div) if div is not None else 'Current Issue'
|
||||||
|
div = soup.find('div', id='cover_image')
|
||||||
|
if div is not None:
|
||||||
|
img = div.find('img', src=True)
|
||||||
|
if img is not None:
|
||||||
|
src = img['src']
|
||||||
|
if src.startswith('/'):
|
||||||
|
src = 'http://www.prospectmagazine.co.uk' + src
|
||||||
|
self.cover_url = src
|
||||||
|
feeds = []
|
||||||
|
# loop through sections
|
||||||
|
for sect in soup.findAll('div',attrs={'class':'sectionheading'}):
|
||||||
|
fname = self.tag_to_string( sect).replace('>','').strip()
|
||||||
|
self.log('Found section', fname)
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
# note: can't just find siblings with class='post' because that will also
|
||||||
|
# grab all the articles belonging to the sections that follow.
|
||||||
|
for item in sect.findNextSiblings('div',attrs={'class':True}):
|
||||||
|
if not 'post' in item['class']: break
|
||||||
|
a = item.find('a', href=True)
|
||||||
|
if a is None: continue
|
||||||
|
url = a['href']
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
p = item.find('p')
|
||||||
|
desc = self.tag_to_string( p) if p is not None else ''
|
||||||
|
art = {'title':title, 'description':desc,'date':' ', 'url':url}
|
||||||
|
p = item.find(attrs={'class':re.compile('author')})
|
||||||
|
self.log('\tFound article:', title, '::', url)
|
||||||
|
if p is not None:
|
||||||
|
art['author'] = self.tag_to_string( p).strip()
|
||||||
|
articles.append(art)
|
||||||
|
|
||||||
|
feeds.append((fname, articles))
|
||||||
|
return feeds
|
@ -1,30 +1,36 @@
|
|||||||
|
"""
|
||||||
|
readitlaterlist.com
|
||||||
|
"""
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '''
|
__copyright__ = '''
|
||||||
2010, Darko Miletic <darko.miletic at gmail.com>
|
2010, Darko Miletic <darko.miletic at gmail.com>
|
||||||
2011, Przemyslaw Kryger <pkryger at gmail.com>
|
2011, Przemyslaw Kryger <pkryger at gmail.com>
|
||||||
'''
|
2012, tBunnyMan <Wag That Tail At Me dot com>
|
||||||
'''
|
|
||||||
readitlaterlist.com
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class Readitlater(BasicNewsRecipe):
|
class Readitlater(BasicNewsRecipe):
|
||||||
title = 'ReadItLater'
|
title = 'ReadItLater'
|
||||||
__author__ = 'Darko Miletic, Przemyslaw Kryger'
|
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
|
||||||
description = '''Personalized news feeds. Go to readitlaterlist.com to
|
description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
|
||||||
setup up your news. Fill in your account
|
up your news. This version displays pages of articles from \
|
||||||
username, and optionally you can add password.'''
|
oldest to newest, with max & minimum counts, and marks articles \
|
||||||
publisher = 'readitlater.com'
|
read after downloading.'''
|
||||||
|
publisher = 'readitlaterlist.com'
|
||||||
category = 'news, custom'
|
category = 'news, custom'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 50
|
||||||
|
minimum_articles = 1
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
INDEX = u'http://readitlaterlist.com'
|
INDEX = u'http://readitlaterlist.com'
|
||||||
LOGIN = INDEX + u'/l'
|
LOGIN = INDEX + u'/l'
|
||||||
|
readList = []
|
||||||
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -38,34 +44,39 @@ class Readitlater(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def get_feeds(self):
|
def get_feeds(self):
|
||||||
self.report_progress(0, ('Fetching list of feeds...'))
|
self.report_progress(0, ('Fetching list of pages...'))
|
||||||
lfeeds = []
|
lfeeds = []
|
||||||
i = 1
|
i = 1
|
||||||
feedurl = self.INDEX + u'/unread/1'
|
feedurl = self.INDEX + u'/unread/1'
|
||||||
while True:
|
while True:
|
||||||
title = u'Unread articles, page ' + str(i)
|
title = u'Unread articles, page ' + str(i)
|
||||||
lfeeds.append((title, feedurl))
|
lfeeds.insert(0, (title, feedurl))
|
||||||
self.report_progress(0, ('Got ') + str(i) + (' feeds'))
|
self.report_progress(0, ('Got ') + str(i) + (' pages'))
|
||||||
i += 1
|
i += 1
|
||||||
soup = self.index_to_soup(feedurl)
|
soup = self.index_to_soup(feedurl)
|
||||||
ritem = soup.find('a', attrs={'id':'next', 'class':'active'})
|
ritem = soup.find('a', attrs={'id':'next', 'class':'active'})
|
||||||
if ritem is None:
|
if ritem is None:
|
||||||
break
|
break
|
||||||
feedurl = self.INDEX + ritem['href']
|
feedurl = self.INDEX + ritem['href']
|
||||||
if self.test:
|
|
||||||
return lfeeds[:2]
|
|
||||||
return lfeeds
|
return lfeeds
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
totalfeeds = []
|
totalfeeds = []
|
||||||
|
articlesToGrab = self.max_articles_per_feed
|
||||||
lfeeds = self.get_feeds()
|
lfeeds = self.get_feeds()
|
||||||
for feedobj in lfeeds:
|
for feedobj in lfeeds:
|
||||||
|
if articlesToGrab < 1:
|
||||||
|
break
|
||||||
feedtitle, feedurl = feedobj
|
feedtitle, feedurl = feedobj
|
||||||
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||||
articles = []
|
articles = []
|
||||||
soup = self.index_to_soup(feedurl)
|
soup = self.index_to_soup(feedurl)
|
||||||
ritem = soup.find('ul', attrs={'id':'list'})
|
ritem = soup.find('ul', attrs={'id':'list'})
|
||||||
for item in ritem.findAll('li'):
|
for item in reversed(ritem.findAll('li')):
|
||||||
|
if articlesToGrab < 1:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
articlesToGrab -= 1
|
||||||
description = ''
|
description = ''
|
||||||
atag = item.find('a', attrs={'class':'text'})
|
atag = item.find('a', attrs={'class':'text'})
|
||||||
if atag and atag.has_key('href'):
|
if atag and atag.has_key('href'):
|
||||||
@ -78,6 +89,20 @@ class Readitlater(BasicNewsRecipe):
|
|||||||
,'url' :url
|
,'url' :url
|
||||||
,'description':description
|
,'description':description
|
||||||
})
|
})
|
||||||
|
readLink = item.find('a', attrs={'class':'check'})['href']
|
||||||
|
self.readList.append(readLink)
|
||||||
totalfeeds.append((feedtitle, articles))
|
totalfeeds.append((feedtitle, articles))
|
||||||
|
if len(self.readList) < self.minimum_articles:
|
||||||
|
raise Exception("Not enough articles in RIL! Change minimum_articles or add more.")
|
||||||
return totalfeeds
|
return totalfeeds
|
||||||
|
|
||||||
|
def mark_as_read(self, markList):
|
||||||
|
br = self.get_browser()
|
||||||
|
for link in markList:
|
||||||
|
url = self.INDEX + link
|
||||||
|
response = br.open(url)
|
||||||
|
response
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
self.mark_as_read(self.readList)
|
||||||
|
|
||||||
|