Merge
179
Changelog.yaml
@ -19,6 +19,185 @@
|
||||
# new recipes:
|
||||
# - title:
|
||||
|
||||
- version: 0.8.22
|
||||
date: 2011-10-14
|
||||
|
||||
new features:
|
||||
- title: "Input plugin for OCR-ed DJVU files (i.e. .djvu files that contain text. Only the text is converted)"
|
||||
type: major
|
||||
|
||||
- title: "Driver for the SONY PRS T1"
|
||||
|
||||
- title: "Add a 'Back' button to the metadata download dialog while downloading covers, so that you can go back and select a different match if you dont lke the covers, instead of having to re-do the entire download."
|
||||
tickets: [855055]
|
||||
|
||||
- title: "Add an option in Preferences->Saving to disk to not show files in file browser after saving to disk"
|
||||
|
||||
- title: "Get Books: Add the amazon.fr store. Remove leading 'by' from author names. Fix encoding issues with non English titles/names"
|
||||
|
||||
- title: "Driver for Onyx BOOX A61S/X61S"
|
||||
tickets: [872741]
|
||||
|
||||
- title: "Kobo: Add support for uploading new covers to the device without converting the ePub. You can just resend the book to have the cover updated"
|
||||
|
||||
- title: "Make it a little harder to ignore the fact that there are multiple toolbars when customizing toolbars"
|
||||
tickets: [864589]
|
||||
|
||||
bug fixes:
|
||||
- title: "MOBI Input: Remove invalid tags of the form <xyz: >"
|
||||
tickets: [872883]
|
||||
|
||||
- title: "calibredb add_format does not refresh running calibre instance"
|
||||
tickets: [872961]
|
||||
|
||||
- title: "Conversion pipeline: Translate <font face> to CSS font-family"
|
||||
tickets: [871388]
|
||||
|
||||
- title: "When sending email add a Date: header so that amavis does not consider the emails to be spam"
|
||||
|
||||
- title: "Fix for the problem where setting the restriction to an empty current search clears the restriction box but does not clear the restriction."
|
||||
tickets: [871921]
|
||||
|
||||
- title: "Fix generation of column coloring rules for date/time columns"
|
||||
|
||||
- title: "Fix plugboard problem where customizations to formats accepted by a device were ignored."
|
||||
|
||||
- title: "Enable adding of various actions to the toolbar when device is connected (they had been erroneously marked as being non-addable)"
|
||||
|
||||
- title: "Fixable content in library check is not hidden after repair"
|
||||
tickets: [864096]
|
||||
|
||||
- title: "Catalog generation: Handle a corrupted thumbnail cache."
|
||||
|
||||
- title: "Do not error out when user clicks stop selected job with no job selected."
|
||||
tickets: [863766]
|
||||
|
||||
improved recipes:
|
||||
- automatiseringgids
|
||||
- CNET
|
||||
- Geek and Poke
|
||||
- Gosc Niedzielny
|
||||
- Dilbert
|
||||
- Economist
|
||||
- Ming Pao
|
||||
- Metro UK
|
||||
- Heise Online
|
||||
- FAZ.net
|
||||
- Houston Chronicle
|
||||
- Slate
|
||||
- Descopera
|
||||
|
||||
new recipes:
|
||||
- title: WoW Insider
|
||||
author: Krittika Goyal
|
||||
|
||||
- title: Merco Press and Penguin news
|
||||
author: Russell Phillips
|
||||
|
||||
- title: Defense News
|
||||
author: Darko Miletic
|
||||
|
||||
- title: Revista Piaui
|
||||
author: Eduardo Simoes
|
||||
|
||||
- title: Dark Horizons
|
||||
author: Jaded
|
||||
|
||||
- title: Various polish news sources
|
||||
author: fenuks
|
||||
|
||||
|
||||
- version: 0.8.21
|
||||
date: 2011-09-30
|
||||
|
||||
new features:
|
||||
- title: "A Tips and Tricks blog at http://blog.calibre-ebook.com to introduce less well known calibre features in a simple way"
|
||||
|
||||
- title: "News download: Add list of articles in the downloaded issue to the comments metadata of the generated ebook. Makes it possible to search for a particular article in the calibre library."
|
||||
ticket: [851717]
|
||||
|
||||
- title: "Toolbar buttons: You can now also right click the buttons to bring the popup of extra actions, in addition to clicking the small arrow next to the button."
|
||||
|
||||
- title: "Amazon metadata download plugin: Add option to download metadata from amazon.es"
|
||||
|
||||
- title: Driver for Vizio and iRobot A9 Android tablets
|
||||
tickets: [854408,862175]
|
||||
|
||||
- title: "When switching to/starting with a library with a corrupted database, offer the user the option of rebuilding the database instead of erroring out."
|
||||
|
||||
- title: "Template language: Add list_equals function"
|
||||
|
||||
- title: "Add a special output profile for the PocketBook 900 as it does not resize images correctly by itself"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression that cause PDF Output to generate very large files"
|
||||
|
||||
- title: Fix Title Sort field not being displayed in Book details panel
|
||||
|
||||
- title: Prevent renaming of languages in the Tag browser
|
||||
tickets: [860943]
|
||||
|
||||
- title: "Get books: Fix getting price from Foyles"
|
||||
|
||||
- title: "Content server: When a search matches no queries, do not show an error message"
|
||||
|
||||
- title: "ODT Input: Add workaround for ADE to fix centering of block level images when converting to EPUB"
|
||||
tickets: [859343]
|
||||
|
||||
- title: "Content server: When WSGI embedding fix handling of empty URL"
|
||||
|
||||
- title: "RTF Input: Fix spurious spaces inserted after some unicode characters"
|
||||
tickets: [851215]
|
||||
|
||||
- title: "Fix regression that broke clicking on the first letter of author names in the Tag Browser when grouped"
|
||||
tickets: [860615]
|
||||
|
||||
- title: "Fix reading metadata from filenames when the author regexp does not match anything"
|
||||
|
||||
- title: "Fix incorrect display of the month September in Finnish calibre"
|
||||
tickets: [858737]
|
||||
|
||||
- title: "Do not delete the file when the user tries to add a format to a book from a file already in the books directory"
|
||||
tickets: [856158]
|
||||
|
||||
- title: "Fix regression that broke customization of Kobo device plugin"
|
||||
|
||||
- title: "Allow user defined templates to be used in save to disk"
|
||||
|
||||
improved recipes:
|
||||
- Read It Later
|
||||
- American Spectator
|
||||
- Sydney Morning Herald
|
||||
- Chicago Tribune
|
||||
- American Prospect
|
||||
- DNA India
|
||||
- Times of India
|
||||
- Kurier
|
||||
- xkcd
|
||||
- Cnet
|
||||
|
||||
new recipes:
|
||||
- title: Various Colombian news sources
|
||||
author: BIGO-CAVA
|
||||
|
||||
- title: Gosc Niedzielny
|
||||
author: Piotr Kontek
|
||||
|
||||
- title: Leipzer Volkszeitung
|
||||
author: a.peter
|
||||
|
||||
- title: Folha de Sao Paulo (full edition)
|
||||
author: fluzao
|
||||
|
||||
- title: Den of Geek
|
||||
author: Jaded
|
||||
|
||||
- title: Republica
|
||||
author: Manish Bhattarai
|
||||
|
||||
- title: Sign on San Diego
|
||||
author: Jay Kindle
|
||||
|
||||
- version: 0.8.20
|
||||
date: 2011-09-23
|
||||
|
||||
|
@ -118,7 +118,7 @@ EBVS
|
||||
<0x 00 00 00 00>
|
||||
<0x 00 00 00 10>
|
||||
...(rest of size of DATA block)
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
DATA
|
||||
<0x 4 bytes = size of <marked text (see 3rd note)> >
|
||||
<marked text (see 3rd note)>
|
||||
@ -155,7 +155,7 @@ EBVS
|
||||
<0x 00 00 00 00>
|
||||
<0x 00 00 00 10>
|
||||
...(rest of size of DATA block)
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
[fi MARK || BOOKMARK]
|
||||
//-------------------------------
|
||||
[if CORRECTION]
|
||||
@ -174,7 +174,7 @@ EBVS
|
||||
<0x 00 00 00 00>
|
||||
<0x 00 00 00 10>
|
||||
...(rest of size of DATA block)
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
DATA
|
||||
<0x 4 bytes = size of <marked text (see 3rd note)> >
|
||||
<marked text (see 3rd note)>
|
||||
@ -246,7 +246,7 @@ EBVS
|
||||
<0x 00 00 00 00>
|
||||
<0x 00 00 00 10>
|
||||
...(size of DATA block - 30)
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
[fi DRAWING]
|
||||
//-------------------------------
|
||||
[next {NOTE,MARK,CORRECTION,DRAWING}]
|
||||
@ -308,7 +308,7 @@ EBVS
|
||||
...4
|
||||
...4
|
||||
...4
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
<0x FD EA = PAD? (ýê)>
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// CATEGORY (if any)
|
||||
@ -411,4 +411,4 @@ BKMK
|
||||
// END OF FILE
|
||||
|
||||
// by idleloop@yahoo.com, v0.2.e, 12/2009
|
||||
// http://www.angelfire.com/ego2/idleloop
|
||||
// http://www.angelfire.com/ego2/idleloop
|
||||
|
@ -18,25 +18,16 @@ class TheAmericanSpectator(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
INDEX = 'http://spectator.org'
|
||||
|
||||
conversion_options = {
|
||||
auto_cleanup = True
|
||||
encoding = 'utf-8'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'post inner'})
|
||||
,dict(name='div', attrs={'class':'author-bio'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='object')
|
||||
,dict(name='div', attrs={'class':['col3','post-options','social']})
|
||||
,dict(name='p' , attrs={'class':['letter-editor','meta']})
|
||||
]
|
||||
|
||||
feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')]
|
||||
|
||||
def get_cover_url(self):
|
||||
@ -48,10 +39,10 @@ class TheAmericanSpectator(BasicNewsRecipe):
|
||||
link_item2 = soup2.find('div',attrs={'class':'post inner issues'})
|
||||
cover_url = self.INDEX + link_item2.img['src']
|
||||
return cover_url
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '/print'
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
|
||||
|
@ -1,26 +1,18 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AmericanProspect(BasicNewsRecipe):
|
||||
title = u'American Prospect'
|
||||
__author__ = u'Michael Heinz'
|
||||
oldest_article = 30
|
||||
language = 'en'
|
||||
max_articles_per_feed = 100
|
||||
recursions = 0
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
__author__ = u'Michael Heinz, a.peter'
|
||||
version = 2
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<body.*?<div class="pad_10L10R">', re.DOTALL|re.IGNORECASE), lambda match: '<body><div>'),
|
||||
(re.compile(r'</div>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</div></body>'),
|
||||
(re.compile('\r'),lambda match: ''),
|
||||
(re.compile(r'<!-- .+? -->', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<link .+?>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<script.*?</script>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<noscript.*?</noscript>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<meta .*?/>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
]
|
||||
oldest_article = 30
|
||||
language = 'en'
|
||||
max_articles_per_feed = 100
|
||||
recursions = 0
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'pad_10L10R'})]
|
||||
remove_tags = [dict(name='form'), dict(name='div', attrs={'class':['bkt_caption','sharebox noprint','badgebox']})]
|
||||
|
||||
feeds = [(u'Articles', u'feed://www.prospect.org/articles_rss.jsp')]
|
||||
|
||||
|
21
recipes/archeowiesci.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Archeowiesci(BasicNewsRecipe):
|
||||
title = u'Archeowiesci'
|
||||
__author__ = 'fenuks'
|
||||
category = 'archeology'
|
||||
language = 'pl'
|
||||
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})]
|
||||
feeds = [(u'Archeowieści', u'http://archeowiesci.pl/feed/')]
|
||||
|
||||
def parse_feeds (self):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
for feed in feeds:
|
||||
for article in feed.articles[:]:
|
||||
if 'subskrypcja' in article.title:
|
||||
feed.articles.remove(article)
|
||||
return feeds
|
@ -10,27 +10,15 @@ class autogids(BasicNewsRecipe):
|
||||
publisher = 'AutomatiseringGids'
|
||||
category = 'Nieuws, IT, Nederlandstalig'
|
||||
simultaneous_downloads = 5
|
||||
#delay = 1
|
||||
timefmt = ' [%A, %d %B, %Y]'
|
||||
#timefmt = ''
|
||||
timefmt = ' [%a, %d %B, %Y]'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.automatiseringgids.nl/siteimg/header_logo.gif'
|
||||
keep_only_tags = [dict(id=['content'])]
|
||||
extra_css = '.artikelheader {font-size:0.8em; color: #666;} .artikelintro {font-weight:bold} div.imgArticle {float: right; margin: 0 0em 1em 1em; display: block; position: relative; } \
|
||||
h2 { margin: 0 0 0.5em; min-height: 30px; font-size: 1.5em; letter-spacing: -0.2px; margin: 0 0 0.5em; color: black; font-weight: bold; line-height: 1.2em; padding: 4px 3px 0; }'
|
||||
cover_url = 'http://www.automatiseringgids.nl/binaries/content/gallery/ag/marketing/ag-avatar-100x50.jpg'
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['content']})]
|
||||
|
||||
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'id':['loginbox','reactiecollapsible','reactiebox']}),
|
||||
dict(name='div', attrs={'class':['column_a','column_c','bannerfullsize','reactieheader','reactiecollapsible','formulier','artikel_headeroptions']}),
|
||||
dict(name='ul', attrs={'class':['highlightlist']}),
|
||||
dict(name='input', attrs={'type':['button']}),
|
||||
dict(name='div', attrs={'style':['display:block; width:428px; height:30px; float:left;']}),
|
||||
]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'(<h3>Reacties</h3>|<h2>Zie ook:</h2>|<div style=".*</div>|<a[^>]*>|</a>)', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
|
@ -110,8 +110,10 @@ class BrandEins(BasicNewsRecipe):
|
||||
selected_issue = issue_map[selected_issue_key]
|
||||
url = selected_issue.get('href', False)
|
||||
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
||||
self.title = "brand eins " + selected_issue_key[4:] + "/" + selected_issue_key[0:4]
|
||||
# self.title = "brand eins " + selected_issue_key[4:] + "/" + selected_issue_key[0:4]
|
||||
# Get the alternative title for the magazin - build it out of the title of the cover - without the issue and year;
|
||||
url = 'http://brandeins.de/'+url
|
||||
self.timefmt = ' ' + selected_issue_key[4:] + '/' + selected_issue_key[:4]
|
||||
|
||||
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
||||
titles_and_articles = self.brand_eins_parse_issue(url)
|
||||
@ -163,4 +165,3 @@ class BrandEins(BasicNewsRecipe):
|
||||
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
||||
titles_and_articles.append([chapter_title, current_articles])
|
||||
return titles_and_articles
|
||||
|
||||
|
@ -9,9 +9,10 @@ class CGM(BasicNewsRecipe):
|
||||
category = 'music'
|
||||
language = 'pl'
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds= True
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheers=True
|
||||
extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;}'
|
||||
extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}'
|
||||
remove_tags_before=dict(id='mainContent')
|
||||
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
|
||||
remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
|
||||
@ -22,10 +23,12 @@ class CGM(BasicNewsRecipe):
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
ad=soup.findAll('img')
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
ad=soup.findAll('a')
|
||||
for r in ad:
|
||||
if '/_vault/_article_photos/5841.jpg' in r['src'] or '_vault/_article_photos/5807.jpg' in r['src'] or 'article_photos/5841.jpg' in r['src'] or 'article_photos/5825.jpg' in r['src'] or '_article_photos/5920.jpg' in r['src'] or '_article_photos/5919.jpg' in r['src'] or '_article_photos/5918.jpg' in r['src'] or '_article_photos/5914.jpg' in r['src'] or '_article_photos/5911.jpg' in r['src'] or '_article_photos/5923.jpg' in r['src'] or '_article_photos/5921.jpg' in r['src']:
|
||||
ad[ad.index(r)].extract()
|
||||
if 'http://www.hustla.pl' in r['href']:
|
||||
r.extract()
|
||||
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
||||
if gallery:
|
||||
img=gallery.find('embed')
|
||||
|
@ -8,21 +8,25 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class ChicagoTribune(BasicNewsRecipe):
|
||||
|
||||
title = 'Chicago Tribune'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman, a.peter'
|
||||
description = 'Politics, local and business news from Chicago'
|
||||
language = 'en'
|
||||
language = 'en'
|
||||
version = 2
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
recursions = 1
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
|
||||
dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),
|
||||
]
|
||||
remove_tags_after = [ {'class':['photo_article',]} ]
|
||||
remove_tags_after = [{'class':['photo_article',]}]
|
||||
|
||||
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
|
||||
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
|
||||
match_regexps = [r'page=[0-9]+']
|
||||
|
||||
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer",'gallery-subcontent','subFooter']},
|
||||
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent",'nextgen-share-tools','outbrainTools', 'google-ad-story-bottom']},
|
||||
dict(name='font',attrs={'id':["cr-other-headlines"]})]
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
@ -37,7 +41,7 @@ class ChicagoTribune(BasicNewsRecipe):
|
||||
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
'''
|
||||
feeds = [
|
||||
('Latest news', 'http://feeds.chicagotribune.com/chicagotribune/news/'),
|
||||
('Local news', 'http://feeds.chicagotribune.com/chicagotribune/news/local/'),
|
||||
@ -76,8 +80,12 @@ class ChicagoTribune(BasicNewsRecipe):
|
||||
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
# Remove the navigation bar. It was kept until now to be able to follow
|
||||
# the links to further pages. But now we don't need them anymore.
|
||||
for nav in soup.findAll(attrs={'class':['toppaginate','article-nav clearfix']}):
|
||||
nav.extract()
|
||||
|
||||
for t in soup.findAll(['table', 'tr', 'td']):
|
||||
t.name = 'div'
|
||||
|
||||
@ -88,4 +96,3 @@ class ChicagoTribune(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
|
@ -2,6 +2,11 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
Changelog:
|
||||
2011-09-24
|
||||
Changed cover (drMerry)
|
||||
2011-10-13
|
||||
Updated Cover (drMerry)
|
||||
news.cnet.com
|
||||
'''
|
||||
|
||||
@ -9,7 +14,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class CnetNews(BasicNewsRecipe):
|
||||
title = 'CNET News'
|
||||
__author__ = 'Darko Miletic'
|
||||
__author__ = 'Darko Miletic updated by DrMerry.'
|
||||
description = 'Tech news and business reports by CNET News. Focused on information technology, core topics include computers, hardware, software, networking, and Internet media.'
|
||||
publisher = 'CNET'
|
||||
category = 'news, IT, USA'
|
||||
@ -19,7 +24,7 @@ class CnetNews(BasicNewsRecipe):
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
|
||||
cover_url = 'http://reviews.cnet.com/i/ff/wp/logo_cnet.gif'
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
|
@ -8,11 +8,7 @@ class DallasNews(BasicNewsRecipe):
|
||||
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_tags_before = dict(name='h1')
|
||||
keep_only_tags = {'class':lambda x: x and 'article' in x}
|
||||
remove_tags = [
|
||||
{'class':['DMNSocialTools', 'article ', 'article first ', 'article premium']},
|
||||
]
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
('Local News',
|
||||
|
15
recipes/dark_horizons.recipe
Normal file
@ -0,0 +1,15 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1317580312(BasicNewsRecipe):
|
||||
title = u'Dark Horizons'
|
||||
language = 'en'
|
||||
__author__ = 'Jaded'
|
||||
description ='News, images, video clips and reviews of current and upcoming blockbuster films. '
|
||||
category = 'movies, tv, news'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
cover_url = 'http://a4.sphotos.ak.fbcdn.net/hphotos-ak-ash2/164168_148419801879765_148410081880737_225532_464073_n.jpg'
|
||||
masthead_url = 'http://www.darkhorizons.com/graphics/2/logo_print.png'
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'News', u'http://www.darkhorizons.com/feeds/news.atom'), (u'Features', u'http://www.darkhorizons.com/feeds/features.atom'), (u'Reviews', u'http://www.darkhorizons.com/feeds/reviews.atom')]
|
62
recipes/defensenews.recipe
Normal file
@ -0,0 +1,62 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.defensenews.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DefenseNews(BasicNewsRecipe):
|
||||
title = 'Defense News'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Find late-breaking defense news from the leading defense news weekly'
|
||||
publisher = 'Gannett Government Media Corporation'
|
||||
category = 'defense news, defence news, defense, defence, defence budget, defence policy'
|
||||
oldest_article = 31
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.defensenews.com/images/logo_defensenews2.jpg'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
.info{font-size: small; color: gray}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['meta','link'])
|
||||
,dict(attrs={'class':['toolbar','related','left','right']})
|
||||
]
|
||||
remove_tags_before = attrs={'class':'storyWrp'}
|
||||
remove_tags_after = attrs={'class':'middle'}
|
||||
|
||||
remove_attributes=['lang']
|
||||
|
||||
feeds = [
|
||||
(u'Europe' , u'http://www.defensenews.com/rss/eur/' )
|
||||
,(u'Americas', u'http://www.defensenews.com/rss/ame/' )
|
||||
,(u'Asia & Pacific rim', u'http://www.defensenews.com/rss/asi/' )
|
||||
,(u'Middle east & Africa', u'http://www.defensenews.com/rss/mid/')
|
||||
,(u'Air', u'http://www.defensenews.com/rss/air/' )
|
||||
,(u'Land', u'http://www.defensenews.com/rss/lan/' )
|
||||
,(u'Naval', u'http://www.defensenews.com/rss/sea/' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
21
recipes/den_of_geek.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1316944753(BasicNewsRecipe):
|
||||
title = u'Den of Geek'
|
||||
__author__ = 'Jaded'
|
||||
language = 'en'
|
||||
description = 'From science fiction enthusiasts through to gaming fanatics, Den of Geek has become the one-stop UK website for people genuinely passionate about their entertainment media. Den of Geek covers popular culture but always with an edgy, UK centric slant that sets it apart from the crowd.'
|
||||
category = 'Movies, TV, Games, Comics, Cult, News, Reviews'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
publication_type = 'newsportal'
|
||||
masthead_url ='http://www.denofgeek.com/siteimage/scale/0/0/logo.gif'
|
||||
cover_url ='http://a5.sphotos.ak.fbcdn.net/hphotos-ak-snc6/166479_180131695357862_139191826118516_354818_4993703_n.jpg'
|
||||
|
||||
feeds = [(u'Movies', u'http://www.denofgeek.com/movies/rss/'), (u'TV', u'http://www.denofgeek.com/television/rss/'), (u'Comics & Books', u'http://www.denofgeek.com/comics/rss/'), (u'Games', u'http://www.denofgeek.com/games/rss/'), (u'DVD/Blu-ray', u'http://www.denofgeek.com/Reviews/rss/')]
|
@ -22,6 +22,10 @@ class Descopera(BasicNewsRecipe):
|
||||
category = 'Ziare,Reviste,Descopera'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.descopera.ro/images/header_images/logo.gif'
|
||||
use_embedded_content = False
|
||||
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -30,28 +34,6 @@ class Descopera(BasicNewsRecipe):
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1', attrs={'style':'font-family: Arial,Helvetica,sans-serif; font-size: 18px; color: rgb(51, 51, 51); font-weight: bold; margin: 10px 0pt; clear: both; float: left;width: 610px;'})
|
||||
,dict(name='div', attrs={'style':'margin-right: 15px; margin-bottom: 15px; float: left;'})
|
||||
, dict(name='p', attrs={'id':'itemDescription'})
|
||||
,dict(name='div', attrs={'id':'itemBody'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['tools']})
|
||||
, dict(name='div', attrs={'class':['share']})
|
||||
, dict(name='div', attrs={'class':['category']})
|
||||
, dict(name='div', attrs={'id':['comments']})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'comments'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://www.descopera.ro/rss')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
11
recipes/diario_la_republica.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1317341449(BasicNewsRecipe):
|
||||
title = u'Diario La Republica'
|
||||
__author__ = 'CAVALENCIA'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
language = 'es_CO'
|
||||
|
||||
feeds = [(u'Diario La Republica', u'http://www.larepublica.com.co/rss/larepublica.xml')]
|
@ -2,6 +2,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
http://www.dilbert.com
|
||||
DrMerry added cover Image 2011-11-12
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
@ -9,7 +10,7 @@ import re
|
||||
|
||||
class DilbertBig(BasicNewsRecipe):
|
||||
title = 'Dilbert'
|
||||
__author__ = 'Darko Miletic and Starson17'
|
||||
__author__ = 'Darko Miletic and Starson17 contribution of DrMerry'
|
||||
description = 'Dilbert'
|
||||
reverse_article_order = True
|
||||
oldest_article = 15
|
||||
@ -20,6 +21,7 @@ class DilbertBig(BasicNewsRecipe):
|
||||
publisher = 'UNITED FEATURE SYNDICATE, INC.'
|
||||
category = 'comic'
|
||||
language = 'en'
|
||||
cover_url = 'http://dilbert.com/mobile/mobile/dilbert.app.icon.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
|
@ -1,6 +1,3 @@
|
||||
'''
|
||||
dnaindia.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -12,6 +9,10 @@ class DNAIndia(BasicNewsRecipe):
|
||||
language = 'en_IN'
|
||||
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
('Top News', 'http://www.dnaindia.com/syndication/rss_topnews.xml'),
|
||||
@ -22,15 +23,10 @@ class DNAIndia(BasicNewsRecipe):
|
||||
('World', 'http://www.dnaindia.com/syndication/rss,catid-9.xml'),
|
||||
('Money', 'http://www.dnaindia.com/syndication/rss,catid-4.xml'),
|
||||
('Sports', 'http://www.dnaindia.com/syndication/rss,catid-6.xml'),
|
||||
('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml'),
|
||||
('Digital Life', 'http://www.dnaindia.com/syndication/rss,catid-1089741.xml'),
|
||||
('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml')
|
||||
]
|
||||
remove_tags = [{'id':['footer', 'lhs-col']}, {'class':['bottom', 'categoryHead',
|
||||
'article_tools']}]
|
||||
keep_only_tags = dict(id='middle-col')
|
||||
remove_tags_after=[dict(attrs={'id':'story'})]
|
||||
remove_attributes=['style']
|
||||
no_stylesheets = True
|
||||
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
match = re.search(r'newsid=(\d+)', url)
|
||||
|
@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe):
|
||||
' perspective. Best downloaded on Friday mornings (GMT)')
|
||||
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
|
||||
oldest_article = 7.0
|
||||
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
|
||||
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||
remove_tags = [
|
||||
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
||||
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
|
||||
@ -56,6 +54,14 @@ class Economist(BasicNewsRecipe):
|
||||
return br
|
||||
'''
|
||||
|
||||
def get_cover_url(self):
|
||||
br = self.browser
|
||||
br.open(self.INDEX)
|
||||
issue = br.geturl().split('/')[4]
|
||||
self.log('Fetching cover for issue: %s'%issue)
|
||||
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
|
||||
return cover_url
|
||||
|
||||
def parse_index(self):
|
||||
return self.economist_parse_index()
|
||||
|
||||
|
@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe):
|
||||
' perspective. Best downloaded on Friday mornings (GMT)')
|
||||
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
|
||||
oldest_article = 7.0
|
||||
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
|
||||
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||
remove_tags = [
|
||||
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
||||
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
|
||||
@ -40,6 +38,14 @@ class Economist(BasicNewsRecipe):
|
||||
# downloaded with connection reset by peer (104) errors.
|
||||
delay = 1
|
||||
|
||||
def get_cover_url(self):
|
||||
br = self.browser
|
||||
br.open(self.INDEX)
|
||||
issue = br.geturl().split('/')[4]
|
||||
self.log('Fetching cover for issue: %s'%issue)
|
||||
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
|
||||
return cover_url
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
try:
|
||||
|
23
recipes/eioba.recipe
Normal file
@ -0,0 +1,23 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class eioba(BasicNewsRecipe):
|
||||
title = u'eioba'
|
||||
__author__ = 'fenuks'
|
||||
cover_url = 'http://www.eioba.org/lay/logo_pl_v3.png'
|
||||
language = 'pl'
|
||||
oldest_article = 7
|
||||
remove_empty_feeds= True
|
||||
max_articles_per_feed = 100
|
||||
extra_css = '#ctl0_body_Topic {font-weight: bold; font-size:30px;}'
|
||||
keep_only_tags=[dict(id=['ctl0_body_Topic', 'articleContent'])]
|
||||
feeds = [(u'Wszyskie kategorie', u'http://feeds.eioba.pl/eioba-pl-top'),
|
||||
(u'Technologia', u'http://www.eioba.pl/feed/categories/1.xml'),
|
||||
(u'Nauka', u'http://www.eioba.pl/feed/categories/12.xml'),
|
||||
(u'Finanse', u'http://www.eioba.pl/feed/categories/7.xml'),
|
||||
(u'Życie', u'http://www.eioba.pl/feed/categories/5.xml'),
|
||||
(u'Zainteresowania', u'http://www.eioba.pl/feed/categories/420.xml'),
|
||||
(u'Społeczeństwo', u'http://www.eioba.pl/feed/categories/8.xml'),
|
||||
(u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'),
|
||||
(u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml')
|
||||
]
|
@ -1,19 +1,21 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1314326622(BasicNewsRecipe):
|
||||
title = u'Ekantipur'
|
||||
__author__ = 'fab4.ilam'
|
||||
language = 'en_NP'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 25
|
||||
masthead_url = 'http://kantipur.com.np/images/ekantipur_01.jpg'
|
||||
remove_empty_feeds = True
|
||||
remove_tags_before = dict(id='main-content')
|
||||
remove_tags_after = dict(id='view-comments')
|
||||
remove_tags = [dict(attrs={'class':['ratings', 'news-tool', 'comment', 'post-ur-comment','asideBox','commentsbox','related-sidebar-row related-news']}),
|
||||
dict(id=['sidebar','news-detail-img', 'footer-wrapper']),
|
||||
dict(name=['script'])]
|
||||
|
||||
feeds = [(u'Top Stories', u'http://www.ekantipur.com/en/rss/top-stories/'), (u'National', u'http://www.ekantipur.com/en/rss/national/1'), (u'Capital', u'http://www.ekantipur.com/en/rss/capital/7'), (u'Business', u'http://www.ekantipur.com/en/rss/business/3'), (u'World', u'http://www.ekantipur.com/en/rss/world/5'), (u'Sports', u'http://www.ekantipur.com/en/rss/sports/4'), (u'Mixed Bag', u'http://www.ekantipur.com/en/rss/mixed-bag/14'), (u'Health & Living', u'http://www.ekantipur.com/en/rss/health-and-living/19'), (u'Entertainment', u'http://www.ekantipur.com/en/rss/entertainment/6')]
|
||||
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1314326622(BasicNewsRecipe):
|
||||
title = u'Ekantipur'
|
||||
__author__ = 'Manish Bhattarai'
|
||||
description = 'News from the No.1 News Portal In Nepal'
|
||||
language = 'en_NP'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 25
|
||||
masthead_url = 'http://kantipur.com.np/images/ekantipur_01.jpg'
|
||||
remove_empty_feeds = True
|
||||
remove_tags_before = dict(id='main-content')
|
||||
remove_tags_after = dict(id='view-comments')
|
||||
remove_tags = [dict(attrs={'class':['ratings', 'news-tool', 'comment', 'post-ur-comment','asideBox','commentsbox','related-sidebar-row related-news']}),
|
||||
dict(id=['sidebar','news-detail-img', 'footer-wrapper']),
|
||||
dict(name=['script'])]
|
||||
|
||||
feeds = [(u'Top Stories', u'http://www.ekantipur.com/en/rss/top-stories/'), (u'National', u'http://www.ekantipur.com/en/rss/national/1'), (u'Capital', u'http://www.ekantipur.com/en/rss/capital/7'), (u'Business', u'http://www.ekantipur.com/en/rss/business/3'), (u'World', u'http://www.ekantipur.com/en/rss/world/5'), (u'Sports', u'http://www.ekantipur.com/en/rss/sports/4'), (u'Mixed Bag', u'http://www.ekantipur.com/en/rss/mixed-bag/14'), (u'Health & Living', u'http://www.ekantipur.com/en/rss/health-and-living/19'), (u'Entertainment', u'http://www.ekantipur.com/en/rss/entertainment/6')]
|
||||
|
||||
|
||||
|
@ -2,12 +2,10 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
|
||||
class AdvancedUserRecipe1311790237(BasicNewsRecipe):
|
||||
title = u'Periódico El Colombiano'
|
||||
language = 'es_CO'
|
||||
__author__ = 'BIGO-CAVA'
|
||||
language = 'es_CO'
|
||||
cover_url = 'http://www.elcolombiano.com/images/logoElColombiano348x46.gif'
|
||||
remove_tags_before = dict(id='contenidoArt')
|
||||
remove_tags_after = dict(id='enviaTips')
|
||||
|
54
recipes/el_espectador.recipe
Normal file
@ -0,0 +1,54 @@
|
||||
# coding=utf-8
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ColombiaElEspectador(BasicNewsRecipe):
|
||||
title = u'Periódico el Espectador'
|
||||
__author__ = 'BIGO-CAVA'
|
||||
cover_url = 'http://www.elespectador.com/sites/elespectador.com/themes/elespectador/images/logo.gif'
|
||||
#remove_tags_before = dict(id='fb-root')
|
||||
remove_tags_before = dict(id='content')
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'paginacion'})]
|
||||
language = 'es_CO'
|
||||
#keep_only_tags = [dict(name='div', id='content')]
|
||||
remove_tags = [dict(name='div', attrs={'class':'herramientas_nota'}),
|
||||
dict(name='div', attrs={'class':'relpauta'}),
|
||||
dict(name='div', attrs={'class':'recursosrelacionados'}),
|
||||
dict(name='div', attrs={'class':'nav_negocios'})]
|
||||
# dict(name='div', attrs={'class':'tags_playerrecurso'}),
|
||||
# dict(name='div', attrs={'class':'ico-mail2'}),
|
||||
# dict(name='div', attrs={'id':'caja-instapaper'}),
|
||||
# dict(name='div', attrs={'class':'modulo herramientas'})]
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.elespectador.com/sites/elespectador.com/themes/elespectador/images/logo.gif'
|
||||
publication_type = 'newspaper'
|
||||
|
||||
extra_css = """
|
||||
p{text-align: justify; font-size: 100%}
|
||||
body{ text-align: left; font-size:100% }
|
||||
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
|
||||
"""
|
||||
|
||||
|
||||
feeds = [(u'Política ', u' http://www.elespectador.com/noticias/politica/feed'),
|
||||
(u'Judicial', u'http://www.elespectador.com/noticias/judicial/feed'),
|
||||
(u'Paz', u'http://www.elespectador.com/noticias/paz/feed'),
|
||||
(u'Economía', u'http://www.elespectador.com/economia/feed'),
|
||||
(u'Soy Periodista', u'http://www.elespectador.com/noticias/soyperiodista/feed'),
|
||||
(u'Investigación', u'http://www.elespectador.com/noticias/investigacion/feed'),
|
||||
(u'Educación', u'http://www.elespectador.com/noticias/educacion/feed'),
|
||||
(u'Salud', u'http://www.elespectador.com/noticias/salud/feed'),
|
||||
(u'El Mundo', u'http://www.elespectador.com/noticias/elmundo/feed'),
|
||||
(u'Nacional', u'http://www.elespectador.com/noticias/nacional/feed'),
|
||||
(u'Bogotá', u'http://www.elespectador.com/noticias/bogota/feed'),
|
||||
(u'Deportes', u'http://www.elespectador.com/deportes/feed'),
|
||||
(u'Tecnología', u'http://www.elespectador.com/tecnologia/feed'),
|
||||
(u'Actualidad', u'http://www.elespectador.com/noticias/actualidad/feed'),
|
||||
(u'Opinión', u'http://www.elespectador.com/opinion/feed'),
|
||||
(u'Editorial', u'http://www.elespectador.com/opinion/editorial/feed')]
|
50
recipes/el_mundo_co.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ColombiaElMundo02(BasicNewsRecipe):
|
||||
title = u'Periódico El Mundo'
|
||||
__author__ = 'BIGO-CAVA'
|
||||
language = 'es_CO'
|
||||
cover_url = 'http://www.elmundo.com/portal/img/logo_mundo2.png'
|
||||
remove_tags_before = dict(id='miga_pan')
|
||||
#remove_tags_before = [dict(name='div', attrs={'class':'contenido'})]
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'cuadro_opciones_new1'})]
|
||||
#keep_only_tags = [dict(name='div', id='miga_pan')]
|
||||
remove_tags = [dict(name='div', attrs={'class':'ruta'}),
|
||||
dict(name='div', attrs={'class':'buscador'}),
|
||||
dict(name='div', attrs={'class':'iconos'}),
|
||||
dict(name='div', attrs={'class':'otros_iconos'}),
|
||||
dict(name='div', attrs={'class':'cuadro_opciones_new1'}),
|
||||
dict(name='div', attrs={'class':'otras_noticias'}),
|
||||
dict(name='div', attrs={'class':'notas_relacionadas'}),
|
||||
dict(name='div', attrs={'id':'lateral_2'})]
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.elmundo.com/portal/img/logo_mundo2.png'
|
||||
publication_type = 'newspaper'
|
||||
|
||||
extra_css = """
|
||||
p{text-align: justify; font-size: 100%}
|
||||
body{ text-align: left; font-size:100% }
|
||||
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
|
||||
"""
|
||||
|
||||
|
||||
feeds = [(u'Opinión', u'http://www.elmundo.com/images/rss/opinion.xml'),
|
||||
(u'Economía', u'http://www.elmundo.com/images/rss/noticias_economia.xml'),
|
||||
(u'Deportes', u'http://www.elmundo.com/images/rss/deportes.xml'),
|
||||
(u'Política ', u'http://www.elmundo.com/images/rss/noticias_politica.xml'),
|
||||
(u'Antioquia', u'http://www.elmundo.com/images/rss/noticias_antioquia.xml'),
|
||||
(u'Nacional ', u'http://www.elmundo.com/images/rss/noticias_nacional.xml'),
|
||||
(u'Internacional', u'http://www.elmundo.com/images/rss/noticias_internacional.xml'),
|
||||
(u'Servicios Públicos', u'http://www.elmundo.com/images/rss/noticias_servicios_publicos.xml'),
|
||||
(u'Infraestructura', u'http://www.elmundo.com/images/rss/noticias_infraestructura.xml'),
|
||||
(u'Mobilidad', u'http://www.elmundo.com/images/rss/noticias_movilidad.xml'),
|
||||
(u'Derechos Humanos', u'http://www.elmundo.com/images/rss/noticias_derechos_humanos.xml'),
|
||||
(u'Vida', u'http://www.elmundo.com/images/rss/vida.xml'),
|
||||
(u'Cultura', u'http://www.elmundo.com/images/rss/cultura.xml')]
|
@ -2,18 +2,17 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
|
||||
|
||||
class ColombiaElTiempo02(BasicNewsRecipe):
|
||||
title = u'Periódico el Tiempo'
|
||||
language = 'es_CO'
|
||||
__author__ = 'BIGO-CAVA'
|
||||
language = 'es_CO'
|
||||
cover_url = 'http://www.eltiempo.com/media/css/images/logo_footer.png'
|
||||
remove_tags_before = dict(id='fb-root')
|
||||
#remove_tags_before = dict(id='fb-root')
|
||||
remove_tags_before = dict(id='contenidoArt')
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'modulo reporte'})]
|
||||
keep_only_tags = [dict(name='div', id='contenidoArt')]
|
||||
remove_tags = [dict(name='div', attrs={'class':'social-media'}),
|
||||
dict(name='div', attrs={'class':'recomend-art'}),
|
||||
dict(name='div', attrs={'class':'caja-facebook'}),
|
||||
dict(name='div', attrs={'class':'caja-twitter'}),
|
||||
dict(name='div', attrs={'class':'caja-buzz'}),
|
||||
|
@ -19,45 +19,20 @@ class FazNet(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'Article'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','embed','base'])
|
||||
,dict(name='div',
|
||||
attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo',
|
||||
'ArtikelServices', 'ModulLesermeinungenFooter',
|
||||
'ModulArtikelServices', 'BoxTool Aufklappen_Grau',
|
||||
'SocialMediaUnten', ]}),
|
||||
dict(id=['KurzLinkMenu', 'ArtikelServicesMenu']),
|
||||
]
|
||||
keep_only_tags = [{'class':'FAZArtikelEinleitung'},
|
||||
{'id':'ArtikelTabContent_0'}]
|
||||
|
||||
feeds = [
|
||||
('FAZ.NET Aktuell', 'http://www.faz.net/s/RubF3CE08B362D244869BE7984590CB6AC1/Tpl~Epartner~SRss_.xml'),
|
||||
('Politik', 'http://www.faz.net/s/RubA24ECD630CAE40E483841DB7D16F4211/Tpl~Epartner~SRss_.xml'),
|
||||
('Wirtschaft', 'http://www.faz.net/s/RubC9401175958F4DE28E143E68888825F6/Tpl~Epartner~SRss_.xml'),
|
||||
('Feuilleton', 'http://www.faz.net/s/RubCC21B04EE95145B3AC877C874FB1B611/Tpl~Epartner~SRss_.xml'),
|
||||
('Sport', 'http://www.faz.net/s/Rub9F27A221597D4C39A82856B0FE79F051/Tpl~Epartner~SRss_.xml'),
|
||||
('Gesellschaft', 'http://www.faz.net/s/Rub02DBAA63F9EB43CEB421272A670A685C/Tpl~Epartner~SRss_.xml'),
|
||||
('Finanzen', 'http://www.faz.net/s/Rub4B891837ECD14082816D9E088A2D7CB4/Tpl~Epartner~SRss_.xml'),
|
||||
('Wissen', 'http://www.faz.net/s/Rub7F4BEE0E0C39429A8565089709B70C44/Tpl~Epartner~SRss_.xml'),
|
||||
('Reise', 'http://www.faz.net/s/RubE2FB5CA667054BDEA70FB3BC45F8D91C/Tpl~Epartner~SRss_.xml'),
|
||||
('Technik & Motor', 'http://www.faz.net/s/Rub01E4D53776494844A85FDF23F5707AD8/Tpl~Epartner~SRss_.xml'),
|
||||
('Beruf & Chance', 'http://www.faz.net/s/RubB1E10A8367E8446897468EDAA6EA0504/Tpl~Epartner~SRss_.xml')
|
||||
('FAZ.NET Aktuell', 'http://www.faz.net/aktuell/?rssview=1'),
|
||||
('Politik', 'http://www.faz.net/aktuell/politik/?rssview=1'),
|
||||
('Wirtschaft', 'http://www.faz.net/aktuell/wirtschaft/?rssview=1'),
|
||||
('Feuilleton', 'http://www.faz.net/aktuell/feuilleton/?rssview=1'),
|
||||
('Sport', 'http://www.faz.net/aktuell/sport/?rssview=1'),
|
||||
('Gesellschaft', 'http://www.faz.net/aktuell/gesellschaft/?rssview=1'),
|
||||
('Finanzen', 'http://www.faz.net/aktuell/finanzen/?rssview=1'),
|
||||
('Technik & Motor', 'http://www.faz.net/aktuell/technik-motor/?rssview=1'),
|
||||
('Wissen', 'http://www.faz.net/aktuell/wissen/?rssview=1'),
|
||||
('Reise', 'http://www.faz.net/aktuell/reise/?rssview=1'),
|
||||
('Beruf & Chance', 'http://www.faz.net/aktuell/beruf-chance/?rssview=1')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
|
||||
soup.head.insert(0,mtag)
|
||||
del soup.body['onload']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
66
recipes/focus_pl.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Focus_pl(BasicNewsRecipe):
|
||||
title = u'Focus.pl'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
language = 'pl'
|
||||
description ='polish scientific monthly magazine'
|
||||
category='magazine'
|
||||
cover_url=''
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets=True
|
||||
remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
|
||||
remove_tags_after=dict(name='div', attrs={'class':'clear'})
|
||||
feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
|
||||
(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
|
||||
(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
|
||||
(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
|
||||
(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
|
||||
(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
|
||||
(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
|
||||
(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
|
||||
(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
|
||||
|
||||
|
||||
|
||||
]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
tag=soup.find(name='a')
|
||||
if tag:
|
||||
new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
|
||||
return new_soup
|
||||
|
||||
def append_page(self, appendtag):
|
||||
tag=appendtag.find(name='div', attrs={'class':'arrows'})
|
||||
if tag:
|
||||
nexturl='http://www.focus.pl/'+tag.a['href']
|
||||
for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
|
||||
rem.extract()
|
||||
while nexturl:
|
||||
soup2=self.index_to_soup(nexturl)
|
||||
nexturl=None
|
||||
pagetext=soup2.find(name='div', attrs={'class':'txt'})
|
||||
tag=pagetext.find(name='div', attrs={'class':'arrows'})
|
||||
for r in tag.findAll(name='a'):
|
||||
if u'Następne' in r.string:
|
||||
nexturl='http://www.focus.pl/'+r['href']
|
||||
for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
|
||||
rem.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup=self.index_to_soup('http://www.focus.pl/magazyn/')
|
||||
tag=soup.find(name='div', attrs={'class':'clr fl'})
|
||||
if tag:
|
||||
self.cover_url='http://www.focus.pl/' + tag.a['href']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup.body)
|
||||
return soup
|
96
recipes/folhadesaopaulo_sub.recipe
Normal file
@ -0,0 +1,96 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
import re
|
||||
|
||||
class FSP(BasicNewsRecipe):
|
||||
|
||||
title = u'Folha de S\xE3o Paulo'
|
||||
__author__ = 'fluzao'
|
||||
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
|
||||
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
|
||||
INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
|
||||
language = 'pt'
|
||||
no_stylesheets = True
|
||||
max_articles_per_feed = 40
|
||||
remove_javascript = True
|
||||
needs_subscription = True
|
||||
remove_tags_before = dict(name='b')
|
||||
remove_tags = [dict(name='td', attrs={'align':'center'})]
|
||||
remove_attributes = ['height','width']
|
||||
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
|
||||
|
||||
# fixes the problem with the section names
|
||||
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
|
||||
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
|
||||
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
|
||||
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'}
|
||||
|
||||
# this solves the problem with truncated content in Kindle
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
|
||||
# this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
|
||||
# Indice e Comunicar Erros
|
||||
preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->',
|
||||
re.DOTALL|re.IGNORECASE), lambda match: r''),
|
||||
(re.compile(r'<BR><BR>Próximo Texto:.*<!--/NOTICIA-->',
|
||||
re.DOTALL|re.IGNORECASE), lambda match: r'')]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('https://acesso.uol.com.br/login.html')
|
||||
br.form = br.forms().next()
|
||||
br['user'] = self.username
|
||||
br['pass'] = self.password
|
||||
br.submit().read()
|
||||
## if 'Please try again' in raw:
|
||||
## raise Exception('Your username and password are incorrect')
|
||||
return br
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
feeds = []
|
||||
articles = []
|
||||
section_title = "Preambulo"
|
||||
for post in soup.findAll('a'):
|
||||
# if name=True => new section
|
||||
strpost = str(post)
|
||||
if strpost.startswith('<a name'):
|
||||
if articles:
|
||||
feeds.append((section_title, articles))
|
||||
self.log()
|
||||
self.log('--> new section found, creating old section feed: ', section_title)
|
||||
section_title = post['name']
|
||||
if section_title in self.section_dict:
|
||||
section_title = self.section_dict[section_title]
|
||||
articles = []
|
||||
self.log('--> new section title: ', section_title)
|
||||
if strpost.startswith('<a href'):
|
||||
url = post['href']
|
||||
if url.startswith('/fsp'):
|
||||
url = 'http://www1.folha.uol.com.br'+url
|
||||
title = self.tag_to_string(post)
|
||||
self.log()
|
||||
self.log('--> post: ', post)
|
||||
self.log('--> url: ', url)
|
||||
self.log('--> title: ', title)
|
||||
articles.append({'title':title, 'url':url})
|
||||
if articles:
|
||||
feeds.append((section_title, articles))
|
||||
|
||||
# keeping the front page url
|
||||
minha_capa = feeds[0][1][1]['url']
|
||||
|
||||
# removing the 'Preambulo' section
|
||||
del feeds[0]
|
||||
|
||||
# creating the url for the cover image
|
||||
coverurl = feeds[0][1][0]['url']
|
||||
coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
|
||||
coverurl = coverurl.replace('01.htm', '.jpg')
|
||||
self.cover_url = coverurl
|
||||
|
||||
# inserting the cover page as the first article (nicer for kindle users)
|
||||
feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
|
||||
return feeds
|
@ -16,7 +16,7 @@ class FTDe(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
timefmt = ' [%d %b %Y]'
|
||||
language = 'de'
|
||||
max_articles_per_feed = 40
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
|
||||
remove_tags = [dict(id='navi_top'),
|
||||
@ -84,19 +84,19 @@ class FTDe(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'artikelsplitfaq'})]
|
||||
#remove_tags_after = [dict(name='a', attrs={'class':'more'})]
|
||||
|
||||
feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'),
|
||||
('Meinungshungrige', 'http://www.ftd.de/rss2/meinungshungrige'),
|
||||
('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'),
|
||||
('Politik', 'http://www.ftd.de/rss2/politik'),
|
||||
('Karriere_Management', 'http://www.ftd.de/rss2/karriere-management'),
|
||||
('IT_Medien', 'http://www.ftd.de/rss2/it-medien'),
|
||||
('Wissen', 'http://www.ftd.de/rss2/wissen'),
|
||||
('Sport', 'http://www.ftd.de/rss2/sport'),
|
||||
('Auto', 'http://www.ftd.de/rss2/auto'),
|
||||
('Lifestyle', 'http://www.ftd.de/rss2/lifestyle')
|
||||
|
||||
]
|
||||
feeds = [
|
||||
('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'),
|
||||
('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'),
|
||||
('Meinungen', 'http://www.ftd.de/rss2/meinungshungrige'),
|
||||
('Politik', 'http://www.ftd.de/rss2/politik'),
|
||||
('Management & Karriere', 'http://www.ftd.de/rss2/karriere-management'),
|
||||
('IT & Medien', 'http://www.ftd.de/rss2/it-medien'),
|
||||
('Wissen', 'http://www.ftd.de/rss2/wissen'),
|
||||
('Sport', 'http://www.ftd.de/rss2/sport'),
|
||||
('Auto', 'http://www.ftd.de/rss2/auto'),
|
||||
('Lifestyle', 'http://www.ftd.de/rss2/lifestyle')
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', '.html?mode=print')
|
||||
return url.replace('.html', '.html?mode=print')
|
83
recipes/gazeta_wyborcza.recipe
Normal file
@ -0,0 +1,83 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||
title = u'Gazeta Wyborcza'
|
||||
__author__ = 'fenuks'
|
||||
cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
||||
language = 'pl'
|
||||
description ='news from gazeta.pl'
|
||||
category='newspaper'
|
||||
INDEX='http://wyborcza.pl'
|
||||
remove_empty_feeds= True
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript=True
|
||||
no_stylesheets=True
|
||||
remove_tags_before=dict(id='k0')
|
||||
remove_tags_after=dict(id='banP4')
|
||||
remove_tags=[dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
|
||||
feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'),
|
||||
(u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'),
|
||||
(u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'),
|
||||
(u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'),
|
||||
(u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss')
|
||||
]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
tag=soup.find(name='a', attrs={'class':'btn'})
|
||||
if tag:
|
||||
new_soup=self.index_to_soup(tag['href'], raw=True)
|
||||
return new_soup
|
||||
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
loop=False
|
||||
tag = soup.find('div', attrs={'id':'Str'})
|
||||
if appendtag.find('div', attrs={'id':'Str'}):
|
||||
nexturl=tag.findAll('a')
|
||||
appendtag.find('div', attrs={'id':'Str'}).extract()
|
||||
loop=True
|
||||
if appendtag.find(id='source'):
|
||||
appendtag.find(id='source').extract()
|
||||
while loop:
|
||||
loop=False
|
||||
for link in nexturl:
|
||||
if u'następne' in link.string:
|
||||
url= self.INDEX + link['href']
|
||||
soup2 = self.index_to_soup(url)
|
||||
pagetext = soup2.find(id='artykul')
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
tag = soup2.find('div', attrs={'id':'Str'})
|
||||
nexturl=tag.findAll('a')
|
||||
loop=True
|
||||
|
||||
def gallery_article(self, appendtag):
|
||||
tag=appendtag.find(id='container_gal')
|
||||
if tag:
|
||||
nexturl=appendtag.find(id='gal_btn_next').a['href']
|
||||
appendtag.find(id='gal_navi').extract()
|
||||
while nexturl:
|
||||
soup2=self.index_to_soup(nexturl)
|
||||
pagetext=soup2.find(id='container_gal')
|
||||
nexturl=pagetext.find(id='gal_btn_next')
|
||||
if nexturl:
|
||||
nexturl=nexturl.a['href']
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
rem=appendtag.find(id='gal_navi')
|
||||
if rem:
|
||||
rem.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
if soup.find(id='container_gal'):
|
||||
self.gallery_article(soup.body)
|
||||
return soup
|
||||
|
||||
def print_version(self, url):
|
||||
if 'http://wyborcza.biz/biznes/' not in url:
|
||||
return url
|
||||
else:
|
||||
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
|
@ -1,35 +1,82 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
from calibre.utils.magick import Image, create_canvas
|
||||
|
||||
class AdvancedUserRecipe1307556816(BasicNewsRecipe):
|
||||
title = u'Geek and Poke'
|
||||
__author__ = u'DrMerry'
|
||||
description = u'Geek and Poke Cartoons'
|
||||
publisher = u'Oliver Widder'
|
||||
author = u'Oliver Widder, DrMerry (calibre-code), calibre'
|
||||
oldest_article = 31
|
||||
max_articles_per_feed = 100
|
||||
language = u'en'
|
||||
simultaneous_downloads = 5
|
||||
simultaneous_downloads = 1
|
||||
#delay = 1
|
||||
timefmt = ' [%A, %d %B, %Y]'
|
||||
timefmt = ' [%a, %d %B, %Y]'
|
||||
summary_length = -1
|
||||
no_stylesheets = True
|
||||
category = 'News.IT, Cartoon, Humor, Geek'
|
||||
use_embedded_content = False
|
||||
cover_url = 'http://geekandpoke.typepad.com/aboutcoders.jpeg'
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'blog'
|
||||
masthead_url = None
|
||||
conversion_options = {
|
||||
'comments' : ''
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'author' : author
|
||||
}
|
||||
|
||||
preprocess_regexps = [ (re.compile(r'(<p> </p>|<iframe.*</iframe>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>)', re.DOTALL|re.IGNORECASE),lambda match: ''),
|
||||
(re.compile(r'( | )', re.DOTALL|re.IGNORECASE),lambda match: ' '),
|
||||
(re.compile(r'<br( /)?>(<br( /)?>)+', re.DOTALL|re.IGNORECASE),lambda match: '<br>')
|
||||
]
|
||||
remove_tags_before = dict(name='p', attrs={'class':'content-nav'})
|
||||
remove_tags_after = dict(name='div', attrs={'class':'entry-content'})
|
||||
remove_tags = [dict(name='div', attrs={'class':'entry-footer'}),
|
||||
dict(name='div', attrs={'id':'alpha'}),
|
||||
dict(name='div', attrs={'id':'gamma'}),
|
||||
dict(name='iframe'),
|
||||
dict(name='p', attrs={'class':'content-nav'})]
|
||||
|
||||
extra_css = 'body, h3, p, h2, h1, div, span{margin:0px} h2.date-header {font-size: 0.7em; color:#eee;} h3.entry-header{font-size: 1.0em} div.entry-body{font-size: 0.9em}'
|
||||
filter_regexps = [(r'feedburner\.com'),
|
||||
(r'pixel.quantserve\.com'),
|
||||
(r'googlesyndication\.com'),
|
||||
(r'yimg\.com'),
|
||||
(r'scorecardresearch\.com')]
|
||||
|
||||
preprocess_regexps = [(re.compile(r'(<p>( |\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>|<!--.*?-->|<h2[^>]*>[^<]*</h2>[^<]*)', re.DOTALL|re.IGNORECASE),lambda match: ''),
|
||||
(re.compile(r'( |\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
|
||||
(re.compile(r'(<h3[^>]*>)<a[^>]>((?!</a)*)</a></h3>', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + match.group(2) + '</h3>'),
|
||||
(re.compile(r'(<img[^>]*alt="([^"]*)"[^>]*>)', re.DOTALL|re.IGNORECASE),lambda match: '<div id="merryImage"><cite>' + match.group(2) + '</cite><br>' + match.group(1) + '</div>'),
|
||||
(re.compile(r'<br( /)?>(<br( /)?>)+', re.DOTALL|re.IGNORECASE),lambda match: '<br>'),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='h2', attrs={'class':'date-header'})
|
||||
remove_tags_after = dict(name='div', attrs={'class':'entry-body'})
|
||||
extra_css = 'body, h3, p, div, span{margin:0px; padding:0px} h3.entry-header{font-size: 0.8em} div.entry-body{font-size: 0.7em}'
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
#width, height = img.size
|
||||
#print '***img is: ', iurl, '\n****width is: ', width, 'height is: ', height
|
||||
img.trim(0)
|
||||
#width, height = img.size
|
||||
#print '***TRIMMED img width is: ', width, 'height is: ', height
|
||||
left=0
|
||||
top=0
|
||||
border_color='#ffffff'
|
||||
width, height = img.size
|
||||
#print '***retrieved img width is: ', width, 'height is: ', height
|
||||
height_correction = 1.17
|
||||
canvas = create_canvas(width, height*height_correction,border_color)
|
||||
canvas.compose(img, left, top)
|
||||
#img = canvas
|
||||
#img.save(iurl)
|
||||
canvas.save(iurl)
|
||||
#width, height = canvas.size
|
||||
#print '***NEW img width is: ', width, 'height is: ', height
|
||||
return soup
|
||||
|
||||
feeds = [(u'Geek and Poke', u'http://feeds.feedburner.com/GeekAndPoke?format=xml')]
|
||||
feeds = ['http://feeds.feedburner.com/GeekAndPoke?format=xml']
|
||||
|
113
recipes/gosc_niedzielny.recipe
Normal file
@ -0,0 +1,113 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
import re
|
||||
|
||||
class GN(BasicNewsRecipe):
|
||||
EDITION = 0
|
||||
|
||||
__author__ = 'Piotr Kontek'
|
||||
title = u'Gość niedzielny'
|
||||
description = 'Weekly magazine'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
language = 'pl'
|
||||
remove_javascript = True
|
||||
temp_files = []
|
||||
simultaneous_downloads = 1
|
||||
|
||||
articles_are_obfuscated = True
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
br = self.get_browser()
|
||||
br.open(url)
|
||||
source = br.response().read()
|
||||
page = self.index_to_soup(source)
|
||||
|
||||
main_section = page.find('div',attrs={'class':'txt doc_prnt_prv'})
|
||||
|
||||
title = main_section.find('h2')
|
||||
info = main_section.find('div', attrs={'class' : 'cf doc_info'})
|
||||
authors = info.find(attrs={'class':'l'})
|
||||
article = str(main_section.find('p', attrs={'class' : 'doc_lead'}))
|
||||
first = True
|
||||
for p in main_section.findAll('p', attrs={'class':None}, recursive=False):
|
||||
if first and p.find('img') != None:
|
||||
article = article + '<p>'
|
||||
article = article + str(p.find('img')).replace('src="/files/','src="http://www.gosc.pl/files/')
|
||||
article = article + '<font size="-2">'
|
||||
for s in p.findAll('span'):
|
||||
article = article + self.tag_to_string(s)
|
||||
article = article + '</font></p>'
|
||||
else:
|
||||
article = article + str(p).replace('src="/files/','src="http://www.gosc.pl/files/')
|
||||
first = False
|
||||
|
||||
html = unicode(title) + unicode(authors) + unicode(article)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
return self.temp_files[-1].name
|
||||
|
||||
def find_last_issue(self):
|
||||
soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny')
|
||||
#szukam zdjęcia i linka do porzedniego pełnego numeru
|
||||
first = True
|
||||
for d in soup.findAll('div', attrs={'class':'l release_preview_l'}):
|
||||
img = d.find('img')
|
||||
if img != None:
|
||||
a = img.parent
|
||||
self.EDITION = a['href']
|
||||
self.title = img['alt']
|
||||
self.cover_url = 'http://www.gosc.pl' + img['src']
|
||||
if not first:
|
||||
break
|
||||
first = False
|
||||
|
||||
def parse_index(self):
|
||||
self.find_last_issue()
|
||||
soup = self.index_to_soup('http://www.gosc.pl' + self.EDITION)
|
||||
feeds = []
|
||||
#wstepniak
|
||||
a = soup.find('div',attrs={'class':'release-wp-b'}).find('a')
|
||||
articles = [
|
||||
{'title' : self.tag_to_string(a),
|
||||
'url' : 'http://www.gosc.pl' + a['href'].replace('/doc/','/doc_pr/'),
|
||||
'date' : '',
|
||||
'description' : ''}
|
||||
]
|
||||
feeds.append((u'Wstępniak',articles))
|
||||
#kategorie
|
||||
for addr in soup.findAll('a',attrs={'href':re.compile('kategoria')}):
|
||||
if addr.string != u'wszystkie artyku\u0142y z tej kategorii \xbb':
|
||||
main_block = self.index_to_soup('http://www.gosc.pl' + addr['href'])
|
||||
articles = list(self.find_articles(main_block))
|
||||
if len(articles) > 0:
|
||||
section = addr.string
|
||||
feeds.append((section, articles))
|
||||
return feeds
|
||||
|
||||
def find_articles(self, main_block):
|
||||
for a in main_block.findAll('div', attrs={'class':'prev_doc2'}):
|
||||
art = a.find('a')
|
||||
yield {
|
||||
'title' : self.tag_to_string(art),
|
||||
'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
|
||||
'date' : '',
|
||||
'description' : ''
|
||||
}
|
||||
for a in main_block.findAll('div', attrs={'class':'sr-document'}):
|
||||
art = a.find('a')
|
||||
yield {
|
||||
'title' : self.tag_to_string(art),
|
||||
'url' : 'http://www.gosc.pl' + art['href'].replace('/doc/','/doc_pr/'),
|
||||
'date' : '',
|
||||
'description' : ''
|
||||
}
|
||||
|
@ -9,8 +9,17 @@ class Gram_pl(BasicNewsRecipe):
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
extra_css = 'h2 {font-style: italic; font-size:20px;}'
|
||||
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
||||
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
|
||||
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
|
||||
feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||
(u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
|
||||
|
||||
def parse_feeds (self):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
for feed in feeds:
|
||||
for article in feed.articles[:]:
|
||||
if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
return feeds
|
||||
|
@ -119,10 +119,8 @@ class Guardian(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
def parse_index(self):
|
||||
try:
|
||||
feeds = []
|
||||
for title, href in self.find_sections():
|
||||
feeds.append((title, list(self.find_articles(href))))
|
||||
return feeds
|
||||
except:
|
||||
raise NotImplementedError
|
||||
feeds = []
|
||||
for title, href in self.find_sections():
|
||||
feeds.append((title, list(self.find_articles(href))))
|
||||
return feeds
|
||||
|
||||
|
@ -1,7 +1,9 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
|
||||
title = 'Heise-online'
|
||||
title = 'heise online'
|
||||
description = 'News vom Heise-Verlag'
|
||||
__author__ = 'schuster'
|
||||
use_embedded_content = False
|
||||
@ -12,10 +14,11 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
timeout = 5
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
|
||||
|
||||
remove_tags_after = dict(name ='p', attrs={'class':'editor'})
|
||||
remove_tags = [dict(id='navi_top_container'),
|
||||
remove_tags = [{'class':'navi_top_container'},
|
||||
dict(id='navi_bottom'),
|
||||
dict(id='mitte_rechts'),
|
||||
dict(id='navigation'),
|
||||
@ -25,28 +28,28 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
dict(id='content_foren'),
|
||||
dict(id='seiten_navi'),
|
||||
dict(id='adbottom'),
|
||||
dict(id='sitemap')]
|
||||
dict(id='sitemap'),
|
||||
dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')),
|
||||
]
|
||||
|
||||
feeds = [
|
||||
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
|
||||
('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
|
||||
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
|
||||
('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
|
||||
('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
|
||||
('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
|
||||
('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
|
||||
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
|
||||
('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
|
||||
('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
|
||||
('iX', 'http://www.heise.de/ix/news/news.rdf'),
|
||||
('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
|
||||
('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
|
||||
('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
|
||||
('Security', 'http://www.heise.de/security/news/news-atom.xml'),
|
||||
('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
|
||||
('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
|
||||
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
|
||||
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
|
||||
('Autos', 'http://www.heise.de/autos/rss/news.rdf'),
|
||||
('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'),
|
||||
('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
|
||||
('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
|
||||
('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
|
||||
('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
|
||||
('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
|
||||
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')
|
||||
]
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?view=print'
|
||||
|
||||
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1298137661(BasicNewsRecipe):
|
||||
title = u'Helsingin Sanomat'
|
||||
__author__ = 'oneillpt'
|
||||
language = 'fi'
|
||||
language = 'fi'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
@ -11,21 +11,12 @@ class AdvancedUserRecipe1298137661(BasicNewsRecipe):
|
||||
conversion_options = {
|
||||
'linearize_tables' : True
|
||||
}
|
||||
remove_tags = [
|
||||
dict(name='a', attrs={'id':'articleCommentUrl'}),
|
||||
dict(name='p', attrs={'class':'newsSummary'}),
|
||||
dict(name='div', attrs={'class':'headerTools'})
|
||||
]
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'main-content'}),
|
||||
dict(name='div', attrs={'class':'contentNewsArticle'})]
|
||||
|
||||
feeds = [(u'Uutiset - HS.fi', u'http://www.hs.fi/uutiset/rss/'), (u'Politiikka - HS.fi', u'http://www.hs.fi/politiikka/rss/'),
|
||||
feeds = [(u'Uutiset - HS.fi', u'http://www.hs.fi/uutiset/rss/')
|
||||
, (u'Politiikka - HS.fi', u'http://www.hs.fi/politiikka/rss/'),
|
||||
(u'Ulkomaat - HS.fi', u'http://www.hs.fi/ulkomaat/rss/'), (u'Kulttuuri - HS.fi', u'http://www.hs.fi/kulttuuri/rss/'),
|
||||
(u'Kirjat - HS.fi', u'http://www.hs.fi/kulttuuri/kirjat/rss/'), (u'Elokuvat - HS.fi', u'http://www.hs.fi/kulttuuri/elokuvat/rss/')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
j = url.rfind("/")
|
||||
s = url[j:]
|
||||
i = s.rfind("?ref=rss")
|
||||
if i > 0:
|
||||
s = s[:i]
|
||||
return "http://www.hs.fi/tulosta" + s
|
||||
|
@ -18,6 +18,7 @@ class HoustonChronicle(BasicNewsRecipe):
|
||||
|
||||
keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or
|
||||
'hst-articletext' in x or 'hst-galleryitem' in x)}
|
||||
remove_attributes = ['xmlns']
|
||||
|
||||
feeds = [
|
||||
('News', "http://www.chron.com/rss/feed/News-270.php"),
|
||||
|
BIN
recipes/icons/archeowiesci.png
Normal file
After Width: | Height: | Size: 718 B |
BIN
recipes/icons/dark_horizons.png
Normal file
After Width: | Height: | Size: 399 B |
BIN
recipes/icons/den_of_geek.png
Normal file
After Width: | Height: | Size: 1.0 KiB |
BIN
recipes/icons/eioba.png
Normal file
After Width: | Height: | Size: 908 B |
BIN
recipes/icons/focus_pl.png
Normal file
After Width: | Height: | Size: 695 B |
BIN
recipes/icons/gazeta_wyborcza.png
Normal file
After Width: | Height: | Size: 221 B |
BIN
recipes/icons/konflikty_zbrojne.png
Normal file
After Width: | Height: | Size: 320 B |
BIN
recipes/icons/la_republica.png
Normal file
After Width: | Height: | Size: 868 B |
15
recipes/konflikty_zbrojne.recipe
Normal file
@ -0,0 +1,15 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Konflikty(BasicNewsRecipe):
|
||||
title = u'Konflikty Zbrojne'
|
||||
__author__ = 'fenuks'
|
||||
cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg'
|
||||
language = 'pl'
|
||||
description ='military news'
|
||||
category='military, history'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')]
|
@ -29,7 +29,7 @@ class Kurier(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [dict(attrs={'class':['contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})]
|
||||
remove_tags = [dict(attrs={'class':['functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})]
|
||||
keep_only_tags = [dict(attrs={'id':'content'})]
|
||||
remove_tags_after = dict(attrs={'id':'author'})
|
||||
remove_attributes = ['width','height']
|
||||
|
@ -1,51 +1,55 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version'
|
||||
|
||||
'''
|
||||
http://www.repubblica.it/
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LaRepubblica(BasicNewsRecipe):
|
||||
__author__ = 'Lorenzo Vigentini, Gabriele Marini'
|
||||
description = 'Italian daily newspaper'
|
||||
|
||||
cover_url = 'http://www.repubblica.it/images/homepage/la_repubblica_logo.gif'
|
||||
title = u'La Repubblica'
|
||||
publisher = 'Gruppo editoriale L\'Espresso'
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
|
||||
language = 'it'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
recursion = 10
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
title = 'La Repubblica'
|
||||
__author__ = 'Lorenzo Vigentini, Gabriele Marini, Darko Miletic'
|
||||
description = 'il quotidiano online con tutte le notizie in tempo reale. News e ultime notizie. Tutti i settori: politica, cronaca, economia, sport, esteri, scienza, tecnologia, internet, spettacoli, musica, cultura, arte, mostre, libri, dvd, vhs, concerti, cinema, attori, attrici, recensioni, chat, cucina, mappe. Le citta di Repubblica: Roma, Milano, Bologna, Firenze, Palermo, Napoli, Bari, Torino.'
|
||||
masthead_url = 'http://www.repubblica.it/static/images/homepage/2010/la-repubblica-logo-home-payoff.png'
|
||||
publisher = 'Gruppo editoriale L\'Espresso'
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
language = 'it'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
oldest_article = 5
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
#recursion = 10
|
||||
no_stylesheets = True
|
||||
extra_css = """
|
||||
img{display: block}
|
||||
"""
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
|
||||
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
|
||||
(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = article.get('id', article.get('guid', None))
|
||||
if link is None:
|
||||
return article
|
||||
return link
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'articolo'}),
|
||||
dict(name='div', attrs={'class':'body-text'}),
|
||||
# dict(name='div', attrs={'class':'page-content'}),
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'class':'articolo'}),
|
||||
dict(attrs={'class':'body-text'}),
|
||||
dict(name='p', attrs={'class':'disclaimer clearfix'}),
|
||||
dict(name='div', attrs={'id':'contA'})
|
||||
dict(attrs={'id':'contA'})
|
||||
]
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link']),
|
||||
dict(name=['object','link','meta']),
|
||||
dict(name='span',attrs={'class':'linkindice'}),
|
||||
dict(name='div', attrs={'class':'bottom-mobile'}),
|
||||
dict(name='div', attrs={'id':['rssdiv','blocco']}),
|
||||
|
34
recipes/leipzer_volkszeitung.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
'''Calibre recipe to convert the RSS feeds of the Leipziger Volkszeitung to an ebook.'''
|
||||
|
||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
__author__ = 'a.peter'
|
||||
__copyright__ = 'a.peter'
|
||||
__license__ = 'GPL v3'
|
||||
language = 'de'
|
||||
description = 'Leipziger Volkszeitung Online RSS'
|
||||
version = 1
|
||||
title = u'Leipziger Volkszeitung Online RSS'
|
||||
timefmt = ' [%d.%m.%Y]'
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
publication_type = 'newspaper'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'article'})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['ARTICLE_MORE', 'clearfloat']})]
|
||||
|
||||
feeds = [(u'Leipzig', u'http://nachrichten.lvz-online.de/rss/leipzig-rss.xml'),
|
||||
(u'Mitteldeutschland', u'http://nachrichten.lvz-online.de/rss/mitteldeutschland-rss.xml'),
|
||||
(u'Brennpunkte', u'http://nachrichten.lvz-online.de/rss/brennpunkte-rss.xml'),
|
||||
(u'Polizeiticker', u'http://nachrichten.lvz-online.de/rss/polizeiticker-rss.xml'),
|
||||
(u'Boulevard', u'http://nachrichten.lvz-online.de/rss/boulevard-rss.xml'),
|
||||
(u'Kultur', u'http://nachrichten.lvz-online.de/rss/kultur-rss.xml'),
|
||||
(u'Sport', u'http://nachrichten.lvz-online.de/rss/sport-rss.xml'),
|
||||
(u'Regionalsport', u'http://nachrichten.lvz-online.de/rss/regionalsport-rss.xml'),
|
||||
(u'Knipser', u'http://nachrichten.lvz-online.de/rss/knipser-rss.xml')]
|
||||
|
||||
def get_masthead_url(self):
|
||||
return 'http://www.lvz-online.de/resources/themes/standard/images/global/logo.gif'
|
27
recipes/merco_press.recipe
Normal file
@ -0,0 +1,27 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MercoPress(BasicNewsRecipe):
|
||||
title = u'Merco Press'
|
||||
description = u"Read News, Stories and Insight Analysis from Latin America and Mercosur. Politics, Economy, Business and Investments in South America."
|
||||
cover_url = 'http://en.mercopress.com/web/img/en/mercopress-logo.gif'
|
||||
|
||||
__author__ = 'Russell Phillips'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
extra_css = 'img{padding-bottom:1ex; display:block; text-align: center;}'
|
||||
remove_tags = [dict(name='a')]
|
||||
|
||||
feeds = [('Antarctica', 'http://en.mercopress.com/rss/antarctica'),
|
||||
('Argentina', 'http://en.mercopress.com/rss/argentina'),
|
||||
('Brazil', 'http://en.mercopress.com/rss/brazil'),
|
||||
('Falkland Islands', 'http://en.mercopress.com/rss/falkland-islands'),
|
||||
('International News', 'http://en.mercopress.com/rss/international'),
|
||||
('Latin America', 'http://en.mercopress.com/rss/latin-america'),
|
||||
('Mercosur', 'http://en.mercopress.com/rss/mercosur'),
|
||||
('Paraguay', 'http://en.mercopress.com/rss/paraguay'),
|
||||
('United States', 'http://en.mercopress.com/rss/united-states'),
|
||||
('Uruguay://en.mercopress.com/rss/uruguay')]
|
@ -5,30 +5,46 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
description = 'News as provide by The Metro -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
||||
|
||||
no_stylesheets = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 25
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
|
||||
#preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<span class="img-cap legend">', re.IGNORECASE | re.DOTALL), lambda match: '<p></p><span class="img-cap legend"> ')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
language = 'en_GB'
|
||||
|
||||
|
||||
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
||||
|
||||
extra_css = 'h2 {font: sans-serif medium;}'
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
|
||||
dict(attrs={'class':['img-cnt figure']}),
|
||||
dict(attrs={'class':['art-img']}),
|
||||
|
||||
dict(name='div', attrs={'class':'art-lft'})
|
||||
dict(name='div', attrs={'class':'art-lft'}),
|
||||
dict(name='p')
|
||||
]
|
||||
remove_tags = [dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
|
||||
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r' ]}),
|
||||
dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']})
|
||||
]
|
||||
dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']})
|
||||
,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
|
||||
]
|
||||
feeds = [
|
||||
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
|
||||
|
||||
extra_css = '''
|
||||
body {font: sans-serif medium;}'
|
||||
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
|
||||
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
|
||||
span{ font-size:9.5px; font-weight:bold;font-style:italic}
|
||||
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
|
||||
'''
|
||||
|
@ -6,19 +6,24 @@ __Region__ = 'Hong Kong'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
# Turn below to True if your device supports display of CJK titles
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source
|
||||
__UseLife__ = True
|
||||
# (HK only) if __UseLife__ is true, turn this on if you want to include the column section
|
||||
# (HK only) It is to disable the column section which is now a premium content
|
||||
__InclCols__ = False
|
||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats
|
||||
__ParsePFF__ = False
|
||||
# (HK only) Turn below to True if you wish hi-res images
|
||||
__HiResImg__ = False
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/09/21: fetching "column" section is made optional. Default is False
|
||||
2011/10/04: option to get hi-res photos for the articles
|
||||
2011/09/21: fetching "column" section is made optional.
|
||||
2011/09/18: parse "column" section stuff from source text file directly.
|
||||
2011/09/07: disable "column" section as it is no longer offered free.
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
@ -42,7 +47,7 @@ Change Log:
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import os, datetime, re
|
||||
import os, datetime, re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
@ -56,7 +61,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
@ -147,43 +152,6 @@ class MPRecipe(BasicNewsRecipe):
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
# i1 = url.find('1')
|
||||
# if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
# i2 = url.find('2')
|
||||
# if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
# i3 = url.find('3')
|
||||
# if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
# i4 = url.find('4')
|
||||
# if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
# i5 = url.find('5')
|
||||
# if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
# i6 = url.find('6')
|
||||
# if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
# i7 = url.find('7')
|
||||
# if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
# i8 = url.find('8')
|
||||
# if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
@ -260,15 +228,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
#if ed_articles:
|
||||
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
@ -279,20 +248,39 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
#if fin_articles:
|
||||
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
# articles = self.parse_section(url)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
#if ent_articles:
|
||||
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclCols__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
@ -300,11 +288,6 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
@ -348,6 +331,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
# replace the url to the print-friendly version
|
||||
if __ParsePFF__ == True:
|
||||
if url.rfind('Redirect') <> -1:
|
||||
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||
url = re.sub('%2F.*%2F', '/', url)
|
||||
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||
url = url.replace('%2Etxt', '_print.htm')
|
||||
url = url.replace('%5F', '_')
|
||||
else:
|
||||
url = url.replace('.htm', '_print.htm')
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
@ -472,38 +465,119 @@ class MPRecipe(BasicNewsRecipe):
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# preprocess those .txt based files
|
||||
# preprocess those .txt and javascript based files
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
if url.rfind('ftp') == -1:
|
||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||
if __HiResImg__ == True:
|
||||
# TODO: add a _ in front of an image url
|
||||
if url.rfind('news.mingpao.com') > -1:
|
||||
imglist = re.findall('src="?.*?jpg"', raw_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
for img in imglist:
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||
raw_html = raw_html.replace(img, gifimg)
|
||||
except:
|
||||
# find the location of the first _
|
||||
pos = img.find('_')
|
||||
if pos > -1:
|
||||
# if found, insert _ after the first _
|
||||
newimg = img[0:pos] + '_' + img[pos:]
|
||||
raw_html = raw_html.replace(img, newimg)
|
||||
else:
|
||||
# if not found, insert _ after "
|
||||
raw_html = raw_html.replace(img[1:], '"_' + img[1:])
|
||||
elif url.rfind('life.mingpao.com') > -1:
|
||||
imglist = re.findall('src=\'?.*?jpg\'', raw_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
#print 'Img list: ', imglist, '\n'
|
||||
for img in imglist:
|
||||
gifimg = img.replace('jpg\'', 'gif\'')
|
||||
try:
|
||||
#print 'Original: ', url
|
||||
#print 'To append: ', "/../" + gifimg[5:len(gifimg)-1]
|
||||
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||
#print 'newurl: ', gifurl + gifimg[5:len(gifimg)-1]
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
#print 'URL: ', url + "/../" + gifimg[5:len(gifimg)-1]
|
||||
#br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||
raw_html = raw_html.replace(img, gifimg)
|
||||
except:
|
||||
#print 'GIF not found'
|
||||
pos = img.rfind('/')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
#print 'newimg: ', newimg
|
||||
raw_html = raw_html.replace(img, newimg)
|
||||
if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1:
|
||||
return raw_html
|
||||
else:
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith(u'\u3010'):
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith('='):
|
||||
next_is_img_txt = True
|
||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||
if url.rfind('_print.htm') <> -1:
|
||||
# javascript based file
|
||||
splitter = re.compile(r'\n')
|
||||
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||
new_raw_html = new_raw_html + '<body>'
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith('var heading1 ='):
|
||||
heading = item.replace('var heading1 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||
if item.startswith('var heading2 ='):
|
||||
heading = item.replace('var heading2 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
if heading <> '':
|
||||
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||
else:
|
||||
if met_article_start_char == False:
|
||||
if title_started == False:
|
||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||
title_started = True
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '<p>\n'
|
||||
new_raw_html = new_raw_html + '</div>'
|
||||
if item.startswith('var content ='):
|
||||
content = item.replace("var content = ", '')
|
||||
content = content.replace('\'', '')
|
||||
content = content.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||
if item.startswith('var photocontent ='):
|
||||
photo = item.replace('var photocontent = \'', '')
|
||||
photo = photo.replace('\'', '')
|
||||
photo = photo.replace(';', '')
|
||||
photo = photo.replace('<tr>', '')
|
||||
photo = photo.replace('<td>', '')
|
||||
photo = photo.replace('</tr>', '')
|
||||
photo = photo.replace('</td>', '<br>')
|
||||
photo = photo.replace('class="photo"', '')
|
||||
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||
return new_raw_html + '</body></html>'
|
||||
else:
|
||||
# .txt based file
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith(u'\u3010'):
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
next_is_img_txt = False
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
return new_raw_html + '</div></body></html>'
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith('='):
|
||||
next_is_img_txt = True
|
||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||
else:
|
||||
if met_article_start_char == False:
|
||||
if title_started == False:
|
||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||
title_started = True
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '<p>\n'
|
||||
else:
|
||||
next_is_img_txt = False
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
return new_raw_html + '</div></body></html>'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
@ -604,7 +678,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
|
15
recipes/naczytniki.recipe
Normal file
@ -0,0 +1,15 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class naczytniki(BasicNewsRecipe):
|
||||
title = u'naczytniki.pl'
|
||||
__author__ = 'fenuks'
|
||||
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
|
||||
language = 'pl'
|
||||
description ='everything about e-readers'
|
||||
category='readers'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
|
||||
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
|
||||
feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]
|
47
recipes/nowa_fantastyka.recipe
Normal file
@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Nowa_Fantastyka(BasicNewsRecipe):
|
||||
title = u'Nowa Fantastyka'
|
||||
oldest_article = 7
|
||||
__author__ = 'fenuks'
|
||||
language = 'pl'
|
||||
description ='site for fantasy readers'
|
||||
category='fantasy'
|
||||
max_articles_per_feed = 100
|
||||
INDEX='http://www.fantastyka.pl/'
|
||||
remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
|
||||
#remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
|
||||
remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
|
||||
remove_tags=[dict(attrs={'class':'avatar2'})]
|
||||
feeds = []
|
||||
|
||||
def find_articles(self, url):
|
||||
articles = []
|
||||
soup=self.index_to_soup(url)
|
||||
tag=soup.find(attrs={'class':'belka1-tlo-m'})
|
||||
art=tag.findAll(name='a', attrs={'class':'a-box'})
|
||||
for i in art:
|
||||
title=i.string
|
||||
url=self.INDEX+i['href']
|
||||
#date=soup.find(id='footer').ul.li.string[41:-1]
|
||||
articles.append({'title' : title,
|
||||
'url' : url,
|
||||
'date' : '',
|
||||
'description' : ''
|
||||
})
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
feeds.append((u"Opowiadania", self.find_articles('http://www.fantastyka.pl/3.html')))
|
||||
feeds.append((u"Publicystyka", self.find_articles('http://www.fantastyka.pl/6.html')))
|
||||
feeds.append((u"Hype Park", self.find_articles('http://www.fantastyka.pl/9.html')))
|
||||
|
||||
return feeds
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.fantastyka.pl/1.html')
|
||||
cover=soup.find(name='img', attrs={'class':'okladka'})
|
||||
self.cover_url=self.INDEX+ cover['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
17
recipes/penguin_news.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MercoPress(BasicNewsRecipe):
|
||||
title = u'Penguin News'
|
||||
description = u"Penguin News: the Falkland Islands' only newspaper."
|
||||
cover_url = 'http://www.penguin-news.com/templates/rt_syndicate_j15/images/logo/light/logo1.png'
|
||||
language = 'en'
|
||||
|
||||
__author__ = 'Russell Phillips'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
extra_css = 'img{padding-bottom:1ex; display:block; text-align: center;}'
|
||||
|
||||
feeds = [(u'Penguin News - Falkland Islands', u'http://www.penguin-news.com/index.php?format=feed&type=rss')]
|
@ -4,13 +4,13 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1311799898(BasicNewsRecipe):
|
||||
title = u'Periódico Portafolio Colombia'
|
||||
language = 'es_CO'
|
||||
__author__ = 'BIGO-CAVA'
|
||||
language = 'es_CO'
|
||||
cover_url = 'http://www.portafolio.co/sites/portafolio.co/themes/portafolio_2011/logo.png'
|
||||
remove_tags_before = dict(id='contenidoArt')
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'articulo-mas'})]
|
||||
keep_only_tags = [dict(name='div', id='contenidoArt')]
|
||||
oldest_article = 1
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
@ -1,5 +1,8 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '''
|
||||
2010, Darko Miletic <darko.miletic at gmail.com>
|
||||
2011, Przemyslaw Kryger <pkryger at gmail.com>
|
||||
'''
|
||||
'''
|
||||
readitlaterlist.com
|
||||
'''
|
||||
@ -9,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Readitlater(BasicNewsRecipe):
|
||||
title = 'Read It Later'
|
||||
__author__ = 'Darko Miletic'
|
||||
__author__ = 'Darko Miletic, Przemyslaw Kryger'
|
||||
description = '''Personalized news feeds. Go to readitlaterlist.com to
|
||||
setup up your news. Fill in your account
|
||||
username, and optionally you can add password.'''
|
||||
@ -23,9 +26,6 @@ class Readitlater(BasicNewsRecipe):
|
||||
INDEX = u'http://readitlaterlist.com'
|
||||
LOGIN = INDEX + u'/l'
|
||||
|
||||
|
||||
feeds = [(u'Unread articles' , INDEX + u'/unread')]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None:
|
||||
@ -37,12 +37,31 @@ class Readitlater(BasicNewsRecipe):
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def get_feeds(self):
|
||||
self.report_progress(0, ('Fetching list of feeds...'))
|
||||
lfeeds = []
|
||||
i = 1
|
||||
feedurl = self.INDEX + u'/unread/1'
|
||||
while True:
|
||||
title = u'Unread articles, page ' + str(i)
|
||||
lfeeds.append((title, feedurl))
|
||||
self.report_progress(0, ('Got ') + str(i) + (' feeds'))
|
||||
i += 1
|
||||
soup = self.index_to_soup(feedurl)
|
||||
ritem = soup.find('a',attrs={'id':'next', 'class':'active'})
|
||||
if ritem is None:
|
||||
break
|
||||
feedurl = self.INDEX + ritem['href']
|
||||
if self.test:
|
||||
return lfeeds[:2]
|
||||
return lfeeds
|
||||
|
||||
def parse_index(self):
|
||||
totalfeeds = []
|
||||
lfeeds = self.get_feeds()
|
||||
for feedobj in lfeeds:
|
||||
feedtitle, feedurl = feedobj
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||
articles = []
|
||||
soup = self.index_to_soup(feedurl)
|
||||
ritem = soup.find('ul',attrs={'id':'list'})
|
||||
|
14
recipes/republica.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1316862613(BasicNewsRecipe):
|
||||
title = u'Republica'
|
||||
__author__ = 'Manish Bhattarai'
|
||||
description = 'News from the Republica'
|
||||
language = 'en_NP'
|
||||
masthead_url = 'http://blog.nyayahealth.org/wp-content/uploads/2011/03/myrepublica1.gif'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
cover_url = 'http://www.myrepublica.com/repub_front.jpg'
|
||||
feeds = [(u'Political Affairs', u'http://www.myrepublica.com/portal/news_rss.php?news_category_id=14'), (u'Business & Economy', u'http://www.myrepublica.com/portal/news_rss.php?news_category_id=15'), (u'International', u'http://www.myrepublica.com/portal/news_rss.php?news_category_id=21'), (u'Social Issues', u'http://www.myrepublica.com/portal/news_rss.php?news_category_id=16'), (u'Sports', u'http://www.myrepublica.com/portal/news_rss.php?news_category_id=18'), (u'Lifestyle', u'http://www.myrepublica.com/portal/news_rss.php?news_category_id=17')]
|
||||
|
29
recipes/revista_piaui.recipe
Normal file
@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class RevistaPiaui(BasicNewsRecipe):
|
||||
title = u'Revista piau\xed'
|
||||
language = 'pt_BR'
|
||||
__author__ = u'Eduardo Gustini Simões'
|
||||
oldest_article = 31
|
||||
max_articles_per_feed = 50
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Edi\xe7\xe3o Atual', u'http://revistapiaui.estadao.com.br/feed/rss/edicao-atual.xml')]
|
||||
|
||||
def parse_feeds (self):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
for feed in feeds:
|
||||
for article in feed.articles[:]:
|
||||
soup = self.index_to_soup('http://revistapiaui.estadao.com.br/feed/rss/edicao-atual.xml')
|
||||
itemTitle = article.title.partition('|')[0].rstrip()
|
||||
item = soup.find(text=itemTitle)
|
||||
articleDescription = item.parent.parent.description.string.partition('<br />')[2]
|
||||
article.summary = articleDescription
|
||||
|
||||
return feeds
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
h2 = soup.find('h2')
|
||||
h2.string.replaceWith(h2.string.partition('|')[0].rstrip())
|
||||
h2.replaceWith(h2.prettify() + '<p><em>' + article.summary + '</em></p><p><em>' + ' posted at ' + article.localtime.strftime('%d-%m-%Y') + '</em></p>')
|
11
recipes/revista_semana.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class AdvancedUserRecipe1317341570(BasicNewsRecipe):
|
||||
title = u'Revista Semana'
|
||||
__author__ = 'BIGO-CAVA'
|
||||
language = 'es_CO'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'Revista Semana', u'http://www.semana.com/rss/Semana_OnLine.xml')]
|
53
recipes/sign_on_sd.recipe
Normal file
@ -0,0 +1,53 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1315899507(BasicNewsRecipe):
|
||||
title = u'Sign On San Diego'
|
||||
__author__ = 'Jay Kindle'
|
||||
description = 'Local news stories from The San Diego Union-Tribune; breaking news, business and technology, local and national sports coverage, entertainment news and reviews.'
|
||||
publisher = 'Tribune Company'
|
||||
category = 'news, politics, USA, San Diego, California, world'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
timefmt = ' [%b %d, %Y]'
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
auto_cleanup = True
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://media.signonsandiego.com/e2/sosd/images/sosd_logo.png'
|
||||
|
||||
feeds = [
|
||||
(u'Latest News', u'http://www.signonsandiego.com/rss/headlines/'),
|
||||
(u'Local News', u'http://www.signonsandiego.com/rss/headlines/metro/'),
|
||||
(u'Business', u'http://www.signonsandiego.com/rss/headlines/business/'),
|
||||
(u'Politics', u'http://www.signonsandiego.com/rss/headlines/local/politics/'),
|
||||
(u'Border & Immigration', u'http://www.signonsandiego.com/rss/headlines/border/'),
|
||||
(u'Courts', u'http://www.signonsandiego.com/rss/headlines/courts/'),
|
||||
(u'Education', u'http://www.signonsandiego.com/news/education/'),
|
||||
(u'Sports', u'http://www.signonsandiego.com/rss/headlines/sports/'),
|
||||
(u'Chargers', u'http://www.signonsandiego.com/rss/headlines/sports/chargers/'),
|
||||
(u'Padres', u'http://www.signonsandiego.com/rss/headlines/sports/padres/'),
|
||||
(u'NFL', u'http://www.signonsandiego.com/rss/headlines/sports/nfl/'),
|
||||
(u'NBA', u'http://www.signonsandiego.com/rss/headlines/sports/nba/'),
|
||||
(u'Nick Canepa', u'http://www.signonsandiego.com/rss/authors/nick-canepa/'),
|
||||
(u'Tim Sullivan', u'http://www.signonsandiego.com/rss/authors/tim-sullivan/'),
|
||||
(u'Ruben Navarrette', u'http://www.signonsandiego.com/rss/authors/ruben-navarrette/'),
|
||||
(u'Diane Bell', u'http://www.signonsandiego.com/rss/authors/diane-bell/'),
|
||||
(u'Smart Living', u'http://www.signonsandiego.com/rss/headlines/smart-living/'),
|
||||
(u'Photos', u'http://www.signonsandiego.com/rss/photos/'),
|
||||
(u'Arts', u'http://www.signonsandiego.com/rss/headlines/night-and-day/theater-arts/'),
|
||||
(u'Books', u'http://www.signonsandiego.com/rss/headlines/lifestyle/books/'),
|
||||
(u'Currents-Passages', u'http://www.signonsandiego.com/rss/headlines/lifestyle/currents/passages/'),
|
||||
(u'Currents-Weekend', u'http://www.signonsandiego.com/news/rss2/daily/currentsweekend.xml'),
|
||||
(u'Dialog', u'http://www.signonsandiego.com/news/rss2/daily/dialog.xml'),
|
||||
(u'Home', u'http://www.signonsandiego.com/rss/headlines/home/'),
|
||||
(u'Homescape', u'http://www.signonsandiego.com/rss/headlines/lifestyle/homescape/'),
|
||||
(u'Night & Day', u'http://www.signonsandiego.com/news/rss2/daily/nightday.xml'),
|
||||
(u'Opinion', u'http://www.signonsandiego.com/rss/headlines/opinion/'),
|
||||
(u'Quest', u'http://www.signonsandiego.com/news/rss2/daily/quest.xml'),
|
||||
(u'Travel', u'http://www.signonsandiego.com/news/rss2/daily/travel.xml'),
|
||||
(u'Wheels', u'http://www.signonsandiego.com/news/rss2/daily/wheels.xml')
|
||||
]
|
||||
|
@ -9,285 +9,79 @@ calibre recipe for slate.com
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
|
||||
|
||||
class Slate(BasicNewsRecipe):
|
||||
# Method variables for customizing downloads
|
||||
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
|
||||
__author__ = 'GRiker, Sujata Raman and Nick Redding'
|
||||
max_articles_per_feed = 100
|
||||
oldest_article = 14
|
||||
recursions = 0
|
||||
delay = 0
|
||||
simultaneous_downloads = 5
|
||||
timeout = 120.0
|
||||
__author__ = 'Kovid Goyal'
|
||||
timefmt = ''
|
||||
feeds = None
|
||||
no_stylesheets = True
|
||||
encoding = None
|
||||
language = 'en'
|
||||
title = 'Slate'
|
||||
INDEX = 'http://slate.com'
|
||||
encoding = 'utf-8'
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!--.*?-->', re.DOTALL), lambda x: ''),
|
||||
(re.compile(r'^.*?<html', re.DOTALL), lambda x:'<html'),
|
||||
(re.compile(r'<meta[^>]+?/>', re.DOTALL), lambda x:''),
|
||||
]
|
||||
remove_tags = [
|
||||
{'name':['link', 'script']},
|
||||
{'class':['share-box-flank', 'sl-crumbs', 'sl-tbar',
|
||||
'sl-chunky-tbar']},
|
||||
]
|
||||
remove_tags_after = [{'class':'sl-art-creds-cntr'}]
|
||||
keep_only_tags = {'class':'sl-body-wrapper'}
|
||||
remove_attributes = ['style']
|
||||
|
||||
slate_complete = True
|
||||
if slate_complete:
|
||||
title = 'Slate (complete)'
|
||||
else:
|
||||
title = 'Slate (weekly)'
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', '.single.html')
|
||||
|
||||
# Method variables for customizing feed parsing
|
||||
summary_length = 250
|
||||
use_embedded_content = None
|
||||
|
||||
# Method variables for pre/post processing of HTML
|
||||
preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>',
|
||||
re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
(re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>',
|
||||
re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '') ]
|
||||
|
||||
match_regexps = []
|
||||
|
||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
|
||||
dict(attrs={ 'id':['content']}) ]
|
||||
|
||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||
remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
|
||||
'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
|
||||
'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
|
||||
'comments_button','add_comments_button','comments-to-fray','marriott_ad',
|
||||
'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
|
||||
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
|
||||
|
||||
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
||||
excludedTitleKeywords = ['Gabfest','Slate V','on Twitter']
|
||||
excludedAuthorKeywords = []
|
||||
excludedContentKeywords = ['http://twitter.com/Slate']
|
||||
|
||||
extra_css = '''
|
||||
.h1_subhead{font-family:Arial; font-size:small; }
|
||||
h1{font-family:Verdana; font-size:large; }
|
||||
.byline {font-family:Georgia; margin-bottom: 0px; }
|
||||
.dateline {font-family:Arial; font-size: smaller; height: 0pt;}
|
||||
.imagewrapper {font-family:Verdana;font-size:x-small; }
|
||||
.source {font-family:Verdana; font-size:x-small;}
|
||||
.credit {font-family:Verdana; font-size: smaller;}
|
||||
#article_body {font-family:Verdana; }
|
||||
#content {font-family:Arial; }
|
||||
.caption{font-family:Verdana;font-style:italic; font-size:x-small;}
|
||||
h3{font-family:Arial; font-size:small}
|
||||
'''
|
||||
|
||||
# Local variables to extend class
|
||||
baseURL = 'http://slate.com'
|
||||
section_dates = []
|
||||
|
||||
# class extension methods
|
||||
def tag_to_strings(self, tag):
|
||||
if not tag:
|
||||
return ''
|
||||
if isinstance(tag, basestring):
|
||||
return tag
|
||||
strings = []
|
||||
for item in tag.contents:
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
strings.append(item.string)
|
||||
elif isinstance(item, Tag):
|
||||
res = self.tag_to_string(item,use_alt=False)
|
||||
if res:
|
||||
strings.append(res)
|
||||
return strings
|
||||
|
||||
def extract_named_sections(self):
|
||||
soup = self.index_to_soup( self.baseURL )
|
||||
soup_nav_bar = soup.find(True, attrs={'id':'nav'})
|
||||
briefing_nav = soup.find('li')
|
||||
briefing_url = briefing_nav.a['href']
|
||||
for section_nav in soup_nav_bar.findAll('li'):
|
||||
section_name = self.tag_to_string(section_nav,use_alt=False)
|
||||
self.section_dates.append(section_name)
|
||||
|
||||
soup = self.index_to_soup(briefing_url)
|
||||
|
||||
self.log("Briefing url = %s " % briefing_url)
|
||||
section_lists = soup.findAll('ul','view_links_list')
|
||||
|
||||
sections = []
|
||||
for section in section_lists :
|
||||
sections.append(section)
|
||||
return sections
|
||||
|
||||
|
||||
def extract_dated_sections(self):
|
||||
soup = self.index_to_soup( self.baseURL )
|
||||
soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
|
||||
if soup_top_stories:
|
||||
self.section_dates.append("Top Stories")
|
||||
self.log("SELECTION TOP STORIES %s" % "Top Stories")
|
||||
|
||||
soup = soup.find(True, attrs={'id':'toc_links_container'})
|
||||
|
||||
todays_section = soup.find(True, attrs={'class':'todaydateline'})
|
||||
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
||||
self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
|
||||
|
||||
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
|
||||
for older_section in older_section_dates :
|
||||
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
|
||||
self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
|
||||
|
||||
if soup_top_stories:
|
||||
headline_stories = soup_top_stories
|
||||
self.log("HAVE top_stories")
|
||||
else:
|
||||
headline_stories = None
|
||||
self.log("NO top_stories")
|
||||
section_lists = soup.findAll('ul')
|
||||
# Prepend the headlines to the first section
|
||||
if headline_stories:
|
||||
section_lists.insert(0,headline_stories)
|
||||
|
||||
sections = []
|
||||
for section in section_lists :
|
||||
sections.append(section)
|
||||
return sections
|
||||
|
||||
|
||||
def extract_section_articles(self, sections_html) :
|
||||
# Find the containers with section content
|
||||
sections = sections_html
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
def parse_index(self) :
|
||||
ans = []
|
||||
|
||||
for (i,section) in enumerate(sections) :
|
||||
|
||||
# Get the section name
|
||||
if section.has_key('id') :
|
||||
self.log("PROCESSING SECTION id = %s" % section['id'])
|
||||
key = self.section_dates[i]
|
||||
if key.startswith("Pod"):
|
||||
continue
|
||||
if key.startswith("Blog"):
|
||||
continue
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
elif self.slate_complete:
|
||||
key = self.section_dates[i]
|
||||
if key.startswith("Pod"):
|
||||
continue
|
||||
if key.startswith("Blog"):
|
||||
continue
|
||||
self.log("PROCESSING SECTION name = %s" % key)
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
else :
|
||||
self.log("SECTION %d HAS NO id" % i);
|
||||
continue
|
||||
|
||||
# Get the section article_list
|
||||
article_list = section.findAll('li')
|
||||
|
||||
# Extract the article attributes
|
||||
for article in article_list :
|
||||
bylines = self.tag_to_strings(article)
|
||||
url = article.a['href']
|
||||
title = bylines[0]
|
||||
full_title = self.tag_to_string(article,use_alt=False)
|
||||
#self.log("ARTICLE TITLE%s" % title)
|
||||
#self.log("ARTICLE FULL_TITLE%s" % full_title)
|
||||
#self.log("URL %s" % url)
|
||||
author = None
|
||||
description = None
|
||||
pubdate = None
|
||||
|
||||
if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
|
||||
description = "A summary of what's in the major U.S. newspapers."
|
||||
|
||||
if len(bylines) == 3 :
|
||||
author = bylines[2].strip()
|
||||
author = re.sub('[\r][\n][\t][\t\t]','', author)
|
||||
author = re.sub(',','', author)
|
||||
if bylines[1] is not None :
|
||||
description = bylines[1]
|
||||
full_byline = self.tag_to_string(article)
|
||||
if full_byline.find('major U.S. newspapers') > 0 :
|
||||
description = "A summary of what's in the major U.S. newspapers."
|
||||
|
||||
if len(bylines) > 3 and author is not None:
|
||||
author += " | "
|
||||
for (i,substring) in enumerate(bylines[3:]) :
|
||||
#print "substring: %s" % substring.encode('cp1252')
|
||||
author += substring.strip()
|
||||
if i < len(bylines[3:]) :
|
||||
author += " | "
|
||||
|
||||
# Skip articles whose descriptions contain excluded keywords
|
||||
if description is not None and len(self.excludedDescriptionKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
|
||||
found_excluded = excluded.search(description)
|
||||
if found_excluded :
|
||||
self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
# Skip articles whose title contain excluded keywords
|
||||
if full_title is not None and len(self.excludedTitleKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedTitleKeywords))
|
||||
#self.log("evaluating full_title: %s" % full_title)
|
||||
found_excluded = excluded.search(full_title)
|
||||
if found_excluded :
|
||||
self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
# Skip articles whose author contain excluded keywords
|
||||
if author is not None and len(self.excludedAuthorKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
|
||||
found_excluded = excluded.search(author)
|
||||
if found_excluded :
|
||||
self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
skip_this_article = False
|
||||
# Check to make sure we're not adding a duplicate
|
||||
for article in articles[key] :
|
||||
if article['url'] == url :
|
||||
skip_this_article = True
|
||||
self.log("SKIPPING DUP %s" % url)
|
||||
break
|
||||
|
||||
if skip_this_article :
|
||||
continue
|
||||
|
||||
# Build the dictionary entry for this article
|
||||
feed = key
|
||||
if not articles.has_key(feed) :
|
||||
articles[feed] = []
|
||||
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
|
||||
author=author, content=''))
|
||||
#self.log("KEY %s" % feed)
|
||||
#self.log("APPENDED %s" % url)
|
||||
# Promote 'newspapers' to top
|
||||
for (i,article) in enumerate(articles[feed]) :
|
||||
if article['description'] is not None :
|
||||
if article['description'].find('newspapers') > 0 :
|
||||
articles[feed].insert(0,articles[feed].pop(i))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
for sectitle, url in (
|
||||
('News & Politics', '/articles/news_and_politics.html'),
|
||||
('Technology', '/articles/technology.html'),
|
||||
('Business', '/articles/business.html'),
|
||||
('Arts', '/articles/arts.html'),
|
||||
('Life', '/articles/life.html'),
|
||||
('Health & Science', '/articles/health_and_science.html'),
|
||||
('Sports', '/articles/sports.html'),
|
||||
('Double X', '/articles/double_x.html'),
|
||||
):
|
||||
url = self.INDEX + url
|
||||
self.log('Found section:', sectitle)
|
||||
articles = self.slate_section_articles(self.index_to_soup(url))
|
||||
if articles:
|
||||
ans.append((sectitle, articles))
|
||||
return ans
|
||||
|
||||
def print_version(self, url) :
|
||||
return url + 'pagenum/all/'
|
||||
|
||||
# Class methods
|
||||
def parse_index(self) :
|
||||
if self.slate_complete:
|
||||
sections = self.extract_named_sections()
|
||||
else:
|
||||
sections = self.extract_dated_sections()
|
||||
section_list = self.extract_section_articles(sections)
|
||||
return section_list
|
||||
def slate_section_articles(self, soup):
|
||||
cont = soup.find('div', id='most_read')
|
||||
seen = set()
|
||||
ans = []
|
||||
for h4 in cont.findAll('h4'):
|
||||
a = h4.find('a', href=True)
|
||||
if a is None: continue
|
||||
url = a['href']
|
||||
if url.startswith('/'):
|
||||
url = self.INDEX + url
|
||||
if url in seen: continue
|
||||
seen.add(url)
|
||||
title = self.tag_to_string(a)
|
||||
parent = h4.parent
|
||||
h3 = parent.find('h3')
|
||||
desc = ''
|
||||
if h3 is not None:
|
||||
desc = self.tag_to_string(h3)
|
||||
a = parent.find('a', rel='author')
|
||||
if a is not None:
|
||||
a = self.tag_to_string(a)
|
||||
art = {'title':title, 'description':desc, 'date':'', 'url':url}
|
||||
if a:
|
||||
art['author'] = a
|
||||
self.log('\tFound article:', title, ' by ', a)
|
||||
ans.append(art)
|
||||
return ans
|
||||
|
||||
def get_masthead_url(self):
|
||||
masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
|
||||
@ -299,153 +93,4 @@ class Slate(BasicNewsRecipe):
|
||||
masthead = None
|
||||
return masthead
|
||||
|
||||
def stripAnchors(self,soup):
|
||||
body = soup.find('div',attrs={'id':['article_body','content']})
|
||||
if body is not None:
|
||||
paras = body.findAll('p')
|
||||
if paras is not None:
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
if aTags is not None:
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
#print repr(a.renderContents())
|
||||
a.replaceWith(a.renderContents().decode('utf-8','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup) :
|
||||
|
||||
# Remove 'grayPlus4.png' images
|
||||
imgs = soup.findAll('img')
|
||||
if imgs is not None:
|
||||
for img in imgs:
|
||||
if re.search("grayPlus4.png",str(img)):
|
||||
img.extract()
|
||||
|
||||
# Delete article based upon content keywords
|
||||
if len(self.excludedDescriptionKeywords):
|
||||
excluded = re.compile('|'.join(self.excludedContentKeywords))
|
||||
found_excluded = excluded.search(str(soup))
|
||||
if found_excluded :
|
||||
print "No allowed content found, removing article"
|
||||
raise Exception('Rejected article')
|
||||
|
||||
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
|
||||
head = soup.find('head')
|
||||
if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
|
||||
byline = soup.find('div',attrs={'id':'byline'})
|
||||
if byline is not None:
|
||||
byline['class'] = byline['id']
|
||||
|
||||
dateline = soup.find('div',attrs={'id':'dateline'})
|
||||
if dateline is not None:
|
||||
dateline['class'] = dateline['id']
|
||||
|
||||
body = soup.find('div',attrs={'id':'content'})
|
||||
if body is not None:
|
||||
body['class'] = 'article_body'
|
||||
|
||||
# Synthesize a department kicker
|
||||
h3Tag = Tag(soup,'h3')
|
||||
emTag = Tag(soup,'em')
|
||||
emTag.insert(0,NavigableString("the big money: Today's business press"))
|
||||
h3Tag.insert(0,emTag)
|
||||
soup.body.insert(0,h3Tag)
|
||||
|
||||
# Strip anchors from HTML
|
||||
return self.stripAnchors(soup)
|
||||
|
||||
def postprocess_html(self, soup, first_fetch) :
|
||||
|
||||
# Fix up dept_kicker as <h3><em>
|
||||
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
|
||||
if dept_kicker is not None :
|
||||
kicker_strings = self.tag_to_strings(dept_kicker)
|
||||
kicker = ''.join(kicker_strings[2:])
|
||||
kicker = re.sub('\.','',kicker)
|
||||
h3Tag = Tag(soup, "h3")
|
||||
emTag = Tag(soup, "em")
|
||||
emTag.insert(0,NavigableString(kicker))
|
||||
h3Tag.insert(0, emTag)
|
||||
dept_kicker.replaceWith(h3Tag)
|
||||
else:
|
||||
self.log("No kicker--return null")
|
||||
return None
|
||||
|
||||
# Fix up the concatenated byline and dateline
|
||||
byline = soup.find(True,attrs={'class':'byline'})
|
||||
if byline is not None :
|
||||
bylineTag = Tag(soup,'div')
|
||||
bylineTag['class'] = 'byline'
|
||||
#bylineTag['height'] = '0em'
|
||||
bylineTag.insert(0,self.tag_to_string(byline))
|
||||
byline.replaceWith(bylineTag)
|
||||
|
||||
dateline = soup.find(True, attrs={'class':'dateline'})
|
||||
if dateline is not None :
|
||||
datelineTag = Tag(soup, 'div')
|
||||
datelineTag['class'] = 'dateline'
|
||||
#datelineTag['margin-top'] = '0em'
|
||||
datelineTag.insert(0,self.tag_to_string(dateline))
|
||||
dateline.replaceWith(datelineTag)
|
||||
|
||||
# Change captions to italic, add <hr>
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption is not None:
|
||||
emTag = Tag(soup, "em")
|
||||
emTag.insert(0, '<br />' + self.tag_to_string(caption))
|
||||
hrTag = Tag(soup, 'hr')
|
||||
emTag.insert(1, hrTag)
|
||||
caption.replaceWith(emTag)
|
||||
|
||||
# Fix photos
|
||||
for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
|
||||
if photo.a is not None and photo.a.img is not None:
|
||||
divTag = Tag(soup,'div')
|
||||
divTag['class'] ='imagewrapper'
|
||||
divTag.insert(0,photo.a.img)
|
||||
photo.replaceWith(divTag)
|
||||
|
||||
return soup
|
||||
|
||||
def postprocess_book(self, oeb, opts, log) :
|
||||
|
||||
def extract_byline(href) :
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
byline = soup.find(True,attrs={'class':'byline'})
|
||||
if byline is not None:
|
||||
return self.tag_to_string(byline,use_alt=False)
|
||||
else :
|
||||
return None
|
||||
|
||||
def extract_description(href) :
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
paragraphs = soup.findAll('p')
|
||||
for p in paragraphs :
|
||||
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
|
||||
self.tag_to_string(p,use_alt=False).startswith('Posted '):
|
||||
continue
|
||||
comment = p.find(text=lambda text:isinstance(text, Comment))
|
||||
if comment is not None:
|
||||
continue
|
||||
else:
|
||||
return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
|
||||
|
||||
return None
|
||||
|
||||
# Method entry point here
|
||||
# Single section toc looks different than multi-section tocs
|
||||
if oeb.toc.depth() == 2 :
|
||||
for article in oeb.toc :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
elif oeb.toc.depth() == 3 :
|
||||
for section in oeb.toc :
|
||||
for article in section :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
|
@ -22,10 +22,10 @@ class Smh_au(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://images.smh.com.au/2010/02/02/1087188/smh-620.jpg'
|
||||
publication_type = 'newspaper'
|
||||
extra_css = """
|
||||
h1{font-family: Georgia,"Times New Roman",Times,serif }
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
.cT-imageLandscape,.cT-imagePortrait{font-size: x-small}
|
||||
extra_css = """
|
||||
h1{font-family: Georgia,"Times New Roman",Times,serif }
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
.cT-imageLandscape,.cT-imagePortrait{font-size: x-small}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
@ -35,16 +35,16 @@ class Smh_au(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['googleAds','moreGoogleAds','comments']})
|
||||
,dict(name='div', attrs={'class':'cT-imageMultimedia'})
|
||||
,dict(name=['object','embed','iframe'])
|
||||
]
|
||||
remove_tags_after = [dict(name='div',attrs={'class':'articleBody'})]
|
||||
keep_only_tags = [dict(name='div',attrs={'id':'content'})]
|
||||
remove_tags = [
|
||||
dict(attrs={'class':'hidden'}),
|
||||
dict(name=['link','meta','base','embed','object','iframe'])
|
||||
remove_tags = [
|
||||
dict(name='div',
|
||||
attrs={'id':['googleAds','moreGoogleAds','comments',
|
||||
'video-player-content']}),
|
||||
dict(name='div', attrs={'class':'cT-imageMultimedia'}),
|
||||
dict(name=['object','embed','iframe']),
|
||||
dict(attrs={'class':'hidden'}),
|
||||
dict(name=['link','meta','base','embed','object','iframe'])
|
||||
]
|
||||
remove_attributes = ['width','height','lang']
|
||||
|
||||
@ -84,4 +84,4 @@ class Smh_au(BasicNewsRecipe):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
||||
|
||||
|
||||
|
@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Sueddeutsche(BasicNewsRecipe):
|
||||
|
||||
title = u'Süddeutsche'
|
||||
title = u'sueddeutsche.de'
|
||||
description = 'News from Germany'
|
||||
__author__ = 'Oliver Niesner and Armin Geller'
|
||||
use_embedded_content = False
|
||||
@ -62,7 +62,7 @@ class Sueddeutsche(BasicNewsRecipe):
|
||||
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
|
||||
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
|
||||
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
|
||||
(u'München&Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'),
|
||||
(u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'),
|
||||
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
|
||||
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
|
||||
(u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'),
|
||||
@ -75,7 +75,7 @@ class Sueddeutsche(BasicNewsRecipe):
|
||||
(u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'), # sometimes only
|
||||
(u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only
|
||||
(u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'), # sometimes only
|
||||
]
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
main, sep, id = url.rpartition('/')
|
||||
|
@ -9,4 +9,6 @@ class Tablety_pl(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags=[dict(name='header', attrs={'class':'entry-header'}), dict(name='div', attrs={'class':'entry-content clearfix'})]
|
||||
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'}), dict(name='span', attrs={'class':'dsq-postid'})]
|
||||
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
|
||||
|
@ -3,7 +3,7 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TelepolisNews(BasicNewsRecipe):
|
||||
title = u'Telepolis (News+Artikel)'
|
||||
title = u'Telepolis'
|
||||
__author__ = 'syntaxis'
|
||||
publisher = 'Heise Zeitschriften Verlag GmbH & Co KG'
|
||||
description = 'News from Telepolis'
|
||||
@ -15,11 +15,8 @@ class TelepolisNews(BasicNewsRecipe):
|
||||
encoding = "utf-8"
|
||||
language = 'de'
|
||||
|
||||
|
||||
remove_empty_feeds = True
|
||||
|
||||
|
||||
|
||||
keep_only_tags = [dict(name = 'div',attrs={'class':'head'}),dict(name = 'div',attrs={'class':'leftbox'}),dict(name='td',attrs={'class':'strict'})]
|
||||
remove_tags = [ dict(name='td',attrs={'class':'blogbottom'}),
|
||||
dict(name='div',attrs={'class':'forum'}), dict(name='div',attrs={'class':'social'}),dict(name='div',attrs={'class':'blog-letter p-news'}),
|
||||
@ -28,7 +25,6 @@ class TelepolisNews(BasicNewsRecipe):
|
||||
|
||||
remove_tags_after = [dict(name='span', attrs={'class':['breadcrumb']})]
|
||||
|
||||
|
||||
feeds = [(u'News', u'http://www.heise.de/tp/news-atom.xml')]
|
||||
|
||||
html2lrf_options = [
|
||||
@ -39,8 +35,7 @@ class TelepolisNews(BasicNewsRecipe):
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
||||
soup.head.insert(0,mtag)
|
||||
return soup
|
||||
return soup
|
@ -9,11 +9,16 @@ class TimesOfIndia(BasicNewsRecipe):
|
||||
max_articles_per_feed = 25
|
||||
|
||||
no_stylesheets = True
|
||||
keep_only_tags = [{'class':['maintable12', 'prttabl']}]
|
||||
remove_attributes = ['style']
|
||||
keep_only_tags = [
|
||||
{'class':re.compile(r'maintable12|prttabl')},
|
||||
{'id':['mod-article-header',
|
||||
'mod-a-body-after-first-para', 'mod-a-body-first-para']},
|
||||
]
|
||||
remove_tags = [
|
||||
dict(style=lambda x: x and 'float' in x),
|
||||
{'class':['prvnxtbg', 'footbdrin', 'bcclftr']},
|
||||
]
|
||||
{'class':re.compile('tabsintbgshow|prvnxtbg')},
|
||||
{'id':['fbrecommend', 'relmaindiv']}
|
||||
]
|
||||
|
||||
feeds = [
|
||||
('Top Stories',
|
||||
@ -41,6 +46,8 @@ class TimesOfIndia(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
# Times of India sometimes serves an ad page instead of the article,
|
||||
# this code, detects and circumvents that
|
||||
url = BasicNewsRecipe.get_article_url(self, article)
|
||||
if '/0Ltimesofindia' in url:
|
||||
url = url.partition('/0L')[-1]
|
||||
@ -61,6 +68,3 @@ class TimesOfIndia(BasicNewsRecipe):
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return soup
|
||||
|
17
recipes/wow.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class WoW(BasicNewsRecipe):
|
||||
title = u'WoW Insider'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
('WoW',
|
||||
'http://wow.joystiq.com/rss.xml')
|
||||
]
|
@ -1,6 +1,10 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''
|
||||
Changelog:
|
||||
2011-09-24
|
||||
Changed cover (drMerry)
|
||||
'''
|
||||
'''
|
||||
Fetch xkcd.
|
||||
'''
|
||||
@ -9,9 +13,10 @@ import time, re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class XkcdCom(BasicNewsRecipe):
|
||||
cover_url = 'http://imgs.xkcd.com/s/9be30a7.png'
|
||||
title = 'xkcd'
|
||||
description = 'A webcomic of romance and math humor.'
|
||||
__author__ = 'Martin Pitt'
|
||||
__author__ = 'Martin Pitt updated by DrMerry.'
|
||||
language = 'en'
|
||||
|
||||
use_embedded_content = False
|
||||
|
@ -285,6 +285,15 @@ function booklist(hide_sort) {
|
||||
first_page();
|
||||
}
|
||||
|
||||
function search_result() {
|
||||
var test = $("#booklist #page0").html();
|
||||
if (!test) {
|
||||
$("#booklist").html("No books found matching this query");
|
||||
return;
|
||||
}
|
||||
booklist();
|
||||
}
|
||||
|
||||
function show_details(a_dom) {
|
||||
var book = $(a_dom).closest('div.summary');
|
||||
var bd = $('#book_details_dialog');
|
||||
|
BIN
resources/images/devices/boox.jpg
Normal file
After Width: | Height: | Size: 3.0 KiB |
13
session.vim
@ -2,7 +2,7 @@
|
||||
let g:pyflakes_builtins = ["_", "dynamic_property", "__", "P", "I", "lopen", "icu_lower", "icu_upper", "icu_title", "ngettext"]
|
||||
|
||||
python << EOFPY
|
||||
import os
|
||||
import os, sys
|
||||
|
||||
import vipy
|
||||
|
||||
@ -11,15 +11,20 @@ project_dir = os.path.dirname(source_file)
|
||||
src_dir = os.path.abspath(os.path.join(project_dir, 'src'))
|
||||
base_dir = os.path.join(src_dir, 'calibre')
|
||||
|
||||
sys.path.insert(0, src_dir)
|
||||
sys.resources_location = os.path.join(project_dir, 'resources')
|
||||
sys.extensions_location = os.path.join(base_dir, 'plugins')
|
||||
sys.executables_location = os.environ.get('CALIBRE_EXECUTABLES_PATH', '/usr/bin')
|
||||
|
||||
vipy.session.initialize(project_name='calibre', src_dir=src_dir,
|
||||
project_dir=project_dir, base_dir=base_dir)
|
||||
project_dir=project_dir, base_dir=project_dir)
|
||||
|
||||
def recipe_title_callback(raw):
|
||||
return eval(raw.decode('utf-8')).replace(' ', '_')
|
||||
|
||||
vipy.session.add_content_browser('.r', ',r', 'Recipe',
|
||||
vipy.session.add_content_browser('<leader>r', 'Recipe',
|
||||
vipy.session.glob_based_iterator(os.path.join(project_dir, 'recipes', '*.recipe')),
|
||||
vipy.session.regexp_based_matcher(r'title\s*=\s*(?P<title>.+)', 'title', recipe_title_callback))
|
||||
EOFPY
|
||||
|
||||
nmap \log :enew<CR>:read ! bzr log -l 500 ../.. <CR>:e ../../Changelog.yaml<CR>:e constants.py<CR>
|
||||
nmap \log :enew<CR>:read ! bzr log -l 500 <CR>:e Changelog.yaml<CR>:e src/calibre/constants.py<CR>
|
||||
|
@ -177,6 +177,7 @@ fc_error = None if os.path.exists(os.path.join(fc_inc, 'fontconfig.h')) else \
|
||||
|
||||
|
||||
poppler_error = None
|
||||
poppler_cflags = ['-DPNG_SKIP_SETJMP_CHECK'] if islinux else []
|
||||
if not poppler_inc_dirs or not os.path.exists(
|
||||
os.path.join(poppler_inc_dirs[0], 'OutputDev.h')):
|
||||
poppler_error = \
|
||||
@ -186,6 +187,10 @@ if not poppler_inc_dirs or not os.path.exists(
|
||||
' the poppler XPDF headers. If your distro does not '
|
||||
' include them you will have to re-compile poppler '
|
||||
' by hand with --enable-xpdf-headers')
|
||||
else:
|
||||
lh = os.path.join(poppler_inc_dirs[0], 'Link.h')
|
||||
if 'class AnnotLink' not in open(lh, 'rb').read():
|
||||
poppler_cflags.append('-DPOPPLER_OLD_LINK_TYPE')
|
||||
|
||||
magick_error = None
|
||||
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
|
||||
@ -220,7 +225,10 @@ except:
|
||||
try:
|
||||
HOST=get_ip_address('wlan0')
|
||||
except:
|
||||
HOST='192.168.1.2'
|
||||
try:
|
||||
HOST=get_ip_address('ppp0')
|
||||
except:
|
||||
HOST='192.168.1.2'
|
||||
|
||||
PROJECT=os.path.basename(os.path.abspath('.'))
|
||||
|
||||
|
@ -11,15 +11,15 @@ from distutils import sysconfig
|
||||
|
||||
from PyQt4.pyqtconfig import QtGuiModuleMakefile
|
||||
|
||||
from setup import Command, islinux, isfreebsd, isbsd, isosx, SRC, iswindows
|
||||
from setup.build_environment import fc_inc, fc_lib, chmlib_inc_dirs, \
|
||||
fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc, \
|
||||
podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, \
|
||||
QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk, \
|
||||
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \
|
||||
magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs, \
|
||||
jpg_lib_dirs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs, \
|
||||
icu_lib_dirs
|
||||
from setup import Command, islinux, isbsd, isosx, SRC, iswindows
|
||||
from setup.build_environment import (fc_inc, fc_lib, chmlib_inc_dirs,
|
||||
fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc,
|
||||
podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE,
|
||||
QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk,
|
||||
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs,
|
||||
magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs,
|
||||
jpg_lib_dirs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs,
|
||||
icu_lib_dirs, poppler_cflags)
|
||||
MT
|
||||
isunix = islinux or isosx or isbsd
|
||||
|
||||
@ -114,7 +114,7 @@ extensions = [
|
||||
lib_dirs=poppler_lib_dirs+magick_lib_dirs+png_lib_dirs+ft_lib_dirs+jpg_lib_dirs,
|
||||
inc_dirs=poppler_inc_dirs+magick_inc_dirs+png_inc_dirs,
|
||||
error=reflow_error,
|
||||
cflags=['-DPNG_SKIP_SETJMP_CHECK'] if islinux else []
|
||||
cflags=poppler_cflags
|
||||
),
|
||||
|
||||
Extension('lzx',
|
||||
@ -336,7 +336,7 @@ class Build(Command):
|
||||
oinc = ['/Fo'+obj] if iswindows else ['-o', obj]
|
||||
cmd = [compiler] + cflags + ext.cflags + einc + sinc + oinc
|
||||
self.info(' '.join(cmd))
|
||||
subprocess.check_call(cmd)
|
||||
self.check_call(cmd)
|
||||
|
||||
dest = self.dest(ext)
|
||||
elib = self.lib_dirs_to_ldflags(ext.lib_dirs)
|
||||
@ -350,18 +350,32 @@ class Build(Command):
|
||||
else:
|
||||
cmd += objects + ext.extra_objs + ['-o', dest] + ldflags + ext.ldflags + elib + xlib
|
||||
self.info('\n\n', ' '.join(cmd), '\n\n')
|
||||
subprocess.check_call(cmd)
|
||||
self.check_call(cmd)
|
||||
if iswindows:
|
||||
#manifest = dest+'.manifest'
|
||||
#cmd = [MT, '-manifest', manifest, '-outputresource:%s;2'%dest]
|
||||
#self.info(*cmd)
|
||||
#subprocess.check_call(cmd)
|
||||
#self.check_call(cmd)
|
||||
#os.remove(manifest)
|
||||
for x in ('.exp', '.lib'):
|
||||
x = os.path.splitext(dest)[0]+x
|
||||
if os.path.exists(x):
|
||||
os.remove(x)
|
||||
|
||||
def check_call(self, *args, **kwargs):
|
||||
"""print cmdline if an error occured
|
||||
|
||||
If something is missing (qmake e.g.) you get a non-informative error
|
||||
self.check_call(qmc + [ext.name+'.pro'])
|
||||
so you would have to look a the source to see the actual command.
|
||||
"""
|
||||
try:
|
||||
subprocess.check_call(*args, **kwargs)
|
||||
except:
|
||||
cmdline = ' '.join(['"%s"' % (arg) if ' ' in arg else arg for arg in args[0]])
|
||||
print "Error while executing: %s\n" % (cmdline)
|
||||
raise
|
||||
|
||||
def build_qt_objects(self, ext):
|
||||
obj_pat = 'release\\*.obj' if iswindows else '*.o'
|
||||
objects = glob.glob(obj_pat)
|
||||
@ -380,8 +394,8 @@ class Build(Command):
|
||||
qmc = [QMAKE, '-o', 'Makefile']
|
||||
if iswindows:
|
||||
qmc += ['-spec', 'win32-msvc2008']
|
||||
subprocess.check_call(qmc + [ext.name+'.pro'])
|
||||
subprocess.check_call([make, '-f', 'Makefile'])
|
||||
self.check_call(qmc + [ext.name+'.pro'])
|
||||
self.check_call([make, '-f', 'Makefile'])
|
||||
objects = glob.glob(obj_pat)
|
||||
return list(map(self.a, objects))
|
||||
|
||||
@ -407,7 +421,7 @@ class Build(Command):
|
||||
cmd = [pyqt.sip_bin+exe, '-w', '-c', src_dir, '-b', sbf, '-I'+\
|
||||
pyqt.pyqt_sip_dir] + shlex.split(pyqt.pyqt_sip_flags) + [sipf]
|
||||
self.info(' '.join(cmd))
|
||||
subprocess.check_call(cmd)
|
||||
self.check_call(cmd)
|
||||
module = self.j(src_dir, self.b(dest))
|
||||
if self.newer(dest, [sbf]+qt_objects):
|
||||
mf = self.j(src_dir, 'Makefile')
|
||||
@ -417,7 +431,7 @@ class Build(Command):
|
||||
makefile.extra_include_dirs = ext.inc_dirs
|
||||
makefile.generate()
|
||||
|
||||
subprocess.check_call([make, '-f', mf], cwd=src_dir)
|
||||
self.check_call([make, '-f', mf], cwd=src_dir)
|
||||
shutil.copy2(module, dest)
|
||||
|
||||
def clean(self):
|
||||
@ -457,7 +471,7 @@ class BuildPDF2XML(Command):
|
||||
cmd += ['-I'+x for x in poppler_inc_dirs+magick_inc_dirs]
|
||||
cmd += ['/Fo'+obj, src]
|
||||
self.info(*cmd)
|
||||
subprocess.check_call(cmd)
|
||||
self.check_call(cmd)
|
||||
objects.append(obj)
|
||||
|
||||
if self.newer(dest, objects):
|
||||
@ -470,7 +484,7 @@ class BuildPDF2XML(Command):
|
||||
png_libs+magick_libs+poppler_libs+ft_libs+jpg_libs+pdfreflow_libs]
|
||||
cmd += ['/OUT:'+dest] + objects
|
||||
self.info(*cmd)
|
||||
subprocess.check_call(cmd)
|
||||
self.check_call(cmd)
|
||||
|
||||
self.info('Binary installed as', dest)
|
||||
|
||||
|
@ -20,17 +20,23 @@ for x in [
|
||||
EXCLUDES.extend(['--exclude', x])
|
||||
SAFE_EXCLUDES = ['"%s"'%x if '*' in x else x for x in EXCLUDES]
|
||||
|
||||
def get_rsync_pw():
|
||||
return open('/home/kovid/work/kde/conf/buildbot').read().partition(
|
||||
':')[-1].strip()
|
||||
|
||||
class Rsync(Command):
|
||||
|
||||
description = 'Sync source tree from development machine'
|
||||
|
||||
SYNC_CMD = ' '.join(BASE_RSYNC+SAFE_EXCLUDES+
|
||||
['rsync://{host}/work/{project}', '..'])
|
||||
['rsync://buildbot@{host}/work/{project}', '..'])
|
||||
|
||||
def run(self, opts):
|
||||
cmd = self.SYNC_CMD.format(host=HOST, project=PROJECT)
|
||||
env = dict(os.environ)
|
||||
env['RSYNC_PASSWORD'] = get_rsync_pw()
|
||||
self.info(cmd)
|
||||
subprocess.check_call(cmd, shell=True)
|
||||
subprocess.check_call(cmd, shell=True, env=env)
|
||||
|
||||
|
||||
class Push(Command):
|
||||
@ -81,7 +87,8 @@ class VMInstaller(Command):
|
||||
|
||||
|
||||
def get_build_script(self):
|
||||
ans = '\n'.join(self.BUILD_PREFIX)+'\n\n'
|
||||
rs = ['export RSYNC_PASSWORD=%s'%get_rsync_pw()]
|
||||
ans = '\n'.join(self.BUILD_PREFIX + rs)+'\n\n'
|
||||
ans += ' && \\\n'.join(self.BUILD_RSYNC) + ' && \\\n'
|
||||
ans += ' && \\\n'.join(self.BUILD_CLEAN) + ' && \\\n'
|
||||
ans += ' && \\\n'.join(self.BUILD_BUILD) + ' && \\\n'
|
||||
|