[Sync] Sync with trunk

This commit is contained in:
Li Fanxi 2012-02-28 00:12:41 +08:00
commit f40d4b61b3
1079 changed files with 481540 additions and 270445 deletions

View File

@ -2,6 +2,7 @@
.check-cache.pickle
src/calibre/plugins
resources/images.qrc
src/calibre/ebooks/oeb/display/test/*.js
src/calibre/manual/.build/
src/calibre/manual/cli/
src/calibre/manual/template_ref.rst
@ -15,6 +16,7 @@ resources/ebook-convert-complete.pickle
resources/builtin_recipes.xml
resources/builtin_recipes.zip
resources/template-functions.json
resources/display/*.js
setup/installer/windows/calibre/build.log
src/calibre/translations/.errors
src/cssutils/.svn/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

152
imgsrc/calibreSymbols.spd Normal file
View File

@ -0,0 +1,152 @@
SplineFontDB: 3.0
FontName: calibreSymbols
FullName: calibre Symbols
FamilyName: calibre Symbols
Weight: Medium
Copyright: Created by Kovid Goyal with FontForge 2.0 (http://fontforge.sf.net)
UComments: "2012-2-27: Created."
Version: 001.000
ItalicAngle: 0
UnderlinePosition: -100
UnderlineWidth: 50
Ascent: 800
Descent: 200
LayerCount: 2
Layer: 0 0 "Back" 1
Layer: 1 0 "Fore" 0
NeedsXUIDChange: 1
XUID: [1021 913 325894820 11538708]
FSType: 0
OS2Version: 0
OS2_WeightWidthSlopeOnly: 0
OS2_UseTypoMetrics: 1
CreationTime: 1330331997
ModificationTime: 1330337167
OS2TypoAscent: 0
OS2TypoAOffset: 1
OS2TypoDescent: 0
OS2TypoDOffset: 1
OS2TypoLinegap: 90
OS2WinAscent: 0
OS2WinAOffset: 1
OS2WinDescent: 0
OS2WinDOffset: 1
HheadAscent: 0
HheadAOffset: 1
HheadDescent: 0
HheadDOffset: 1
MarkAttachClasses: 1
DEI: 91125
Encoding: UnicodeFull
UnicodeInterp: none
NameList: Adobe Glyph List
DisplaySize: -24
AntiAlias: 1
FitToEm: 1
WidthSeparation: 150
WinInfo: 0 75 22
BeginPrivate: 0
EndPrivate
BeginChars: 1114112 3
StartChar: uni2605
Encoding: 9733 9733 0
Width: 933
VWidth: 0
Flags: W
LayerCount: 2
Fore
SplineSet
544.1 344.853 m 1
723.713 360.062 l 2
774.129 364.181 799.969 366.241 801.229 366.241 c 0
816.984 366.241 824.862 359.429 824.862 345.803 c 0
824.862 340.416 823.287 336.218 820.136 333.207 c 0
816.984 330.197 792.878 314.274 747.817 285.438 c 2
596.566 188 l 1
693.461 -56.3096 l 2
694.722 -58.8447 695.353 -62.6465 695.353 -67.7168 c 0
695.353 -72.4697 693.619 -76.5898 690.152 -80.0742 c 0
686.687 -83.5605 682.905 -85.3027 678.81 -85.3027 c 0
675.028 -85.3027 671.089 -83.9561 666.991 -81.2637 c 0
662.896 -78.5693 640.681 -59.7949 600.348 -24.9385 c 2
466.11 91.9873 l 1
333.765 -23.0381 l 2
292.172 -59.1621 269.405 -78.5693 265.467 -81.2637 c 0
261.527 -83.9561 257.667 -85.3027 253.887 -85.3027 c 0
249.475 -85.3027 245.457 -83.4814 241.833 -79.8369 c 0
238.209 -76.1934 236.397 -72.1523 236.397 -67.7168 c 0
236.397 -64.8652 245.379 -40.7832 263.34 4.53027 c 2
335.184 188 l 1
181.096 287.34 l 2
137.61 315.225 114.372 330.593 111.379 333.445 c 0
108.385 336.297 106.888 340.416 106.888 345.803 c 0
106.888 359.745 114.924 366.717 130.994 366.717 c 0
132.255 366.717 154.312 364.815 197.167 361.013 c 2
387.648 344.853 l 1
430.661 528.798 l 2
441.69 576.646 448.544 602.945 451.222 607.699 c 0
453.9 612.452 458.863 614.828 466.11 614.828 c 0
473.674 614.828 478.716 612.215 481.236 606.986 c 0
483.757 601.758 491.005 573.317 502.979 521.667 c 2
544.1 344.853 l 1
EndSplineSet
Validated: 524289
EndChar
StartChar: zero
Encoding: 48 48 1
Width: 1303
VWidth: 2048
Flags: W
HStem: -43.3789 76.7998<582.097 721.09> 623.341 76.7998<582.097 721.091>
VStem: 403.82 97.4395<148.044 508.66> 802.221 96.959<148.044 508.659>
LayerCount: 2
Fore
SplineSet
651.5 623.341 m 0
601.58 623.341 564.061 598.78 538.939 549.66 c 0
513.82 500.541 501.26 426.7 501.26 328.141 c 0
501.26 229.9 513.82 156.221 538.939 107.101 c 0
564.061 57.9805 601.58 33.4209 651.5 33.4209 c 0
701.74 33.4209 739.42 57.9805 764.54 107.101 c 0
789.66 156.221 802.221 229.9 802.221 328.141 c 0
802.221 426.7 789.66 500.541 764.54 549.66 c 0
739.42 598.78 701.74 623.341 651.5 623.341 c 0
651.5 700.141 m 0
731.82 700.141 793.18 668.38 835.58 604.859 c 0
877.979 541.341 899.18 449.101 899.18 328.141 c 0
899.18 207.5 877.979 115.421 835.58 51.9004 c 0
793.18 -11.6201 731.819 -43.3789 651.5 -43.3789 c 0
571.18 -43.3789 509.82 -11.6201 467.42 51.9004 c 0
425.021 115.421 403.82 207.5 403.82 328.141 c 0
403.82 449.101 425.021 541.341 467.42 604.859 c 0
509.82 668.38 571.18 700.141 651.5 700.141 c 0
EndSplineSet
Validated: 524289
EndChar
StartChar: period
Encoding: 46 46 2
Width: 516
VWidth: 2048
Flags: W
HStem: 53.4004 166.199<203.263 309.297>
VStem: 174.6 163.801<82.9501 190.955>
LayerCount: 2
Fore
SplineSet
338.4 142.8 m 0
338.4 119.2 330.5 98.4004 314.7 80.4004 c 0
298.9 62.4004 277 53.4004 249 53.4004 c 0
225.4 53.4004 207.1 61.2002 194.1 76.7998 c 0
181.1 92.4004 174.6 111 174.6 132.6 c 0
174.6 155.8 182.6 176.1 198.6 193.5 c 0
214.6 210.9 236.8 219.6 265.2 219.6 c 0
288.8 219.6 306.9 212.2 319.5 197.4 c 0
332.1 182.6 338.4 164.4 338.4 142.8 c 0
EndSplineSet
Validated: 524289
EndChar
EndChars
EndSplineFont

70
recipes/20minutes.recipe Normal file
View File

@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011 Aurélien Chabot <contact@aurelienchabot.fr>'
'''
20minutes.fr
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Minutes(BasicNewsRecipe):
title = '20 minutes'
__author__ = 'calibre'
description = 'Actualités'
encoding = 'cp1252'
publisher = '20minutes.fr'
category = 'Actualités, France, Monde'
language = 'fr'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.mna-details {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-image {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['mn-section-heading']}),
dict(name='a', attrs={'href':['#commentaires']}),
dict(name='div', attrs={'class':['mn-right']}),
dict(name='div', attrs={'class':['mna-box']}),
dict(name='div', attrs={'class':['mna-comment-call']}),
dict(name='div', attrs={'class':['mna-tools']}),
dict(name='div', attrs={'class':['mn-trilist']})
]
keep_only_tags = [dict(id='mn-article')]
remove_tags_after = dict(name='div', attrs={'class':['mna-body','mna-signature']})
feeds = [
('France', 'http://www.20minutes.fr/rss/actu-france.xml'),
('International', 'http://www.20minutes.fr/rss/monde.xml'),
('Tech/Web', 'http://www.20minutes.fr/rss/hightech.xml'),
('Sciences', 'http://www.20minutes.fr/rss/sciences.xml'),
('Economie', 'http://www.20minutes.fr/rss/economie.xml'),
('Politique', 'http://www.20minutes.fr/rss/politique.xml'),
(u'Médias', 'http://www.20minutes.fr/rss/media.xml'),
('Cinema', 'http://www.20minutes.fr/rss/cinema.xml'),
('People', 'http://www.20minutes.fr/rss/people.xml'),
('Culture', 'http://www.20minutes.fr/rss/culture.xml'),
('Sport', 'http://www.20minutes.fr/rss/sport.xml'),
('Paris', 'http://www.20minutes.fr/rss/paris.xml'),
('Lyon', 'http://www.20minutes.fr/rss/lyon.xml'),
('Toulouse', 'http://www.20minutes.fr/rss/toulouse.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Dean Cording'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
'''
abc.net.au/news
'''
@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class ABCNews(BasicNewsRecipe):
title = 'ABC News'
__author__ = 'Dean Cording'
__author__ = 'Pat Stapleton, Dean Cording'
description = 'News from Australia'
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe):
category = 'News, Australia, World'
language = 'en_AU'
publication_type = 'newsportal'
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
@ -32,23 +34,23 @@ class ABCNews(BasicNewsRecipe):
,'linearize_tables': False
}
keep_only_tags = dict(id='article')
keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags']}),
dict(id='statepromo')
]
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height']
feeds = [
('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'),
('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'),
('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'),
('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'),
('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'),
('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'),
('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'),
('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'),
('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'),
('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'),
('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
]

View File

@ -1,19 +1,38 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone'
__author__ = 'fenuks'
description = 'Adventure zone - adventure games from A to Z'
category = 'games'
language = 'pl'
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
use_embedded_content=False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'})
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
remove_tags_after= dict(id='comments')
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
tag=soup.find(name='channel')
titles=[]
for r in tag.findAll(name='image'):
r.extract()
art=tag.findAll(name='item')
for i in art:
titles.append(i.title.string)
for feed in feeds:
for article in feed.articles[:]:
article.title=titles[feed.articles.index(article)]
return feeds
def get_cover_url(self):
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
cover=soup.find(id='box_OstatninumerAZ')
@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):
def skip_ad_pages(self, soup):
skip_tag = soup.body.findAll(name='a')
if skip_tag is not None:
for r in skip_tag:
if 'articles.php?' in r['href']:
if r.strong is not None:
word=r.strong.string
if ('zapowied' or 'recenzj') in word:
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
else:
None
def print_version(self, url):
return url.replace('news.php?readmore', 'print.php?type=N&item_id')
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
skip_tag = skip_tag.findAll(name='a')
for r in skip_tag:
if r.strong:
word=r.strong.string
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
'''
abc.net.au/news
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class TheDailyNewsEG(BasicNewsRecipe):
title = u'al-masry al-youm'
__author__ = 'Omm Mishmishah'
description = 'Independent News from Egypt'
masthead_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
cover_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
auto_cleanup = True
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'Independent News Egypt'
category = 'News, Egypt, World'
language = 'en_EG'
publication_type = 'newsportal'
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': False
}
keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height']
feeds = [(u'English News', u'http://www.almasryalyoum.com/en/rss_feed_term/113/rss.xml'),
(u'News Features', u'http://www.almasryalyoum.com/en/rss_feed_term/115/rss.xml'),
(u'Culture', u'http://www.almasryalyoum.com/en/rss_feed_term/133/rss.xml'),
(u'Cinema', u'http://www.almasryalyoum.com/en/rss_feed_term/134/rss.xml')
]

View File

@ -0,0 +1,18 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class AlbertMohlersBlog(BasicNewsRecipe):
title = u'Albert Mohler\'s Blog'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 90
max_articles_per_feed = 10
auto_cleanup = True
cover_url = 'http://www.albertmohler.com/wp-content/themes/albert-mohler-v5/img/logo-am-lg.gif'
publisher = 'Albert Mohler'
language = 'en'
author = 'Albert Mohler'
feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')]

View File

@ -10,11 +10,11 @@ class Alternet(BasicNewsRecipe):
category = 'News, Magazine'
description = 'News magazine and online community'
feeds = [
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
]
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
]
remove_attributes = ['width', 'align','cellspacing']
remove_javascript = True
use_embedded_content = False
@ -36,3 +36,5 @@ class Alternet(BasicNewsRecipe):
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
conversion_options = {'linearize_tables': True}

View File

@ -11,7 +11,6 @@ class AssociatedPress(BasicNewsRecipe):
language = 'en'
no_stylesheets = True
max_articles_per_feed = 15
html2lrf_options = ['--force-page-break-before-tag="chapter"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in

View File

@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe):
language = 'pl'
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
oldest_article = 7
needs_subscription='optional'
max_articles_per_feed = 100
auto_cleanup = True
remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})]
@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if 'subskrypcja' in article.title:
if self.username is None and 'subskrypcja' in article.title:
feed.articles.remove(article)
return feeds
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://archeowiesci.pl/wp-login.php')
br.select_form(name='loginform')
br['log'] = self.username
br['pwd'] = self.password
br.submit()
return br

View File

@ -0,0 +1,51 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.asianreviewofbooks.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AsianReviewOfBooks(BasicNewsRecipe):
title = 'The Asian Review of Books'
__author__ = 'Darko Miletic'
description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.'
publisher = 'The Asian Review of Books'
category = 'literature, books, reviews, Asia'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = 'en_HK'
publication_type = 'magazine'
masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png'
extra_css = """
body{font-family: serif}
.big {font-size: xx-large}
.bold {font-weight: bold}
.italic {font-style: italic}
.small {font-size: small}
img {display: block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['object','script','iframe','embed'])]
remove_attributes = ['style', 'onclick']
feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')]
def print_version(self, url):
root, sep, artid = url.rpartition('?ID=')
return root + 'getarticle.php?articleID=' + artid + '&stats=web'
def preprocess_raw_html(self, raw, url):
return '<html><head><title>title</title></head><body>' + raw + '</body></html>'

View File

@ -1,5 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AstroNEWS(BasicNewsRecipe):
title = u'AstroNEWS'
__author__ = 'fenuks'
@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
auto_cleanup = True
#extra_css= 'table {text-align: left;}'
no_stylesheets=True
cover_url='http://news.astronet.pl/img/logo_news.jpg'
# no_stylesheets= True
remove_tags=[dict(name='hr')]
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
def print_version(self, url):
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
def preprocess_html(self, soup):
for item in soup.findAll(align=True):
del item['align']
return soup

View File

@ -1,15 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Astronomia_pl(BasicNewsRecipe):
title = u'Astronomia.pl'
__author__ = 'fenuks'
description = 'Astronomia - polish astronomy site'
masthead_url = 'http://www.astronomia.pl/grafika/logo.gif'
cover_url = 'http://www.astronomia.pl/grafika/logo.gif'
category = 'astronomy, science'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
#no_stylesheets=True
extra_css='#h2 {font-size: 18px;}'
no_stylesheets=True
preprocess_regexps = [(re.compile(ur'<b>Przeczytaj także:.*?</BODY>', re.DOTALL), lambda match: '</BODY>') ]
remove_tags_before=dict(name='div', attrs={'id':'a1'})
keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})]
feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')]

View File

@ -10,27 +10,15 @@ class autogids(BasicNewsRecipe):
publisher = 'AutomatiseringGids'
category = 'Nieuws, IT, Nederlandstalig'
simultaneous_downloads = 5
#delay = 1
timefmt = ' [%A, %d %B, %Y]'
#timefmt = ''
timefmt = ' [%a, %d %B, %Y]'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
publication_type = 'newspaper'
encoding = 'utf-8'
cover_url = 'http://www.automatiseringgids.nl/siteimg/header_logo.gif'
keep_only_tags = [dict(id=['content'])]
extra_css = '.artikelheader {font-size:0.8em; color: #666;} .artikelintro {font-weight:bold} div.imgArticle {float: right; margin: 0 0em 1em 1em; display: block; position: relative; } \
h2 { margin: 0 0 0.5em; min-height: 30px; font-size: 1.5em; letter-spacing: -0.2px; margin: 0 0 0.5em; color: black; font-weight: bold; line-height: 1.2em; padding: 4px 3px 0; }'
cover_url = 'http://www.automatiseringgids.nl/binaries/content/gallery/ag/marketing/ag-avatar-100x50.jpg'
keep_only_tags = [dict(name='div', attrs={'class':['content']})]
remove_tags = [dict(name='div', attrs={'id':['loginbox','reactiecollapsible','reactiebox']}),
dict(name='div', attrs={'class':['column_a','column_c','bannerfullsize','reactieheader','reactiecollapsible','formulier','artikel_headeroptions']}),
dict(name='ul', attrs={'class':['highlightlist']}),
dict(name='input', attrs={'type':['button']}),
dict(name='div', attrs={'style':['display:block; width:428px; height:30px; float:left;']}),
]
preprocess_regexps = [
(re.compile(r'(<h3>Reacties</h3>|<h2>Zie ook:</h2>|<div style=".*</div>|<a[^>]*>|</a>)', re.DOTALL|re.IGNORECASE),
lambda match: ''),

View File

@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
b365.realitatea.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
class b365Realitatea(BasicNewsRecipe):
title = u'b365 Realitatea'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'b365 Realitatea'
description = u'b365 Realitatea'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania,Bucuresti'
encoding = 'utf-8'
cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'newsArticle'})
]
remove_tags = [
dict(name='div', attrs={'class':'date'})
, dict(name='dic', attrs={'class':'addthis_toolbox addthis_default_style'})
, dict(name='div', attrs={'class':'related_posts'})
, dict(name='div', attrs={'id':'RelevantiWidget'})
]
remove_tags_after = [
dict(name='div', attrs={'id':'RelevantiWidget'})
]
feeds = [
(u'\u0218tiri', u'http://b365.realitatea.net/rss-full/')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,61 +1,648 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
##
## Title: BBC News, Sport, and Blog Calibre Recipe
## Contact: mattst - jmstanfield@gmail.com
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: mattst - jmstanfield@gmail.com
##
## Written: November 2011
## Last Edited: 2011-11-19
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'mattst - jmstanfield@gmail.com'
'''
news.bbc.co.uk
BBC News, Sport, and Blog Calibre Recipe
'''
# Import the regular expressions module.
import re
# Import the BasicNewsRecipe class which this class extends.
from calibre.web.feeds.recipes import BasicNewsRecipe
class BBC(BasicNewsRecipe):
title = 'BBC News'
__author__ = 'Darko Miletic, Starson17'
description = 'News from UK. '
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'BBC'
category = 'news, UK, world'
language = 'en_GB'
publication_type = 'newsportal'
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
class BBCNewsSportBlog(BasicNewsRecipe):
#
# **** IMPORTANT USERS READ ME ****
#
# First select the feeds you want then scroll down below the feeds list
# and select the values you want for the other user preferences, like
# oldest_article and such like.
#
#
# Select the BBC rss feeds which you want in your ebook.
# Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
#
# Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed.
# Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
#
# There are 68 feeds below which constitute the bulk of the available rss
# feeds on the BBC web site. These include 5 blogs by editors and
# correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
# Wales, Scotland Business), and 7 Welsh language feeds.
#
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
# so if "oldest_article = 1.5" (only articles published in the last 36 hours)
# you may get some 'empty feeds' which will not then be included in the ebook.
#
# The 15 feeds currently selected below are simply my default ones.
#
# Note: With all 68 feeds selected, oldest_article set to 2,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
# the ebook creation took 29 minutes on my speedy 100 mbps net connection,
# fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
# More realistically with 15 feeds selected, oldest_article set to 1.5,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
# it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
#
# Select / de-select the feeds you want in your ebook.
#
feeds = [
("News Home", "http://feeds.bbci.co.uk/news/rss.xml"),
("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"),
("World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
#("England", "http://feeds.bbci.co.uk/news/england/rss.xml"),
#("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"),
#("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"),
#("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
#("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"),
#("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"),
#("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"),
#("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
#("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"),
("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"),
("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"),
("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
#("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"),
#("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"),
("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"),
("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"),
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
#("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
#("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
#("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
#("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
#("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
#("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
#("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
#("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
#("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
#("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
#("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
#("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
#("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
#("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
#("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
#("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
#("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
#("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
#("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
#("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
#("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
#("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
#("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
#("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
#("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
#("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
#("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
#("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
#("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
#("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
#("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
#("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
#("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
#("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
#("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
#("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"),
#("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
#("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
#("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
#("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
#("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
#("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
]
# **** SELECT YOUR USER PREFERENCES ****
# Title to use for the ebook.
#
title = 'BBC News'
# A brief description for the ebook.
#
description = u'BBC web site ebook created using rss feeds.'
# The max number of articles which may be downloaded from each feed.
# I've never seen more than about 70 articles in a single feed in the
# BBC feeds.
#
max_articles_per_feed = 100
# The max age of articles which may be downloaded from each feed. This is
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
# half days). My default of 1.5 days is the last 36 hours, the point at
# which I've decided 'news' becomes 'old news', but be warned this is not
# so good for the blogs, technology, magazine, etc., and sports feeds.
# You may wish to extend this to 2-5 but watch out ebook creation time will
# increase as well. Setting this to 30 will get everything (AFAICT) as long
# as max_articles_per_feed remains set high (except for 'Click' which is
# v. low volume and its currently oldest article is 4th Feb 2011).
#
oldest_article = 1.5
# Number of simultaneous downloads. 20 is consistantly working fine on the
# BBC News feeds with no problems. Speeds things up from the defualt of 5.
# If you have a lot of feeds and/or have increased oldest_article above 2
# then you may wish to try increasing simultaneous_downloads to 25-30,
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
#
simultaneous_downloads = 20
# Timeout for fetching files from the server in seconds. The default of
# 120 seconds, seems somewhat excessive.
#
timeout = 30
# The format string for the date shown on the ebook's first page.
# List of all values: http://docs.python.org/library/time.html
# Default in news.py has a leading space so that's mirrored here.
# As with 'feeds' select/de-select by adding/removing the initial '#',
# only one timefmt should be selected, here's a few to choose from.
#
timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default)
#timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
#timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
#timefmt = ' [%d %b %Y]' # [14 Nov 2011]
#timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
#timefmt = ' [%Y-%m-%d]' # [2011-11-14]
#timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
#
# **** IMPORTANT ****
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
#
# **** IMPORTANT ****
#
# Author of this recipe.
__author__ = 'mattst'
# Specify English as the language of the RSS feeds (ISO-639 code).
language = 'en_GB'
# Set tags.
tags = 'news, sport, blog'
# Set publisher and publication type.
publisher = 'BBC'
publication_type = 'newspaper'
# Disable stylesheets from site.
no_stylesheets = True
# Specifies an override encoding for sites that have an incorrect charset
# specified. Default of 'None' says to auto-detect. Some other BBC recipes
# use 'utf8', which works fine (so use that if necessary) but auto-detecting
# with None is working fine, so stick with that for robustness.
encoding = None
# Sets whether a feed has full articles embedded in it. The BBC feeds do not.
use_embedded_content = False
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
# Create a custom title which fits nicely in the Kindle title list.
# Requires "import time" above class declaration, and replacing
# title with custom_title in conversion_options (right column only).
# Example of string below: "BBC News - 14 Nov 2011"
#
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
'''
# Conversion options for advanced users, but don't forget to comment out the
# current conversion_options below. Avoid setting 'linearize_tables' as that
# plays havoc with the 'old style' table based pages.
#
conversion_options = { 'title' : title,
'comments' : description,
'tags' : tags,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
'smarten_punctuation' : True
}
'''
keep_only_tags = [
dict(name='div', attrs={'class':['layout-block-a layout-block']})
,dict(attrs={'class':['story-body','storybody']})
]
conversion_options = { 'smarten_punctuation' : True }
remove_tags = [
dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper',
'story-feature wide ', 'story-feature narrow']}),
dict(id=['hypertab', 'comment-form']),
]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { text-align: center; font-size: 175%; font-weight: bold; } \
h2 { text-align: center; font-size: 150%; font-weight: bold; } \
h3 { text-align: center; font-size: 125%; font-weight: bold; } \
h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
remove_attributes = ['width','height']
# Remove various tag attributes to improve the look of the ebook pages.
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
feeds = [
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
]
# Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
# cause a section of the ebook to start in an unsightly fashion or, more
# frequently, a "<br />" will muck up the formatting of a correspondant's byline.
# "<br />" and "<br clear/>" are far more frequently used on the table formatted
# style of pages, and really spoil the look of the ebook pages.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
# Create regular expressions for tag keeping and removal to make the matches more
# robust against minor changes and errors in the HTML, Eg. double spaces, leading
# and trailing spaces, missing hyphens, and such like.
# Python regular expression ('re' class) page: http://docs.python.org/library/re.html
# ***************************************
# Regular expressions for keep_only_tags:
# ***************************************
# The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
# page which contains the main text of the article. Match storybody variants: 'storybody',
# 'story-body', 'story body','storybody ', etc.
storybody_reg_exp = '^.*story[_ -]*body.*$'
# The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
# and published date. This is one level above the usual news pages which have the title
# and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
# resulting in a lot of extra things to be removed by remove_tags.
blq_content_reg_exp = '^.*blq[_ -]*content.*$'
# The BBC has an alternative page design structure, which I suspect is an out-of-date
# design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
# (travel), and in some sport pages. These alternative pages are table based (which is
# why I think they are an out-of-date design) and account for -I'm guesstimaking- less
# than 1% of all articles. They use a table class 'storycontent' to hold the article
# and like blq_content (above) have required lots of extra removal by remove_tags.
story_content_reg_exp = '^.*story[_ -]*content.*$'
# Keep the sections of the HTML which match the list below. The HTML page created by
# Calibre will fill <body> with those sections which are matched. Note that the
# blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
# it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
# will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
# all). If they are the other way around in keep_only_tags then blq_content_reg_exp
# will end up being discarded.
keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
# ************************************
# Regular expressions for remove_tags:
# ************************************
# Regular expression to remove share-help and variant tags. The share-help class
# is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
# twitter, email. Removed to avoid page clutter.
share_help_reg_exp = '^.*share[_ -]*help.*$'
# Regular expression to remove embedded-hyper and variant tags. This class is used to
# display links to other BBC News articles on the same/similar subject.
embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
# Regular expression to remove hypertabs and variant tags. This class is used to
# display a tab bar at the top of an article which allows the user to switch to
# an article (viewed on the same page) providing further info., 'in depth' analysis,
# an editorial, a correspondant's blog entry, and such like. The ability to handle
# a tab bar of this nature is currently beyond the scope of this recipe and
# possibly of Calibre itself (not sure about that - TO DO - check!).
hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
# Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
# 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
# This class is used to add additional info. boxes, or small lists, outside of
# the main story. TO DO: Work out a way to incorporate these neatly.
story_feature_reg_exp = '^.*story[_ -]*feature.*$'
# Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
# 'videoInStoryC'. This class is used to embed video.
video_reg_exp = '^.*video.*$'
# Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
# This class is used to embed audio.
audio_reg_exp = '^.*audio.*$'
# Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
# This class is used to embed a photo slideshow. See also 'slideshow' below.
picture_gallery_reg_exp = '^.*picture.*$'
# Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
# This class is used to embed a slideshow (not necessarily photo) but both
# 'slideshow' and 'pictureGallery' are used for slideshows.
slideshow_reg_exp = '^.*slide[_ -]*show.*$'
# Regular expression to remove social-links and variant tags. This class is used to
# display links to a BBC bloggers main page, used in various columnist's blogs
# (Eg. Nick Robinson, Robert Preston).
social_links_reg_exp = '^.*social[_ -]*links.*$'
# Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
# 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
# removed by 'story-feature' removal (as they are usually within them), but
# not always. The quotation removed is always (AFAICT) in the article text
# as well but a 2nd copy is placed in a quote tag to draw attention to it.
# The quote class tags may or may not appear in div's.
quote_reg_exp = '^.*quote.*$'
# Regular expression to remove hidden and variant tags, Eg. 'hidden'.
# The purpose of these is unclear, they seem to be an internal link to a
# section within the article, but the text of the link (Eg. 'Continue reading
# the main story') never seems to be displayed anyway. Removed to avoid clutter.
# The hidden class tags may or may not appear in div's.
hidden_reg_exp = '^.*hidden.*$'
# Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
# Used on the site to display text about registered users entering comments.
comment_reg_exp = '^.*comment.*$'
# Regular expression to remove form and variant tags, Eg. 'comment-form'.
# Used on the site to allow registered BBC users to fill in forms, typically
# for entering comments about an article.
form_reg_exp = '^.*form.*$'
# Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
#<div class="story-actions"> Used on sports pages for 'email' and 'print'.
story_actions_reg_exp = '^.*story[_ -]*actions.*$'
#<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
# social networking links).
bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
#<div id="secondary-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to 'similar stories'.
secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
#<div id="featured-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
featured_content_reg_exp = '^.*featured[_ -]*content.*$'
#<div id="navigation">
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
# Used sometimes instead of "featured-content" above.
navigation_reg_exp = '^.*navigation.*$'
#<a class="skip" href="#blq-container-inner">Skip to top</a>
# Used on sports pages to link to the top of the page.
skip_reg_exp = '^.*skip.*$'
# Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
# which are the alterative table design based pages. The purpose of some of these
# is not entirely clear from the pages (which are a total mess!).
# Remove mapping based tags, Eg. <map id="world_map">
# The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
map_reg_exp = '^.*map.*$'
# Remove social bookmarking variation, called 'socialBookMarks'.
social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
# Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
# Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
# alongside 'socialBookMarks' whenever that appears. I am removing it as well
# under the assumption that it can appear alone as well.
sharesb_reg_exp = '^.*sharesb.*$'
# Remove class 'o'. The worst named user created css class of all time. The creator
# should immediately be fired. I've seen it used to hold nothing at all but with
# 20 or so empty lines in it. Also to hold a single link to another article.
# Whatever it was designed to do it is not wanted by this recipe. Exact match only.
o_reg_exp = '^o$'
# Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
# use two reg expressions to make removing this (and variants) robust.
promo_top_reg_exp = '^.*promotopbg.*$'
promo_bottom_reg_exp = '^.*promobottombg.*$'
# Remove 'nlp', provides heading for link lists. Requires an exact match due to
# risk of matching those letters in something needed, unless I see a variation
# of 'nlp' used at a later date.
nlp_reg_exp = '^nlp$'
# Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
# has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
# matching those letters in something needed.
mva_or_mvb_reg_exp = '^mv[ab]$'
# Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
mvtb_reg_exp = '^mvtb$'
# Remove 'blq-toplink', class to provide a link to the top of the page.
blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
# Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
# Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
# use two reg expressions to make removing this (and variants) robust.
prods_services_01_reg_exp = '^.*servicev4.*$'
prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
# Remove -what I think is- some kind of navigation tools helper class, though I am
# not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
# frequently and it is not wanted. Have decided to use two reg expressions to make
# removing this (and variants) robust.
blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
# Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
# need removing - I have no clue what it does other than it contains links.
# Whatever it is - it is not part of the article and is not wanted.
puffbox_reg_exp = '^.*puffbox.*$'
# Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
sibtbg_reg_exp = '^.*sibtbg.*$'
# Remove 'storyextra' - links to relevant articles and external sites.
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
remove_tags = [ dict(name='div', attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)})
]
# Uses url to create and return the 'printer friendly' version of the url.
# In other words the 'print this page' address of the page.
#
# There are 3 types of urls used in the BBC site's rss feeds. There is just
# 1 type for the standard news while there are 2 used for sports feed urls.
# Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
# there is a major story of interest to 'everyone'. So even if no BBC sports
# feeds are added to 'feeds' the logic of this method is still needed to avoid
# blank / missing / empty articles which have an index title and then no body.
def print_version(self, url):
# Handle sports page urls type 01:
if (url.find("go/rss/-/sport1/") != -1):
temp_url = url.replace("go/rss/-/", "")
# Handle sports page urls type 02:
elif (url.find("go/rss/int/news/-/sport1/") != -1):
temp_url = url.replace("go/rss/int/news/-/", "")
# Handle regular news page urls:
else:
temp_url = url.replace("go/rss/int/news/-/", "")
# Always add "?print=true" to the end of the url.
print_url = temp_url + "?print=true"
return print_url
# Remove articles in feeds based on a string in the article title or url.
#
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
# thread, in post with title: "Remove articles from feed", see url:
# http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
# Many thanks and all credit to Starson17.
#
# Starson17's code has obviously been altered to suite my requirements.
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Match key words and remove article if there's a match.
# Most BBC rss feed video only 'articles' use upper case 'VIDEO'
# as a title prefix. Just match upper case 'VIDEO', so that
# articles like 'Video game banned' won't be matched and removed.
if 'VIDEO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
# as a title prefix. Just match upper case 'AUDIO', so that
# articles like 'Hi-Def audio...' won't be matched and removed.
elif 'AUDIO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
# 'In pictures', and 'in pictures', somewhere in their title.
# Match any case of that phrase.
elif 'IN PICTURES' in article.title.upper():
feed.articles.remove(article)
# As above, but user contributed pictures. Match any case.
elif 'YOUR PICTURES' in article.title.upper():
feed.articles.remove(article)
# 'Sportsday Live' are articles which contain a constantly and
# dynamically updated 'running commentary' during a live sporting
# event. Match any case.
elif 'SPORTSDAY LIVE' in article.title.upper():
feed.articles.remove(article)
# Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
# These are being matched below using 'Live - ' because removing all
# articles with 'live' in their titles would remove some articles
# that are in fact not live sports pages. Match any case.
elif 'LIVE - ' in article.title.upper():
feed.articles.remove(article)
# 'Quiz of the week' is a Flash player weekly news quiz. Match only
# the 'Quiz of the' part in anticipation of monthly and yearly
# variants. Match any case.
elif 'QUIZ OF THE' in article.title.upper():
feed.articles.remove(article)
# Remove articles with 'scorecards' in the url. These are BBC sports
# pages which just display a cricket scorecard. The pages have a mass
# of table and css entries to display the scorecards nicely. Probably
# could make them work with this recipe, but might take a whole day
# of work to sort out all the css - basically a formatting nightmare.
elif 'scorecards' in article.url:
feed.articles.remove(article)
return feeds
# End of class and file.

View File

@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe):
title = u'Benchmark.pl'
__author__ = 'fenuks'
description = u'benchmark.pl -IT site'
masthead_url = 'http://www.benchmark.pl/i/logo-footer.png'
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
category = 'IT'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets=True
preprocess_regexps = [(re.compile(ur'\bWięcej o .*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
remove_tags_after=dict(name='div', attrs={'class':'body'})
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})]
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
INDEX= 'http://www.benchmark.pl'
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]

View File

@ -0,0 +1,16 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327747616(BasicNewsRecipe):
title = u'Beppe Grillo'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Beppe Grillo', u'http://feeds.feedburner.com/beppegrillo/atom')]
description = 'Blog of the famous comedian and politician Beppe Grillo - v1.00 (28, January 2012)'
__author__ = 'faber1971'
language = 'it'

View File

@ -1,61 +1,44 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
class SportsIllustratedRecipe(BasicNewsRecipe) :
__author__ = 'ape'
__copyright__ = 'ape'
__author__ = 'a.peter'
__copyright__ = 'a.peter'
__license__ = 'GPL v3'
language = 'de'
description = 'Berliner Zeitung'
version = 2
description = 'Berliner Zeitung RSS'
version = 4
title = u'Berliner Zeitung'
timefmt = ' [%d.%m.%Y]'
#oldest_article = 7.0
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
publication_type = 'newspaper'
keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
remove_tags_before = dict(name='div', attrs={'class':'newstype'})
remove_tags_after = [dict(id='article_text')]
INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
def parse_index(self):
base = 'http://www.berlinonline.de'
answer = []
articles = {}
more = 1
soup = self.index_to_soup(self.INDEX)
# Get list of links to ressorts from index page
ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
for ressort in ressort_list[0].findAll('a'):
feed_title = ressort.string
print 'Analyzing', feed_title
if not articles.has_key(feed_title):
articles[feed_title] = []
answer.append(feed_title)
# Load ressort page.
feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
# find mainbar div which contains the list of all articles
for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
# iterate over all articles
for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
# extract title of article
if article_teaser.h3 != None:
article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
articles[feed_title].append(article)
else:
# Skip teasers for missing photos
if article_teaser.div.p.contents[0].find('Foto:') > -1:
continue
article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
articles[feed_title].append(article)
more += 1
answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
return answer
feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
(u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
(u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
(u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
(u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
(u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
(u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
(u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
(u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
(u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
(u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
(u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
(u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
(u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
(u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
def get_masthead_url(self):
return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
def print_version(self, url):
return url.replace('.html', ',view,printVersion.html')

View File

@ -1,4 +1,3 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
@ -18,11 +17,17 @@ class Berlingske_dk(BasicNewsRecipe):
no_stylesheets = True
remove_empty_feeds = True
use_embedded_content = False
remove_javascript = True
publication_type = 'newspaper'
encoding = 'utf8'
language = 'da'
masthead_url = 'http://www.berlingske.dk/sites/all/themes/bm/img/layout/masthead_bg.gif'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h1,.manchet,.byline{font-family: Cambria,Georgia,Times,"Times New Roman",serif } '
auto_cleanup = True
extra_css = '''
.manchet {color:#888888;}
.dateline {font-size: x-small; color:#444444;}
.manchet,.dateline { font-family: Cambria,Georgia,Times,"Times New Roman",serif }
.body {font-family: Arial,Helvetica,sans-serif }
'''
conversion_options = {
'comment' : description
@ -32,18 +37,14 @@ class Berlingske_dk(BasicNewsRecipe):
}
feeds = [
(u'Breaking news' , u'http://www.berlingske.dk/breaking/rss' )
,(u'Seneste nyt' , u'http://www.berlingske.dk/seneste/rss' )
,(u'Topnyheder' , u'http://www.berlingske.dk/top/rss' )
,(u'Danmark' , u'http://www.berlingske.dk/danmark/seneste/rss' )
,(u'Verden' , u'http://www.berlingske.dk/verden/seneste/rss' )
,(u'Klima' , u'http://www.berlingske.dk/klima/seneste/rss' )
,(u'Debat' , u'http://www.berlingske.dk/debat/seneste/rss' )
,(u'Koebenhavn' , u'http://www.berlingske.dk/koebenhavn/seneste/rss')
,(u'Politik' , u'http://www.berlingske.dk/politik/seneste/rss' )
,(u'Kultur' , u'http://www.berlingske.dk/kultur/seneste/rss' )
(u'Breaking news' , u'http://www.b.dk/breaking/rss' )
,(u'Seneste nyt' , u'http://www.b.dk/seneste/rss' )
,(u'Topnyheder' , u'http://www.b.dk/top/rss' )
,(u'Danmark' , u'http://www.b.dk/danmark/seneste/rss' )
,(u'Verden' , u'http://www.b.dk/verden/seneste/rss' )
,(u'Klima' , u'http://www.b.dk/klima/seneste/rss' )
,(u'Debat' , u'http://www.b.dk/debat/seneste/rss' )
,(u'Koebenhavn' , u'http://www.b.dk/koebenhavn/seneste/rss')
,(u'Politik' , u'http://www.b.dk/politik/seneste/rss' )
,(u'Kultur' , u'http://www.b.dk/kultur/seneste/rss' )
]
keep_only_tags = [dict(attrs={'class':['first','pt-article']})]
remove_tags = [dict(name=['object','link','base','iframe','embed'])]

38
recipes/biamag.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'BiaMag'
__author__ = 'Osman Kaysan'
description = 'Independent News from Turkey'
publisher = 'BiaMag'
category = 'news, politics, Turkey'
oldest_article = 15
max_articles_per_feed = 120
masthead_url = 'http://bianet.org/images/biamag_logo.gif'
language = 'tr'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'BiaMag', u'http://www.bianet.org/biamag.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

38
recipes/biamag_en.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'Bianet-English'
__author__ = 'Osman Kaysan'
description = 'Independent News Network from Turkey(English)'
publisher = 'Bianet'
category = 'news, politics, Turkey'
oldest_article = 7
max_articles_per_feed = 150
masthead_url = 'http://bianet.org/images/english_logo.gif'
language = 'en_TR'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'Bianet-English', u'http://www.bianet.org/english.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

38
recipes/bianet.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'Bianet'
__author__ = 'Osman Kaysan'
description = 'Independent News from Turkey'
publisher = 'Bianet'
category = 'news, politics, Turkey'
oldest_article = 7
max_articles_per_feed = 120
masthead_url = 'http://bianet.org/images/bianet_logo.gif'
language = 'tr'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'Bianet', u'http://bianet.org/bianet.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

20
recipes/biolog_pl.recipe Normal file
View File

@ -0,0 +1,20 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Biolog_pl(BasicNewsRecipe):
title = u'Biolog.pl'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds=True
__author__ = 'fenuks'
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
category = 'biology'
language = 'pl'
masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png'
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
no_stylesheets = True
#keeps_only_tags=[dict(id='main')]
remove_tags_before=dict(id='main')
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})]
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]

View File

@ -0,0 +1,50 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Birgun (BasicNewsRecipe):
title = u'Birgün Gazetesi'
__author__ = u'Osman Kaysan'
oldest_article = 7
max_articles_per_feed =150
use_embedded_content = False
description = 'Birgun gazatesi haberleri, kose yazarlari'
publisher = 'Birgün'
category = 'news,haberler,turkce,gazete,birgun'
language = 'tr'
no_stylesheets = True
publication_type = 'newspaper'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
cover_img_url = 'http://www.birgun.net/i/birgun.png'
masthead_url = 'http://www.birgun.net/i/birgun.png'
remove_attributes = ['width','height']
remove_tags_before = dict(name='h2', attrs={'class':'storyHeadline'})
#remove_tags_after = dict(name='div', attrs={'class':'toollinks'})
remove_tags_after = dict(name='tr', attrs={'valign':'top'})
remove_tags = [ dict(name='div', attrs={'id':'byLine'}), dict(name='div', attrs={'class':'toollinks'})
, dict(name='div', attrs={'class':'main-lead'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})
, dict(name='a', attrs={'class':'addthis_button'})]
remove_empty_feeds= True
feeds = [
( u'Güncel', u'http://www.birgun.net/actuels.xml')
,( u'Köşe Yazarları', u'http://www.birgun.net/writer.xml')
,( u'Politika', u'http://www.birgun.net/politics.xml')
,( u'Ekonomi', u'http://www.birgun.net/economic.xml')
,( u'Çalışma Yaşamı', u'http://www.birgun.net/workers.xml')
,( u'Dünya', u'http://www.birgun.net/worlds.xml')
,( u'Yaşam', u'http://www.birgun.net/lifes.xml')
]

View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Birmingham post'
description = 'News for Birmingham UK'
timefmt = ''
__author__ = 'Dave Asbury'
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
oldest_article = 1
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
auto_cleanup = True
language = 'en_GB'
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
keep_only_tags = [
#dict(name='h1',attrs={'id' : 'article-headline'}),
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
#dict(name='p')
#dict(attrs={'id' : 'three-col'})
]
remove_tags = [
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
]
feeds = [
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
]
extra_css = '''
body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''

View File

@ -1,6 +1,6 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
blic.rs
'''
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
def print_version(self, url):
return url + '/print'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_cover_url(self):
soup = self.index_to_soup('http://www.blic.rs/')
alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
if alink:
return 'http://www.blic.rs' + alink['href']
return None

26
recipes/blues.recipe Normal file
View File

@ -0,0 +1,26 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Oskar Kunicki <rakso at interia.pl>'
'''
Changelog:
2011-11-27
News from BluesRSS.info
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BluesRSS(BasicNewsRecipe):
title = 'Blues News'
__author__ = 'Oskar Kunicki'
description ='Blues news from around the world'
publisher = 'BluesRSS.info'
category = 'news, blues, USA,UK'
oldest_article = 5
max_articles_per_feed = 100
language = 'en'
cover_url = 'http://bluesrss.info/cover.jpg'
masthead_url = 'http://bluesrss.info/cover.jpg'
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':'wp-pagenavi'})]
feeds = [(u'News', u'http://bluesrss.info/feed/')]

View File

@ -1,95 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
borba.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Borba(BasicNewsRecipe):
title = 'Borba Online'
__author__ = 'Darko Miletic'
description = 'Dnevne novine Borba Online'
publisher = 'IP Novine Borba'
category = 'news, politics, Serbia'
language = 'sr'
lang = _('sr-Latn-RS')
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
INDEX = u'http://www.borba.rs/'
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'main'})]
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
remove_tags = [
dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'id':'written_comments_title'})
]
feeds = [
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
]
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
url = item['href']
title = self.tag_to_string(item)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -110,8 +110,10 @@ class BrandEins(BasicNewsRecipe):
selected_issue = issue_map[selected_issue_key]
url = selected_issue.get('href', False)
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
self.title = "brand eins " + selected_issue_key[4:] + "/" + selected_issue_key[0:4]
# self.title = "brand eins " + selected_issue_key[4:] + "/" + selected_issue_key[0:4]
# Get the alternative title for the magazin - build it out of the title of the cover - without the issue and year;
url = 'http://brandeins.de/'+url
self.timefmt = ' ' + selected_issue_key[4:] + '/' + selected_issue_key[:4]
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
titles_and_articles = self.brand_eins_parse_issue(url)
@ -163,4 +165,3 @@ class BrandEins(BasicNewsRecipe):
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
titles_and_articles.append([chapter_title, current_articles])
return titles_and_articles

View File

@ -10,49 +10,39 @@ http://www.buffalonews.com/RSS/
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1298680852(BasicNewsRecipe):
class BuffaloNews(BasicNewsRecipe):
title = u'Buffalo News'
oldest_article = 2
language = 'en'
__author__ = 'ChappyOnIce'
__author__ = 'ChappyOnIce, Krittika Goyal'
max_articles_per_feed = 20
encoding = 'utf-8'
masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png'
remove_javascript = True
extra_css = 'body {text-align: justify;}\n \
p {text-indent: 20px;}'
auto_cleanup = True
remove_empty_feeds = True
keep_only_tags = [
dict(name='div', attrs={'class':['main-content-left']})
]
remove_tags = [
dict(name='div', attrs={'id':['commentCount']}),
dict(name='div', attrs={'class':['story-list-links']})
]
remove_tags_after = dict(name='div', attrs={'class':['body storyContent']})
feeds = [(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
(u'Niagara County', u'http://www.buffalonews.com/city/communities/niagara-county/?widget=rssfeed&view=feed&contentId=77944'),
(u'Business', u'http://www.buffalonews.com/business/?widget=rssfeed&view=feed&contentId=77944'),
(u'MoneySmart', u'http://www.buffalonews.com/business/moneysmart/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bills & NFL', u'http://www.buffalonews.com/sports/bills-nfl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Sabres & NHL', u'http://www.buffalonews.com/sports/sabres-nhl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bob DiCesare', u'http://www.buffalonews.com/sports/columns/bob-dicesare/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bucky Gleason', u'http://www.buffalonews.com/sports/columns/bucky-gleason/?widget=rssfeed&view=feed&contentId=77944'),
(u'Mark Gaughan', u'http://www.buffalonews.com/sports/bills-nfl/inside-the-nfl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Mike Harrington', u'http://www.buffalonews.com/sports/columns/mike-harrington/?widget=rssfeed&view=feed&contentId=77944'),
(u'Jerry Sullivan', u'http://www.buffalonews.com/sports/columns/jerry-sullivan/?widget=rssfeed&view=feed&contentId=77944'),
(u'Other Sports Columns', u'http://www.buffalonews.com/sports/columns/other-sports-columns/?widget=rssfeed&view=feed&contentId=77944'),
(u'Life', u'http://www.buffalonews.com/life/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bruce Andriatch', u'http://www.buffalonews.com/city/columns/bruce-andriatch/?widget=rssfeed&view=feed&contentId=77944'),
(u'Donn Esmonde', u'http://www.buffalonews.com/city/columns/donn-esmonde/?widget=rssfeed&view=feed&contentId=77944'),
(u'Rod Watson', u'http://www.buffalonews.com/city/columns/rod-watson/?widget=rssfeed&view=feed&contentId=77944'),
(u'Entertainment', u'http://www.buffalonews.com/entertainment/?widget=rssfeed&view=feed&contentId=77944'),
(u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
(u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
feeds = [
(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
(u'Niagara County', u'http://www.buffalonews.com/city/communities/niagara-county/?widget=rssfeed&view=feed&contentId=77944'),
(u'Business', u'http://www.buffalonews.com/business/?widget=rssfeed&view=feed&contentId=77944'),
(u'MoneySmart', u'http://www.buffalonews.com/business/moneysmart/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bills & NFL', u'http://www.buffalonews.com/sports/bills-nfl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Sabres & NHL', u'http://www.buffalonews.com/sports/sabres-nhl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bob DiCesare', u'http://www.buffalonews.com/sports/columns/bob-dicesare/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bucky Gleason', u'http://www.buffalonews.com/sports/columns/bucky-gleason/?widget=rssfeed&view=feed&contentId=77944'),
(u'Mark Gaughan', u'http://www.buffalonews.com/sports/bills-nfl/inside-the-nfl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Mike Harrington', u'http://www.buffalonews.com/sports/columns/mike-harrington/?widget=rssfeed&view=feed&contentId=77944'),
(u'Jerry Sullivan', u'http://www.buffalonews.com/sports/columns/jerry-sullivan/?widget=rssfeed&view=feed&contentId=77944'),
(u'Other Sports Columns', u'http://www.buffalonews.com/sports/columns/other-sports-columns/?widget=rssfeed&view=feed&contentId=77944'),
(u'Life', u'http://www.buffalonews.com/life/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bruce Andriatch', u'http://www.buffalonews.com/city/columns/bruce-andriatch/?widget=rssfeed&view=feed&contentId=77944'),
(u'Donn Esmonde', u'http://www.buffalonews.com/city/columns/donn-esmonde/?widget=rssfeed&view=feed&contentId=77944'),
(u'Rod Watson', u'http://www.buffalonews.com/city/columns/rod-watson/?widget=rssfeed&view=feed&contentId=77944'),
(u'Entertainment', u'http://www.buffalonews.com/entertainment/?widget=rssfeed&view=feed&contentId=77944'),
(u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
(u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
]

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,45 +7,76 @@ __license__ = 'GPL v3'
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Calgary Herald
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
encoding = 'latin1'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
@ -64,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@ -98,9 +196,7 @@ class CanWestPaper(BasicNewsRecipe):
atag = h1tag.find('a',href=True)
if not atag:
continue
url = atag['href']
if not url.startswith('http:'):
url = self.url_prefix+'/news/todays-paper/'+atag['href']
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)

View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class CalibreBlog(BasicNewsRecipe):
title = u'Calibre Blog'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1000 #days
max_articles_per_feed = 5
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('Article',
'http://blog.calibre-ebook.com/feeds/posts/default'),
]

35
recipes/capital_gr.recipe Normal file
View File

@ -0,0 +1,35 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
class Capital(BasicNewsRecipe):
title = 'Capital.gr'
__author__ ='Stelios'
description = 'Financial News from Greece'
#max_articles_per_feed = 100
oldest_article = 3
publisher = 'Capital.gr'
category = 'news, GR'
language = 'el'
encoding = 'windows-1253'
cover_url = 'http://files.capital.gr/images/caplogo.gif'
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
keep_only_tags = [
dict(name='h1'),
dict(name='p'),
dict(name='span', attrs={'id' : ["textbody"]})
]
#3 posts seemed to have utf8 encoding
feeds = [
(u'\u039F\u039B\u0395\u03A3 \u039F\u0399 \u0395\u0399\u0394\u0397\u03A3\u0395\u0399\u03A3', 'http://www.capital.gr/news/newsrss.asp?s=-1'),
(u'\u0395\u03A0\u0399\u03A7\u0395\u0399\u03A1\u0397\u03A3\u0395\u0399\u03A3', 'http://www.capital.gr/news/newsrss.asp?s=-2'),
(u'\u0391\u0393\u039F\u03A1\u0395\u03A3', 'http://www.capital.gr/news/newsrss.asp?s=-3'),
(u'\u039F\u0399\u039A\u039F\u039D\u039F\u039C\u0399\u0391', 'http://www.capital.gr/news/newsrss.asp?s=-4'),
(u'\u03A7\u03A1\u0397\u039C. \u0391\u039D\u0391\u039A\u039F\u0399\u039D\u03A9\u03A3\u0395\u0399\u03A3', 'http://www.capital.gr/news/newsrss.asp?s=-6'),
(u'\u039C\u03CC\u03BD\u03B9\u03BC\u03B5\u03C2 \u03C3\u03C4\u03AE\u03BB\u03B5\u03C2: \u039C\u0395 \u0391\u03A0\u039F\u03A8\u0397', 'http://www.capital.gr/articles/articlesrss.asp?catid=4'),
(u'\u039C\u03CC\u03BD\u03B9\u03BC\u03B5\u03C2 \u03C3\u03C4\u03AE\u03BB\u03B5\u03C2: \u03A3\u0399\u03A9\u03A0\u0397\u03A4\u0397\u03A1\u0399\u039F', 'http://www.capital.gr/articles/articlesrss.asp?catid=6'),
(u'\u039C\u03CC\u03BD\u03B9\u03BC\u03B5\u03C2 \u03C3\u03C4\u03AE\u03BB\u03B5\u03C2: \u03A0\u0399\u03A3\u03A9 \u0391\u03A0\u039F \u03A4\u0399\u03A3 \u0393\u03A1\u0391\u039C\u039C\u0395\u03A3', 'http://www.capital.gr/articles/articlesrss.asp?catid=8'),
#(u'\u039C\u03CC\u03BD\u03B9\u03BC\u03B5\u03C2 \u03C3\u03C4\u03AE\u03BB\u03B5\u03C2: \u03A4\u0395\u03A7\u039D\u039F\u039B\u039F\u0393\u0399\u0391', 'http://www.capital.gr/news/newsrss.asp?s=-8') not working for now
]

51
recipes/catavencii.recipe Normal file
View File

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
catavencii.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Catavencii(BasicNewsRecipe):
title = u'Ca\u0163avencii'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'Ca\u0163avencii'
description = u'Ca\u0163avencii'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania'
encoding = 'utf-8'
cover_url = 'http://www.simonatache.ro/wp-content/uploads/2011/06/catavencii-logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'id':'content'})
]
remove_tags = [
dict(name='div', attrs={'id':'breadcrumbs'})
, dict(name='span', attrs={'class':'info'})
, dict(name='div', attrs={'id':'social-media-article'})
]
remove_tags_after = [
dict(name='div', attrs={'id':'social-media-article'})
]
feeds = [
(u'\u0218tiri', u'http://www.catavencii.ro/rss')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -4,16 +4,16 @@
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
catavencu.ro
academiacatavencu.info
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Catavencu(BasicNewsRecipe):
class AcademiaCatavencu(BasicNewsRecipe):
title = u'Academia Ca\u0163avencu'
__author__ = u'Silviu Cotoar\u0103'
description = 'Tagma cum laude'
publisher = 'Catavencu'
publisher = u'Ca\u0163avencu'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
@ -21,32 +21,31 @@ class Catavencu(BasicNewsRecipe):
use_embedded_content = False
category = 'Ziare'
encoding = 'utf-8'
cover_url = 'http://upload.wikimedia.org/wikipedia/en/1/1e/Academia_Catavencu.jpg'
cover_url = 'http://www.academiacatavencu.info/images/logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'publisher' : publisher
}
keep_only_tags = [
dict(name='ul', attrs={'class':'articles'})
dict(name='h1', attrs={'class':'art_title'}),
dict(name='div', attrs={'class':'art_text'})
]
remove_tags = [
dict(name='div', attrs={'class':['tools']})
, dict(name='div', attrs={'class':['share']})
, dict(name='div', attrs={'class':['category']})
, dict(name='div', attrs={'id':['comments']})
dict(name='div', attrs={'class':['desp_m']})
, dict(name='div', attrs={'id':['tags']})
]
remove_tags_after = [
dict(name='div', attrs={'id':'comments'})
dict(name='div', attrs={'class':['desp_m']})
]
feeds = [
(u'Feeds', u'http://catavencu.ro/feed/rss')
(u'Feeds', u'http://www.academiacatavencu.info/rss.xml')
]
def preprocess_html(self, soup):

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1328971305(BasicNewsRecipe):
title = u'Catholic Daily Readings'
language = 'en'
__author__ = 'adoucette'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Daily Readings - USCCB', u'http://www.usccb.org/bible/readings/rss/'), (u'Daily Reflection - One Bread One Body', u'http://www.presentationministries.com/general/rss.asp'), (u'Mass Readings - Universalis', u'http://www.universalis.com/atommass3.xml'), (u'Saint Of The Day - CNA', u'http://feeds.feedburner.com/catholicnewsagency/saintoftheday')]

View File

@ -1,16 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
class CD_Action(BasicNewsRecipe):
title = u'CD-Action'
__author__ = 'fenuks'
description = 'cdaction.pl - polish magazine about games site'
description = 'cdaction.pl - polish games magazine site'
category = 'games'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG'
keep_only_tags= dict(id='news_content')
remove_tags_after= dict(name='div', attrs={'class':'tresc'})
feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe):
oldest_article = 7
__author__ = 'fenuks'
description = u'Codzienna Gazeta Muzyczna'
masthead_url='http://www.cgm.pl/img/header/logo.gif'
cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg'
category = 'music'
language = 'pl'
@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe):
def preprocess_html(self, soup):
gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery:
img=gallery.div
gallery.img.extract()
if img:
img=img['style']
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
gallery.contents[1].name='img'
gallery.contents[1]['src']=img
for item in soup.findAll(style=True):
del item['style']
ad=soup.findAll('a')
for r in ad:
if 'http://www.hustla.pl' in r['href']:
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
r.extract()
gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery:
img=gallery.find('embed')
if img:
img=img['src'][35:]
img='http://www.cgm.pl/_vault/_gallery/_photo/'+img
param=gallery.findAll(name='param')
for i in param:
i.extract()
gallery.contents[1].name='img'
gallery.contents[1]['src']=img
return soup

View File

@ -77,8 +77,18 @@ class ChicagoTribune(BasicNewsRecipe):
def get_article_url(self, article):
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
url = article.get('feedburner_origlink', article.get('guid', article.get('link')))
if url.endswith('?track=rss'):
url = url.partition('?')[0]
return url
def skip_ad_pages(self, soup):
text = soup.find(text='click here to continue to article')
if text:
a = text.parent
url = a.get('href')
if url:
return self.index_to_soup(url, raw=True)
def postprocess_html(self, soup, first_fetch):
# Remove the navigation bar. It was kept until now to be able to follow

View File

@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe):
remove_javascript = True
no_stylesheets = True
requires_version = (0, 8, 39)
def preprocess_raw_html(self, raw, url):
try:
from html5lib import parse
root = parse(raw, namespaceHTMLElements=False,
treebuilder='lxml').getroot()
from lxml import etree
for tag in root.xpath(
'//script|//style|//noscript|//meta|//link|//object'):
tag.getparent().remove(tag)
for elem in list(root.iterdescendants(tag=etree.Comment)):
elem.getparent().remove(elem)
ans = etree.tostring(root, encoding=unicode)
ans = re.sub('.*<html', '<html', ans, flags=re.DOTALL)
return ans
except:
import traceback
traceback.print_exc()
raise
def index_to_soup(self, url):
raw = BasicNewsRecipe.index_to_soup(self, url,
raw=True).decode('utf-8')
raw = self.preprocess_raw_html(raw, url)
return BasicNewsRecipe.index_to_soup(self, raw)
def append_page(self, soup, appendtag, position):
nav = soup.find('div',attrs={'class':'navigation'})
@ -78,14 +104,6 @@ class ChristianScienceMonitor(BasicNewsRecipe):
print_soup = soup
return print_soup
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<!--.*?-->', lambda match : ''),
(r'<body.*?<div id="story"', lambda match : '<body><div id="story"'),
(r'<div class="pubdate">.*?</div>', lambda m: ''),
(r'Full HTML version of this story which may include photos, graphics, and related links.*</body>',
lambda match : '</body>'),
]]
extra_css = '''
h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large}
.sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;}

View File

@ -0,0 +1,48 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Ciekawostki_Historyczne(BasicNewsRecipe):
title = u'Ciekawostki Historyczne'
oldest_article = 7
__author__ = 'fenuks'
description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.'
category = 'history'
language = 'pl'
masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
no_stylesheets=True
remove_empty_feeds=True
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
remove_tags=[dict(id='singlepostinfo')]
feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')]
def append_page(self, soup, appendtag):
tag=soup.find(name='h7')
if tag:
if tag.br:
pass
elif tag.nextSibling.name=='p':
tag=tag.nextSibling
nexturl = tag.findAll('a')
for nextpage in nexturl:
tag.extract()
nextpage= nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(name='div', attrs={'class':'post'})
for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}):
r.extract()
for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}):
r.extract()
for r in pagetext.findAll('h1'):
r.extract()
pagetext.find('h6').nextSibling.extract()
pagetext.find('h7').nextSibling.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url):
if url.find('news/article.php') >= 0:
@ -46,16 +48,18 @@ class TheCND(BasicNewsRecipe):
title = self.tag_to_string(a)
self.log('\tFound article: ', title, 'at', url)
date = a.nextSibling
if re.search('cm', date):
continue
if (date is not None) and len(date)>2:
if not articles.has_key(date):
articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date)
self.log('log articles', articles)
#self.log('log articles', articles)
mostCurrent = sorted(articles).pop()
self.title = 'CND ' + mostCurrent
self.title = 'CND ' + mostCurrent
feeds.append((self.title, articles[mostCurrent]))
return feeds

72
recipes/cnd_weekly.recipe Normal file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
'''
cnd.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class TheCND(BasicNewsRecipe):
title = 'CND Weekly'
__author__ = 'Derek Liang'
description = ''
INDEX = 'http://cnd.org'
language = 'zh'
conversion_options = {'linearize_tables':True}
remove_tags_before = dict(name='div', id='articleHead')
remove_tags_after = dict(id='copyright')
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url):
if url.find('news/article.php') >= 0:
return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
else:
return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
feeds = []
articles = {}
for a in soup.findAll('a', attrs={'target':'_cnd'}):
url = a['href']
if url.find('article.php') < 0 :
continue
if url.startswith('/'):
url = 'http://cnd.org'+url
title = self.tag_to_string(a)
date = a.nextSibling
if not re.search('cm', date):
continue
self.log('\tFound article: ', title, 'at', url, '@', date)
if (date is not None) and len(date)>2:
if not articles.has_key(date):
articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date)
sorted_articles = sorted(articles)
while sorted_articles:
mostCurrent = sorted_articles.pop()
self.title = 'CND ' + mostCurrent
feeds.append((self.title, articles[mostCurrent]))
return feeds
def populate_article_metadata(self, article, soup, first):
header = soup.find('h3')
self.log('header: ' + self.tag_to_string(header))
pass

View File

@ -5,8 +5,8 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
Changelog:
2011-09-24
Changed cover (drMerry)
'''
'''
2011-10-13
Updated Cover (drMerry)
news.cnet.com
'''
@ -24,7 +24,7 @@ class CnetNews(BasicNewsRecipe):
encoding = 'cp1252'
use_embedded_content = False
language = 'en'
cover_url = 'http://reviews.cnet.com/i/ff/wp/logo_cnet.gif'
conversion_options = {
'comment' : description
, 'tags' : category

View File

@ -22,6 +22,14 @@ class CNN(BasicNewsRecipe):
#match_regexps = [r'http://sportsillustrated.cnn.com/.*/[1-9].html']
max_articles_per_feed = 25
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.cnn_story_author, .cnn_stryathrtmp {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycaptiontxt, .cnnArticleGalleryPhotoContainer {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycbftrtxt, .cnnEditorialNote {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycntntlft {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
preprocess_regexps = [
(re.compile(r'<!--\[if.*if\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
@ -32,7 +40,12 @@ class CNN(BasicNewsRecipe):
remove_tags = [
{'class':['cnn_strybtntools', 'cnn_strylftcntnt',
'cnn_strybtntools', 'cnn_strybtntoolsbttm', 'cnn_strybtmcntnt',
'cnn_strycntntrgt', 'hed_side', 'foot']},
'cnn_strycntntrgt', 'hed_side', 'foot', 'cnn_strylftcntnt cnn_strylftcexpbx']},
{'class':['cnn_html_media_title_new', 'cnn_html_media_title_new cnn_html_media_title_none',
'cnnArticleGalleryCaptionControlText', 'articleGalleryNavContainer']},
{'id':['articleGalleryNav00JumpPrev', 'articleGalleryNav00Prev',
'articleGalleryNav00Next', 'articleGalleryNav00JumpNext']},
{'style':['display:none']},
dict(id=['ie_column']),
]
@ -58,3 +71,12 @@ class CNN(BasicNewsRecipe):
ans = BasicNewsRecipe.get_article_url(self, article)
return ans.partition('?')[0]
def get_masthead_url(self):
masthead = 'http://i.cdn.turner.com/cnn/.element/img/3.0/global/header/intl/hdr-globe-central.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead

View File

@ -14,67 +14,43 @@ class ColumbusDispatchRecipe(BasicNewsRecipe):
use_embedded_content = False
remove_empty_feeds = True
oldest_article = 1.2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
# Seems to work best, but YMMV
simultaneous_downloads = 2
auto_cleanup = True
#auto_cleanup_keep = '//div[@id="story-photos"]'
# Feeds from http://www.dispatch.com/live/content/rss/index.html
feeds = []
feeds.append((u'News: Local and state news', u'http://www.dispatch.com/live/static/crt/2_rss_localnews.xml'))
feeds.append((u'News: National news', u'http://www.dispatch.com/live/static/crt/2_rss_nationalnews.xml'))
feeds.append((u'News: Editorials', u'http://www.dispatch.com/live/static/crt/2_rss_editorials.xml'))
feeds.append((u'News: Columnists', u'http://www.dispatch.com/live/static/crt/2_rss_columnists.xml'))
feeds.append((u'News: Health news', u'http://www.dispatch.com/live/static/crt/2_rss_health.xml'))
feeds.append((u'News: Science news', u'http://www.dispatch.com/live/static/crt/2_rss_science.xml'))
feeds.append((u'Sports: OSU football', u'http://www.dispatch.com/live/static/crt/2_rss_osufootball.xml'))
feeds.append((u'Sports: OSU men\'s basketball', u'http://www.dispatch.com/live/static/crt/2_rss_osumensbball.xml'))
feeds.append((u'Sports: OSU women\'s basketball', u'http://www.dispatch.com/live/static/crt/2_rss_osuwomensbball.xml'))
feeds.append((u'Sports: OSU sports', u'http://www.dispatch.com/live/static/crt/2_rss_osusports.xml'))
feeds.append((u'Sports: Blue Jackets', u'http://www.dispatch.com/live/static/crt/2_rss_bluejackets.xml'))
feeds.append((u'Sports: Crew', u'http://www.dispatch.com/live/static/crt/2_rss_crew.xml'))
feeds.append((u'Sports: Clippers', u'http://www.dispatch.com/live/static/crt/2_rss_clippers.xml'))
feeds.append((u'Sports: Indians', u'http://www.dispatch.com/live/static/crt/2_rss_indians.xml'))
feeds.append((u'Sports: Reds', u'http://www.dispatch.com/live/static/crt/2_rss_reds.xml'))
feeds.append((u'Sports: Golf', u'http://www.dispatch.com/live/static/crt/2_rss_golf.xml'))
feeds.append((u'Sports: Outdoors', u'http://www.dispatch.com/live/static/crt/2_rss_outdoors.xml'))
feeds.append((u'Sports: Cavs/NBA', u'http://www.dispatch.com/live/static/crt/2_rss_cavaliers.xml'))
feeds.append((u'Sports: High Schools', u'http://www.dispatch.com/live/static/crt/2_rss_highschools.xml'))
feeds.append((u'Sports: Browns', u'http://www.dispatch.com/live/static/crt/2_rss_browns.xml'))
feeds.append((u'Sports: Bengals', u'http://www.dispatch.com/live/static/crt/2_rss_bengals.xml'))
feeds.append((u'Sports: Auto Racing', u'http://www.dispatch.com/live/static/crt/2_rss_autoracing.xml'))
feeds.append((u'Business News', u'http://www.dispatch.com/live/static/crt/2_rss_business.xml'))
feeds.append((u'Features: Weekender', u'http://www.dispatch.com/live/static/crt/2_rss_weekender.xml'))
feeds.append((u'Features: Life and Arts', u'http://www.dispatch.com/live/static/crt/2_rss_lifearts.xml'))
feeds.append((u'Features: Food', u'http://www.dispatch.com/live/static/crt/2_rss_food.xml'))
feeds.append((u'Features: NOW! for kids', u'http://www.dispatch.com/live/static/crt/2_rss_now.xml'))
feeds.append((u'Features: Travel', u'http://www.dispatch.com/live/static/crt/2_rss_travel.xml'))
feeds.append((u'Features: Home and Garden', u'http://www.dispatch.com/live/static/crt/2_rss_homegarden.xml'))
feeds.append((u'Features: Faith and Values', u'http://www.dispatch.com/live/static/crt/2_rss_faithvalues.xml'))
#feeds.append((u'', u''))
feeds = [
('Local',
'http://www.dispatch.com/content/syndication/news_local-state.xml'),
('National',
'http://www.dispatch.com/content/syndication/news_national.xml'),
('Business',
'http://www.dispatch.com/content/syndication/news_business.xml'),
('Editorials',
'http://www.dispatch.com/content/syndication/opinion_editorials.xml'),
('Columnists',
'http://www.dispatch.com/content/syndication/opinion_columns.xml'),
('Life and Arts',
'http://www.dispatch.com/content/syndication/lae_life-and-arts.xml'),
('OSU Sports',
'http://www.dispatch.com/content/syndication/sports_osu.xml'),
('Auto Racing',
'http://www.dispatch.com/content/syndication/sports_auto-racing.xml'),
('Outdoors',
'http://www.dispatch.com/content/syndication/sports_outdoors.xml'),
('Bengals',
'http://www.dispatch.com/content/syndication/sports_bengals.xml'),
('Indians',
'http://www.dispatch.com/content/syndication/sports_indians.xml'),
('Clippers',
'http://www.dispatch.com/content/syndication/sports_clippers.xml'),
('Crew',
'http://www.dispatch.com/content/syndication/sports_crew.xml'),
('Reds',
'http://www.dispatch.com/content/syndication/sports_reds.xml'),
('Blue Jackets',
'http://www.dispatch.com/content/syndication/sports_bluejackets.xml'),
]
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'colhed'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'hed'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'subhed'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'date'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'byline'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'srcline'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'body'}))
remove_tags = []
remove_tags.append(dict(name = 'div', attrs = {'id': 'middle-story-ad-container'}))
extra_css = '''
body {font-family:verdana,arial,helvetica,geneva,sans-serif ;}
a {text-decoration: none; color: blue;}
div.colhed {font-weight: bold;}
div.hed {font-size: xx-large; font-weight: bold; margin-bottom: 0.2em;}
div.subhed {font-size: large;}
div.date {font-size: x-small; font-style: italic; color: #666666; margin-top: 0.4em; margin-bottom: 0.4em;}
div.byline, div.srcline {font-size: small; color: #696969;}
'''

View File

@ -1,38 +1,89 @@
#!/usr/bin/env python
##
## Title: Common Dreams
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Cleaned up the output to have only the main article
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
commondreams.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class CommonDreams(BasicNewsRecipe):
# Identify the recipe
title = u'Common Dreams'
description = u'Progressive news and views'
description = u'Breaking News & Views for the Progressive Community.'
cover_url = 'https://s3.amazonaws.com/s3.commondreams.org/images/common-dreams.png'
__author__ = u'XanthanGum'
language = 'en'
# Format the text
extra_css = '''
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
h1{font-size: xx-large;}
h2{font-size: large;}
'''
# Pick no article older than seven days and limit the number of articles per feed to 100
oldest_article = 7
max_articles_per_feed = 100
# Remove everything before the article
no_stylesheets = True
remove_javascript = True
# Flattens all the tables to make it compatible with Nook
conversion_options = {'linearize_tables' : True}
remove_tags_before = dict(name = 'div', attrs = {'id':'node-header'})
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Remove everything after the article
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: ''),
(re.compile(r'<a.*?>'), lambda m: ''),
(re.compile(r'</a>'), lambda m: ''),
]
remove_tags_after = dict(name = 'div', attrs = {'class':'copyright-info'})
# Main article is inside this tag
keep_only_tags = [
dict(name='div', attrs={'id':lambda x: x and 'node-' in x}),
]
remove_tags = [
dict(name='div', attrs={'class':'node-links clear-block'}), # remove Share options
]
# Identify the news feeds
feeds = [(u'Headlines', u'http://www.commondreams.org/feed/headlines_rss'),
(u'Further News Articles', u'http://www.commondreams.org/feed/further_rss'),
(u'Views', u'http://www.commondreams.org/feed/views_rss'),
(u'Progressive Newswire', u'http://www.commondreams.org/feed/newswire_rss')]
feeds = [(u'Headlines', u'https://www.commondreams.org/feed/headlines_rss'),
(u'Further News Articles', u'https://www.commondreams.org/feed/further_rss'),
(u'Views', u'https://www.commondreams.org/feed/views_rss'),
(u'Progressive Newswire', u'https://www.commondreams.org/feed/newswire_rss')]
def print_version(self, url):
url = url + '?print'
return url

View File

@ -0,0 +1,23 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Computerworld_pl(BasicNewsRecipe):
title = u'Computerworld.pl'
__author__ = 'fenuks'
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
category = 'IT'
language = 'pl'
masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
no_stylesheets=True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.computerworld.pl/')
cover=soup.find(name='img', attrs={'class':'prawo'})
self.cover_url=cover['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
##
## Title: Consortium News
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Initial release
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
consortiumnews.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ConsortiumNews(BasicNewsRecipe):
title = u'Consortium News'
publisher = 'Copyright © 2012 Consortiumnews. All Rights Reserved.'
language = 'en'
__author__ = 'kiavash'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
conversion_options = {'linearize_tables' : True} # Flattens all the tables to make it compatible with Nook
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: ''),
(re.compile(r'<a.*?>'), lambda h1: ''),
(re.compile(r'</a>'), lambda h2: ''),
]
# Main article is inside this tag
keep_only_tags = [dict(name='div', attrs={'id':lambda x: x and 'post-' in x})]
remove_tags = [
dict(name='div', attrs={'class':'sociable'}), # remove 'Share this Article'
dict(name='p', attrs={'class':'tags'}), # remove 'Tags: ... '
]
feeds = [(u'Consortium News', u'http://feeds.feedburner.com/Consortiumnewscom')]

View File

@ -0,0 +1,52 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
#from calibre import __appname__
from calibre.utils.magick import Image
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Cosmopolitan UK'
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
__author__ = 'Dave Asbury'
#last update 21/12/11
# greyscale code by Starson
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
preprocess_regexps = [
(re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
masthead_url = 'http://www.cosmopolitan.co.uk/cm/cosmopolitanuk/site_images/header/cosmouk_logo_home.gif'
keep_only_tags = [
dict(attrs={'class' : ['dateAuthor', 'publishDate']}),
dict(name='div',attrs ={'id' : ['main_content']})
]
remove_tags = [
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
dict(name='li',attrs={'class' : 'thumb'})
]
feeds = [
(u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
def postprocess_html(self, soup, first):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -0,0 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
# last updated 29/1/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
remove_empty_feeds = True
no_stylesheets = True
auto_cleanup = True
#articles_are_obfuscated = True
remove_tags = [
# dict(attrs={'class' : ['player']}),
]
feeds = [
(u'Homepage', u'http://www.countryfile.com/rss/home'),
(u'Country News', u'http://www.countryfile.com/rss/news'),
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
]

View File

@ -1,10 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 11/2/12
language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
@ -13,40 +14,64 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
oldest_article = 1
max_articles_per_feed = 100
max_articles_per_feed = 5
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
keep_only_tags = [
dict(name='h1'),
dict(attrs={'class':['article-attr']}),
dict(name='div', attrs={'class' : [ 'article-body', 'crosshead']})
auto_cleanup = True
#conversion_options = { 'linearize_tables' : True }
]
#keep_only_tags = [
# dict(name='h1'),
# dict(name='div',attrs={'id' : 'body-content'}),
#dict(name='div',atts={'class' : 'article-body'}),
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
#dict(name='p'),
# ]
#remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
remove_tags = [
dict(name='div', attrs={'class' : ['caption', 'article-resize']}),
dict( attrs={'class':'append-html'})
]
dict(name='title'),
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
# dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
#dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
#dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
]
# preprocess_regexps = [
#(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
#preprocess_regexps = [
#(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [
(u'News', u'http://www.mirror.co.uk/news/rss.xml')
,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
(u'UK News', u'http://feed43.com/0287771688643868.xml')
,(u'Tech News', u'http://feed43.com/2455520588350501.xml')
,(u'Weird World','http://feed43.com/0863800333634654.xml')
,(u'Sport','http://feed43.com/7713243036546130.xml')
,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml')
,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml')
,(u'Sport : Other','http://feed43.com/4501416886323415.xml')
,(u'TV and Film','http://feed43.com/5238302853765104.xml')
,(u'Celebs','http://feed43.com/8770061048844683.xml')
,(u'Life Style : Family','http://feed43.com/4356170742410338.xml')
,(u'Travel','http://feed43.com/1436576006476607.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
]
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
h1{ font-size:18px;}
img { display:block}
'''

View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class DailyWritingTips(BasicNewsRecipe):
title = u'Daily Writing Tips'
language = 'en_GB'
__author__ = 'NotTaken'
oldest_article = 7 #days
max_articles_per_feed = 40
use_embedded_content = True
no_stylesheets = True
auto_cleanup = False
encoding = 'utf-8'
feeds = [
('Latest tips',
'http://feeds2.feedburner.com/DailyWritingTips'),
]

View File

@ -8,11 +8,7 @@ class DallasNews(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
remove_tags_before = dict(name='h1')
keep_only_tags = {'class':lambda x: x and 'article' in x}
remove_tags = [
{'class':['DMNSocialTools', 'article ', 'article first ', 'article premium']},
]
auto_cleanup = True
feeds = [
('Local News',

15
recipes/datasport.recipe Normal file
View File

@ -0,0 +1,15 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
description = 'Italian soccer news website - v1.00 (17, December 2011)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324114272(BasicNewsRecipe):
title = u'Datasport'
language = 'it'
__author__ = 'faber1971'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]

View File

@ -0,0 +1,62 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.defensenews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DefenseNews(BasicNewsRecipe):
title = 'Defense News'
__author__ = 'Darko Miletic'
description = 'Find late-breaking defense news from the leading defense news weekly'
publisher = 'Gannett Government Media Corporation'
category = 'defense news, defence news, defense, defence, defence budget, defence policy'
oldest_article = 31
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.defensenews.com/images/logo_defensenews2.jpg'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
.info{font-size: small; color: gray}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [
dict(name=['meta','link'])
,dict(attrs={'class':['toolbar','related','left','right']})
]
remove_tags_before = attrs={'class':'storyWrp'}
remove_tags_after = attrs={'class':'middle'}
remove_attributes=['lang']
feeds = [
(u'Europe' , u'http://www.defensenews.com/rss/eur/' )
,(u'Americas', u'http://www.defensenews.com/rss/ame/' )
,(u'Asia & Pacific rim', u'http://www.defensenews.com/rss/asi/' )
,(u'Middle east & Africa', u'http://www.defensenews.com/rss/mid/')
,(u'Air', u'http://www.defensenews.com/rss/air/' )
,(u'Land', u'http://www.defensenews.com/rss/lan/' )
,(u'Naval', u'http://www.defensenews.com/rss/sea/' )
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324913694(BasicNewsRecipe):
title = u'Derin Dusunce'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
feeds = [(u'Derin D\xfc\u015f\xfcnce', u'http://www.derindusunce.org/feed/')]

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
'''
descopera.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Descopera(BasicNewsRecipe):
title = u'Descoperă.org'
__author__ = 'Marius Ignătescu'
description = 'Descoperă. Placerea de a cunoaște'
publisher = 'descopera.org'
category = 'science, technology, culture, history, earth'
language = 'ro'
oldest_article = 14
max_articles_per_feed = 100
encoding = 'utf8'
no_stylesheets = True
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
keep_only_tags = [dict(name='div', attrs={'class':['post']})]
remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
remove_attributes = ['width','height']
cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,21 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class DesiringGodEnglish(BasicNewsRecipe):
title = u'Desiring God'
__author__ = 'Peter Grungi'
language = 'en'
cover_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
masthead_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
language = 'en'
oldest_article = 7
max_articles_per_feed = 50
auto_cleanup = True
publisher = 'Desiring God Ministries'
author = 'Desiring God Ministries'
feeds = [(u'Desiring God Blog', u'http://feeds.feedburner.com/DGBlog?format=xml')]

View File

@ -16,7 +16,7 @@ class DeutscheWelle_es(BasicNewsRecipe):
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
language = 'de_ES'
language = 'de'
publication_type = 'newsportal'
remove_empty_feeds = True
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'

View File

@ -46,7 +46,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
dict(name = 'div', attrs = {'class' : 'uniBox'}),
dict(name = 'object', attrs = {}),
dict(name = 'h3', attrs = {})
dict(name = 'h3', attrs = {}),
dict(attrs={'class':'twitter-share-button'})
]
preprocess_regexps = [
@ -58,3 +59,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
(r'\s*</', lambda match: '</'),
]
]
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)

View File

@ -2,6 +2,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
http://www.dilbert.com
DrMerry added cover Image 2011-11-12
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
@ -9,7 +10,7 @@ import re
class DilbertBig(BasicNewsRecipe):
title = 'Dilbert'
__author__ = 'Darko Miletic and Starson17'
__author__ = 'Darko Miletic and Starson17 contribution of DrMerry'
description = 'Dilbert'
reverse_article_order = True
oldest_article = 15
@ -20,6 +21,7 @@ class DilbertBig(BasicNewsRecipe):
publisher = 'UNITED FEATURE SYNDICATE, INC.'
category = 'comic'
language = 'en'
cover_url = 'http://dilbert.com/mobile/mobile/dilbert.app.icon.png'
conversion_options = {
'comments' : description

View File

@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
__licence__ ='GPL v3'
category = 'IT'
language = 'pl'
masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png'
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
description = u'Aktualności i blogi z dobreprogramy.pl'
encoding = 'utf-8'
@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe):
oldest_article = 8
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})]
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})]
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324736687(BasicNewsRecipe):
title = u'D\xfcnya Bizim'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 10
auto_cleanup = True
feeds = [(u'Aktif \u0130mamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=31'), (u'Ayr\u0131nt\u0131 Defteri', u'http://dunyabizim.com/servisler/rss.php?kategoriID=58'), (u'Baba Kitaplar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=4'), (u'Bu da Oldu', u'http://dunyabizim.com/servisler/rss.php?kategoriID=32'), (u'\xc7-al\u0131nt\u0131 Yaz\u0131lar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=33'), (u'Dar\xfclmedya', u'http://dunyabizim.com/servisler/rss.php?kategoriID=49'), (u'Gidenler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=59'), (u'G\xfczel Mekanlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=43'), (u'\u0130yi Haberler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=18'), (u'\u0130yi M\xfczikler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=2'), (u'Kalite Dergiler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=3'), (u'Konu\u015fa Konu\u015fa', u'http://dunyabizim.com/servisler/rss.php?kategoriID=24'), (u'M\xfcstesta G\xfczeller', u'http://dunyabizim.com/servisler/rss.php?kategoriID=65'), (u'O \u015eimdi Nerede?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=52'), (u'Olsa Ke\u015fke', u'http://dunyabizim.com/servisler/rss.php?kategoriID=34'), (u'Orada Ne Oldu?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=38'), (u'\xd6nemli Adamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=1'), (u'Polemik', u'http://dunyabizim.com/servisler/rss.php?kategoriID=39'), (u'Sinema', u'http://dunyabizim.com/servisler/rss.php?kategoriID=23'), (u'Yalan Haber', u'http://dunyabizim.com/servisler/rss.php?kategoriID=40'), (u'Yeni \u015eeyler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=57'), (u'Zekeriya Sofras\u0131', u'http://dunyabizim.com/servisler/rss.php?kategoriID=60')]

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1321194347(BasicNewsRecipe):
title = u'D\xfcnya B\xfclteni'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 50
auto_cleanup = True
feeds = [(u'Tarih Dosyas\u0131', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=157'), (u'R\xf6portaj', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=153'), (u'Makale-Yorum', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=174'), (u'K\xfclt\xfcr-Sanat', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=66'), (u'Hayat\u0131n \u0130\xe7inden', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=200'), (u'Haber Analiz', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=123'), (u'Gezi-\u0130zlenim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=90'), (u'Aile Sa\u011fl\u0131k E\u011fitim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=75')]

View File

@ -0,0 +1,67 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Dziennik_pl(BasicNewsRecipe):
title = u'Dziennik.pl'
__author__ = 'fenuks'
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
category = 'newspaper'
language = 'pl'
masthead_url= 'http://5.s.dziennik.pl/images/logos.png'
cover_url= 'http://5.s.dziennik.pl/images/logos.png'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
remove_javascript=True
remove_empty_feeds=True
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">&gt;&gt;&gt; CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
keep_only_tags=[dict(id='article')]
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})]
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
def skip_ad_pages(self, soup):
tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})
if tag:
new_soup=self.index_to_soup(tag['href'], raw=True)
return new_soup
def append_page(self, soup, appendtag):
tag=soup.find('a', attrs={'class':'page_next'})
if tag:
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
while tag:
soup2= self.index_to_soup(tag['href'])
tag=soup2.find('a', attrs={'class':'page_next'})
if not tag:
for r in appendtag.findAll('div', attrs={'class':'art_src'}):
r.extract()
pagetext = soup2.find(name='div', attrs={'class':'article_body'})
for dictionary in self.remove_tags:
v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
for delete in v:
delete.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
if appendtag.find('div', attrs={'class':'article_paginator'}):
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,46 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
'''
Fetch echo-online.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Echo_Online(BasicNewsRecipe):
title = u'Echo Online' # 2011-12-28 AGe
description = '-Echo Online-'
publisher = 'Echo Online GmbH'
category = 'News, Germany'
__author__ = 'Armin Geller' # 2011-12-28 AGe
language = 'de'
lang = 'de-DE'
encoding = 'iso-8859-1'
timefmt = ' [%a, %d %b %Y]'
oldest_article = 7
max_articles_per_feed = 50 # 2011-12-28 AGe
no_stylesheets = True
auto_cleanup = True
remove_javascript = True
feeds = [
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
]
def print_version(self, url):
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif'

View File

@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe):
' perspective. Best downloaded on Friday mornings (GMT)')
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
oldest_article = 7.0
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
@ -56,6 +54,19 @@ class Economist(BasicNewsRecipe):
return br
'''
def get_cover_url(self):
soup = self.index_to_soup('http://www.economist.com/printedition/covers')
div = soup.find('div', attrs={'class':lambda x: x and
'print-cover-links' in x})
a = div.find('a', href=True)
url = a.get('href')
if url.startswith('/'):
url = 'http://www.economist.com' + url
soup = self.index_to_soup(url)
div = soup.find('div', attrs={'class':'cover-content'})
img = div.find('img', src=True)
return img.get('src')
def parse_index(self):
return self.economist_parse_index()

View File

@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe):
' perspective. Best downloaded on Friday mornings (GMT)')
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
oldest_article = 7.0
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
@ -40,6 +38,18 @@ class Economist(BasicNewsRecipe):
# downloaded with connection reset by peer (104) errors.
delay = 1
def get_cover_url(self):
soup = self.index_to_soup('http://www.economist.com/printedition/covers')
div = soup.find('div', attrs={'class':lambda x: x and
'print-cover-links' in x})
a = div.find('a', href=True)
url = a.get('href')
if url.startswith('/'):
url = 'http://www.economist.com' + url
soup = self.index_to_soup(url)
div = soup.find('div', attrs={'class':'cover-content'})
img = div.find('img', src=True)
return img.get('src')
def parse_index(self):
try:

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2012 Levien van Zon <levien@zonnetjes.net>'
'''
Fetch Edge.org conversations
'''
from calibre.web.feeds.news import BasicNewsRecipe
class EdgeConversationRSS(BasicNewsRecipe):
title = u'Edge.org Conversations'
__author__ = 'levien'
language = 'en'
description = '''Edge.org offers "open-minded, free ranging, intellectually
playful ... an unadorned pleasure in curiosity, a collective expression of
wonder at the living and inanimate world ... an ongoing and thrilling
colloquium.'''
oldest_article = 60
max_articles_per_feed = 100
no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'class':'HomeLeftPannel IMGCTRL'}) ]
remove_tags = [
dict(name='div',attrs={'class':'Logo'})
]
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
def print_version(self, url):
return url.replace('conversation/', 'conversation.php?cid=')
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Remove anything that is not a conversation, and remove PDF files as well...
if not ('CONVERSATION' in article.title):
feed.articles.remove(article)
elif 'pdf' in article.url:
feed.articles.remove(article)
return feeds

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,45 +7,72 @@ __license__ = 'GPL v3'
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Edmonton Journal
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
title = u'Edmonton Journal'
url_prefix = 'http://www.edmontonjournal.com'
description = u'News from Edmonton, AB'
fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -68,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -0,0 +1,58 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
class Ekathimerini(BasicNewsRecipe):
title = 'ekathimerini'
__author__ = 'Thomas Scholl'
description = 'News from Greece, English edition'
masthead_url = 'http://wwk.kathimerini.gr/webadmin/EnglishNew/gifs/logo.gif'
max_articles_per_feed = 100
oldest_article = 100
publisher = 'Kathimerini'
category = 'news, GR'
language = 'en_GR'
encoding = 'windows-1253'
conversion_options = { 'linearize_tables': True}
no_stylesheets = True
delay = 1
keep_only_tags = [dict(name='td', attrs={'class':'news'})]
rss_url = 'http://ws.kathimerini.gr/xml_files/latestnews.xml'
def find_articles(self, idx, category):
for article in idx.findAll('item'):
cat = u''
cat_elem = article.find('subcat')
if cat_elem:
cat = self.tag_to_string(cat_elem)
if cat == category:
desc_html = self.tag_to_string(article.find('description'))
description = self.tag_to_string(BeautifulSoup(desc_html))
a = {
'title': self.tag_to_string(article.find('title')),
'url': self.tag_to_string(article.find('link')),
'description': description,
'date' : self.tag_to_string(article.find('pubdate')),
}
yield a
def parse_index(self):
idx_contents = self.browser.open(self.rss_url).read()
idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES)
cats = list(set([self.tag_to_string(subcat) for subcat in idx.findAll('subcat')]))
cats.sort()
feeds = [(u'News',list(self.find_articles(idx, u'')))]
for cat in cats:
feeds.append((cat.capitalize(), list(self.find_articles(idx, cat))))
return feeds
def print_version(self, url):
return url.replace('http://www.ekathimerini.com/4dcgi/', 'http://www.ekathimerini.com/4Dcgi/4dcgi/')

View File

@ -33,7 +33,7 @@ class ElPais(BasicNewsRecipe):
remove_javascript = True
no_stylesheets = True
keep_only_tags = [ dict(name='div', attrs={'class':['cabecera_noticia_reportaje estirar','cabecera_noticia_opinion estirar','cabecera_noticia estirar','contenido_noticia','caja_despiece']})]
keep_only_tags = [ dict(name='div', attrs={'class':['cabecera_noticia_reportaje estirar','cabecera_noticia_opinion estirar','cabecera_noticia estirar','contenido_noticia','cuerpo_noticia','caja_despiece']})]
extra_css = ' p{text-align: justify; font-size: 100%} body{ text-align: left; font-family: serif; font-size: 100% } h1{ font-family: sans-serif; font-size:200%; font-weight: bolder; text-align: justify; } h2{ font-family: sans-serif; font-size:150%; font-weight: 500; text-align: justify } h3{ font-family: sans-serif; font-size:125%; font-weight: 500; text-align: justify } img{margin-bottom: 0.4em} '

View File

@ -5,12 +5,11 @@ __license__ = 'GPL v3'
__copyright__ = '04 December 2010, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
__version__ = 'v0.07'
__date__ = '06, February 2011'
__version__ = 'v0.08'
__date__ = '13, November 2011'
'''
elperiodicodearagon.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
@ -20,13 +19,13 @@ class elperiodicodearagon(BasicNewsRecipe):
description = u'Noticias desde Aragon'
publisher = u'elperiodicodearagon.com'
category = u'news, politics, Spain, Aragon'
oldest_article = 2
oldest_article = 1
delay = 0
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'es'
encoding = 'utf8'
encoding = 'iso-8859-1'
remove_empty_feeds = True
remove_javascript = True
@ -39,61 +38,30 @@ class elperiodicodearagon(BasicNewsRecipe):
}
feeds = [
(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
(u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
(u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
(u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
(u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
(u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
(u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
(u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
(u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
(u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')
(u'Portada', u'http://zetaestaticos.com/aragon/rss/portada_es.xml'),
(u'Arag\xf3n', u'http://zetaestaticos.com/aragon/rss/2_es.xml'),
(u'Internacional', u'http://zetaestaticos.com/aragon/rss/4_es.xml'),
(u'Espa\xf1a', u'http://zetaestaticos.com/aragon/rss/3_es.xml'),
(u'Econom\xeda', u'http://zetaestaticos.com/aragon/rss/5_es.xml'),
(u'Deportes', u'http://zetaestaticos.com/aragon/rss/7_es.xml'),
(u'Real Zaragoza', u'http://zetaestaticos.com/aragon/rss/10_es.xml'),
(u'CAI Zaragoza', u'http://zetaestaticos.com/aragon/rss/91_es.xml'),
(u'Monta\xf1ismo', u'http://zetaestaticos.com/aragon/rss/354_es.xml'),
(u'Opini\xf3n', u'http://zetaestaticos.com/aragon/rss/103_es.xml'),
(u'Tema del d\xeda', u'http://zetaestaticos.com/aragon/rss/102_es.xml'),
(u'Escenarios', u'http://zetaestaticos.com/aragon/rss/105_es.xml'),
(u'Sociedad', u'http://zetaestaticos.com/aragon/rss/104_es.xml'),
(u'Gente', u'http://zetaestaticos.com/aragon/rss/330_es.xml'),
(u'Espacio 3', u'http://zetaestaticos.com/aragon/rss/328_es.xml'),
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
]
extra_css = '''
h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
.columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
img{margin-bottom: 0.4em}
'''
remove_attributes = ['height','width']
keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})]
keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
# Quitar toda la morralla
remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
dict(name='span', attrs={'class':'MasInformacion '}),
dict(name='span', attrs={'class':'MasInformacion'}),
dict(name='div', attrs={'class':'Middle'}),
dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
dict(name='div', attrs={'class':'MenuEquipo'}),
dict(name='div', attrs={'class':'TemasRelacionados'}),
dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
dict(name='div', attrs={'class':'Recorte'}),
dict(name='div', attrs={'id':'NoticiasenRecursos'}),
dict(name='div', attrs={'id':'NoticiaEnPapel'}),
dict(name='p', attrs={'class':'RecorteEnNoticias'}),
dict(name='div', attrs={'id':'Comparte'}),
dict(name='div', attrs={'id':'CajaComparte'}),
dict(name='a', attrs={'class':'EscribirComentario'}),
dict(name='a', attrs={'class':'AvisoComentario'}),
dict(name='div', attrs={'class':'CajaAvisoComentario'}),
dict(name='div', attrs={'class':'navegaNoticias'}),
dict(name='div', attrs={'class':'Mensaje'}),
dict(name='div', attrs={'id':'PaginadorDiCom'}),
dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
dict(name='div', attrs={'id':'CintilloComentario'}),
dict(name='div', attrs={'id':'EscribeComentario'}),
dict(name='div', attrs={'id':'FormularioComentario'}),
dict(name='div', attrs={'id':'FormularioNormas'})]
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
def get_cover_url(self):
@ -104,23 +72,7 @@ class elperiodicodearagon(BasicNewsRecipe):
return image['src'].rstrip('format=2') + 'format=1'
return None
# Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2)
# El indice no apuntaba correctamente al empiece de la noticia (linea 3)
# Usamos la versión para móviles
preprocess_regexps = [
(re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
]
# Para sustituir el video incrustado de YouTube por una imagen
def preprocess_html(self, soup):
for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
if video_yt:
video_yt.name = 'img'
fuente = video_yt['src']
fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
video_yt['src'] = fuente2 + '/0.jpg'
return soup
def print_version(self, url):
return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')

View File

@ -0,0 +1,58 @@
################################################################################
#Description: http://es.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2012.01.20. - V1.2
################################################################################
from calibre.web.feeds.recipes import BasicNewsRecipe
class elet_es_irodalom(BasicNewsRecipe):
title = u'\u00c9let \u00e9s Irodalom'
__author__ = 'Bigpapa'
oldest_article = 7
max_articles_per_feed = 30 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'iso-8859-2'
category = 'Cikkek'
language = 'hu'
publication_type = 'newsportal'
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
needs_subscription = 'optional'
masthead_url = 'http://www.es.hu/images/logo.jpg'
timefmt = ' [%Y %b %d, %a]'
#Nem ide a kódba kell beleírni a hozzáférés adatait, hanem azt akkor adod meg, ha le akarod tölteni!
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.es.hu/')
br.select_form(name='userfrmlogin')
br['cusername'] = self.username
br['cpassword'] = self.password
br.submit()
return br
keep_only_tags = [
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
]
remove_tags = [
dict(name='a', attrs={'target':['_TOP']}),
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
]
feeds = [
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
]

View File

@ -4,7 +4,8 @@ __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
elmundo.es
'''
import re
import time
from calibre.web.feeds.news import BasicNewsRecipe
class ElMundo(BasicNewsRecipe):
@ -18,12 +19,15 @@ class ElMundo(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
encoding = 'iso8859_15'
remove_javascript = True
remove_empty_feeds = True
language = 'es'
masthead_url = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
publication_type = 'newspaper'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
.metadata_noticia{font-size: small}
.pestana_GDP{font-size: small; font-weight:bold}
h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974}
.hora{color: red}
.update{color: gray}
@ -41,22 +45,43 @@ class ElMundo(BasicNewsRecipe):
remove_tags_after = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']})
remove_attributes = ['lang','border']
remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google']})
,dict(name='div', attrs={'id':'modulo_multimedia' })
dict(name='div', attrs={'class':['herramientas','publicidad_google','comenta','col col-2b','apoyos','no-te-pierdas']})
,dict(name='div', attrs={'class':['publicidad publicidad_cuerpo_noticia','comentarios_nav','mensaje_privado','interact']})
,dict(name='div', attrs={'class':['num_comentarios estirar']})
,dict(name='span', attrs={'class':['links_comentar']})
,dict(name='div', attrs={'id':['comentar']})
,dict(name='ul', attrs={'class':'herramientas' })
,dict(name=['object','link','embed','iframe','base','meta'])
]
feeds = [
(u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' )
(u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' )
,(u'Deportes' , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml')
,(u'Economia' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
,(u'Espana' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
,(u'Econom\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
,(u'Espa\xf1a' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
,(u'Internacional' , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' )
,(u'Cultura' , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml' )
,(u'Ciencia/Ecologia', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
,(u'Comunicacion' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
,(u'Television' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
,(u'Ciencia/Ecolog\xeda', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
,(u'Comunicaci\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
,(u'Televisi\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
,(u'Salud' , u'http://estaticos.elmundo.es/elmundosalud/rss/portada.xml' )
,(u'Solidaridad' , u'http://estaticos.elmundo.es/elmundo/rss/solidaridad.xml' )
,(u'Su vivienda' , u'http://estaticos.elmundo.es/elmundo/rss/suvivienda.xml' )
,(u'Motor' , u'http://estaticos.elmundo.es/elmundomotor/rss/portada.xml' )
,(u'Madrid' , u'http://estaticos.elmundo.es/elmundo/rss/madrid.xml' )
,(u'Barcelona' , u'http://estaticos.elmundo.es/elmundo/rss/barcelona.xml' )
,(u'Pa\xeds Vasco' , u'http://estaticos.elmundo.es/elmundo/rss/paisvasco.xml' )
,(u'Baleares' , u'http://estaticos.elmundo.es/elmundo/rss/baleares.xml' )
,(u'Castilla y Le\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castillayleon.xml' )
,(u'Valladolid' , u'http://estaticos.elmundo.es/elmundo/rss/valladolid.xml' )
,(u'Valencia' , u'http://estaticos.elmundo.es/elmundo/rss/valencia.xml' )
,(u'Alicante' , u'http://estaticos.elmundo.es/elmundo/rss/alicante.xml' )
,(u'Castell\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castellon.xml' )
,(u'Andaluc\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia.xml' )
,(u'Sevilla' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_sevilla.xml' )
,(u'M\xe1laga' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_malaga.xml' )
]
def preprocess_html(self, soup):
@ -67,3 +92,34 @@ class ElMundo(BasicNewsRecipe):
def get_article_url(self, article):
return article.get('guid', None)
preprocess_regexps = [
# Para presentar la imagen de los videos incrustados
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var video=', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
# Para que no salga la numeración de comentarios: 1, 2, 3 ...
(re.compile(r'<ol>\n<li style="z-index:', re.DOTALL|re.IGNORECASE), lambda match: '<ul><li style="z-index:'),
(re.compile(r'</ol>\n<div class="num_comentarios estirar">', re.DOTALL|re.IGNORECASE), lambda match: '</ul><div class="num_comentarios estirar">'),
]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#http://img.kiosko.net/2011/11/19/es/elmundo.750.jpg
cover='http://img.kiosko.net/'+ year + '/' + month + '/' + day +'/es/elmundo.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
return cover

View File

@ -56,6 +56,7 @@ class ElUniversal(BasicNewsRecipe):
]
def print_version(self, url):
rp,sep,rest = url.rpartition('/')
return rp + sep + 'imp_' + rest
return url + '-imp'
def get_article_url(self, article):
return article.get('guid', None)

16
recipes/emuzica_pl.recipe Normal file
View File

@ -0,0 +1,16 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class eMuzyka(BasicNewsRecipe):
title = u'eMuzyka'
__author__ = 'fenuks'
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]

View File

@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
use_embedded_content = False
remove_javascript = True
needs_subscription = True
needs_subscription = 'optional'
encoding= 'ISO-8859-1'
remove_tags_before = dict(name='font', attrs={'class':'date'})
@ -75,32 +75,30 @@ class ESPN(BasicNewsRecipe):
return soup
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
url = ('https://r.espn.go.com/members/v3_1/login')
raw = br.open(url).read()
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
with TemporaryFile(suffix='.htm') as fname:
with open(fname, 'wb') as f:
f.write(raw)
br.open_local_file(fname)
if self.username and self.password:
br.set_handle_refresh(False)
url = ('https://r.espn.go.com/members/v3_1/login')
raw = br.open(url).read()
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
with TemporaryFile(suffix='.htm') as fname:
with open(fname, 'wb') as f:
f.write(raw)
br.open_local_file(fname)
br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password
br.submit().read()
br.open('http://espn.go.com').read()
br.set_handle_refresh(True)
br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password
br.submit().read()
br.open('http://espn.go.com').read()
br.set_handle_refresh(True)
return br
def get_article_url(self, article):
return article.get('guid', None)
def print_version(self, url):
if 'eticket' in url:
return url.partition('&')[0].replace('story?', 'print?')
match = re.search(r'story\?(id=\d+)', url)

View File

@ -1,35 +1,43 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Gerardo Diez'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
__docformat__ = 'restructuredtext en'
__license__ = 'GPL v3'
__copyright__ = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
__author__ = 'desUBIKado, based on an earlier version by Gerardo Diez'
__version__ = 'v1.01'
__date__ = '13, November 2011'
'''
expansion.es
[url]http://www.expansion.com/[/url]
'''
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Expansion.com'
__author__ ='Gerardo Diez'
publisher =u'Unidad Editorial Información Económica, S.L.'
category ='finances, catalunya'
oldest_article =1
class expansion_spanish(BasicNewsRecipe):
__author__ ='Gerardo Diez & desUBIKado'
description ='Financial news from Spain'
title =u'Expansion'
publisher =u'Unidad Editorial Internet, S.L.'
category ='news, finances, Spain'
oldest_article = 2
simultaneous_downloads = 10
max_articles_per_feed =100
simultaneous_downloads =10
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
timefmt ='[%A, %d %B, %Y]'
encoding ='latin'
timefmt = '[%a, %d %b, %Y]'
encoding ='iso-8859-15'
language ='es'
remove_javascript =True
no_stylesheets =True
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
remove_empty_feeds = True
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
remove_tags =[
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
dict(name='span', attrs={'class':['comentarios']}),
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
dict(name='div', attrs={'id':['comentarios_lectores_listado']})
dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
]
feeds =[
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
(u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
(u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
(u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
(u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
(u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
(u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
(u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
(u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
(u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
(u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
(u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
(u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
return cover
# Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
# la página web, mando la variable "t" con la hora "linux" o "epoch" actual
# haciendole creer al sitio web que justo se acaba de ver la publicidad
def print_version(self, url):
st = time.time()
segundos = str(int(st))
parametros = '.html?t=' + segundos
return url.replace('.html', parametros)
_processed_links = []
def get_article_url(self, article):
# Para obtener la url original del artículo a partir de la de "feedsportal"
link = article.get('link', None)
if link is None:
return article
if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
for i in range(0,len(a)):
link=link.replace(a[i],b[i])
link="http://"+link
# Eliminar artículos duplicados en otros feeds
if not (link in self._processed_links):
self._processed_links.append(link)
else:
link = None
return link
# Un poco de css para mejorar la presentación de las noticias
extra_css = '''
.entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
.fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
'''
# Para presentar la imagen de los videos incrustados
preprocess_regexps = [
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
]

View File

@ -19,45 +19,20 @@ class FazNet(BasicNewsRecipe):
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'Article'})]
remove_tags = [
dict(name=['object','link','embed','base'])
,dict(name='div',
attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo',
'ArtikelServices', 'ModulLesermeinungenFooter',
'ModulArtikelServices', 'BoxTool Aufklappen_Grau',
'SocialMediaUnten', ]}),
dict(id=['KurzLinkMenu', 'ArtikelServicesMenu']),
]
keep_only_tags = [{'class':'FAZArtikelEinleitung'},
{'id':'ArtikelTabContent_0'}]
feeds = [
('FAZ.NET Aktuell', 'http://www.faz.net/s/RubF3CE08B362D244869BE7984590CB6AC1/Tpl~Epartner~SRss_.xml'),
('Politik', 'http://www.faz.net/s/RubA24ECD630CAE40E483841DB7D16F4211/Tpl~Epartner~SRss_.xml'),
('Wirtschaft', 'http://www.faz.net/s/RubC9401175958F4DE28E143E68888825F6/Tpl~Epartner~SRss_.xml'),
('Feuilleton', 'http://www.faz.net/s/RubCC21B04EE95145B3AC877C874FB1B611/Tpl~Epartner~SRss_.xml'),
('Sport', 'http://www.faz.net/s/Rub9F27A221597D4C39A82856B0FE79F051/Tpl~Epartner~SRss_.xml'),
('Gesellschaft', 'http://www.faz.net/s/Rub02DBAA63F9EB43CEB421272A670A685C/Tpl~Epartner~SRss_.xml'),
('Finanzen', 'http://www.faz.net/s/Rub4B891837ECD14082816D9E088A2D7CB4/Tpl~Epartner~SRss_.xml'),
('Wissen', 'http://www.faz.net/s/Rub7F4BEE0E0C39429A8565089709B70C44/Tpl~Epartner~SRss_.xml'),
('Reise', 'http://www.faz.net/s/RubE2FB5CA667054BDEA70FB3BC45F8D91C/Tpl~Epartner~SRss_.xml'),
('Technik & Motor', 'http://www.faz.net/s/Rub01E4D53776494844A85FDF23F5707AD8/Tpl~Epartner~SRss_.xml'),
('Beruf & Chance', 'http://www.faz.net/s/RubB1E10A8367E8446897468EDAA6EA0504/Tpl~Epartner~SRss_.xml')
('FAZ.NET Aktuell', 'http://www.faz.net/aktuell/?rssview=1'),
('Politik', 'http://www.faz.net/aktuell/politik/?rssview=1'),
('Wirtschaft', 'http://www.faz.net/aktuell/wirtschaft/?rssview=1'),
('Feuilleton', 'http://www.faz.net/aktuell/feuilleton/?rssview=1'),
('Sport', 'http://www.faz.net/aktuell/sport/?rssview=1'),
('Gesellschaft', 'http://www.faz.net/aktuell/gesellschaft/?rssview=1'),
('Finanzen', 'http://www.faz.net/aktuell/finanzen/?rssview=1'),
('Technik & Motor', 'http://www.faz.net/aktuell/technik-motor/?rssview=1'),
('Wissen', 'http://www.faz.net/aktuell/wissen/?rssview=1'),
('Reise', 'http://www.faz.net/aktuell/reise/?rssview=1'),
('Beruf & Chance', 'http://www.faz.net/aktuell/beruf-chance/?rssview=1')
]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
return soup

34
recipes/fhm_uk.recipe Normal file
View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'FHM UK'
description = 'Good News for Men'
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
# last updated 27/1/12
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
remove_empty_feeds = True
no_stylesheets = True
#auto_cleanup = True
#articles_are_obfuscated = True
keep_only_tags = [
dict(name='h1'),
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
dict(name='div',attrs={'id' : ['articleLeft']}),
dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}),
]
#remove_tags = [
#dict(attrs={'class' : ['player']}),
#]
feeds = [
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
]

View File

@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe):
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
extra_css = '.hdrBig {font-size:22px;}'
remove_empty_feeds=True
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})]
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),

18
recipes/fisco_oggi.recipe Normal file
View File

@ -0,0 +1,18 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
description = 'Website of Italian Governament Income Agency (about revenue, taxation, taxes)- v1.00 (17, December 2011)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324112023(BasicNewsRecipe):
title = u'Fisco Oggi'
language = 'it'
__author__ = 'faber1971'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
no_stylesheets = True
feeds = [(u'Attualit\xe0', u'http://www.fiscooggi.it/taxonomy/term/1/feed'), (u'Normativa', u'http://www.fiscooggi.it/taxonomy/term/5/feed'), (u'Giurisprudenza', u'http://www.fiscooggi.it/taxonomy/term/8/feed'), (u'Dati e statistiche', u'http://www.fiscooggi.it/taxonomy/term/12/feed'), (u'Analisi e commenti', u'http://www.fiscooggi.it/taxonomy/term/13/feed'), (u'Bilancio e contabilit\xe0', u'http://www.fiscooggi.it/taxonomy/term/576/feed'), (u'Dalle regioni', u'http://www.fiscooggi.it/taxonomy/term/16/feed'), (u'Dal mondo', u'http://www.fiscooggi.it/taxonomy/term/17/feed')]

View File

@ -1,57 +1,68 @@
# -*- coding: utf-8 -*-
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Focus_pl(BasicNewsRecipe):
title = u'Focus.pl'
oldest_article = 15
max_articles_per_feed = 100
__author__ = 'fenuks'
language = 'pl'
description ='polish scientific monthly magazine'
class FocusRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = u'intromatyk <intromatyk@gmail.com>'
language = 'pl'
version = 1
title = u'Focus'
publisher = u'Gruner + Jahr Polska'
category = u'News'
description = u'Newspaper'
category='magazine'
cover_url=''
remove_empty_feeds= True
no_stylesheets=True
remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
remove_tags_after=dict(name='div', attrs={'class':'clear'})
feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
oldest_article = 7
max_articles_per_feed = 100000
recursions = 0
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
# Seems to work best, but YMMV
simultaneous_downloads = 5
r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
keep_only_tags =[]
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
h1{text-align: left;}
h2{font-size: medium; font-weight: bold;}
p.lead {font-weight: bold; text-align: left;}
.authordate {font-size: small; color: #696969;}
.fot{font-size: x-small; color: #666666;}
'''
]
feeds = [
('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
]
def skip_ad_pages(self, soup):
tag=soup.find(name='a')
if tag:
new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
return new_soup
def append_page(self, appendtag):
tag=appendtag.find(name='div', attrs={'class':'arrows'})
if tag:
nexturl='http://www.focus.pl/'+tag.a['href']
for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
while nexturl:
soup2=self.index_to_soup(nexturl)
nexturl=None
pagetext=soup2.find(name='div', attrs={'class':'txt'})
tag=pagetext.find(name='div', attrs={'class':'arrows'})
for r in tag.findAll(name='a'):
if u'Następne' in r.string:
nexturl='http://www.focus.pl/'+r['href']
for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
if ('advertisement' in soup.find('title').string.lower()):
href = soup.find('a').get('href')
return self.index_to_soup(href, raw=True)
else:
return None
def get_cover_url(self):
soup=self.index_to_soup('http://www.focus.pl/magazyn/')
@ -60,7 +71,14 @@ class Focus_pl(BasicNewsRecipe):
self.cover_url='http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url)
def preprocess_html(self, soup):
self.append_page(soup.body)
return soup
def print_version(self, url):
if url.count ('focus.pl.feedsportal.com'):
u = url.find('focus0Bpl')
u = 'http://www.focus.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace ('0E','-')
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
else:
u = url.replace('/nc/1','/do-druku/1')
return u

View File

@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe):
__author__ = 'fluzao'
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
#found this to be the easiest place to find the index page (13-Nov-2011).
# searching for the "Indice Geral" link
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
language = 'pt'
no_stylesheets = True
max_articles_per_feed = 40
remove_javascript = True
needs_subscription = True
remove_tags_before = dict(name='b')
remove_tags_before = dict(name='p')
remove_tags = [dict(name='td', attrs={'align':'center'})]
remove_attributes = ['height','width']
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
# fixes the problem with the section names
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'}
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}
# this solves the problem with truncated content in Kindle
conversion_options = {'linearize_tables' : True}
# this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
# Indice e Comunicar Erros
preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->',
re.DOTALL|re.IGNORECASE), lambda match: r''),
(re.compile(r'<BR><BR>Pr&oacute;ximo Texto:.*<!--/NOTICIA-->',
preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
re.DOTALL|re.IGNORECASE), lambda match: r'')]
def get_browser(self):
@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe):
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
#Searching for the index page on the HOMEPAGE
hpsoup = self.index_to_soup(self.HOMEPAGE)
indexref = hpsoup.find('a', href=re.compile('^indices.*'))
self.log('--> tag containing the today s index: ', indexref)
INDEX = indexref['href']
INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX
self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
# ... and taking the opportunity to get the cover image link
coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
if coverurl:
self.log('--> tag containing the today s cover: ', coverurl)
coverurl = coverurl.replace('htm', 'jpg')
coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl
self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
self.cover_url = coverurl
#soup = self.index_to_soup(self.INDEX)
soup = self.index_to_soup(INDEX)
feeds = []
articles = []
section_title = "Preambulo"
@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe):
self.log('--> new section title: ', section_title)
if strpost.startswith('<a href'):
url = post['href']
#this bit is kept if they ever go back to the old format (pre Nov-2011)
if url.startswith('/fsp'):
url = 'http://www1.folha.uol.com.br'+url
#
if url.startswith('http://www1.folha.uol.com.br/fsp'):
#url = 'http://www1.folha.uol.com.br'+url
title = self.tag_to_string(post)
self.log()
self.log('--> post: ', post)
@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe):
# keeping the front page url
minha_capa = feeds[0][1][1]['url']
# removing the 'Preambulo' section
# removing the first section (now called 'top')
del feeds[0]
# creating the url for the cover image
coverurl = feeds[0][1][0]['url']
coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
coverurl = coverurl.replace('01.htm', '.jpg')
self.cover_url = coverurl
# inserting the cover page as the first article (nicer for kindle users)
feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
return feeds

View File

@ -3,10 +3,17 @@ import re
from calibre.ptempfile import PersistentTemporaryFile
class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications:
1) fetch issue cover
2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei weichen302@gmx.com, 2012-02-05'''
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en'
version = 1
version = 1.01
title = u'Foreign Affairs (Subcription or (free) Registration)'
publisher = u'Council on Foreign Relations'
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
remove_javascript = True
INDEX = 'http://www.foreignaffairs.com'
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
INCLUDE_PREMIUM = False
remove_tags = []
remove_tags.append(dict(name = 'base'))
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
temp_files = []
articles_are_obfuscated = True
def get_cover_url(self):
soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
img_url = div.find('img')['src']
return self.INDEX + img_url
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
@ -50,57 +66,46 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return self.temp_files[-1].name
def parse_index(self):
soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
articles = []
answer = []
content = soup.find('div', attrs = {'class': 'center-wrapper'})
if content:
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tag = div.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
# If they ever fix their markup, this will break :-(
summary = self.tag_to_string(tag.findNextSibling('p'))
description = author + '<br/>' + summary
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
else:
continue
answer.append(('Magazine', articles))
ul = content.find('ul')
if ul:
soup = self.index_to_soup(self.FRONTPAGE)
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
for sec in sec_start:
content = sec.nextSibling
if content:
section = self.tag_to_string(content.find('h2'))
articles = []
for li in ul.findAll('li'):
tag = li.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
description = ''
tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
if tag:
description = self.tag_to_string(tag)
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
tags = []
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tags.append(div)
for li in content.findAll('li'):
tags.append(li)
for div in tags:
title = url = description = author = None
if self.INCLUDE_PREMIUM:
found_premium = False
else:
continue
answer.append(('Letters to the Editor', articles))
found_premium = div.findAll('span', attrs={'class':
'premium-icon'})
if not found_premium:
tag = div.find('div', attrs={'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
description = self.tag_to_string(tag_summary)
articles.append({'title':title, 'date':None, 'url':url,
'description':description, 'author':author})
if articles:
answer.append((section, articles))
return answer
def preprocess_html(self, soup):

50
recipes/formulaas.recipe Normal file
View File

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
formula-as.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class FormulaAS(BasicNewsRecipe):
title = u'Formula AS'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'Formula AS'
description = u'Formula AS'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania'
encoding = 'utf-8'
cover_url = 'http://www.formula-as.ro/_client/img/header_logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'item padded'})
]
remove_tags = [
dict(name='ul', attrs={'class':'subtitle lower'})
]
remove_tags_after = [
dict(name='ul', attrs={'class':'subtitle lower'}),
dict(name='div', attrs={'class':'item-brief-options'})
]
feeds = [
(u'\u0218tiri', u'http://www.formula-as.ro/rss/articole.xml')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

10
recipes/frandroid.recipe Normal file
View File

@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1318572550(BasicNewsRecipe):
title = u'FrAndroid'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'FrAndroid', u'http://feeds.feedburner.com/Frandroid')]

View File

@ -1,35 +1,61 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Christian Schmitt'
'''
fr-online.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe(BasicNewsRecipe):
title = u'Frankfurter Rundschau'
__author__ = 'schuster'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'de'
remove_javascript = True
cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823538/-/logo.png'
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
img {min-width:300px; max-width:600px; min-height:300px; max-height:800px}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
feeds = [(u'Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'),
(u'Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'),
(u'Meinungen', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'),
(u'Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'),
(u'Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'),
(u'Kultur', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'),
(u'Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'),
(u'Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'),
(u'Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml')
]
class FROnlineRecipe(BasicNewsRecipe):
title = 'Frankfurter Rundschau'
__author__ = 'maccs'
description = 'Nachrichten aus D und aller Welt'
encoding = 'utf-8'
masthead_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
publisher = 'Druck- und Verlagshaus Frankfurt am Main GmbH'
category = 'news, germany, world'
language = 'de'
publication_type = 'newspaper'
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
oldest_article = 1 # Increase this number if you're interested in older articles
max_articles_per_feed = 50 # Seems a reasonable number to me
extra_css = '''
body { font-family: "arial", "verdana", "geneva", sans-serif; font-size: 12px; margin: 0px; background-color: #ffffff;}
.imgSubline{background-color: #f4f4f4; font-size: 0.8em;}
.p--heading-1 {font-weight: bold;}
.calibre_navbar {font-size: 0.8em; font-family: "arial", "verdana", "geneva", sans-serif;}
'''
keep_only_tags = [{'class':'ArticleHeadlineH1'}, {'class':'article_text'}]
cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
cover_margins = (100, 150, '#ffffff')
def print_version(self, url):
return url.replace('index.html', 'view/printVersion/-/index.html')
feeds = []
feeds.append(('Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'))
feeds.append(('Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'))
feeds.append(('Meinung', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'))
feeds.append(('Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'))
feeds.append(('Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'))
feeds.append(('Eintracht Frankfurt', u'http://www.fr-online.de/sport/eintracht-frankfurt/-/1473446/1473446/-/view/asFeed/-/index.xml'))
feeds.append(('Kultur und Medien', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'))
feeds.append(('Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'))
feeds.append(('Frankfurt', u'http://www.fr-online.de/frankfurt/-/1472798/1472798/-/view/asFeed/-/index.xml'))
feeds.append(('Rhein-Main', u'http://www.fr-online.de/rhein-main/-/1472796/1472796/-/view/asFeed/-/index.xml'))
feeds.append(('Hanau', u'http://www.fr-online.de/rhein-main/hanau/-/1472866/1472866/-/view/asFeed/-/index.xml'))
feeds.append(('Darmstadt', u'http://www.fr-online.de/rhein-main/darmstadt/-/1472858/1472858/-/view/asFeed/-/index.xml'))
feeds.append(('Wiesbaden', u'http://www.fr-online.de/rhein-main/wiesbaden/-/1472860/1472860/-/view/asFeed/-/index.xml'))
feeds.append(('Offenbach', u'http://www.fr-online.de/rhein-main/offenbach/-/1472856/1472856/-/view/asFeed/-/index.xml'))
feeds.append(('Bad Homburg', u'http://www.fr-online.de/rhein-main/bad-homburg/-/1472864/1472864/-/view/asFeed/-/index.xml'))
feeds.append(('Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'))
feeds.append(('Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml'))
def print_version(self, url):
return url.replace('index.html', 'view/printVersion/-/index.html')

View File

@ -18,7 +18,7 @@ class FrazPC(BasicNewsRecipe):
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
cover_url='http://www.frazpc.pl/images/logo.png'
feeds = [
(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'),
(u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly')
@ -33,6 +33,7 @@ class FrazPC(BasicNewsRecipe):
dict(name='div', attrs={'class':'comments_box'})
]
remove_tags_after=dict(name='div', attrs={'class':'content'})
preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')]
remove_attributes = [ 'width', 'height' ]

View File

@ -16,7 +16,7 @@ class FTDe(BasicNewsRecipe):
use_embedded_content = False
timefmt = ' [%d %b %Y]'
language = 'de'
max_articles_per_feed = 40
max_articles_per_feed = 30
no_stylesheets = True
remove_tags = [dict(id='navi_top'),
@ -84,19 +84,19 @@ class FTDe(BasicNewsRecipe):
dict(name='div', attrs={'class':'artikelsplitfaq'})]
#remove_tags_after = [dict(name='a', attrs={'class':'more'})]
feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'),
('Meinungshungrige', 'http://www.ftd.de/rss2/meinungshungrige'),
('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'),
('Politik', 'http://www.ftd.de/rss2/politik'),
('Karriere_Management', 'http://www.ftd.de/rss2/karriere-management'),
('IT_Medien', 'http://www.ftd.de/rss2/it-medien'),
('Wissen', 'http://www.ftd.de/rss2/wissen'),
('Sport', 'http://www.ftd.de/rss2/sport'),
('Auto', 'http://www.ftd.de/rss2/auto'),
('Lifestyle', 'http://www.ftd.de/rss2/lifestyle')
]
feeds = [
('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'),
('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'),
('Meinungen', 'http://www.ftd.de/rss2/meinungshungrige'),
('Politik', 'http://www.ftd.de/rss2/politik'),
('Management & Karriere', 'http://www.ftd.de/rss2/karriere-management'),
('IT & Medien', 'http://www.ftd.de/rss2/it-medien'),
('Wissen', 'http://www.ftd.de/rss2/wissen'),
('Sport', 'http://www.ftd.de/rss2/sport'),
('Auto', 'http://www.ftd.de/rss2/auto'),
('Lifestyle', 'http://www.ftd.de/rss2/lifestyle')
]
def print_version(self, url):
return url.replace('.html', '.html?mode=print')
return url.replace('.html', '.html?mode=print')

View File

@ -0,0 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Gameplay_pl(BasicNewsRecipe):
title = u'Gameplay.pl'
oldest_article = 7
__author__ = 'fenuks'
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
return url

View File

@ -0,0 +1,35 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import re
import string
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaPlSzczecin(BasicNewsRecipe):
title = u'Gazeta.pl Szczecin'
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
__author__ = u'Michał Szkutnik'
__license__ = u'GPL v3'
language = 'pl'
publisher = 'Agora S.A.'
category = 'news, szczecin'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}]
cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif"
feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
def get_article_url(self, article):
s = re.search("""/0L(szczecin.*)/story01.htm""", article.link)
s = s.group(1)
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"}
for (a, b) in replacements.iteritems():
s = string.replace(s, a, b)
s = string.replace(s, "0A", "0")
return "http://"+s
def print_version(self, url):
s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url)
no1 = s.group(2)
no2 = s.group(3)
return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2)

View File

@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta Wyborcza'
__author__ = 'fenuks'
cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
language = 'pl'
description ='news from gazeta.pl'
category='newspaper'
publication_type = 'newspaper'
masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
INDEX='http://wyborcza.pl'
remove_empty_feeds= True
oldest_article = 3
@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
return url
else:
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
def get_cover_url(self):
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
cover=soup.find(id='GWmini2')
soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href'])
self.cover_url='http://wyborcza.pl' + soup.img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -1,35 +1,82 @@
#!/usr/bin/python
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image, create_canvas
class AdvancedUserRecipe1307556816(BasicNewsRecipe):
title = u'Geek and Poke'
__author__ = u'DrMerry'
description = u'Geek and Poke Cartoons'
publisher = u'Oliver Widder'
author = u'Oliver Widder, DrMerry (calibre-code), calibre'
oldest_article = 31
max_articles_per_feed = 100
language = u'en'
simultaneous_downloads = 5
simultaneous_downloads = 1
#delay = 1
timefmt = ' [%A, %d %B, %Y]'
timefmt = ' [%a, %d %B, %Y]'
summary_length = -1
no_stylesheets = True
category = 'News.IT, Cartoon, Humor, Geek'
use_embedded_content = False
cover_url = 'http://geekandpoke.typepad.com/aboutcoders.jpeg'
remove_javascript = True
remove_empty_feeds = True
publication_type = 'blog'
masthead_url = None
conversion_options = {
'comments' : ''
,'tags' : category
,'language' : language
,'publisher' : publisher
,'author' : author
}
preprocess_regexps = [ (re.compile(r'(<p>&nbsp;</p>|<iframe.*</iframe>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>)', re.DOTALL|re.IGNORECASE),lambda match: ''),
(re.compile(r'(&nbsp;| )', re.DOTALL|re.IGNORECASE),lambda match: ' '),
(re.compile(r'<br( /)?>(<br( /)?>)+', re.DOTALL|re.IGNORECASE),lambda match: '<br>')
]
remove_tags_before = dict(name='p', attrs={'class':'content-nav'})
remove_tags_after = dict(name='div', attrs={'class':'entry-content'})
remove_tags = [dict(name='div', attrs={'class':'entry-footer'}),
dict(name='div', attrs={'id':'alpha'}),
dict(name='div', attrs={'id':'gamma'}),
dict(name='iframe'),
dict(name='p', attrs={'class':'content-nav'})]
extra_css = 'body, h3, p, h2, h1, div, span{margin:0px} h2.date-header {font-size: 0.7em; color:#eee;} h3.entry-header{font-size: 1.0em} div.entry-body{font-size: 0.9em}'
filter_regexps = [(r'feedburner\.com'),
(r'pixel.quantserve\.com'),
(r'googlesyndication\.com'),
(r'yimg\.com'),
(r'scorecardresearch\.com')]
preprocess_regexps = [(re.compile(r'(<p>(&nbsp;|\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>|<!--.*?-->|<h2[^>]*>[^<]*</h2>[^<]*)', re.DOTALL|re.IGNORECASE),lambda match: ''),
(re.compile(r'(&nbsp;|\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
(re.compile(r'(<h3[^>]*>)<a[^>]>((?!</a)*)</a></h3>', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + match.group(2) + '</h3>'),
(re.compile(r'(<img[^>]*alt="([^"]*)"[^>]*>)', re.DOTALL|re.IGNORECASE),lambda match: '<div id="merryImage"><cite>' + match.group(2) + '</cite><br>' + match.group(1) + '</div>'),
(re.compile(r'<br( /)?>(<br( /)?>)+', re.DOTALL|re.IGNORECASE),lambda match: '<br>'),
]
remove_tags_before = dict(name='h2', attrs={'class':'date-header'})
remove_tags_after = dict(name='div', attrs={'class':'entry-body'})
extra_css = 'body, h3, p, div, span{margin:0px; padding:0px} h3.entry-header{font-size: 0.8em} div.entry-body{font-size: 0.7em}'
def postprocess_html(self, soup, first):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
#width, height = img.size
#print '***img is: ', iurl, '\n****width is: ', width, 'height is: ', height
img.trim(0)
#width, height = img.size
#print '***TRIMMED img width is: ', width, 'height is: ', height
left=0
top=0
border_color='#ffffff'
width, height = img.size
#print '***retrieved img width is: ', width, 'height is: ', height
height_correction = 1.17
canvas = create_canvas(width, height*height_correction,border_color)
canvas.compose(img, left, top)
#img = canvas
#img.save(iurl)
canvas.save(iurl)
#width, height = canvas.size
#print '***NEW img width is: ', width, 'height is: ', height
return soup
feeds = [(u'Geek and Poke', u'http://feeds.feedburner.com/GeekAndPoke?format=xml')]
feeds = ['http://feeds.feedburner.com/GeekAndPoke?format=xml']

View File

@ -0,0 +1,90 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GiveMeSomethingToRead(BasicNewsRecipe):
title = u'Give Me Something To Read'
description = 'Curation / aggregation of articles on diverse topics'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://givemesomethingtoread.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('The Arts','arts',25),
('Science','science',30),
('Technology','technology',30),
('Politics','politics',20),
('Media','media',30),
('Crime','crime',15),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/tagged/'+tag
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
headers = soup.findAll('h2')
if len(headers) == .0:
break
for header in headers:
atag = header.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(header)
self.log('\tFound article:', title)
#self.log('\t', url)
desc = header.parent.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = header.parent.previousSibling
# navigate up to find h3, which contains the date
while p:
if hasattr(p,'name') and p.name == 'h3':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

Some files were not shown because too many files have changed in this diff Show More