Merge from upstream
@ -30,3 +30,4 @@ nbproject/
|
||||
.project
|
||||
.pydevproject
|
||||
.settings/
|
||||
*.DS_Store
|
||||
|
231
Changelog.yaml
@ -19,6 +19,237 @@
|
||||
# new recipes:
|
||||
# - title:
|
||||
|
||||
- version: 0.8.1
|
||||
date: 2011-05-13
|
||||
|
||||
new features:
|
||||
- title: "Add Amazon DE, Beam EBooks, Beam DE, Weightless Books, Wizards Tower Books to the list of ebook stores searched by Get Books"
|
||||
|
||||
- title: "TXT output: All new Textile output with much greater preservation of formatting from the input document"
|
||||
|
||||
- title: "Migrate metadata plugin for Douban Books to the 0.8 API"
|
||||
|
||||
- title: "Driver for Dell Streak on windows"
|
||||
|
||||
- title: "Add menu items to Get Books action to search by title and author of current book"
|
||||
|
||||
- title: "Add title_sort as available field to CSV/XML catalogs"
|
||||
|
||||
- title: "Add a context menu to the manage authors dialog"
|
||||
|
||||
- title: "Add a button to paste isbn into the identifiers field in the edit metadata dialog automatically"
|
||||
|
||||
bug fixes:
|
||||
- title: "Amazon metadata download plugin: Fix links being stripped from comments. Also fix ratings/isbn not being parsed from kindle edition pages."
|
||||
tickets: [782012]
|
||||
|
||||
- title: "Fix one source of segfaults on shutdown in the linux binary builds."
|
||||
|
||||
- title: "Allow the use of condensed/expanded fonts as interface fonts"
|
||||
|
||||
- title: "EPUB Input: Ignore missing cover file when converting, instead of erroring out."
|
||||
tickets: [781848]
|
||||
|
||||
- title: "Fix custom identifier being erased by metadata download"
|
||||
tickets: [781759]
|
||||
|
||||
- title: "Fix regression that broke various things when using Japanese language calibre on windows"
|
||||
tickets: [780804]
|
||||
|
||||
- title: "RTF Input: Handle null color codes correctly"
|
||||
tickets: [780728]
|
||||
|
||||
- title: "ODT Input: Handle inline special styles defined on <text:span> tags."
|
||||
tickets: [780250]
|
||||
|
||||
- title: "Fix error when pressing next previous button with an empty search in the Plugins preferences"
|
||||
tickets: [781135]
|
||||
|
||||
- title: "Ignore 'Unknown' author when downloading metadata."
|
||||
tickets: [779348]
|
||||
|
||||
- title: "Fix timezone bug when setting dates in the edit metadata dialog"
|
||||
tickets: [779497]
|
||||
|
||||
- title: "Fix ebook-convert not recognizing output paths starting with .."
|
||||
tickets: [779322]
|
||||
|
||||
improved recipes:
|
||||
- "Strategy+Business"
|
||||
- Readers Digest
|
||||
- Ming Pao
|
||||
- Telepolis
|
||||
- Fronda
|
||||
- Rzeczpospolita
|
||||
|
||||
new recipes:
|
||||
- title: "Various Taiwanese news sources"
|
||||
author: Eddie Lau
|
||||
|
||||
- title: Replica Vedetelor, Ziua Veche
|
||||
author: Silviu Cotoara
|
||||
|
||||
- title: Welt der Physik
|
||||
author: schuster
|
||||
|
||||
- title: Korea Herald
|
||||
author: Seongkyoun Yoo
|
||||
|
||||
|
||||
- version: 0.8.0
|
||||
date: 2010-05-06
|
||||
|
||||
new features:
|
||||
- title: "Go to http://calibre-ebook.com/new-in/eight to see what's new in 0.8.0"
|
||||
type: major
|
||||
|
||||
- version: 0.7.59
|
||||
date: 2011-04-30
|
||||
|
||||
bug fixes:
|
||||
- title: "Fixes a bug in 0.7.58 that caused too small fonts when converting to MOBI for the Kindle. Apologies."
|
||||
|
||||
- title: "Apple driver: Handle invalid EPUBs that do not contain an OPF file"
|
||||
|
||||
new recipes:
|
||||
- title: The Big Picture and Auto industry news
|
||||
author: welovelucy
|
||||
|
||||
- title: Gazeta Prawna
|
||||
author: Vroo
|
||||
|
||||
- title: Various Czech news sources
|
||||
author: Tomas Latal
|
||||
|
||||
- title: Diario de Ibiza
|
||||
author: Joan Tur
|
||||
|
||||
- version: 0.7.58
|
||||
date: 2011-04-29
|
||||
|
||||
new features:
|
||||
- title: "Support for converting and reading metadata from Plucker format PDB files"
|
||||
type: major
|
||||
|
||||
- title: "The metadata that is displayed in the book details panel on the right is now completely configurable via Preferences->Look & Feel"
|
||||
|
||||
- title: "Add a column that shows the date when the metadata of a book record was last modified in calibre. To see the column, right click on the column headers in calibre and select Show column->Modified. Note that the dates may be incorrect for books added with older versions of calibre."
|
||||
|
||||
- title: "Add command line option to shutdown running calibre"
|
||||
|
||||
- title: "CHM Input: Store extracted files in the input/ sub dir for easy debugging when --debug-pipeline is specified"
|
||||
|
||||
- title: "Add a popup menu to the 'Create saved search button' to allow easy deleting of saved searches"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression that broke converting to LIT in 0.7.57"
|
||||
tickets: [769334]
|
||||
|
||||
- title: "Conversion pipeline: Remove encoding declarations from input HTML documents to guarantee that there is only a single encoding declaration in the output HTML."
|
||||
tickets: [773337]
|
||||
|
||||
- title: "Correctly parenthesize searches that are used to make search restrictions"
|
||||
|
||||
- title: "Fix ratings in save to disk templates not being divided by 2"
|
||||
|
||||
- title: "TXT to EPUB: Underlined words (following quotes?) fail to become italics"
|
||||
tickets: [772267]
|
||||
|
||||
- title: "Fix template function source code unavailable when not running calibre from source"
|
||||
|
||||
- title: "Fix adding html books from the top of a deep folder hierarchy very slow"
|
||||
|
||||
- title: "Only set language in MOBI metadata if it is not null"
|
||||
|
||||
- title: "Fix 'count-of' searches (e.g., tags:#>3)."
|
||||
tickets: [771175]
|
||||
|
||||
- title: "Fix regression that broke connection to iTunes in some cases"
|
||||
tickets: [771164]
|
||||
|
||||
- title: "Fix buggy regex that made converting PDFs with the string ****************** very slow"
|
||||
tickets: [770534]
|
||||
|
||||
- title: "Fix Ctrl+L shortcut to lookup word not working in ebook viewer"
|
||||
tickets: [769492]
|
||||
|
||||
- title: "Fix regression that broke searching on boolean columns"
|
||||
|
||||
improved recipes:
|
||||
- HBR Blogs
|
||||
- The Marker
|
||||
- Financial Times
|
||||
- Clarin
|
||||
- Honolulu Star Advertiser
|
||||
|
||||
new recipes:
|
||||
- title: Novi Standard
|
||||
author: Darko Miletic
|
||||
|
||||
- title: Autobild.ro and Social Diva
|
||||
author: Silviu Cotoara
|
||||
|
||||
- title: Novinky
|
||||
author: Tomas Latal
|
||||
|
||||
- title: "De Volksrant (subscriber version)"
|
||||
author: Selcal
|
||||
|
||||
|
||||
- version: 0.7.57
|
||||
date: 2011-04-22
|
||||
|
||||
new features:
|
||||
- title: "Launch worker processes on demand instead of keeping a pool of them in memory. Reduces memory footprint."
|
||||
|
||||
- title: "Use the visual formatting of the Table of Contents to try to automatically create a multi-level TOC when converting/viewing MOBI files."
|
||||
tickets: [763681]
|
||||
|
||||
- title: "Add a new function booksize() to the template language to get the value of the size column in calibre."
|
||||
|
||||
- title: "Add support for using metadata plugboards with the content server (only with the epub format)"
|
||||
|
||||
- title: "Change default algorithm for automatically computing author sort to be more intelligent and handle the case when the author name has a comma in it"
|
||||
|
||||
- title: "Show cover size in the tooltips of the book details panel and book details popup window"
|
||||
|
||||
bug fixes:
|
||||
- title: "Dragging and dropping a cover onto the book details panel did not change the cover size"
|
||||
tickets: [768332]
|
||||
|
||||
- title: "Fix non-escaped '|' when searching for commas in authors using REGEXP_MATCH"
|
||||
|
||||
- title: "Fix ratings in templates being multiplied by 2"
|
||||
|
||||
- title: "Fix adding a comma to custom series values when using completion."
|
||||
tickets: [763788]
|
||||
|
||||
- title: "CHM Input: Another workaround for a Microsoft mess."
|
||||
tickets: [763336]
|
||||
|
||||
- title: "Fix job count in the spinner not always being updated when a job completes"
|
||||
|
||||
- title: "Changing case only of a title does not update title sort"
|
||||
tickets: [768904]
|
||||
|
||||
improved recipes:
|
||||
- ecuisine.ro, egirl.ro and tabu.ro
|
||||
- Daily Telegraph
|
||||
- Handelsblatt
|
||||
- Il Sole 24 Ore
|
||||
- Newsweek
|
||||
- Arcamax
|
||||
|
||||
new recipes:
|
||||
- title: BabyOnline.ro
|
||||
author: Silviu Cotoara
|
||||
|
||||
- title: "The Journal.ie"
|
||||
author: Phil Burns
|
||||
|
||||
- title: "Der Spiegel"
|
||||
author: Nikolas Mangold
|
||||
|
||||
- version: 0.7.56
|
||||
date: 2011-04-17
|
||||
|
||||
|
@ -93,7 +93,7 @@ class Arcamax(BasicNewsRecipe):
|
||||
for page in pages:
|
||||
page_soup = self.index_to_soup(url)
|
||||
if page_soup:
|
||||
title = page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0]
|
||||
title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0])
|
||||
page_url = url
|
||||
# orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
|
||||
prev_page_url = 'http://www.arcamax.com' + page_soup.find('span', text='Previous').parent.parent['href']
|
||||
@ -127,4 +127,3 @@ class Arcamax(BasicNewsRecipe):
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
||||
|
16
recipes/auto_blog.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AutoBlog(BasicNewsRecipe):
|
||||
title = u'Auto Blog'
|
||||
__author__ = 'Welovelucy'
|
||||
language = 'en'
|
||||
description = 'Auto industry news'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'AutoBlog', u'http://www.autoblog.com/rss.xml')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + 'print/'
|
||||
|
||||
|
55
recipes/autobild.recipe
Normal file
@ -0,0 +1,55 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
auto-bild.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AutoBild(BasicNewsRecipe):
|
||||
title = u'Auto Bild'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = 'Auto'
|
||||
publisher = 'Auto Bild'
|
||||
oldest_article = 50
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Reviste,Auto'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.auto-bild.ro/images/autobild.gif'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'box_2 articol clearfix'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['detail']})
|
||||
, dict(name='a', attrs={'id':['zoom_link']})
|
||||
, dict(name='div', attrs={'class':['icons clearfix']})
|
||||
, dict(name='div', attrs={'class':['pub_articol clearfix']})
|
||||
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'class':['pub_articol clearfix']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://www.auto-bild.ro/rss/toate')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
59
recipes/babyonline.recipe
Normal file
@ -0,0 +1,59 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
babyonline.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BabyOnline(BasicNewsRecipe):
|
||||
title = u'Baby Online'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = u'De la p\u0103rinte la p\u0103rinte'
|
||||
publisher = u'Baby Online'
|
||||
oldest_article = 50
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Reviste,Copii,Mame'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.babyonline.ro/images/default/logo.gif'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'article_container'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'bar_nav'}),
|
||||
dict(name='div', attrs={'id':'service_send'}),
|
||||
dict(name='div', attrs={'id':'other_videos'}),
|
||||
dict(name='div', attrs={'class':'dot_line_yellow'}),
|
||||
dict(name='a', attrs={'class':'print'}),
|
||||
dict(name='a', attrs={'class':'email'}),
|
||||
dict(name='a', attrs={'class':'YM'}),
|
||||
dict(name='a', attrs={'class':'comment'}),
|
||||
dict(name='div', attrs={'class':'tombstone_cross'}),
|
||||
dict(name='span', attrs={'class':'liketext'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'service_send'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://www.babyonline.ro/rss_homepage.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
12
recipes/big_picture.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BigPicture(BasicNewsRecipe):
|
||||
title = u'The Big Picture'
|
||||
__author__ = 'Welovelucy'
|
||||
description = ('Macro perspective on capital markets, economy, technology'
|
||||
' and digital media')
|
||||
language = 'en'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'Big Picture', u'http://feeds.feedburner.com/TheBigPicture')]
|
46
recipes/bild_de.recipe
Normal file
@ -0,0 +1,46 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
title = u'Bild.de'
|
||||
__author__ = 'schuster'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 50
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
|
||||
# get cover from myspace
|
||||
cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'
|
||||
|
||||
# set what to fetch on the site
|
||||
remove_tags_before = dict(name = 'h2', attrs={'id':'cover'})
|
||||
remove_tags_after = dict(name ='div', attrs={'class':'back'})
|
||||
|
||||
# thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code)
|
||||
# this one removes a lot of direct-link's
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
# remove the ad's
|
||||
filter_regexps = [r'.\.smartadserver\.com']
|
||||
def skip_ad_pages(self, soup):
|
||||
return None
|
||||
|
||||
#get the real url behind .feedsportal.com and fetch the artikels
|
||||
def get_article_url(self, article):
|
||||
return article.get('id', article.get('guid', None))
|
||||
|
||||
#list of the rss source from www.bild.de
|
||||
feeds = [(u'Überblick', u'http://rss.bild.de/bild.xml'),
|
||||
(u'News', u'http://rss.bild.de/bild-news.xml'),
|
||||
(u'Politik', u'http://rss.bild.de/bild-politik.xml'),
|
||||
(u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'),
|
||||
(u'Sport', u'http://rss.bild.de/bild-sport.xml'),
|
||||
(u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'),
|
||||
(u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml')
|
||||
]
|
@ -3,7 +3,8 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
|
||||
__version__ = '0.98' # 2011-04-10
|
||||
__version__ = '0.98'
|
||||
|
||||
''' http://brandeins.de - Wirtschaftsmagazin '''
|
||||
import re
|
||||
import string
|
||||
@ -13,8 +14,8 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class BrandEins(BasicNewsRecipe):
|
||||
|
||||
title = u'brand eins'
|
||||
__author__ = 'Constantin Hofstetter; Steffen Siebert'
|
||||
description = u'Wirtschaftsmagazin: Gets the last full issue on default. Set a integer value for the username-field to get older issues: 1 -> the newest (but not complete) issue, 2 -> the last complete issue (default), 3 -> the issue before 2 etc.'
|
||||
__author__ = 'Constantin Hofstetter'
|
||||
description = u'Wirtschaftsmagazin'
|
||||
publisher ='brandeins.de'
|
||||
category = 'politics, business, wirtschaft, Germany'
|
||||
use_embedded_content = False
|
||||
@ -105,10 +106,11 @@ class BrandEins(BasicNewsRecipe):
|
||||
keys = issue_map.keys()
|
||||
keys.sort()
|
||||
keys.reverse()
|
||||
selected_issue = issue_map[keys[issue-1]]
|
||||
selected_issue_key = keys[issue - 1]
|
||||
selected_issue = issue_map[selected_issue_key]
|
||||
url = selected_issue.get('href', False)
|
||||
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
||||
self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date')
|
||||
self.title = "brand eins " + selected_issue_key[4:] + "/" + selected_issue_key[0:4]
|
||||
url = 'http://brandeins.de/'+url
|
||||
|
||||
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
||||
@ -161,3 +163,4 @@ class BrandEins(BasicNewsRecipe):
|
||||
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
||||
titles_and_articles.append([chapter_title, current_articles])
|
||||
return titles_and_articles
|
||||
|
||||
|
42
recipes/china_times.recipe
Normal file
@ -0,0 +1,42 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
# dug from http://www.mobileread.com/forums/showthread.php?p=1012294
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1277443634(BasicNewsRecipe):
|
||||
title = u'中時電子報'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'焦點', u'http://rss.chinatimes.com/rss/focus-u.rss'),
|
||||
(u'政治', u'http://rss.chinatimes.com/rss/Politic-u.rss'),
|
||||
(u'社會', u'http://rss.chinatimes.com/rss/social-u.rss'),
|
||||
(u'國際', u'http://rss.chinatimes.com/rss/international-u.rss'),
|
||||
(u'兩岸', u'http://rss.chinatimes.com/rss/mainland-u.rss'),
|
||||
(u'地方', u'http://rss.chinatimes.com/rss/local-u.rss'),
|
||||
(u'言論', u'http://rss.chinatimes.com/rss/comment-u.rss'),
|
||||
(u'科技', u'http://rss.chinatimes.com/rss/technology-u.rss'),
|
||||
(u'運動', u'http://rss.chinatimes.com/rss/sport-u.rss'),
|
||||
(u'藝文', u'http://rss.chinatimes.com/rss/philology-u.rss'),
|
||||
#(u'旺報', u'http://rss.chinatimes.com/rss/want-u.rss'),
|
||||
#(u'財經', u'http://rss.chinatimes.com/rss/finance-u.rss'), # broken links
|
||||
#(u'股市', u'http://rss.chinatimes.com/rss/stock-u.rss') # broken links
|
||||
]
|
||||
|
||||
__author__ = 'einstuerzende, updated by Eddie Lau'
|
||||
__version__ = '1.0'
|
||||
language = 'zh'
|
||||
publisher = 'China Times Group'
|
||||
description = 'China Times (Taiwan)'
|
||||
category = 'News, Chinese, Taiwan'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
encoding = 'big5'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
masthead_url = 'http://www.fcuaa.org/gif/chinatimeslogo.gif'
|
||||
cover_url = 'http://www.fcuaa.org/gif/chinatimeslogo.gif'
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['articlebox','articlebox clearfix']})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['focus-news']})]
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
clarin.com
|
||||
'''
|
||||
@ -18,11 +18,18 @@ class Clarin(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
delay = 1
|
||||
language = 'es_AR'
|
||||
publication_type = 'newspaper'
|
||||
INDEX = 'http://www.clarin.com'
|
||||
masthead_url = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif} h2{font-family: Georgia,serif; font-size: xx-large} .hora{font-weight:bold} .hd p{font-size: small} .nombre-autor{color: #0F325A} '
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
h2{font-family: Georgia,serif; font-size: xx-large}
|
||||
.hora{font-weight:bold}
|
||||
.hd p{font-size: small}
|
||||
.nombre-autor{color: #0F325A}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -31,7 +38,9 @@ class Clarin(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':['hd','mt']})]
|
||||
keep_only_tags = [dict(attrs={'class':['hd','mt']})]
|
||||
remove_tags = [dict(name=['meta','base','link'])]
|
||||
remove_attributes = ['lang','_mce_bogus']
|
||||
|
||||
feeds = [
|
||||
(u'Pagina principal', u'http://www.clarin.com/rss/' )
|
||||
@ -47,6 +56,10 @@ class Clarin(BasicNewsRecipe):
|
||||
,(u'Ciudades' , u'http://www.clarin.com/rss/ciudades/' )
|
||||
]
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?print=1'
|
||||
|
||||
|
34
recipes/cosmopolitan_de.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1305567197(BasicNewsRecipe):
|
||||
title = u'Cosmopolitan.de'
|
||||
__author__ = 'schuster'
|
||||
oldest_article = 7
|
||||
language = 'de'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.cosmopolitan.com/cm/shared/site_images/print_this/cosmopolitan_logo.gif'
|
||||
remove_tags_before = dict(name = 'h1', attrs={'class':'artikel'})
|
||||
remove_tags_after = dict(name ='div', attrs={'class':'morePages'})
|
||||
extra_css = '''
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small;}
|
||||
h1{ font-family:Arial,Helvetica,sans-serif; font-size:x-large; font-weight:bold;}
|
||||
'''
|
||||
remove_tags = [ dict(id='strong'),
|
||||
dict(title='strong'),
|
||||
dict(name='span'),
|
||||
dict(name='li', attrs={'class':'large'}),
|
||||
dict(name='ul', attrs={'class':'articleImagesPortrait clearfix'}),
|
||||
dict(name='p', attrs={'class':'external'}),
|
||||
dict(name='a', attrs={'target':'_blank'}),]
|
||||
feeds = [ (u'Komplett', u'http://www.cosmopolitan.de/rss/allgemein.xml'),
|
||||
(u'Mode', u'http://www.cosmopolitan.de/rss/mode.xml'),
|
||||
(u'Beauty', u'http://www.cosmopolitan.de/rss/beauty.xml'),
|
||||
(u'Liebe&Sex', u'http://www.cosmopolitan.de/rss/liebe.xml'),
|
||||
(u'Psychologie', u'http://www.cosmopolitan.de/rss/psychologie.xml'),
|
||||
(u'Job&Karriere', u'http://www.cosmopolitan.de/rss/job.xml'),
|
||||
(u'Lifestyle', u'http://www.cosmopolitan.de/rss/lifestyle.xml'),
|
||||
(u'Shopping', u'http://www.cosmopolitan.de/rss/shopping.xml'),
|
||||
(u'Bildergalerien', u'http://www.cosmopolitan.de/rss/bildgalerien.xml')]
|
@ -61,6 +61,12 @@ class DailyTelegraph(BasicNewsRecipe):
|
||||
(u'Entertainment News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_news_201.xml'),
|
||||
(u'Lifestyle News', u'http://feeds.news.com.au/public/rss/2.0/dtele_lifestyle_227.xml'),
|
||||
(u'Music', u'http://feeds.news.com.au/public/rss/2.0/dtele_music_441.xml'),
|
||||
(u'Sport',
|
||||
u'http://feeds.news.com.au/public/rss/2.0/dtele_sport_203.xml'),
|
||||
(u'Soccer',
|
||||
u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_soccer_344.xml'),
|
||||
(u'Rugby Union',
|
||||
u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_rugby_union_342.xml'),
|
||||
(u'Property Confidential', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_confidential_463.xml'),
|
||||
(u'Property - Your Space', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_yourspace_462.xml'),
|
||||
(u'Confidential News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_confidential_252.xml'),
|
||||
|
83
recipes/der_spiegel.recipe
Normal file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Nikolas Mangold <nmangold at gmail.com>'
|
||||
'''
|
||||
spiegel.de
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre import strftime
|
||||
from calibre import re
|
||||
|
||||
class DerSpiegel(BasicNewsRecipe):
|
||||
title = 'Der Spiegel'
|
||||
__author__ = 'Nikolas Mangold'
|
||||
description = 'Der Spiegel, Printed Edition. Access to paid content.'
|
||||
publisher = 'SPIEGEL-VERLAG RUDOLF AUGSTEIN GMBH & CO. KG'
|
||||
category = 'news, politics, Germany'
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
needs_subscription = True
|
||||
remove_empty_feeds = True
|
||||
delay = 1
|
||||
PREFIX = 'http://m.spiegel.de'
|
||||
INDEX = PREFIX + '/spiegel/print/epaper/index-heftaktuell.html'
|
||||
use_embedded_content = False
|
||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/thumb/1/17/Der_Spiegel_logo.svg/200px-Der_Spiegel_logo.svg.png'
|
||||
language = 'de'
|
||||
publication_type = 'magazine'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif} '
|
||||
timefmt = '[%W/%Y]'
|
||||
empty_articles = ['Titelbild']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<p>◆</p>', re.DOTALL|re.IGNORECASE), lambda match: '<hr>'),
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
def has_login_name(form):
|
||||
try:
|
||||
form.find_control(name="f.loginName")
|
||||
except:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open(self.PREFIX + '/meinspiegel/login.html')
|
||||
br.select_form(predicate=has_login_name)
|
||||
br['f.loginName' ] = self.username
|
||||
br['f.password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
remove_tags_before = dict(attrs={'class':'spArticleContent'})
|
||||
remove_tags_after = dict(attrs={'class':'spArticleCredit'})
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
cover = soup.find('img', width=248)
|
||||
if cover is not None:
|
||||
self.cover_url = cover['src']
|
||||
|
||||
index = soup.find('dl')
|
||||
|
||||
feeds = []
|
||||
for section in index.findAll('dt'):
|
||||
section_title = self.tag_to_string(section).strip()
|
||||
self.log('Found section ', section_title)
|
||||
|
||||
articles = []
|
||||
for article in section.findNextSiblings(['dd','dt']):
|
||||
if article.name == 'dt':
|
||||
break
|
||||
link = article.find('a')
|
||||
title = self.tag_to_string(link).strip()
|
||||
if title in self.empty_articles:
|
||||
continue
|
||||
self.log('Found article ', title)
|
||||
url = self.PREFIX + link['href']
|
||||
articles.append({'title' : title, 'date' : strftime(self.timefmt), 'url' : url})
|
||||
feeds.append((section_title,articles))
|
||||
return feeds;
|
55
recipes/diario_ibiza.recipe
Normal file
@ -0,0 +1,55 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Joan Tur, based on El Pais version by Jordi Balcells & elargentino.com version by Darko Miletic'
|
||||
description = 'Principal periodico de las islas Pitiusas, Ibiza y Formentera (Espanya) - v1.06 (29/04/2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
diariodeibiza.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DiarioDeIbiza(BasicNewsRecipe):
|
||||
__author__ = 'Joan Tur, cullet'
|
||||
description = 'Principal periodico de las islas Pitiusas, Ibiza y Formentera (Espanya) - v1.06 (29/04/2011)'
|
||||
|
||||
cover_url = 'http://estaticos01.diariodeibiza.es//elementosWeb/mediaweb/images/logo.jpg'
|
||||
title = u'Diario de Ibiza digital'
|
||||
publisher = u'Editorial Prensa Iberica'
|
||||
category = 'News, politics, culture, economy, general interest'
|
||||
language = 'es'
|
||||
|
||||
encoding = 'iso-8859-1'
|
||||
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['noticia_titular','epigrafe','subtitulo','actualizada','noticia_fecha','noticia_texto']}),
|
||||
dict(name='font', attrs={'class':['actualizada']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Portada de Ibiza', u'http://www.diariodeibiza.es/elementosInt/rss/1'),
|
||||
(u'Pitiuses i Balears', u'http://www.diariodeibiza.es/elementosInt/rss/2'),
|
||||
(u'Opini\xf3n', u'http://www.diariodeibiza.es/elementosInt/rss/3'),
|
||||
(u'Nacional', u'http://www.diariodeibiza.es/elementosInt/rss/4'),
|
||||
(u'Internacional', u'http://www.diariodeibiza.es/elementosInt/rss/5'),
|
||||
(u'Econom\xeda', u'http://www.diariodeibiza.es/elementosInt/rss/6'),
|
||||
(u'Deportes', u'http://www.diariodeibiza.es/elementosInt/rss/7'),
|
||||
(u'Sociedad', u'http://www.diariodeibiza.es/elementosInt/rss/8'),
|
||||
(u'Ciencia', u'http://www.diariodeibiza.es/elementosInt/rss/11'),
|
||||
(u'Tecnolog\xeda', u'http://www.diariodeibiza.es/elementosInt/rss/12'),
|
||||
(u'Gente', u'http://www.diariodeibiza.es/elementosInt/rss/13'),
|
||||
(u'Sucesos', u'http://www.diariodeibiza.es/elementosInt/rss/15'),
|
||||
(u'Cultura', u'http://www.diariodeibiza.es/elementosInt/rss/16Piti')
|
||||
]
|
||||
|
37
recipes/digizone.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Tomas Latal<latal.tomas at gmail.com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DigiZoneCZ(BasicNewsRecipe):
|
||||
title = 'DigiZone'
|
||||
__author__ = 'Tomas Latal'
|
||||
__version__ = '1.0'
|
||||
__date__ = '30 April 2011'
|
||||
description = u'Aktuality a \u010dl\xe1nky z DigiZone.cz'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 10
|
||||
encoding = 'iso-8859-2'
|
||||
publisher = 'Internet Info s.r.o.'
|
||||
category = 'digitalni vysilani, televize, CZ'
|
||||
language = 'cs'
|
||||
publication_type = 'newsportal'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
extra_css = 'p.perex{font-size: 1.2em; margin: 0 0 10px 0;line-height: 1.4;padding: 0 0 10px 0;font-weight: bold;} \
|
||||
p.perex img {display:none;} \
|
||||
.urs p {margin: 0 0 0.8em 0;}'
|
||||
|
||||
feeds = [
|
||||
(u'Aktuality', u'http://rss.digizone.cz/aktuality'),
|
||||
(u'\u010cl\xe1nky', u'http://rss.digizone.cz/clanky')
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id=['p-article','p-actuality'])
|
||||
|
||||
remove_tags_after = dict(id=['p-article','p-actuality'])
|
||||
|
||||
remove_tags = [
|
||||
dict(attrs={'class':['path','mth','lbtr','serial','enquiry','links','dp-n','side','op-ab','op-view','op-sub','op-list',]}),
|
||||
dict(id=['opinions','discussionList','similarItems','sidebar','footer','opl','promo-box'])
|
||||
]
|
53
recipes/divahair.recipe
Normal file
@ -0,0 +1,53 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
divahair.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DivaHair(BasicNewsRecipe):
|
||||
title = u'Diva Hair'
|
||||
language = 'ro'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = u'Coafuri, frizuri, tunsori ..'
|
||||
publisher = u'Diva Hair'
|
||||
category = u'Ziare,Stiri,Coafuri,Femei'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.divahair.ro/imgs/logo.jpg'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='td', attrs={'class':'spatiuart'})
|
||||
, dict(name='div', attrs={'class':'spatiuart'})
|
||||
]
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'categorie'})
|
||||
, dict(name='div', attrs={'class':'gri gri2 detaliiart'})
|
||||
, dict(name='div', attrs={'class':'articol_box_bottom'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'class':'articol_box_bottom'})
|
||||
]
|
||||
|
||||
feeds = [ (u'\u0218tiri', u'http://www.divahair.ro/feed') ]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -37,7 +37,7 @@ class DN_se(BasicNewsRecipe):
|
||||
,(u'Kultur' , u'http://www.dn.se/kultur-rss' )
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'article-content'})]
|
||||
remove_tags_before = dict(name='h1')
|
||||
remove_tags_after = dict(name='div',attrs={'id':'byline'})
|
||||
remove_tags = [
|
||||
|
@ -1,19 +1,21 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1302341394(BasicNewsRecipe):
|
||||
title = u'DvhN'
|
||||
oldest_article = 1
|
||||
__author__ = 'Reijndert'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 200
|
||||
|
||||
__author__ = 'Reijndert'
|
||||
no_stylesheets = True
|
||||
cover_url = 'http://www.dvhn.nl/template/Dagblad_v2.0/gfx/logo_DvhN.gif'
|
||||
cover_url = 'http://members.home.nl/apm.de.haas/calibre/DvhN.jpg'
|
||||
language = 'nl'
|
||||
country = 'NL'
|
||||
version = 1
|
||||
publisher = u'Dagblad van het Noorden'
|
||||
category = u'Nieuws'
|
||||
description = u'Nieuws uit Noord Nederland'
|
||||
timefmt = ' %Y-%m-%d (%a)'
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'fullPicture'})
|
||||
@ -21,11 +23,26 @@ class AdvancedUserRecipe1302341394(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','iframe','base'])
|
||||
,dict(name='span',attrs={'class':'copyright'})
|
||||
dict(name='span',attrs={'class':'location'})
|
||||
]
|
||||
|
||||
feeds = [(u'Drenthe', u'http://www.dvhn.nl/nieuws/drenthe/index.jsp?service=rss'), (u'Groningen', u'http://www.dvhn.nl/nieuws/groningen/index.jsp?service=rss'), (u'Nederland', u'http://www.dvhn.nl/nieuws/nederland/index.jsp?service=rss'), (u'Wereld', u'http://www.dvhn.nl/nieuws/wereld/index.jsp?service=rss'), (u'Economie', u'http://www.dvhn.nl/nieuws/economie/index.jsp?service=rss'), (u'Sport', u'http://www.dvhn.nl/nieuws/sport/index.jsp?service=rss'), (u'Cultuur', u'http://www.dvhn.nl/nieuws/kunst/index.jsp?service=rss'), (u'24 Uur', u'http://www.dvhn.nl/nieuws/24uurdvhn/index.jsp?service=rss&selectiontype=last24hours')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<a.*?>'), lambda h1: '')
|
||||
,(re.compile(r'</a>'), lambda h2: '')
|
||||
,(re.compile(r'Word vriend van Dagblad van het Noorden op Facebook'), lambda h3: '')
|
||||
,(re.compile(r'Volg Dagblad van het Noorden op Twitter'), lambda h3: '')
|
||||
]
|
||||
|
||||
|
||||
feeds = [(u'Drenthe', u'http://www.dvhn.nl/nieuws/drenthe/index.jsp?service=rss')
|
||||
, (u'Groningen', u'http://www.dvhn.nl/nieuws/groningen/index.jsp?service=rss')
|
||||
, (u'Nederland', u'http://www.dvhn.nl/nieuws/nederland/index.jsp?service=rss')
|
||||
, (u'Wereld', u'http://www.dvhn.nl/nieuws/wereld/index.jsp?service=rss')
|
||||
, (u'Economie', u'http://www.dvhn.nl/nieuws/economie/index.jsp?service=rss')
|
||||
, (u'Sport', u'http://www.dvhn.nl/nieuws/sport/index.jsp?service=rss')
|
||||
, (u'Cultuur', u'http://www.dvhn.nl/nieuws/kunst/index.jsp?service=rss')
|
||||
, (u'24 Uur', u'http://www.dvhn.nl/nieuws/24uurdvhn/index.jsp?service=rss&selectiontype=last24hours')
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
|
||||
|
@ -20,7 +20,7 @@ class Economist(BasicNewsRecipe):
|
||||
INDEX = 'http://www.economist.com/printedition'
|
||||
description = ('Global news and current affairs from a European'
|
||||
' perspective. Best downloaded on Friday mornings (GMT)')
|
||||
|
||||
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
|
||||
oldest_article = 7.0
|
||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||
remove_tags = [
|
||||
|
@ -14,14 +14,14 @@ class EcuisineRo(BasicNewsRecipe):
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = u'Reinventeaz\u0103 pl\u0103cerea de a g\u0103ti'
|
||||
publisher = 'eCuisine'
|
||||
oldest_article = 5
|
||||
oldest_article = 50
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Retete,Bucatarie'
|
||||
encoding = 'utf-8'
|
||||
cover_url = ''
|
||||
cover_url = 'http://www.ecuisine.ro/sites/all/themes/ecuisine/images/logo.gif'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -31,8 +31,8 @@ class EcuisineRo(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'page-title'})
|
||||
, dict(name='div', attrs={'class':'content clearfix'})
|
||||
dict(name='h1', attrs={'id':'page-title'})
|
||||
, dict(name='div', attrs={'class':'field-item even'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
|
@ -31,8 +31,8 @@ class EgirlRo(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'title_art'})
|
||||
, dict(name='div', attrs={'class':'content_style'})
|
||||
dict(name='div', attrs={'id':'content_art'})
|
||||
, dict(name='div', attrs={'class':'content_articol'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
|
74
recipes/express_de.recipe
Normal file
@ -0,0 +1,74 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
title = u'Express.de'
|
||||
__author__ = 'schuster'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 50
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
extra_css = '''
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small;}
|
||||
h1{ font-family:Arial,Helvetica,sans-serif; font-size:x-large; font-weight:bold;}
|
||||
|
||||
'''
|
||||
remove_javascript = True
|
||||
remove_tags_befor = [dict(name='div', attrs={'class':'Datum'})]
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'MoreNews'})]
|
||||
|
||||
remove_tags = [dict(id='kalaydo'),
|
||||
dict(id='Header'),
|
||||
dict(id='Searchline'),
|
||||
dict(id='MainNav'),
|
||||
dict(id='Logo'),
|
||||
dict(id='MainLinkSpacer'),
|
||||
dict(id='MainLinks'),
|
||||
dict(title='Diese Seite Bookmarken'),
|
||||
|
||||
dict(name='span'),
|
||||
dict(name='div', attrs={'class':'spacer_leftneu'}),
|
||||
dict(name='div', attrs={'class':'button kalaydologo'}),
|
||||
dict(name='div', attrs={'class':'button stellenneu'}),
|
||||
dict(name='div', attrs={'class':'button autoneu'}),
|
||||
dict(name='div', attrs={'class':'button immobilienneu'}),
|
||||
dict(name='div', attrs={'class':'button kleinanzeigen'}),
|
||||
dict(name='div', attrs={'class':'button tiereneu'}),
|
||||
dict(name='div', attrs={'class':'button ferienwohnungen'}),
|
||||
dict(name='div', attrs={'class':'button inserierenneu'}),
|
||||
dict(name='div', attrs={'class':'spacer_rightneu'}),
|
||||
dict(name='div', attrs={'class':'spacer_rightcorner'}),
|
||||
dict(name='div', attrs={'class':'HeaderMetaNav'}),
|
||||
dict(name='div', attrs={'class':'HeaderSearchOption'}),
|
||||
dict(name='div', attrs={'class':'HeaderSearch'}),
|
||||
dict(name='div', attrs={'class':'sbutton'}),
|
||||
dict(name='div', attrs={'class':'active'}),
|
||||
|
||||
]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
feeds = [(u'Top-Themen', u'http://www.express.de/home/-/2126/2126/-/view/asFeed/-/index.xml'),
|
||||
(u'Regional - Köln', u'http://www.express.de/regional/koeln/-/2856/2856/-/view/asFeed/-/index.xml'),
|
||||
(u'Regional - Bonn', u'http://www.express.de/regional/bonn/-/2860/2860/-/view/asFeed/-/index.xml'),
|
||||
(u'Regional - Düsseldorf', u'http://www.express.de/regional/duesseldorf/-/2858/2858/-/view/asFeed/-/index.xml'),
|
||||
(u'Regional - Region', u'http://www.express.de/regional/-/2178/2178/-/view/asFeed/-/index.xml'),
|
||||
(u'Sport-News', u'http://www.express.de/sport/-/2176/2176/-/view/asFeed/-/index.xml'),
|
||||
(u'Fussball-News', u'http://www.express.de/sport/fussball/-/3186/3186/-/view/asFeed/-/index.xml'),
|
||||
(u'1.FC Köln News', u'http://www.express.de/sport/fussball/fc-koeln/-/3192/3192/-/view/asFeed/-/index.xml'),
|
||||
(u'Alemannia Aachen News', u'http://www.express.de/sport/fussball/alemannia/-/3290/3290/-/view/asFeed/-/index.xml'),
|
||||
(u'Borussia M~Gladbach', u'http://www.express.de/sport/fussball/gladbach/-/3286/3286/-/view/asFeed/-/index.xml'),
|
||||
(u'Fortuna D~Dorf', u'http://www.express.de/sport/fussball/fortuna/-/3292/3292/-/view/asFeed/-/index.xml'),
|
||||
(u'Basketball News', u'http://www.express.de/sport/basketball/-/3190/3190/-/view/asFeed/-/index.xml'),
|
||||
(u'Big Brother', u'http://www.express.de/news/promi-show/big-brother/-/2402/2402/-/view/asFeed/-/index.xml'),
|
||||
|
||||
|
||||
|
||||
]
|
@ -12,7 +12,6 @@ class AdvancedUserRecipe1301860159(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'en_EN'
|
||||
remove_javascript = True
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
|
||||
remove_tags = [dict(name='a'),dict(name='hr')]
|
||||
|
@ -1,51 +1,38 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2009, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>'
|
||||
'''
|
||||
Profile to download FAZ.net
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class FazNet(BasicNewsRecipe):
|
||||
title = 'FAZ NET'
|
||||
__author__ = 'Kovid Goyal, Darko Miletic'
|
||||
title = u'Faz.net'
|
||||
__author__ = 'schuster'
|
||||
remove_tags = [dict(attrs={'class':['right', 'ArrowLinkRight', 'ModulVerlagsInfo', 'left', 'Head']}),
|
||||
dict(id=['BreadCrumbs', 'tstag', 'FazFooterPrint']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
oldest_article = 2
|
||||
description = 'Frankfurter Allgemeine Zeitung'
|
||||
publisher = 'FAZ Electronic Media GmbH'
|
||||
category = 'news, politics, Germany'
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'Article'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','embed','base'])
|
||||
,dict(name='div', attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo']})
|
||||
]
|
||||
|
||||
|
||||
feeds = [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ]
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.faz.net/f30/Images/Logos/logo.gif'
|
||||
|
||||
def print_version(self, url):
|
||||
article, sep, rest = url.partition('?')
|
||||
return article.replace('.html', '~Afor~Eprint.html')
|
||||
return url.replace('.html', '~Afor~Eprint.html')
|
||||
|
||||
|
||||
|
||||
feeds = [(u'Politik', u'http://www.faz.net/s/RubA24ECD630CAE40E483841DB7D16F4211/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Wirtschaft', u'http://www.faz.net/s/RubC9401175958F4DE28E143E68888825F6/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Feuilleton', u'http://www.faz.net/s/RubCC21B04EE95145B3AC877C874FB1B611/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Sport', u'http://www.faz.net/s/Rub9F27A221597D4C39A82856B0FE79F051/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Gesellschaft', u'http://www.faz.net/s/Rub02DBAA63F9EB43CEB421272A670A685C/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Finanzen', u'http://www.faz.net/s/Rub4B891837ECD14082816D9E088A2D7CB4/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Wissen', u'http://www.faz.net/s/Rub7F4BEE0E0C39429A8565089709B70C44/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Reise', u'http://www.faz.net/s/RubE2FB5CA667054BDEA70FB3BC45F8D91C/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Technik & Motor', u'http://www.faz.net/s/Rub01E4D53776494844A85FDF23F5707AD8/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Beruf & Chance', u'http://www.faz.net/s/RubB1E10A8367E8446897468EDAA6EA0504/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Kunstmarkt', u'http://www.faz.net/s/RubBC09F7BF72A2405A96718ECBFB68FBFE/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Immobilien ', u'http://www.faz.net/s/RubFED172A9E10F46B3A5F01B02098C0C8D/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Rhein-Main Zeitung', u'http://www.faz.net/s/RubABE881A6669742C2A5EBCB5D50D7EBEE/Tpl~Epartner~SRss_.xml'),
|
||||
(u'Atomdebatte ', u'http://www.faz.net/s/Rub469C43057F8C437CACC2DE9ED41B7950/Tpl~Epartner~SRss_.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
|
||||
soup.head.insert(0,mtag)
|
||||
del soup.body['onload']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
@ -53,6 +53,7 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
feeds = [
|
||||
(u'UK' , u'http://www.ft.com/rss/home/uk' )
|
||||
,(u'US' , u'http://www.ft.com/rss/home/us' )
|
||||
,(u'Europe' , u'http://www.ft.com/rss/home/europe' )
|
||||
,(u'Asia' , u'http://www.ft.com/rss/home/asia' )
|
||||
,(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
|
||||
]
|
||||
|
64
recipes/financialsense.recipe
Normal file
@ -0,0 +1,64 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.financialsense.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class FinancialSense(BasicNewsRecipe):
|
||||
title = 'Financial Sense'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Uncommon News & Views for the Wise Investor'
|
||||
publisher = 'Financial Sense'
|
||||
category = 'news, finances, politics, USA'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newsportal'
|
||||
masthead_url = 'http://www.financialsense.com/sites/default/files/logo.jpg'
|
||||
extra_css = """
|
||||
body{font-family: Arial,"Helvetica Neue",Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
h2{color: gray}
|
||||
.name{margin-right: 5em}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags =[dict(name=['meta','link','base','object','embed','iframe'])]
|
||||
remove_tags_after=dict(attrs={'class':'vcard'})
|
||||
keep_only_tags =[dict(attrs={'class':['title','post-meta','content','item-title','vcard']})]
|
||||
remove_attributes=['lang','type']
|
||||
|
||||
|
||||
feeds = [(u'Articles', u'http://feeds.feedburner.com/fso')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
foxnews.com
|
||||
'''
|
||||
@ -23,6 +23,7 @@ class FoxNews(BasicNewsRecipe):
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif }
|
||||
.caption{font-size: x-small}
|
||||
.author,.dateline{font-size: small}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
@ -34,12 +35,12 @@ class FoxNews(BasicNewsRecipe):
|
||||
|
||||
remove_attributes = ['xmlns','lang']
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','embed','link','script','iframe','meta','base'])
|
||||
,dict(attrs={'class':['user-control','url-description','ad-context']})
|
||||
]
|
||||
remove_tags=[
|
||||
dict(attrs={'class':['user-control','logo','ad-300x250','url-description']})
|
||||
,dict(name=['meta','base','link','iframe','object','embed'])
|
||||
]
|
||||
|
||||
remove_tags_before=dict(name='h1')
|
||||
keep_only_tags=[dict(attrs={'id':'article-print'})]
|
||||
remove_tags_after =dict(attrs={'class':'url-description'})
|
||||
|
||||
feeds = [
|
||||
@ -55,3 +56,24 @@ class FoxNews(BasicNewsRecipe):
|
||||
|
||||
def print_version(self, url):
|
||||
return url + 'print'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
||||
|
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
|
||||
__copyright__ = u'2010-2011, Tomasz Dlugosz <tomek3d@gmail.com>'
|
||||
'''
|
||||
frazpc.pl
|
||||
'''
|
||||
@ -19,17 +19,20 @@ class FrazPC(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
feeds = [(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed'), (u'Recenzje', u'http://www.frazpc.pl/kat/recenzje-2/feed') ]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'FRAZ_CONTENT'})]
|
||||
|
||||
remove_tags = [dict(name='p', attrs={'class':'gray tagsP fs11'})]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[(r'<div id="post-[0-9]*"', lambda match: '<div id="FRAZ_CONTENT"'),
|
||||
(r'href="/f/news/', lambda match: 'href="http://www.frazpc.pl/f/news/'),
|
||||
(r' <a href="http://www.frazpc.pl/[^>]*?">(Skomentuj|Komentarz(e)?\([0-9]*\))</a> \|', lambda match: '')]
|
||||
feeds = [
|
||||
(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'),
|
||||
(u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly')
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'article'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'title-wrapper'}),
|
||||
dict(name='p', attrs={'class':'tags'}),
|
||||
dict(name='p', attrs={'class':'article-links'}),
|
||||
dict(name='div', attrs={'class':'comments_box'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')]
|
||||
|
||||
remove_attributes = [ 'width', 'height' ]
|
||||
|
@ -21,14 +21,19 @@ class Fronda(BasicNewsRecipe):
|
||||
|
||||
feeds = [(u'Infformacje', u'http://fronda.pl/news/feed')]
|
||||
|
||||
keep_only_tags = [dict(name='h1', attrs={'class':'big'}),
|
||||
dict(name='ul', attrs={'class':'about clear'}),
|
||||
dict(name='div', attrs={'class':'content'})]
|
||||
keep_only_tags = [dict(name='h2', attrs={'class':'news_title'}),
|
||||
dict(name='div', attrs={'class':'naglowek_tresc'}),
|
||||
dict(name='div', attrs={'id':'czytaj'}) ]
|
||||
|
||||
remove_tags = [dict(name='a', attrs={'class':'print'})]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[ (r'<a href="#" class="print">Drukuj</a>', lambda match: ''),
|
||||
(r'<p><a href="http://fronda.pl/sklepy">.*</a></p>', lambda match: ''),
|
||||
[ (r'<p><a href="http://fronda.pl/sklepy">.*</a></p>', lambda match: ''),
|
||||
(r'<p><a href="http://fronda.pl/pasaz">.*</a></p>', lambda match: ''),
|
||||
(r'<h3><strong>W.* lektury.*</a></p></div>', lambda match: '</div>'),
|
||||
(r'<h3>Zobacz t.*?</div>', lambda match: '</div>') ]
|
||||
(r'<h3>Zobacz t.*?</div>', lambda match: '</div>'),
|
||||
(r'<p[^>]*> </p>', lambda match: ''),
|
||||
(r'<p><span style=".*?"><br /></span></p> ', lambda match: ''),
|
||||
(r'<a style=\'float:right;margin-top:3px;\' href="http://www.facebook.com/share.php?.*?</a>', lambda match: '')]
|
||||
]
|
||||
|
53
recipes/gazeta-prawna-calibre-v1.recipe
Normal file
@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Vroo <vroobelek@iq.pl>'
|
||||
__author__ = u'Vroo'
|
||||
'''
|
||||
gazetaprawna.pl
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class gazetaprawna(BasicNewsRecipe):
|
||||
version = 1
|
||||
title = u'Gazeta Prawna'
|
||||
__author__ = u'Vroo'
|
||||
publisher = u'Infor Biznes'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
description = 'Polski dziennik gospodarczy'
|
||||
language = 'pl'
|
||||
encoding = 'utf-8'
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'class':['data-art']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['dodatki_artykulu','data-art']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Wiadomo\u015bci - najwa\u017cniejsze', u'http://www.gazetaprawna.pl/wiadomosci/najwazniejsze/rss.xml'),
|
||||
(u'Biznes i prawo gospodarcze', u'http://biznes.gazetaprawna.pl/rss.xml'),
|
||||
(u'Prawo i wymiar sprawiedliwo\u015bci', u'http://prawo.gazetaprawna.pl/rss.xml'),
|
||||
(u'Praca i ubezpieczenia', u'http://praca.gazetaprawna.pl/rss.xml'),
|
||||
(u'Podatki i rachunkowo\u015b\u0107', u'http://podatki.gazetaprawna.pl/rss.xml')
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
url = url.replace('wiadomosci/artykuly', 'drukowanie')
|
||||
url = url.replace('artykuly', 'drukowanie')
|
||||
url = url.replace('porady', 'drukowanie')
|
||||
url = url.replace('wywiady', 'drukowanie')
|
||||
url = url.replace('orzeczenia', 'drukowanie')
|
||||
url = url.replace('galeria', 'drukowanie')
|
||||
url = url.replace('komentarze', 'drukowanie')
|
||||
url = url.replace('biznes.gazetaprawna', 'www.gazetaprawna')
|
||||
url = url.replace('podatki.gazetaprawna', 'www.gazetaprawna')
|
||||
url = url.replace('prawo.gazetaprawna', 'www.gazetaprawna')
|
||||
url = url.replace('praca.gazetaprawna', 'www.gazetaprawna')
|
||||
return url
|
38
recipes/glamour.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1305547242(BasicNewsRecipe):
|
||||
title = u'Glamour (US)'
|
||||
oldest_article = 21
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_javascript = True
|
||||
__author__ = 'Anonymous'
|
||||
remove_tags = [dict(name='div', attrs={'class':'articles_footer', 'class':'printoptions'})]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?printable=true'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
feeds = [ (u'All Fashion', u'http://feeds.glamour.com/glamour/all_fashion'),
|
||||
(u'All Beauty', u'http://feeds.glamour.com/glamour/all_beauty'),
|
||||
(u'All Sex, Love & Life', u'http://feeds.glamour.com/glamour/sex_love_life'),
|
||||
(u'All Health & Fitness', u'http://feeds.glamour.com/glamour/health_fitness'),
|
||||
(u'Shopping', u'http://feeds.glamour.com/glamour/shopping'),
|
||||
(u'Slaves to Fashion blog', u'http://feeds.glamour.com/glamour/slavestofashion'),
|
||||
(u'The Girls in the Beauty Department', u'http://feeds.glamour.com/glamour/thegirlsinthebeautydepartment'),
|
||||
(u'Smitten blog', u'http://feeds.glamour.com/glamour/smitten'),
|
||||
(u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
|
||||
(u'Single-ish blog', u'http://feeds.glamour.com/glamour/glamoursingle-ish'),
|
||||
(u'Save the Date', u'http://feeds.feedburner.com/glamour/save-the-date'),
|
||||
(u'Vitamin G blog', u'http://feeds.glamour.com/glamour/vitamin-g'),
|
||||
(u'Margarita Shapes Up blog', u'http://feeds.glamour.com/glamour/margaritashapesup'),
|
||||
(u'Little Miss Fortune blog', u'http://feeds.glamour.com/glamour/little-miss-fortune'),
|
||||
]
|
@ -1,83 +1,70 @@
|
||||
#!/usr/bin/env python
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class golem_ger(BasicNewsRecipe):
|
||||
title = u'Golem.de'
|
||||
language = 'de'
|
||||
__author__ = 'Kovid Goyal'
|
||||
__author__ = 'schuster'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'de'
|
||||
lang = 'de-DE'
|
||||
no_stylesheets = True
|
||||
encoding = 'iso-8859-1'
|
||||
recursions = 1
|
||||
match_regexps = [r'http://www.golem.de/.*.html']
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1', attrs={'class':'artikelhead'}),
|
||||
dict(name='p', attrs={'class':'teaser'}),
|
||||
dict(name='div', attrs={'class':'artikeltext'}),
|
||||
dict(name='h2', attrs={'id':'artikelhead'}),
|
||||
]
|
||||
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['similarContent','topContentWrapper','storycarousel','aboveFootPromo','comments','toolbar','breadcrumbs','commentlink','sidebar','rightColumn']}),
|
||||
dict(name='div', attrs={'class':['gg_embeddedSubText','gg_embeddedIndex gg_solid','gg_toOldGallery','golemGallery']}),
|
||||
dict(name='img', attrs={'class':['gg_embedded','gg_embeddedIconRight gg_embeddedIconFS gg_cursorpointer']}),
|
||||
dict(name='td', attrs={'class':['xsmall']}),
|
||||
]
|
||||
|
||||
|
||||
# remove_tags_after = [
|
||||
# dict(name='div', attrs={'id':['contentad2']})
|
||||
# ]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Golem.de', u'http://rss.golem.de/rss.php?feed=ATOM1.0'),
|
||||
(u'Audio/Video', u'http://rss.golem.de/rss.php?tp=av&feed=RSS2.0'),
|
||||
(u'Foto', u'http://rss.golem.de/rss.php?tp=foto&feed=RSS2.0'),
|
||||
(u'Games', u'http://rss.golem.de/rss.php?tp=games&feed=RSS2.0'),
|
||||
(u'Internet', u'http://rss.golem.de/rss.php?tp=inet&feed=RSS1.0'),
|
||||
(u'Mobil', u'http://rss.golem.de/rss.php?tp=mc&feed=ATOM1.0'),
|
||||
(u'Internet', u'http://rss.golem.de/rss.php?tp=inet&feed=RSS1.0'),
|
||||
(u'Politik/Recht', u'http://rss.golem.de/rss.php?tp=pol&feed=ATOM1.0'),
|
||||
(u'Desktop-Applikationen', u'http://rss.golem.de/rss.php?tp=apps&feed=RSS2.0'),
|
||||
(u'Software-Entwicklung', u'http://rss.golem.de/rss.php?tp=dev&feed=RSS2.0'),
|
||||
(u'Wirtschaft', u'http://rss.golem.de/rss.php?tp=wirtschaft&feed=RSS2.0'),
|
||||
(u'Hardware', u'http://rss.golem.de/rss.php?r=hw&feed=RSS2.0'),
|
||||
(u'Software', u'http://rss.golem.de/rss.php?r=sw&feed=RSS2.0'),
|
||||
(u'Networld', u'http://rss.golem.de/rss.php?r=nw&feed=RSS2.0'),
|
||||
(u'Entertainment', u'http://rss.golem.de/rss.php?r=et&feed=RSS2.0'),
|
||||
(u'TK', u'http://rss.golem.de/rss.php?r=tk&feed=RSS2.0'),
|
||||
(u'E-Commerce', u'http://rss.golem.de/rss.php?r=ec&feed=RSS2.0'),
|
||||
(u'Unternehmen/Maerkte', u'http://rss.golem.de/rss.php?r=wi&feed=RSS2.0')
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Golem.de', u'http://rss.golem.de/rss.php?feed=ATOM1.0'),
|
||||
(u'Mobil', u'http://rss.golem.de/rss.php?tp=mc&feed=feed=RSS2.0'),
|
||||
(u'OSS', u'http://rss.golem.de/rss.php?tp=oss&feed=RSS2.0'),
|
||||
(u'Politik/Recht', u'http://rss.golem.de/rss.php?tp=pol&feed=RSS2.0'),
|
||||
(u'Desktop-Applikationen', u'http://rss.golem.de/rss.php?tp=apps&feed=RSS2.0'),
|
||||
(u'Software-Entwicklung', u'http://rss.golem.de/rss.php?tp=dev&feed=RSS2.0'),
|
||||
]
|
||||
|
||||
|
||||
max_articles_per_feed = 10
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
cover_url = 'http://www.e-energy.de/images/logo_golem.jpg'
|
||||
masthead_url = 'http://www.golem.de/staticrl/images/logo.png'
|
||||
extra_css = '''
|
||||
h1 {color:#0066CC;font-family:Arial,Helvetica,sans-serif; font-size:30px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:20px;margin-bottom:2 em;}
|
||||
h2 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:22px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:16px; }
|
||||
h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:x-small; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal; line-height:5px;}
|
||||
h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:13px; }
|
||||
h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:11px; text-transform:uppercase;}
|
||||
.teaser {font-style:italic;font-size:12pt;margin-bottom:15pt;}
|
||||
.xsmall{font-style:italic;font-size:x-small;}
|
||||
.td{font-style:italic;font-size:x-small;}
|
||||
img {align:left;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small;}
|
||||
h1{ font-family:Arial,Helvetica,sans-serif; font-size:x-large; font-weight:bold;}
|
||||
|
||||
'''
|
||||
remove_javascript = True
|
||||
remove_tags_befor = [dict(name='header', attrs={'class':'cluster-header'})]
|
||||
remove_tags_after = [dict(name='p', attrs={'class':'meta'})]
|
||||
remove_tags = [dict(rel='nofollow'),
|
||||
dict(name='header', attrs={'id':'header'}),
|
||||
dict(name='div', attrs={'class':'dh1'}),
|
||||
dict(name='label', attrs={'class':'implied'}),
|
||||
dict(name='section', attrs={'id':'comments'}),
|
||||
dict(name='li', attrs={'class':'gg_prebackcounterItem'}),
|
||||
dict(name='li', attrs={'class':'gg_prebackcounterItem gg_embeddedIndexCounter'}),
|
||||
dict(name='img', attrs={'class':'gg_embeddedIconRight gg_embeddedIconFS gg_cursorpointer'}),
|
||||
dict(name='div', attrs={'target':'_blank'})
|
||||
]
|
||||
|
||||
def get_browser(self, *args, **kwargs):
|
||||
from calibre import browser
|
||||
kwargs['user_agent'] = 'mozilla'
|
||||
return browser(*args, **kwargs)
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('id', article.get('guid', None))
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
feeds = [(u'Audio/Video', u'http://rss.golem.de/rss.php?tp=av&feed=RSS2.0'),
|
||||
(u'Foto', u'http://rss.golem.de/rss.php?tp=foto&feed=RSS2.0'),
|
||||
(u'Games', u'http://rss.golem.de/rss.php?tp=games&feed=RSS2.0'),
|
||||
(u'Handy', u'http://rss.golem.de/rss.php?tp=handy&feed=RSS2.0'),
|
||||
(u'Internet', u'http://rss.golem.de/rss.php?tp=inet&feed=RSS2.0'),
|
||||
(u'Mobile', u'http://rss.golem.de/rss.php?tp=mc&feed=RSS2.0'),
|
||||
(u'OSS', u'http://rss.golem.de/rss.php?tp=oss&feed=RSS2.0'),
|
||||
(u'Politik/Recht', u'http://rss.golem.de/rss.php?tp=pol&feed=RSS2.0'),
|
||||
(u'Security', u'http://rss.golem.de/rss.php?tp=sec&feed=RSS2.0'),
|
||||
(u'Desktop-Applikationen', u'http://rss.golem.de/rss.php?tp=apps&feed=RSS2.0'),
|
||||
(u'Software-Entwicklung', u'http://rss.golem.de/rss.php?tp=dev&feed=RSS2.0'),
|
||||
(u'Wirtschaft', u'http://rss.golem.de/rss.php?tp=wirtschaft&feed=RSS2.0'),
|
||||
(u'Hardware', u'http://rss.golem.de/rss.php?r=hw&feed=RSS2.0'),
|
||||
(u'Software', u'http://rss.golem.de/rss.php?r=sw&feed=RSS2.0'),
|
||||
(u'Networld', u'http://rss.golem.de/rss.php?r=nw&feed=RSS2.0'),
|
||||
(u'Entertainment', u'http://rss.golem.de/rss.php?r=et&feed=RSS2.0'),
|
||||
(u'TK', u'http://rss.golem.de/rss.php?r=tk&feed=RSS2.0'),
|
||||
(u'Wirtschaft', u'http://rss.golem.de/rss.php?r=wi&feed=RSS2.0'),
|
||||
(u'E-Commerce', u'http://rss.golem.de/rss.php?r=ec&feed=RSS2.0')
|
||||
|
||||
]
|
||||
|
||||
|
31
recipes/good_house_keeping.recipe
Normal file
@ -0,0 +1,31 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1305547242(BasicNewsRecipe):
|
||||
title = u'Good House Keeping'
|
||||
language = 'en'
|
||||
__author__ = 'Anonymous'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
|
||||
def print_version(self,url):
|
||||
segments = url.split('/')
|
||||
printURL = '/'.join(segments[0:3]) + '/print-this/' + '/'.join(segments[4:])
|
||||
return printURL
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
feeds = [ (u'Recipes & Entertaining', u'http://www.goodhousekeeping.com/food/food-rss/?src=rss'),
|
||||
(u'Home & House', u'http://www.goodhousekeeping.com/home/home-rss/?src=rss'),
|
||||
(u'Diet & Health', u'http://www.goodhousekeeping.com/health/health-rss/?src=rss'),
|
||||
(u'Beauty & Style', u'http://www.goodhousekeeping.com/beauty/beauty-rss/?src=rss'),
|
||||
(u'Family & Pets', u'http://www.goodhousekeeping.com/family/family-rss/?src=rss'),
|
||||
(u'Saving Money', u'http://www.goodhousekeeping.com/money/money-rss/?src=rss'),
|
||||
]
|
32
recipes/good_to_know.recipe
Normal file
@ -0,0 +1,32 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1305547242(BasicNewsRecipe):
|
||||
title = u'Good to Know (uk)'
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
__author__ = 'Anonymous'
|
||||
language = 'en_GB'
|
||||
remove_tags = [dict(name='div', attrs={'class':'articles_footer', 'class':'printoptions'})]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '/print/1'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
feeds = [ (u'Family Conception Advice', u'http://www.goodtoknow.co.uk/feeds/family.rss'),
|
||||
(u'Family Health Advice', u'http://www.goodtoknow.co.uk/feeds/health.rss'),
|
||||
(u'Diet Advice', u'http://www.goodtoknow.co.uk/feeds/diet.rss'),
|
||||
(u'Food Advice', u'http://www.goodtoknow.co.uk/feeds/food.rss'),
|
||||
(u'Sex Advice', u'http://www.goodtoknow.co.uk/feeds/sex.rss'),
|
||||
(u'Easy Exercise', u'http://www.goodtoknow.co.uk/feeds/easyexercise.rss'),
|
||||
(u'Recipes', u'http://www.goodtoknow.co.uk/feeds/recipes.rss'),
|
||||
(u'Food Quick-tips', u'http://www.goodtoknow.co.uk/feeds/foodquicktips.rss'),
|
||||
]
|
@ -1,4 +1,3 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Handelsblatt(BasicNewsRecipe):
|
||||
@ -7,14 +6,11 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
cover_url = 'http://www.handelsblatt.com/images/logo/logo_handelsblatt.com.png'
|
||||
# cover_url = 'http://www.handelsblatt.com/images/logo/logo_handelsblatt.com.png'
|
||||
language = 'de'
|
||||
# keep_only_tags = []
|
||||
keep_only_tags = (dict(name = 'div', attrs = {'class': ['hcf-detail-abstract hcf-teaser ajaxify','hcf-detail','hcf-author-wrapper']}))
|
||||
# keep_only_tags.append(dict(name = 'div', attrs = {'id': 'fullText'}))
|
||||
remove_tags = [dict(name='img', attrs = {'src': 'http://www.handelsblatt.com/images/icon/loading.gif'})
|
||||
,dict(name='ul' , attrs={'class':['hcf-detail-tools']})
|
||||
]
|
||||
|
||||
remove_tags_before = dict(attrs={'class':'hcf-overline'})
|
||||
remove_tags_after = dict(attrs={'class':'hcf-footer'})
|
||||
|
||||
feeds = [
|
||||
(u'Handelsblatt Exklusiv',u'http://www.handelsblatt.com/rss/exklusiv'),
|
||||
@ -28,17 +24,16 @@ class Handelsblatt(BasicNewsRecipe):
|
||||
(u'Handelsblatt Magazin',u'http://www.handelsblatt.com/rss/magazin/'),
|
||||
(u'Handelsblatt Weblogs',u'http://www.handelsblatt.com/rss/blogs')
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
.hcf-headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:x-large;}
|
||||
.hcf-overline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:x-large;}
|
||||
.hcf-exclusive {font-family:Arial,Helvetica,sans-serif; font-style:italic;font-weight:bold; margin-right:5pt;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;}
|
||||
.hcf-location-mark{font-weight:bold; margin-right:5pt;}
|
||||
.MsoNormal{font-family:Helvetica,Arial,sans-serif;}
|
||||
.hcf-author-wrapper{font-style:italic;}
|
||||
.hcf-article-date{font-size:x-small;}
|
||||
.hcf-caption {font-style:italic;font-size:small;}
|
||||
img {align:left;}
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
url = url.split('/')
|
||||
url[-1] = 'v_detail_tab_print,'+url[-1]
|
||||
url = '/'.join(url)
|
||||
return url
|
||||
|
@ -1,9 +1,6 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
# Needed for BLOGs
|
||||
from calibre.web.feeds import Feed
|
||||
|
||||
class HBR(BasicNewsRecipe):
|
||||
|
||||
title = 'Harvard Business Review Blogs'
|
||||
@ -32,6 +29,7 @@ class HBR(BasicNewsRecipe):
|
||||
feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')]
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
else:
|
||||
timefmt = ' [%B %Y]'
|
||||
|
||||
@ -59,9 +57,9 @@ class HBR(BasicNewsRecipe):
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
br.open(self.LOGIN_URL)
|
||||
br.select_form(name='signInForm')
|
||||
br['signInForm:username'] = self.username
|
||||
br['signInForm:password'] = self.password
|
||||
br.select_form(name='signin-form')
|
||||
br['signin-form:username'] = self.username
|
||||
br['signin-form:password'] = self.password
|
||||
raw = br.submit().read()
|
||||
if 'My Account' not in raw:
|
||||
raise Exception('Failed to login, are you sure your username and password are correct?')
|
||||
@ -161,27 +159,13 @@ class HBR(BasicNewsRecipe):
|
||||
return startDate, endDate
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
def hbr_parse_blogs(self, feeds):
|
||||
# Do the "official" parse_feeds first
|
||||
rssFeeds = Feed()
|
||||
|
||||
# Use the PARSE_FEEDS method to get a Feeds object of the articles
|
||||
rssFeeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
# Create a new feed of the right configuration and append to existing afeeds
|
||||
self.feed_to_index_append(rssFeeds[:], feeds)
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
def parse_index(self):
|
||||
if self.INCLUDE_ARTICLES == True:
|
||||
soup = self.hbr_get_toc()
|
||||
feeds = self.hbr_parse_toc(soup)
|
||||
else:
|
||||
feeds = []
|
||||
|
||||
# blog stuff
|
||||
if self.INCLUDE_BLOGS == True:
|
||||
self.hbr_parse_blogs(feeds)
|
||||
return BasicNewsRecipe.parse_index(self)
|
||||
|
||||
return feeds
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
BIN
recipes/icons/autobild.png
Normal file
After Width: | Height: | Size: 614 B |
BIN
recipes/icons/babyonline.png
Normal file
After Width: | Height: | Size: 256 B |
BIN
recipes/icons/divahair.png
Normal file
After Width: | Height: | Size: 675 B |
BIN
recipes/icons/financialsense.png
Normal file
After Width: | Height: | Size: 702 B |
BIN
recipes/icons/iprofesional.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/mayra.png
Normal file
After Width: | Height: | Size: 620 B |
BIN
recipes/icons/moldovaazi.png
Normal file
After Width: | Height: | Size: 243 B |
BIN
recipes/icons/newsmoldova.png
Normal file
After Width: | Height: | Size: 837 B |
BIN
recipes/icons/novistandard.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/osnews_pl.png
Normal file
After Width: | Height: | Size: 1006 B |
BIN
recipes/icons/replicavedetelor.png
Normal file
After Width: | Height: | Size: 709 B |
BIN
recipes/icons/rmf24_opinie.png
Normal file
After Width: | Height: | Size: 722 B |
BIN
recipes/icons/rzeczpospolita.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/socialdiva.png
Normal file
After Width: | Height: | Size: 1.0 KiB |
BIN
recipes/icons/swiatkindle.png
Normal file
After Width: | Height: | Size: 425 B |
BIN
recipes/icons/ziuaveche.png
Normal file
After Width: | Height: | Size: 554 B |
@ -1,71 +1,65 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Lorenzo Vigentini & Edwin van Maastrigt'
|
||||
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com> and Edwin van Maastrigt <evanmaastrigt at gmail.com>'
|
||||
__description__ = 'Financial news daily paper - v1.02 (30, January 2010)'
|
||||
__author__ = 'Marco Saraceno'
|
||||
__copyright__ = '2010, Marco Saraceno <marcosaraceno at gmail.com>'
|
||||
description = 'Italian daily newspaper - v 1.1 (Mar14,2011)'
|
||||
|
||||
'''
|
||||
http://www.ilsole24ore.com/
|
||||
http://www.ilsole24ore.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IlSole24Ore(BasicNewsRecipe):
|
||||
__author__ = 'Marco Saraceno'
|
||||
description = 'Italian financial daily newspaper'
|
||||
|
||||
class ilsole24Ore(BasicNewsRecipe):
|
||||
author = 'Lorenzo Vigentini & Edwin van Maastrigt'
|
||||
description = 'Financial news daily paper'
|
||||
|
||||
cover_url = 'http://www.ilsole24ore.com/img2007/print_header.gif'
|
||||
|
||||
title = u'il Sole 24 Ore New'
|
||||
publisher = 'italiaNews'
|
||||
category = 'News, finance, economy, politics'
|
||||
cover_url = 'http://www.shopping24.ilsole24ore.com/ProductRelated/rds/img/logo_sole.gif'
|
||||
title = u'Il Sole 24 Ore'
|
||||
publisher = 'Gruppo editoriale GRUPPO 24ORE'
|
||||
category = 'News, politics, culture, economy, financial, Italian'
|
||||
|
||||
language = 'it'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 50
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['header','titolo']}),
|
||||
dict(name='table', attrs={'class':['footer1024','footerdown']}),
|
||||
]
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('id', article.get('guid', None))
|
||||
link = article.get('link', None)
|
||||
if link is None:
|
||||
return article
|
||||
if link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
|
||||
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
|
||||
for i in range(0,len(a)):
|
||||
link=link.replace(a[i],b[i])
|
||||
link="http://"+link
|
||||
return link
|
||||
|
||||
feeds = [
|
||||
(u'Notizie Italia', u'http://www.ilsole24ore.com/rss/notizie/italia.xml'),
|
||||
(u'Notizie Europa', u'http://www.ilsole24ore.com/rss/notizie/europa.xml'),
|
||||
(u'Notizie USA', u'http://www.ilsole24ore.com/rss/notizie/usa.xml'),
|
||||
(u'Notizie Americhe', u'http://www.ilsole24ore.com/rss/notizie/americhe.xml'),
|
||||
(u'Notizie Medio Oriente e Africa', u'http://www.ilsole24ore.com/rss/notizie/medio-oriente-e-africa.xml'),
|
||||
(u'Notizie Asia e Oceania', u'http://www.ilsole24ore.com/rss/notizie/asia-e-oceania.xml'),
|
||||
(u'Commenti', u'http://www.ilsole24ore.com/rss/commenti-e-idee.xml'),
|
||||
(u'Norme e tributi', u'http://www.ilsole24ore.com/rss/norme-e-tributi.xml'),
|
||||
(u'Finanza', u'http://www.ilsole24ore.com/rss/finanza-e-mercati.xml'),
|
||||
(u'Economia', u'http://www.ilsole24ore.com/rss/economia.xml'),
|
||||
(u'Tecnologia', u'http://www.ilsole24ore.com/rss/tecnologie.xml'),
|
||||
(u'Cultura', u'http://www.ilsole24ore.com/rss/cultura.xml'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
link, sep, params = url.rpartition('?')
|
||||
if link is None:
|
||||
return link.replace('_1.php', '_php')
|
||||
return link.replace('.shtml', '_PRN.shtml')
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'txt'})
|
||||
]
|
||||
# remove_tags = [dict(name='br')]
|
||||
|
||||
feeds = [
|
||||
(u'Prima pagina', u'http://www.ilsole24ore.com/rss/primapagina.xml'),
|
||||
(u'Norme e tributi', u'http://www.ilsole24ore.com/rss/norme-tributi.xml'),
|
||||
(u'Finanza e mercati', u'http://www.ilsole24ore.com/rss/finanza-mercati.xml'),
|
||||
(u'Economia e lavoro', u'http://www.ilsole24ore.com/rss/economia-lavoro.xml'),
|
||||
(u'Italia', u'http://www.ilsole24ore.com/rss/italia.xml'),
|
||||
(u'Mondo', u'http://www.ilsole24ore.com/rss/mondo.xml'),
|
||||
(u'Tecnologia e business', u'http://www.ilsole24ore.com/rss/tecnologia-business.xml'),
|
||||
(u'Cultura e tempo libero', u'http://www.ilsole24ore.com/rss/tempolibero-cultura.xml'),
|
||||
(u'Sport', u'http://www.ilsole24ore.com/rss/sport.xml'),
|
||||
(u'Professionisti 24', u'http://www.ilsole24ore.com/rss/prof_home.xml'),
|
||||
(u'Ambiente e Sicurezza',u'http://www.ilsole24ore.com/rss/prof_as.xml')
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
html, body, table, tr, td, h1, h2, h3, h4, h5, h6, p, a, span, br, img {margin:0;padding:0;border:0;font-size:12px;font-family:"Georgia","Times New Roman";}
|
||||
.linkHighlight {color:#0292c6;}
|
||||
.txt {border-bottom:1px solid #7c7c7c;padding-bottom:20px};text-align:justify;font-family:"serif"}
|
||||
.txt p {line-height:18px;}
|
||||
.txt span {line-height:22px;}
|
||||
.title h3 {color:#7b7b7b;}
|
||||
.title h4 {color:#08526e;font-size:26px;font-family:"Times New Roman";font-weight:normal;}
|
||||
'''
|
||||
return url.replace('.shtml', '_PRN.shtml')
|
||||
|
||||
|
79
recipes/iprofesional.recipe
Normal file
@ -0,0 +1,79 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.iprofesional.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class iProfesional(BasicNewsRecipe):
|
||||
title = 'iProfesional.com'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Las ultimas noticias sobre profesionales'
|
||||
publisher = 'Emprendimientos Corporativos S.A.'
|
||||
category = 'news, IT, impuestos, negocios, politics, Argentina'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'es_AR'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'nesportal'
|
||||
masthead_url = 'http://www.iprofesional.com/img/logo-iprofesional.png'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
.titulo-interior{font-family: Georgia,"Times New Roman",Times,serif}
|
||||
.autor-nota{font-size: small; font-weight: bold; font-style: italic; color: gray}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':['fecha','interior-nota']})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','base','embed','object','iframe'])
|
||||
,dict(attrs={'class':['menu-imprimir','guardarNota','IN-widget','fin','permalink']})
|
||||
]
|
||||
remove_attributes=['lang','xmlns:og','xmlns:fb']
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas noticias' , u'http://feeds.feedburner.com/iprofesional-principales-noticias')
|
||||
,(u'Finanzas' , u'http://feeds.feedburner.com/iprofesional-finanzas' )
|
||||
,(u'Impuestos' , u'http://feeds.feedburner.com/iprofesional-impuestos' )
|
||||
,(u'Negocios' , u'http://feeds.feedburner.com/iprofesional-economia' )
|
||||
,(u'Comercio Exterior' , u'http://feeds.feedburner.com/iprofesional-comercio-exterior' )
|
||||
,(u'Tecnologia' , u'http://feeds.feedburner.com/iprofesional-tecnologia' )
|
||||
,(u'Management' , u'http://feeds.feedburner.com/iprofesional-managment' )
|
||||
,(u'Marketing' , u'http://feeds.feedburner.com/iprofesional-marketing' )
|
||||
,(u'Legales' , u'http://feeds.feedburner.com/iprofesional-legales' )
|
||||
,(u'Autos' , u'http://feeds.feedburner.com/iprofesional-autos' )
|
||||
,(u'Vinos' , u'http://feeds.feedburner.com/iprofesional-vinos-bodegas' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
@ -16,7 +16,7 @@ class Jezebel(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
use_embedded_content = True
|
||||
language = 'en'
|
||||
masthead_url = 'http://cache.gawkerassets.com/assets/jezebel.com/img/logo.png'
|
||||
extra_css = '''
|
||||
@ -32,13 +32,12 @@ class Jezebel(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
keep_only_tags = [dict(attrs={'class':'content permalink'})]
|
||||
remove_tags_before = dict(name='h1')
|
||||
remove_tags = [dict(attrs={'class':'contactinfo'})]
|
||||
remove_tags_after = dict(attrs={'class':'contactinfo'})
|
||||
feeds = [(u'Articles', u'http://feeds.gawker.com/jezebel/vip?format=xml')]
|
||||
|
||||
remove_tags = [
|
||||
{'class': 'feedflare'},
|
||||
]
|
||||
|
||||
feeds = [(u'Articles', u'http://feeds.gawker.com/jezebel/full')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
36
recipes/korea_herald.recipe
Normal file
@ -0,0 +1,36 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Seongkyoun Yoo <Seongkyoun.yoo at gmail.com>'
|
||||
'''
|
||||
Profile to download KoreaHerald
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class KoreaHerald(BasicNewsRecipe):
|
||||
title = u'KoreaHerald'
|
||||
language = 'en'
|
||||
description = u'Korea Herald News articles'
|
||||
__author__ = 'Seongkyoun Yoo'
|
||||
oldest_article = 10
|
||||
recursions = 3
|
||||
max_articles_per_feed = 10
|
||||
no_stylesheets = True
|
||||
keep_only_tags = [
|
||||
dict(id=['contentLeft', '_article'])
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
dict(name='div', attrs={'class':['left','htit2', 'navigation','banner_txt','banner_img']}),
|
||||
dict(name='ul', attrs={'class':['link_icon', 'flow_icon','detailTextAD110113']}),
|
||||
]
|
||||
|
||||
feeds = [
|
||||
('All News','http://www.koreaherald.com/rss/020000000000.xml'),
|
||||
('National','http://www.koreaherald.com/rss/020100000000.xml'),
|
||||
('Business','http://www.koreaherald.com/rss/020200000000.xml'),
|
||||
('Life&Style','http://www.koreaherald.com/rss/020300000000.xml'),
|
||||
('Entertainment','http://www.koreaherald.com/rss/020400000000.xml'),
|
||||
('Sports','http://www.koreaherald.com/rss/020500000000.xml'),
|
||||
('Opinion','http://www.koreaherald.com/rss/020600000000.xml'),
|
||||
('English Cafe','http://www.koreaherald.com/rss/021000000000.xml'),
|
||||
]
|
@ -16,7 +16,7 @@ class Kotaku(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
use_embedded_content = True
|
||||
language = 'en'
|
||||
masthead_url = 'http://cache.gawkerassets.com/assets/kotaku.com/img/logo.png'
|
||||
extra_css = '''
|
||||
@ -31,13 +31,12 @@ class Kotaku(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
keep_only_tags = [dict(attrs={'class':'content permalink'})]
|
||||
remove_tags_before = dict(name='h1')
|
||||
remove_tags = [dict(attrs={'class':'contactinfo'})]
|
||||
remove_tags_after = dict(attrs={'class':'contactinfo'})
|
||||
feeds = [(u'Articles', u'http://feeds.gawker.com/kotaku/vip?format=xml')]
|
||||
|
||||
remove_tags = [
|
||||
{'class': 'feedflare'},
|
||||
]
|
||||
|
||||
feeds = [(u'Articles', u'http://feeds.gawker.com/kotaku/full')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
@ -48,7 +48,7 @@ class LeMonde(BasicNewsRecipe):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
return self.adeify_images(soup)
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'),
|
||||
|
44
recipes/liberty_times.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
# dug from http://www.mobileread.com/forums/showthread.php?p=1012294
|
||||
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1277443634(BasicNewsRecipe):
|
||||
title = u'自由電子報'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'焦點新聞', u'http://www.libertytimes.com.tw/rss/fo.xml'),
|
||||
(u'政治新聞', u'http://www.libertytimes.com.tw/rss/p.xml'),
|
||||
(u'生活新聞', u'http://www.libertytimes.com.tw/rss/life.xml'),
|
||||
(u'國際新聞', u'http://www.libertytimes.com.tw/rss/int.xml'),
|
||||
(u'自由廣場', u'http://www.libertytimes.com.tw/rss/o.xml'),
|
||||
(u'社會新聞', u'http://www.libertytimes.com.tw/rss/so.xml'),
|
||||
(u'體育新聞', u'http://www.libertytimes.com.tw/rss/sp.xml'),
|
||||
(u'財經焦點', u'http://www.libertytimes.com.tw/rss/e.xml'),
|
||||
(u'證券理財', u'http://www.libertytimes.com.tw/rss/stock.xml'),
|
||||
(u'影視焦點', u'http://www.libertytimes.com.tw/rss/show.xml'),
|
||||
(u'北部新聞', u'http://www.libertytimes.com.tw/rss/north.xml'),
|
||||
(u'中部新聞', u'http://www.libertytimes.com.tw/rss/center.xml'),
|
||||
(u'南部新聞', u'http://www.libertytimes.com.tw/rss/south.xml'),
|
||||
(u'大台北新聞', u'http://www.libertytimes.com.tw/rss/taipei.xml'),
|
||||
(u'藝術文化', u'http://www.libertytimes.com.tw/rss/art.xml'),
|
||||
]
|
||||
extra_css = '''span[class='insubject1'][id='newtitle'] {font-size:200%; font-weight:bold;}'''
|
||||
__author__ = 'einstuerzende, updated by Eddie Lau'
|
||||
__version__ = '1.1'
|
||||
language = 'zh'
|
||||
publisher = 'Liberty Times Group'
|
||||
description = 'Liberty Times (Taiwan)'
|
||||
category = 'News, Chinese, Taiwan'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
encoding = 'big5'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
masthead_url = 'http://www.libertytimes.com.tw/2008/images/img_auto/005/logo_new.gif'
|
||||
cover_url = 'http://www.libertytimes.com.tw/2008/images/img_auto/005/logo_new.gif'
|
||||
keep_only_tags = [dict(name='td', attrs={'id':['newsContent']})]
|
||||
|
37
recipes/lupa.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Tomas Latal<latal.tomas at gmail.com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LupaCZ(BasicNewsRecipe):
|
||||
title = 'Lupa'
|
||||
__author__ = 'Tomas Latal'
|
||||
__version__ = '1.0'
|
||||
__date__ = '30 April 2011'
|
||||
description = u'Zpr\xe1vi\u010dky a \u010dl\xe1nky z Lupa.cz'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 10
|
||||
encoding = 'utf8'
|
||||
publisher = 'Internet Info s.r.o.'
|
||||
category = 'IT,news,CZ'
|
||||
language = 'cs'
|
||||
publication_type = 'newsportal'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
extra_css = 'p.perex{font-size: 1.2em;margin: 0 0 10px 0;line-height: 1.4;padding: 0 0 10px 0;font-weight: bold;} \
|
||||
p.perex img {display:none;} \
|
||||
.urs p {margin: 0 0 0.8em 0;}'
|
||||
|
||||
feeds = [
|
||||
(u'Zpr\xe1vi\u010dky', u'http://rss.lupa.cz/zpravicky'),
|
||||
(u'\u010cl\xe1nky', u'http://rss.lupa.cz/clanky')
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id='main')
|
||||
|
||||
remove_tags_after = [dict(id='main')]
|
||||
|
||||
remove_tags = [
|
||||
dict(attrs={'class':['author clear','tags-rubrics','box border style1 links clear','enquiry clear','serial','box border style1 TitleList','breadcrumb clear','article-discussion box border style1 monitoringComponentArticle','link-more border prev-next clear']}),
|
||||
dict(id=['discussionList','similarItems','sidebar','footer','opl','promo-box'])
|
||||
]
|
22
recipes/max_planck.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
title = u'Max-Planck-Inst.'
|
||||
__author__ = 'schuster'
|
||||
remove_tags = [dict(attrs={'class':['clearfix', 'lens', 'col2_box_list', 'col2_box_teaser group_ext no_print', 'dotted_line', 'col2_box_teaser', 'box_image small', 'bold', 'col2_box_teaser no_print', 'print_kontakt']}),
|
||||
dict(id=['ie_clearing', 'col2', 'col2_content']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
|
||||
def print_version(self, url):
|
||||
split_url = url.split("/")
|
||||
print_url = 'http://www.mpg.de/print/' + split_url[3]
|
||||
return print_url
|
||||
|
||||
feeds = [(u'Forschung', u'http://www.mpg.de/de/forschung.rss')]
|
||||
|
51
recipes/mayra.recipe
Normal file
@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
mayra.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Mayra(BasicNewsRecipe):
|
||||
title = u'Mayra'
|
||||
language = 'ro'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = u'Traieste urban, cool, sexy'
|
||||
publisher = 'Mayra'
|
||||
category = 'Ziare,Stiri,Reviste'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://img.konkurs.ro/img/concursuri-cu-premii/147/14672_front.jpg'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'article_details'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'LikePluginPagelet'})
|
||||
, dict(name='p', attrs={'id':'tags'})
|
||||
, dict(name='span', attrs={'id':'tweet-button'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'LikePluginPagelet'})
|
||||
]
|
||||
|
||||
feeds = [ (u'\u0218tiri', u'http://www.mayra.ro/rss') ]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
10
recipes/mens_health.recipe
Normal file
@ -0,0 +1,10 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1305636254(BasicNewsRecipe):
|
||||
title = u'Mens Health (US)'
|
||||
language = 'en'
|
||||
__author__ = 'Anonymous'
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'News', u'http://blogs.menshealth.com/health-headlines/feed')]
|
37
recipes/mesec.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Tomas Latal<latal.tomas at gmail.com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MesecCZ(BasicNewsRecipe):
|
||||
title = u'M\u011b\u0161ec'
|
||||
__author__ = 'Tomas Latal'
|
||||
__version__ = '1.0'
|
||||
__date__ = '30 April 2011'
|
||||
description = u'Zpr\xe1vi\u010dky a \u010dl\xe1nky z Mesec.cz'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 10
|
||||
encoding = 'utf8'
|
||||
publisher = 'Internet Info s.r.o.'
|
||||
category = 'finance,CZ'
|
||||
language = 'cs'
|
||||
publication_type = 'newsportal'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
extra_css = 'p.perex{font-size: 1.2em;margin: 0 0 10px 0;line-height: 1.4;padding: 0 0 10px 0;font-weight: bold;} \
|
||||
p.perex img {display:none;} \
|
||||
.urs p {margin: 0 0 0.8em 0;}'
|
||||
|
||||
feeds = [
|
||||
(u'Aktuality', u'http://www.mesec.cz/rss/aktuality/'),
|
||||
(u'\u010cl\xe1nky', u'http://www.mesec.cz/rss/clanky/')
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id='main')
|
||||
|
||||
remove_tags_after = [dict(id='main')]
|
||||
|
||||
remove_tags = [
|
||||
dict(attrs={'class':['author clear','tags-rubrics','box border style1 links clear','enquiry clear','serial','box border style1 TitleList','breadcrumb clear','article-discussion box border style1 monitoringComponentArticle','link-more border prev-next clear']}),
|
||||
dict(id=['discussionList','similarItems','sidebar','footer','opl','promo-box'])
|
||||
]
|
@ -1,15 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010-2011, Eddie Lau'
|
||||
|
||||
# Users of Kindle 3 (with limited system-level CJK support)
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
__MakePeriodical__ = True
|
||||
# Turn it to True if your device supports display of CJK titles
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
__UseChineseTitle__ = False
|
||||
|
||||
# Trun below to true if you wish to use life.mingpao.com as the main article source
|
||||
__UseLife__ = True
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
2011/03/06: add new articles for finance section, also a new section "Columns"
|
||||
2011/02/28: rearrange the sections
|
||||
[Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
|
||||
@ -32,41 +35,43 @@ import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
class MPHKRecipe(BasicNewsRecipe):
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'Eddie Lau'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
publisher = 'MingPao'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'zh'
|
||||
encoding = 'Big5-HKSCS'
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'Eddie Lau'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
publisher = 'MingPao'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'zh'
|
||||
encoding = 'Big5-HKSCS'
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['photo']})
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
]
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page
|
||||
dict(name='table')] # for content fetched from life.mingpao.com
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']}), # for the finance page from mpfinance.com
|
||||
dict(name='table')] # for content fetched from life.mingpao.com
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<h1>'),
|
||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||
@ -80,10 +85,10 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
lambda match: "</b>")
|
||||
]
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
@ -115,314 +120,357 @@ class MPHKRecipe(BasicNewsRecipe):
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||
return dt_local
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
def get_fetchdate(self):
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
def get_fetchformatteddate(self):
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchday(self):
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
def get_fetchday(self):
|
||||
# dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
# dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = None
|
||||
return cover
|
||||
def get_cover_url(self):
|
||||
cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
if __UseLife__:
|
||||
for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
|
||||
(u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
|
||||
(u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
|
||||
(u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
|
||||
return feeds
|
||||
return feeds
|
||||
|
||||
def parse_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
# parse from news.mingpao.com
|
||||
def parse_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_ed_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_fin_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href= True)
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
#url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
|
||||
title = self.tag_to_string(i)
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
included_urls.append(url)
|
||||
return current_articles
|
||||
def parse_ed_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_ent_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
def parse_fin_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href= True)
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
#url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
#if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
|
||||
title = self.tag_to_string(i)
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
included_urls.append(url)
|
||||
return current_articles
|
||||
|
||||
def parse_col_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
def parse_ent_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['width']
|
||||
for item in soup.findAll(stype=True):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
def parse_col_section(self, url):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['width']
|
||||
for item in soup.findAll(stype=True):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
50
recipes/moldovaazi.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
azi.md
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MoldovaAzi(BasicNewsRecipe):
|
||||
title = u'Moldova Azi'
|
||||
language = 'ro'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = u'Moldova pe internet'
|
||||
publisher = 'Moldova Azi'
|
||||
category = 'Ziare,Stiri,Moldova'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.azi.md/images/logo.gif'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [ dict(name='div', attrs={'id':'in'})
|
||||
]
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'in-more-stories'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'comment_wrapper'})
|
||||
, dict(name='div', attrs={'class':'box-title4'})
|
||||
]
|
||||
|
||||
feeds = [ (u'\u0218tiri', u'http://www.azi.md/ro/feeds/0/rss201') ]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
50
recipes/newsmoldova.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
newsmoldova.md
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NewsMoldova(BasicNewsRecipe):
|
||||
title = u'Agen\u0163ia de \u015ftiri Moldova'
|
||||
language = 'ro'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = u'Agen\u0163ia de \u015ftiri Moldova'
|
||||
publisher = 'Moldova'
|
||||
category = 'Ziare,Stiri,Moldova'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.newsmoldova.md/i/logo_top_md.gif'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [ dict(name='div', attrs={'class':'main-article-index article'})
|
||||
]
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'actions'})
|
||||
, dict(name='li', attrs={'class':'invisible'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'actions'})
|
||||
]
|
||||
|
||||
feeds = [ (u'\u0218tiri', u'http://newsmoldova.md/export/rss2/archive/index.xml') ]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -1,4 +1,3 @@
|
||||
import string
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Newsweek(BasicNewsRecipe):
|
||||
|
29
recipes/ngz.recipe
Normal file
@ -0,0 +1,29 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
title = u'NGZ-online'
|
||||
__author__ = 'schuster'
|
||||
remove_tags_before = dict(id='bu')
|
||||
remove_tags_after = dict(id='noblock')
|
||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix', 'liketext']}),
|
||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'Verlinken', 'vorheriger', 'LESERKOMMENTARE', 'bei facebook', 'bei twitter', 'Schreiben Sie jetzt Ihre Meinung:', 'Thema', 'Ihr Beitrag', 'Ihr Name', 'Ich möchte über weitere Lesermeinungen zu diesem Artikel per E-Mail informiert werden.', 'banneroben', 'bannerrechts', 'inserieren', 'stellen', 'auto', 'immobilien', 'kleinanzeige', 'tiere', 'ferienwohnung', 'NGZ Card', 'Mediengruppe RP', 'Werben', 'Newsletter', 'Wetter', 'RSS', 'Abo', 'Anzeigen', 'Redaktion', 'Schulprojekte', 'Gast', 'Mein NGZ', 'Nachrichten', 'Sport', 'Wirtschaft', 'Stadt-Infos', 'Bilderserien', 'Bookmarken', 'del.icio.us', 'Mister Wong', 'YiGG', 'Webnews', 'Shortnews', 'Twitter', 'Newsider', 'Facebook', 'StudiVZ/MeinVZ', 'Versenden', 'Drucken']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.rhein-kreis-neuss-macht-sport.de/sport/includes/bilder/ngz_logo.jpg'
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?ot=de.circit.rpo.PopupPageLayout.ot'
|
||||
feeds = [
|
||||
(u'Grevenbroich', u'http://www.ngz-online.de/app/feed/rss/grevenbroich'),
|
||||
(u'Kreis Neuss', u'http://www.ngz-online.de/app/feed/rss/rheinkreisneuss'),
|
||||
(u'Dormagen', u'http://www.ngz-online.de/app/feed/rss/dormagen'),
|
||||
(u'J\xfcchen', u'http://www.ngz-online.de/app/feed/rss/juechen'),
|
||||
(u'Rommerskirchen', u'http://www.ngz-online.de/app/feed/rss/rommerskirchen')
|
||||
|
||||
]
|
||||
|
43
recipes/novinky.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Tomas Latal<latal.tomas at gmail.com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NovinkyCZ(BasicNewsRecipe):
|
||||
title = 'Novinky'
|
||||
__author__ = 'Tomas Latal'
|
||||
__version__ = '1.1'
|
||||
__date__ = '30 April 2011'
|
||||
description = 'News from server Novinky.cz'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 10
|
||||
encoding = 'utf8'
|
||||
publisher = 'Novinky'
|
||||
category = 'news, CZ'
|
||||
language = 'cs'
|
||||
publication_type = 'newsportal'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
cover_url = 'http://img193.imageshack.us/img193/3039/novinkycover.jpg'
|
||||
extra_css = 'p.acmDescription{font-style:italic;} p.acmAuthor{font-size:0.8em; color:#707070}'
|
||||
|
||||
feeds = [
|
||||
(u'Dom\xe1c\xed', u'http://www.novinky.cz/rss/domaci/'),
|
||||
(u'Zahrani\u010d\xed', u'http://www.novinky.cz/rss/zahranicni/'),
|
||||
(u'Krimi', u'http://www.novinky.cz/rss/krimi/'),
|
||||
(u'Ekonomika', u'http://www.novinky.cz/rss/ekonomika/'),
|
||||
(u'Finance', u'http://www.novinky.cz/rss/finance/'),
|
||||
(u'Kultura', u'http://www.novinky.cz/rss/kultura/'),
|
||||
(u'Koktejl', u'http://www.novinky.cz/rss/koktejl/'),
|
||||
(u'Internet a PC', u'http://www.novinky.cz/rss/internet-a-pc/'),
|
||||
(u'Auto-moto', u'http://www.novinky.cz/rss/auto/'),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id='articleContent')
|
||||
|
||||
remove_tags_after = [dict(id='movedArticleAuthors')]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['articleColumnInfo','pictureInnerBox']}),
|
||||
dict(name='p', attrs={'id':['articleDate']})
|
||||
]
|
100
recipes/novistandard.recipe
Normal file
@ -0,0 +1,100 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.standard.rs
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NoviStandard(BasicNewsRecipe):
|
||||
title = 'Novi Standard'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'NoviStandard - energija je neunistiva!'
|
||||
publisher = 'Novi Standard'
|
||||
category = 'news, politics, Serbia'
|
||||
no_stylesheets = True
|
||||
delay = 1
|
||||
oldest_article = 15
|
||||
encoding = 'utf-8'
|
||||
publication_type = 'magazine'
|
||||
needs_subscription = 'optional'
|
||||
remove_empty_feeds = True
|
||||
INDEX = 'http://www.standard.rs/'
|
||||
use_embedded_content = False
|
||||
language = 'sr'
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.standard.rs/templates/ja_opal/images/red/logo.png'
|
||||
extra_css = """
|
||||
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,"Segoe UI","Trebuchet MS",Helvetica,sans1,sans-serif}
|
||||
.dropcap{font-family: Georgia,Times,serif1,serif; display:inline}
|
||||
.dropcap:first-letter{display: inline; font-size: xx-large; font-weight: bold}
|
||||
.contentheading{color: gray; font-size: x-large}
|
||||
.article-meta, .createdby{color: red}
|
||||
img{margin-top:0.5em; margin-bottom: 0.7em; display: block}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.open(self.INDEX)
|
||||
if self.username is not None and self.password is not None:
|
||||
br.select_form(name='login')
|
||||
br['username'] = self.username
|
||||
br['passwd' ] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
keep_only_tags =[dict(attrs={'class':['contentheading','article-meta','article-content']})]
|
||||
remove_tags_after =dict(attrs={'class':'extravote-container'})
|
||||
remove_tags = [
|
||||
dict(name=['object','link','iframe','meta','base'])
|
||||
,dict(attrs={'class':'extravote-container'})
|
||||
]
|
||||
remove_attributes =['border','background','height','width','align','valign','lang']
|
||||
feeds = [
|
||||
(u'Naslovna', u'http://www.standard.rs/index.php?format=feed&type=rss')
|
||||
,(u'Politika', u'http://www.standard.rs/vesti/36-politika.html?format=feed&type=rss')
|
||||
,(u'Cvijanovic preporucuje', u'http://www.standard.rs/-cvijanovi-vam-preporuuje.html?format=feed&type=rss')
|
||||
,(u'Kolumne', u'http://www.standard.rs/vesti/49-kolumne.html?format=feed&type=rss')
|
||||
,(u'Kultura', u'http://www.standard.rs/vesti/40-kultura.html?format=feed&type=rss')
|
||||
,(u'Lifestyle', u'http://www.standard.rs/vesti/39-lifestyle.html?format=feed&type=rss')
|
||||
,(u'Svet', u'http://www.standard.rs/vesti/41-svet.html?format=feed&type=rss')
|
||||
,(u'Ekonomija', u'http://www.standard.rs/vesti/37-ekonomija.html?format=feed&type=rss')
|
||||
,(u'Sport', u'http://www.standard.rs/vesti/38-sport.html?format=feed&type=rss')
|
||||
]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('div'):
|
||||
if len(item.contents) == 0:
|
||||
item.extract()
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
37
recipes/podnikatel.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Tomas Latal<latal.tomas at gmail.com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class PodnikatelCZ(BasicNewsRecipe):
|
||||
title = 'Podnikatel'
|
||||
__author__ = 'Tomas Latal'
|
||||
__version__ = '1.0'
|
||||
__date__ = '30 April 2011'
|
||||
description = u'Aktuality a \u010dl\xe1nky z Podnikatel.cz'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 10
|
||||
encoding = 'utf8'
|
||||
publisher = 'Internet Info s.r.o.'
|
||||
category = 'podnikani, bussiness, CZ'
|
||||
language = 'cs'
|
||||
publication_type = 'newsportal'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
extra_css = 'p.perex{font-size: 1.2em; margin: 0 0 10px 0;line-height: 1.4;padding: 0 0 10px 0;font-weight: bold;} \
|
||||
p.perex img {display:none;} \
|
||||
.urs p {margin: 0 0 0.8em 0;}'
|
||||
|
||||
feeds = [
|
||||
(u'Aktuality', u'http://rss.podnikatel.cz/aktuality'),
|
||||
(u'\u010cl\xe1nky', u'http://rss.podnikatel.cz/clanky')
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id='art-content')
|
||||
|
||||
remove_tags_after = [dict(id='art-content')]
|
||||
|
||||
remove_tags = [
|
||||
dict(attrs={'class':['socialshare','box-blue','author clear','labels-terms','box diskuze','ad','page-nav right','infobox','box zpravy','s-clanky']}),
|
||||
dict(id=['path','article-tools','discussionList','similarItems','promo-box'])
|
||||
]
|
22
recipes/pro_physik.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
title = u'Pro Physik'
|
||||
__author__ = 'schuster'
|
||||
oldest_article = 4
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.pro-physik.de/Phy/images/site/prophysik_logo1.jpg'
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('leadArticle.do', 'print.do')
|
||||
|
||||
|
||||
feeds = [(u'Hightech', u'http://www.pro-physik.de/Phy/hightechfeed.xml'),
|
||||
(u'Forschung', u'http://www.pro-physik.de/Phy/forschungfeed.xml'),
|
||||
(u'Magazin', u'http://www.pro-physik.de/Phy/magazinfeed.xml')]
|
||||
|
@ -3,7 +3,6 @@ __license__ = 'GPL v3'
|
||||
'''
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.web.feeds import Feed
|
||||
|
||||
|
||||
class ReadersDigest(BasicNewsRecipe):
|
||||
@ -38,151 +37,20 @@ class ReadersDigest(BasicNewsRecipe):
|
||||
'''
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='h4', attrs={'class':'close'}),
|
||||
dict(name='div', attrs={'class':'fromLine'}),
|
||||
dict(name='img', attrs={'class':'colorTag'}),
|
||||
dict(name='div', attrs={'id':'sponsorArticleHeader'}),
|
||||
dict(name='div', attrs={'class':'horizontalAd'}),
|
||||
dict(name='div', attrs={'id':'imageCounterLeft'}),
|
||||
dict(name='div', attrs={'id':'commentsPrint'})
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
('New in RD', 'http://feeds.rd.com/ReadersDigest'),
|
||||
('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
|
||||
('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
|
||||
('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
|
||||
('Food', 'http://www.rd.com/food/feed'),
|
||||
('Health', 'http://www.rd.com/health/feed'),
|
||||
('Home', 'http://www.rd.com/home/feed'),
|
||||
('Family', 'http://www.rd.com/family/feed'),
|
||||
('Money', 'http://www.rd.com/money/feed'),
|
||||
('Travel', 'http://www.rd.com/travel/feed'),
|
||||
]
|
||||
|
||||
cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
|
||||
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def print_version(self, url):
|
||||
|
||||
# Get the identity number of the current article and append it to the root print URL
|
||||
|
||||
if url.find('/article') > 0:
|
||||
ident = url[url.find('/article')+8:url.find('.html?')-4]
|
||||
url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
|
||||
|
||||
elif url.find('/post') > 0:
|
||||
|
||||
# in this case, have to get the page itself to derive the Print page.
|
||||
soup = self.index_to_soup(url)
|
||||
newsoup = soup.find('ul',attrs={'class':'printBlock'})
|
||||
url = 'http://www.rd.com' + newsoup('a')[0]['href']
|
||||
url = url[0:url.find('&Keep')]
|
||||
|
||||
return url
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def parse_index(self):
|
||||
|
||||
pages = [
|
||||
('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
|
||||
# useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
|
||||
('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
|
||||
|
||||
keep_only_tags = dict(id='main-content')
|
||||
remove_tags = [
|
||||
{'class':['post-categories']},
|
||||
]
|
||||
|
||||
feeds = []
|
||||
|
||||
for page in pages:
|
||||
section, url, divider, attrList = page
|
||||
newArticles = self.page_parse(url, divider, attrList)
|
||||
feeds.append((section,newArticles))
|
||||
|
||||
# after the pages of the site have been processed, parse several RSS feeds for additional sections
|
||||
newfeeds = Feed()
|
||||
newfeeds = self.parse_rss()
|
||||
|
||||
|
||||
# The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable
|
||||
# for this module (parse_index).
|
||||
|
||||
for feed in newfeeds:
|
||||
newArticles = []
|
||||
for article in feed.articles:
|
||||
newArt = {
|
||||
'title' : article.title,
|
||||
'url' : article.url,
|
||||
'date' : article.date,
|
||||
'description' : article.text_summary
|
||||
}
|
||||
newArticles.append(newArt)
|
||||
|
||||
|
||||
# New and Blogs should be the first two feeds.
|
||||
if feed.title == 'New in RD':
|
||||
feeds.insert(0,(feed.title,newArticles))
|
||||
elif feed.title == 'Blogs':
|
||||
feeds.insert(1,(feed.title,newArticles))
|
||||
else:
|
||||
feeds.append((feed.title,newArticles))
|
||||
|
||||
|
||||
return feeds
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def page_parse(self, mainurl, divider, attrList):
|
||||
|
||||
articles = []
|
||||
mainsoup = self.index_to_soup(mainurl)
|
||||
for item in mainsoup.findAll(attrs=attrList):
|
||||
newArticle = {
|
||||
'title' : item('img')[0]['alt'],
|
||||
'url' : 'http://www.rd.com'+item('a')[0]['href'],
|
||||
'date' : '',
|
||||
'description' : ''
|
||||
}
|
||||
articles.append(newArticle)
|
||||
|
||||
|
||||
|
||||
return articles
|
||||
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def parse_rss (self):
|
||||
|
||||
# Do the "official" parse_feeds first
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
|
||||
# Loop thru the articles in all feeds to find articles with "recipe" in it
|
||||
recipeArticles = []
|
||||
for curfeed in feeds:
|
||||
delList = []
|
||||
for a,curarticle in enumerate(curfeed.articles):
|
||||
if curarticle.title.upper().find('RECIPE') >= 0:
|
||||
recipeArticles.append(curarticle)
|
||||
delList.append(curarticle)
|
||||
if len(delList)>0:
|
||||
for d in delList:
|
||||
index = curfeed.articles.index(d)
|
||||
curfeed.articles[index:index+1] = []
|
||||
|
||||
# If there are any recipes found, create a new Feed object and append.
|
||||
if len(recipeArticles) > 0:
|
||||
pfeed = Feed()
|
||||
pfeed.title = 'Recipes'
|
||||
pfeed.descrition = 'Recipe Feed (Virtual)'
|
||||
pfeed.image_url = None
|
||||
pfeed.oldest_article = 30
|
||||
pfeed.id_counter = len(recipeArticles)
|
||||
# Create a new Feed, add the recipe articles, and then append
|
||||
# to "official" list of feeds
|
||||
pfeed.articles = recipeArticles[:]
|
||||
feeds.append(pfeed)
|
||||
|
||||
return feeds
|
||||
|
||||
|
53
recipes/replicavedetelor.recipe
Normal file
@ -0,0 +1,53 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, '
|
||||
'''
|
||||
replicavedetelor.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ReplicaVedetelor(BasicNewsRecipe):
|
||||
title = u'Replica Vedetelor'
|
||||
__author__ = u'Silviu Cotoara'
|
||||
description = u'Ofer\u0103 vedetelor dreptul la replic\u0103'
|
||||
publisher = 'Replica Vedetelor'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Reviste,Vedete'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.webart-software.eu/_pics/lucrari_referinta/medium/84/1-Replica-Vedetelor.jpg'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'zona-continut'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='ul', attrs={'id':['lista-imagini']})
|
||||
, dict(name='form', attrs={'id':['f-trimite-unui-prieten']})
|
||||
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='form', attrs={'id':['f-trimite-unui-prieten']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://www.replicavedetelor.ro/feed')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
@ -2,7 +2,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class RzeczpospolitaRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'kwetal'
|
||||
__author__ = u'kwetal and Tomasz Dlugosz'
|
||||
language = 'pl'
|
||||
version = 1
|
||||
|
||||
@ -38,6 +38,8 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'clr'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'share_bottom'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'id' : 'copyright_law'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'more'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks'}))
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
||||
@ -48,6 +50,13 @@ class RzeczpospolitaRecipe(BasicNewsRecipe):
|
||||
.fot{font-size: x-small; color: #666666;}
|
||||
'''
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if ('advertisement' in soup.find('title').string.lower()):
|
||||
href = soup.find('a').get('href')
|
||||
return self.index_to_soup(href, raw=True)
|
||||
else:
|
||||
return None
|
||||
|
||||
def print_version(self, url):
|
||||
start, sep, rest = url.rpartition('/')
|
||||
forget, sep, index = rest.rpartition(',')
|
||||
|
@ -8,23 +8,36 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net> edited by Huan T'
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Slashdot(BasicNewsRecipe):
|
||||
title = u'Slashdot.org'
|
||||
description = '''Tech news. WARNING: This recipe downloads a lot
|
||||
of content and may result in your IP being banned from slashdot.org'''
|
||||
oldest_article = 7
|
||||
simultaneous_downloads = 1
|
||||
delay = 3
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
title = u'Slashdot.org'
|
||||
description = '''Tech news. WARNING: This recipe downloads a lot
|
||||
of content and may result in your IP being banned from slashdot.org'''
|
||||
oldest_article = 7
|
||||
simultaneous_downloads = 1
|
||||
delay = 3
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
|
||||
__author__ = 'floweros edited by Huan T'
|
||||
no_stylesheets = True
|
||||
# keep_only_tags = [
|
||||
# dict(name='div',attrs={'class':'article'}),
|
||||
# dict(name='div',attrs={'class':'commentTop'}),
|
||||
# ]
|
||||
__author__ = 'floweros edited by Huan T'
|
||||
no_stylesheets = True
|
||||
keep_only_tags = [
|
||||
dict(name='div',attrs={'id':'article'}),
|
||||
dict(name='div',attrs={'class':['postBody' 'details']}),
|
||||
dict(name='footer',attrs={'class':['clearfix meta article-foot']}),
|
||||
dict(name='article',attrs={'class':['fhitem fhitem-story article usermode thumbs grid_24']}),
|
||||
dict(name='dl',attrs={'class':'relatedPosts'}),
|
||||
dict(name='h2',attrs={'class':'story'}),
|
||||
dict(name='span',attrs={'class':'comments'}),
|
||||
]
|
||||
|
||||
feeds = [
|
||||
|
||||
remove_tags = [
|
||||
dict(name='aside',attrs={'id':'slashboxes'}),
|
||||
dict(name='div',attrs={'class':'paginate'}),
|
||||
dict(name='section',attrs={'id':'comments'}),
|
||||
dict(name='span',attrs={'class':'topic'}),
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Slashdot',
|
||||
u'http://rss.slashdot.org/Slashdot/slashdot'),
|
||||
(u'/. IT',
|
||||
@ -37,5 +50,3 @@ class Slashdot(BasicNewsRecipe):
|
||||
u'http://rss.slashdot.org/Slashdot/slashdotYourRightsOnline')
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('feedburner_origlink', None)
|
||||
|
54
recipes/socialdiva.recipe
Normal file
@ -0,0 +1,54 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011'
|
||||
'''
|
||||
socialdiva.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class SocialDiva(BasicNewsRecipe):
|
||||
title = u'Social Diva'
|
||||
__author__ = u'Silviu Cotoara'
|
||||
description = u'When in doubt, wear red'
|
||||
publisher = 'Social Diva'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Reviste,Femei'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.socialdiva.ro/images/logo.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'col-alpha mt5 content_articol'}),
|
||||
dict(name='div', attrs={'class':'mt5'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='a', attrs={'class':['comments float-left scroll mt5']}),
|
||||
dict(name='a', attrs={'class':['comments float-left scroll']}),
|
||||
dict(name='div', attrs={'class':['rating-container relative float-left']}),
|
||||
dict(name='div', attrs={'class':['float-right social_articol']})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='a', attrs={'class':['comments float-left scroll mt5']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://www.socialdiva.ro/rss.html')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
28
recipes/spektrum.recipe
Normal file
@ -0,0 +1,28 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
title = u'Spektrum (der Wissenschaft)'
|
||||
__author__ = 'schuster'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'de'
|
||||
cover_url = 'http://upload.wikimedia.org/wikipedia/de/3/3b/Spektrum_der_Wissenschaft_Logo.svg'
|
||||
|
||||
remove_tags = [dict(attrs={'class':['hauptnaviPkt gainlayout', 'hauptnaviButton', 'suchButton', 'suchbegriffKasten', 'loginButton', 'subnavigation', 'artikelInfoLeiste gainlayout', 'artikelTools', 'nurLetzteSeite', 'link', 'boxUnterArtikel', 'leserbriefeBlock', 'boxTitel', 'boxInhalt', 'sehrklein', 'boxabstand', 'werbeboxinhalt', 'rbabstand', 'bildlinks', 'rechtebox', 'denkmalbox', 'denkmalfrage']}),
|
||||
dict(id=['pflip', 'verlagsleiste', 'bereich', 'bannerVertikal', 'headerLogoLink', 'kopf', 'topNavi', 'headerSchnellsuche', 'headerSchnellsucheWarten', 'navigation', 'navigationL', 'navigationR', 'inhalt', 'rechtespalte', 'sdwboxenshop', 'shopboxen', 'fuss']),
|
||||
dict(name=['naservice'])]
|
||||
|
||||
def print_version(self,url):
|
||||
newurl = url.replace('artikel/', 'sixcms/detail.php?id=')
|
||||
return newurl + '&_druckversion=1'
|
||||
|
||||
|
||||
|
||||
feeds = [(u'Spektrum der Wissenschaft', u'http://www.spektrum.de/artikel/982623'),
|
||||
(u'SpektrumDirekt', u'http://www.spektrumdirekt.de/artikel/996406'),
|
||||
(u'Sterne und Weltraum', u'http://www.astronomie-heute.de/artikel/865248'),
|
||||
(u'Gehirn & Geist', u'http://www.gehirn-und-geist.de/artikel/982626'),
|
||||
(u'epoc', u'http://www.epoc.de/artikel/982625')
|
||||
|
||||
]
|
||||
|
||||
filter_regexps = [r'ads\.doubleclick\.net']
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
staradvertiser.com
|
||||
'''
|
||||
@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Starbulletin(BasicNewsRecipe):
|
||||
title = 'Honolulu Star Advertiser'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = "Latest national and local Hawaii sports news"
|
||||
description = 'Latest national and local Hawaii sports news'
|
||||
publisher = 'Honolulu Star-Advertiser'
|
||||
category = 'news, Honolulu, Hawaii'
|
||||
oldest_article = 2
|
||||
@ -19,7 +19,13 @@ class Starbulletin(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publication_type = 'newspaper'
|
||||
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif} h1,.brown,.postCredit{color: #663300} .storyDeck{font-size: 1.2em; font-weight: bold} '
|
||||
masthead_url = 'http://media.staradvertiser.com/designimages/star-advertiser-logo-small.gif'
|
||||
extra_css = """
|
||||
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
||||
h1,.brown,.postCredit{color: #663300}
|
||||
.storyDeck{font-size: 1.2em; font-weight: bold}
|
||||
img{display: block}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -28,14 +34,16 @@ class Starbulletin(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
remove_tags_before = dict(attrs={'id':'storyTitle'})
|
||||
remove_tags_after = dict(name='div',attrs={'class':'storytext'})
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'storyTitle'})
|
||||
,dict(attrs={'class':['storyDeck','postCredit']})
|
||||
,dict(name='span',attrs={'class':'brown'})
|
||||
,dict(name='div',attrs={'class':'storytext'})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name=['object','link','script','span'])
|
||||
,dict(attrs={'class':'insideStoryImage'})
|
||||
dict(name=['object','link','script','span','meta','base','iframe'])
|
||||
,dict(attrs={'class':['insideStoryImage','insideStoryAd']})
|
||||
,dict(attrs={'name':'fb_share'})
|
||||
,dict(name='div',attrs={'class':'storytext'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
@ -47,3 +55,24 @@ class Starbulletin(BasicNewsRecipe):
|
||||
,(u'Business' , u'http://www.staradvertiser.com/business/index.rss' )
|
||||
,(u'Travel' , u'http://www.staradvertiser.com/travel/index.rss' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
return soup
|
||||
|
@ -33,7 +33,7 @@ class StrategyBusinessRecipe(BasicNewsRecipe):
|
||||
elif c.name.endswith('_password'):
|
||||
br[c.name] = self.password
|
||||
raw = br.submit().read()
|
||||
if '>Logout' not in raw:
|
||||
if 'You have been logged in' not in raw:
|
||||
raise ValueError('Failed to login, check your username and password')
|
||||
return br
|
||||
|
||||
|
@ -37,10 +37,12 @@ class TabuRo(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'asemanatoare'})
|
||||
dict(name='div', attrs={'class':'asemanatoare'}),
|
||||
dict(name='div', attrs={'class':'social'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'class':'social'}),
|
||||
dict(name='div', attrs={'id':'comments'}),
|
||||
dict(name='div', attrs={'class':'asemanatoare'})
|
||||
]
|
||||
|
24
recipes/technology_review_de.recipe
Normal file
@ -0,0 +1,24 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
title = u'Technology Review'
|
||||
__author__ = 'schuster'
|
||||
remove_tags_before = dict(id='keywords')
|
||||
remove_tags_after = dict(id='kommentar')
|
||||
remove_tags = [dict(attrs={'class':['navi_oben_pvg', 'navi_oben_tarifr', 'navi_oben_itm', 'navi_oben_eve', 'navi_oben_whi', 'navi_oben_abo', 'navi_oben_shop', 'navi_top_logo', 'navi_top_abschnitt', 'first']}),
|
||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
oldest_article = 4
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?view=print'
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Technik News', u'http://www.heise.de/tr/news-atom.xml') ]
|
||||
|
@ -1,33 +1,33 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
|
||||
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TelepolisNews(BasicNewsRecipe):
|
||||
title = u'Telepolis (News+Artikel)'
|
||||
__author__ = 'Gerhard Aigner'
|
||||
__author__ = 'syntaxis'
|
||||
publisher = 'Heise Zeitschriften Verlag GmbH & Co KG'
|
||||
description = 'News from telepolis'
|
||||
description = 'News from Telepolis'
|
||||
category = 'news'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
recursion = 0
|
||||
no_stylesheets = True
|
||||
encoding = "utf-8"
|
||||
language = 'de_AT'
|
||||
language = 'de'
|
||||
|
||||
|
||||
use_embedded_content =False
|
||||
remove_empty_feeds = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<a[^>]*>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),]
|
||||
|
||||
keep_only_tags = [dict(name = 'td',attrs={'class':'bloghead'}),dict(name = 'td',attrs={'class':'blogfliess'})]
|
||||
remove_tags = [dict(name='img'), dict(name='td',attrs={'class':'blogbottom'}), dict(name='td',attrs={'class':'forum'})]
|
||||
|
||||
keep_only_tags = [dict(name = 'div',attrs={'class':'head'}),dict(name = 'div',attrs={'class':'leftbox'}),dict(name='td',attrs={'class':'strict'})]
|
||||
remove_tags = [ dict(name='td',attrs={'class':'blogbottom'}),
|
||||
dict(name='div',attrs={'class':'forum'}), dict(name='div',attrs={'class':'social'}),dict(name='div',attrs={'class':'blog-letter p-news'}),
|
||||
dict(name='div',attrs={'class':'blog-sub'}),dict(name='div',attrs={'class':'version-div'}),dict(name='div',attrs={'id':'breadcrumb'})
|
||||
,dict(attrs={'class':'tp-url'}),dict(attrs={'class':'blog-name entry_'}) ]
|
||||
|
||||
remove_tags_after = [dict(name='span', attrs={'class':['breadcrumb']})]
|
||||
|
||||
|
||||
feeds = [(u'News', u'http://www.heise.de/tp/news-atom.xml')]
|
||||
|
||||
@ -39,15 +39,8 @@ class TelepolisNews(BasicNewsRecipe):
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
def get_article_url(self, article):
|
||||
'''if the linked article is of kind artikel don't take it'''
|
||||
if (article.link.count('artikel') > 1) :
|
||||
return None
|
||||
return article.link
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
||||
soup.head.insert(0,mtag)
|
||||
return soup
|
||||
|
||||
|
||||
|
26
recipes/the_journal.recipe
Normal file
@ -0,0 +1,26 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011 Phil Burns'
|
||||
'''
|
||||
TheJournal.ie
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TheJournal(BasicNewsRecipe):
|
||||
|
||||
__author_ = 'Phil Burns'
|
||||
title = u'TheJournal.ie'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'utf8'
|
||||
language = 'en_IE'
|
||||
timefmt = ' (%A, %B %d, %Y)'
|
||||
|
||||
no_stylesheets = True
|
||||
remove_tags = [dict(name='div', attrs={'class':'footer'}),
|
||||
dict(name=['script', 'noscript'])]
|
||||
|
||||
extra_css = 'p, div { margin: 0pt; border: 0pt; text-indent: 0.5em }'
|
||||
|
||||
feeds = [
|
||||
(u'Latest News', u'http://www.thejournal.ie/feed/')]
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||
description = 'TheMarker Financial News in Hebrew'
|
||||
__author__ = 'TonyTheBookworm, Marbs'
|
||||
__author__ = 'Marbs'
|
||||
cover_url = 'http://static.ispot.co.il/wp-content/upload/2009/09/themarker.jpg'
|
||||
title = u'TheMarker'
|
||||
language = 'he'
|
||||
@ -11,42 +11,38 @@ class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
oldest_article = 1
|
||||
remove_tags = [dict(name='tr', attrs={'bgcolor':['#738A94']}) ]
|
||||
max_articles_per_feed = 10
|
||||
keep_only_tags =dict(name='div', attrs={'id':'content'})
|
||||
remove_attributes = ['width','float','margin-left']
|
||||
no_stylesheets = True
|
||||
remove_tags = [dict(name='div', attrs={'class':['social-nav article-social-nav','prsnlArticleEnvelope','cb']}) ,
|
||||
dict(name='a', attrs={'href':['/misc/mobile']}) ,
|
||||
dict(name='span', attrs={'class':['post-summ']}) ]
|
||||
max_articles_per_feed = 100
|
||||
extra_css='body{direction: rtl;} .article_description{direction: rtl; } a.article{direction: rtl; } .calibre_feed_description{direction: rtl; }'
|
||||
feeds = [(u'Head Lines', u'http://www.themarker.com/tmc/content/xml/rss/hpfeed.xml'),
|
||||
(u'TA Market', u'http://www.themarker.com/tmc/content/xml/rss/sections/marketfeed.xml'),
|
||||
(u'Real Estate', u'http://www.themarker.com/tmc/content/xml/rss/sections/realEstaterfeed.xml'),
|
||||
(u'Wall Street & Global', u'http://www.themarker.com/tmc/content/xml/rss/sections/wallsfeed.xml'),
|
||||
(u'Law', u'http://www.themarker.com/tmc/content/xml/rss/sections/lawfeed.xml'),
|
||||
(u'Media', u'http://www.themarker.com/tmc/content/xml/rss/sections/mediafeed.xml'),
|
||||
(u'Consumer', u'http://www.themarker.com/tmc/content/xml/rss/sections/consumerfeed.xml'),
|
||||
(u'Career', u'http://www.themarker.com/tmc/content/xml/rss/sections/careerfeed.xml'),
|
||||
(u'Car', u'http://www.themarker.com/tmc/content/xml/rss/sections/carfeed.xml'),
|
||||
(u'High Tech', u'http://www.themarker.com/tmc/content/xml/rss/sections/hightechfeed.xml'),
|
||||
(u'Investor Guide', u'http://www.themarker.com/tmc/content/xml/rss/sections/investorGuidefeed.xml')]
|
||||
feeds = [(u'Head Lines', u'http://www.themarker.com/cmlink/1.144'),
|
||||
(u'TA Market', u'http://www.themarker.com/cmlink/1.243'),
|
||||
(u'Real Estate', u'http://www.themarker.com/cmlink/1.605656'),
|
||||
(u'Global', u'http://www.themarker.com/cmlink/1.605658'),
|
||||
(u'Wall Street', u'http://www.themarker.com/cmlink/1.613713'),
|
||||
(u'SmartPhone', u'http://www.themarker.com/cmlink/1.605661'),
|
||||
(u'Law', u'http://www.themarker.com/cmlink/1.605664'),
|
||||
(u'Media', u'http://www.themarker.com/cmlink/1.605660'),
|
||||
(u'Consumer', u'http://www.themarker.com/cmlink/1.605662'),
|
||||
(u'Career', u'http://www.themarker.com/cmlink/1.605665'),
|
||||
(u'Car', u'http://www.themarker.com/cmlink/1.605663'),
|
||||
(u'High Tech', u'http://www.themarker.com/cmlink/1.605659'),
|
||||
(u'Small Business', u'http://www.themarker.com/cmlink/1.605666')]
|
||||
|
||||
def print_version(self, url):
|
||||
split1 = url.split("=")
|
||||
weblinks = url
|
||||
#split1 = url.split("/")
|
||||
#print_url='http://www.themarker.com/misc/article-print-page/'+split1[-1]
|
||||
txt=url
|
||||
|
||||
if weblinks is not None:
|
||||
for link in weblinks:
|
||||
#---------------------------------------------------------
|
||||
#here we need some help with some regexpressions
|
||||
#we are trying to find it.themarker.com in a url
|
||||
#-----------------------------------------------------------
|
||||
re1='.*?' # Non-greedy match on filler
|
||||
re2='(it\\.themarker\\.com)' # Fully Qualified Domain Name 1
|
||||
rg = re.compile(re1+re2,re.IGNORECASE|re.DOTALL)
|
||||
m = rg.search(url)
|
||||
re1='.*?' # Non-greedy match on filler
|
||||
re2='(tv)' # Word 1
|
||||
|
||||
|
||||
if m:
|
||||
split2 = url.split("article/")
|
||||
print_url = 'http://it.themarker.com/tmit/PrintArticle/' + split2[1]
|
||||
|
||||
else:
|
||||
print_url = 'http://www.themarker.com/ibo/misc/printFriendly.jhtml?ElementId=%2Fibo%2Frepositories%2Fstories%2Fm1_2000%2F' + split1[1]+'.xml'
|
||||
|
||||
return print_url
|
||||
rg = re.compile(re1+re2,re.IGNORECASE|re.DOTALL)
|
||||
m = rg.search(txt)
|
||||
if m:
|
||||
#print 'bad link'
|
||||
return 1
|
||||
|
@ -10,6 +10,8 @@ import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Time(BasicNewsRecipe):
|
||||
recipe_disabled = ('This recipe has been disabled as TIME no longer'
|
||||
' publish complete articles on the web.')
|
||||
title = u'Time'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
description = 'Weekly magazine'
|
||||
|
86
recipes/united_daily.recipe
Normal file
@ -0,0 +1,86 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class UnitedDaily(BasicNewsRecipe):
|
||||
title = u'聯合新聞網'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'焦點', u'http://udn.com/udnrss/focus.xml'),
|
||||
(u'政治', u'http://udn.com/udnrss/politics.xml'),
|
||||
(u'社會', u'http://udn.com/udnrss/social.xml'),
|
||||
(u'生活', u'http://udn.com/udnrss/life.xml'),
|
||||
(u'綜合', u'http://udn.com/udnrss/education.xml'),
|
||||
(u'意見評論', u'http://udn.com/udnrss/opinion.xml'),
|
||||
(u'校園博覽會', u'http://mag.udn.com/udnrss/campus_rss.xml'),
|
||||
(u'大台北', u'http://udn.com/udnrss/local_taipei.xml'),
|
||||
(u'桃竹苗', u'http://udn.com/udnrss/local_tyhcml.xml'),
|
||||
(u'中彰投', u'http://udn.com/udnrss/local_tcchnt.xml'),
|
||||
(u'雲嘉南', u'http://udn.com/udnrss/local_ylcytn.xml'),
|
||||
(u'高屏離島', u'http://udn.com/udnrss/local_ksptisland.xml'),
|
||||
(u'基宜花東', u'http://udn.com/udnrss/local_klilhltt.xml'),
|
||||
(u'台灣百寶鄉', u'http://udn.com/udnrss/local_oddlyenough.xml'),
|
||||
(u'台灣人物', u'http://mag.udn.com/udnrss/people_rss.xml'),
|
||||
(u'兩岸要聞', u'http://udn.com/udnrss/mainland.xml'),
|
||||
(u'國際焦點', u'http://udn.com/udnrss/international.xml'),
|
||||
(u'台商經貿', u'http://udn.com/udnrss/financechina.xml'),
|
||||
(u'國際財經', u'http://udn.com/udnrss/financeworld.xml'),
|
||||
(u'全球觀察', u'http://mag.udn.com/udnrss/world_rss.xml'),
|
||||
(u'財經焦點', u'http://udn.com/udnrss/financesfocus.xml'),
|
||||
(u'股市要聞', u'http://udn.com/udnrss/stock.xml'),
|
||||
(u'股市快訊', u'http://udn.com/udnrss/stklatest.xml'),
|
||||
(u'稅務法務', u'http://udn.com/udnrss/tax.xml'),
|
||||
(u'房市情報', u'http://udn.com/udnrss/houses.xml'),
|
||||
(u'個人理財', u'http://mag.udn.com/udnrss/wealth_rss.xml'),
|
||||
(u'研究報告', u'http://mag.udn.com/udnrss/report_rss.xml'),
|
||||
(u'基金', u'http://mag.udn.com/udnrss/fund_rss.xml'),
|
||||
(u'理財會客室', u'http://mag.udn.com/udnrss/m_forum_rss.xml'),
|
||||
(u'棒球', u'http://udn.com/udnrss/baseball.xml'),
|
||||
(u'籃球', u'http://udn.com/udnrss/basketball.xml'),
|
||||
(u'體壇動態', u'http://udn.com/udnrss/sportsfocus.xml'),
|
||||
(u'熱門星聞', u'http://udn.com/udnrss/starsfocus.xml'),
|
||||
(u'廣電港陸', u'http://udn.com/udnrss/tv.xml'),
|
||||
(u'海外星球', u'http://udn.com/udnrss/starswestern.xml'),
|
||||
(u'日韓星情', u'http://udn.com/udnrss/starsjk.xml'),
|
||||
(u'電影世界', u'http://udn.com/udnrss/movie.xml'),
|
||||
(u'流行音樂', u'http://udn.com/udnrss/music.xml'),
|
||||
(u'觀點專題', u'http://udn.com/udnrss/starssubject.xml'),
|
||||
(u'消費流行', u'http://mag.udn.com/udnrss/happylife_rss.xml'),
|
||||
(u'食樂指南', u'http://udn.com/udnrss/food.xml'),
|
||||
(u'數位資訊', u'http://mag.udn.com/udnrss/digital_rss.xml'),
|
||||
(u'折扣好康', u'http://udn.com/udnrss/shopping.xml'),
|
||||
(u'發燒車訊', u'http://mag.udn.com/udnrss/car_rss.xml'),
|
||||
(u'醫藥新聞', u'http://udn.com/udnrss/health.xml'),
|
||||
(u'家婦繽紛', u'http://udn.com/udnrss/benfen.xml'),
|
||||
(u'談星論命', u'http://udn.com/udnrss/astrology.xml'),
|
||||
(u'文化副刊', u'http://udn.com/udnrss/reading.xml'),
|
||||
(u'旅遊休閒', u'http://travel.udn.com/udnrss/travel_rss.xml'),
|
||||
(u'健康醫藥', u'http://mag.udn.com/udnrss/life_rss.xml'),
|
||||
]
|
||||
|
||||
extra_css = '''div[id='story_title'] {font-size:200%; font-weight:bold;} td[class='story_title'] {font-size:200%; font-weight:bold;} td[class='story_title'] td[class='story_title']>div {font-size:200%; font-weight:bold;}'''
|
||||
|
||||
__author__ = 'Eddie Lau'
|
||||
__version__ = '1.1'
|
||||
language = 'zh-TW'
|
||||
publisher = 'United Daily News Group'
|
||||
description = 'United Daily (Taiwan)'
|
||||
category = 'News, Chinese, Taiwan'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
encoding = 'big5'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
masthead_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif'
|
||||
cover_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif'
|
||||
keep_only_tags = [dict(name='td', attrs={'class':['story_title']}),
|
||||
dict(name='div', attrs={'id':['story_title']}),
|
||||
dict(name='td', attrs={'class':['story_author']}),
|
||||
dict(name='div', attrs={'id':['story_author']}),
|
||||
dict(name='td', attrs={'class':['story']}),
|
||||
dict(name='div', attrs={'id':['story']}),
|
||||
]
|
||||
remove_tags = [dict(name='div', attrs={'id':['mvouter']})]
|
||||
|
@ -7,13 +7,11 @@ usatoday.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
|
||||
import re
|
||||
|
||||
class USAToday(BasicNewsRecipe):
|
||||
|
||||
title = 'USA Today'
|
||||
__author__ = 'GRiker'
|
||||
__author__ = 'Kovid Goyal'
|
||||
oldest_article = 1
|
||||
timefmt = ''
|
||||
max_articles_per_feed = 20
|
||||
@ -31,7 +29,6 @@ class USAToday(BasicNewsRecipe):
|
||||
margin-bottom: 0em; \
|
||||
font-size: smaller;}\n \
|
||||
.articleBody {text-align: left;}\n '
|
||||
conversion_options = { 'linearize_tables' : True }
|
||||
#simultaneous_downloads = 1
|
||||
feeds = [
|
||||
('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
|
||||
@ -47,63 +44,26 @@ class USAToday(BasicNewsRecipe):
|
||||
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
|
||||
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'),
|
||||
]
|
||||
keep_only_tags = [dict(attrs={'class':[
|
||||
'byLine',
|
||||
'inside-copy',
|
||||
'inside-head',
|
||||
'inside-head2',
|
||||
'item',
|
||||
'item-block',
|
||||
'photo-container',
|
||||
]}),
|
||||
dict(id=[
|
||||
'applyMainStoryPhoto',
|
||||
'permalink',
|
||||
])]
|
||||
keep_only_tags = [dict(attrs={'class':'story'})]
|
||||
remove_tags = [
|
||||
dict(attrs={'class':[
|
||||
'share',
|
||||
'reprints',
|
||||
'inline-h3',
|
||||
'info-extras',
|
||||
'ppy-outer',
|
||||
'ppy-caption',
|
||||
'comments',
|
||||
'jump',
|
||||
'pagetools',
|
||||
'post-attributes',
|
||||
'tags',
|
||||
'bottom-tools',
|
||||
'sponsoredlinks',
|
||||
]}),
|
||||
dict(id=['pluck']),
|
||||
]
|
||||
|
||||
remove_tags = [dict(attrs={'class':[
|
||||
'comments',
|
||||
'jump',
|
||||
'pagetools',
|
||||
'post-attributes',
|
||||
'tags',
|
||||
]}),
|
||||
dict(id=[])]
|
||||
|
||||
#feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')]
|
||||
|
||||
def dump_hex(self, src, length=16):
|
||||
''' Diagnostic '''
|
||||
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
||||
N=0; result=''
|
||||
while src:
|
||||
s,src = src[:length],src[length:]
|
||||
hexa = ' '.join(["%02X"%ord(x) for x in s])
|
||||
s = s.translate(FILTER)
|
||||
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
|
||||
N+=length
|
||||
print result
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
|
||||
return fixed
|
||||
|
||||
def get_masthead_url(self):
|
||||
masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif'
|
||||
@ -115,321 +75,4 @@ class USAToday(BasicNewsRecipe):
|
||||
masthead = None
|
||||
return masthead
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def parse_feeds(self, *args, **kwargs):
|
||||
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
|
||||
# Count articles for progress dialog
|
||||
article_count = 0
|
||||
for feed in parsed_feeds:
|
||||
article_count += len(feed)
|
||||
self.log( "Queued %d articles" % article_count)
|
||||
return parsed_feeds
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup = self.strip_anchors(soup)
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
|
||||
# Remove navLinks <div class="inside-copy" style="padding-bottom:3px">
|
||||
navLinks = soup.find(True,{'style':'padding-bottom:3px'})
|
||||
if navLinks:
|
||||
navLinks.extract()
|
||||
|
||||
# Remove <div class="inside-copy" style="margin-bottom:10px">
|
||||
gibberish = soup.find(True,{'style':'margin-bottom:10px'})
|
||||
if gibberish:
|
||||
gibberish.extract()
|
||||
|
||||
# Change <inside-head> to <h2>
|
||||
headline = soup.find(True, {'class':['inside-head','inside-head2']})
|
||||
if not headline:
|
||||
headline = soup.find('h3')
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, headline.contents[0])
|
||||
headline.replaceWith(tag)
|
||||
else:
|
||||
print "unable to find headline:\n%s\n" % soup
|
||||
|
||||
# Change byLine to byline, change commas to middot
|
||||
# Kindle renders commas in byline as '&'
|
||||
byline = soup.find(True, {'class':'byLine'})
|
||||
if byline:
|
||||
byline['class'] = 'byline'
|
||||
# Replace comma with middot
|
||||
byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents()))
|
||||
|
||||
jumpout_punc_list = [':','?']
|
||||
# Remove the inline jumpouts in <div class="inside-copy">
|
||||
paras = soup.findAll(True, {'class':'inside-copy'})
|
||||
for para in paras:
|
||||
if re.match("<b>[\w\W]+ ",para.renderContents()):
|
||||
p = para.find('b')
|
||||
for punc in jumpout_punc_list:
|
||||
punc_offset = p.contents[0].find(punc)
|
||||
if punc_offset == -1:
|
||||
continue
|
||||
if punc_offset > 1:
|
||||
if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
|
||||
#print "extracting \n%s\n" % para.prettify()
|
||||
para.extract()
|
||||
|
||||
# Reset class for remaining
|
||||
paras = soup.findAll(True, {'class':'inside-copy'})
|
||||
for para in paras:
|
||||
para['class'] = 'articleBody'
|
||||
|
||||
# Remove inline jumpouts in <p>
|
||||
paras = soup.findAll(['p'])
|
||||
for p in paras:
|
||||
if hasattr(p,'contents') and len(p.contents):
|
||||
for punc in jumpout_punc_list:
|
||||
punc_offset = p.contents[0].find(punc)
|
||||
if punc_offset == -1:
|
||||
continue
|
||||
if punc_offset > 2 and hasattr(p,'a') and len(p.contents):
|
||||
#print "evaluating %s\n" % p.contents[0][:punc_offset+1]
|
||||
if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
|
||||
#print "extracting \n%s\n" % p.prettify()
|
||||
p.extract()
|
||||
|
||||
# Capture the first img, insert after headline
|
||||
imgs = soup.findAll('img')
|
||||
print "postprocess_html(): %d images" % len(imgs)
|
||||
if imgs:
|
||||
divTag = Tag(soup, 'div')
|
||||
divTag['class'] = 'image'
|
||||
body = soup.find('body')
|
||||
img = imgs[0]
|
||||
#print "img: \n%s\n" % img.prettify()
|
||||
|
||||
# Table for photo and credit
|
||||
tableTag = Tag(soup,'table')
|
||||
|
||||
# Photo
|
||||
trimgTag = Tag(soup, 'tr')
|
||||
tdimgTag = Tag(soup, 'td')
|
||||
tdimgTag.insert(0,img)
|
||||
trimgTag.insert(0,tdimgTag)
|
||||
tableTag.insert(0,trimgTag)
|
||||
|
||||
# Credit
|
||||
trcreditTag = Tag(soup, 'tr')
|
||||
|
||||
tdcreditTag = Tag(soup, 'td')
|
||||
tdcreditTag['class'] = 'credit'
|
||||
credit = soup.find('td',{'class':'photoCredit'})
|
||||
if credit:
|
||||
tdcreditTag.insert(0,NavigableString(credit.renderContents()))
|
||||
else:
|
||||
credit = img['credit']
|
||||
if credit:
|
||||
tdcreditTag.insert(0,NavigableString(credit))
|
||||
else:
|
||||
tdcreditTag.insert(0,NavigableString(''))
|
||||
|
||||
trcreditTag.insert(0,tdcreditTag)
|
||||
tableTag.insert(1,trcreditTag)
|
||||
dtc = 0
|
||||
divTag.insert(dtc,tableTag)
|
||||
dtc += 1
|
||||
|
||||
if False:
|
||||
# Add the caption in the table
|
||||
tableCaptionTag = Tag(soup,'caption')
|
||||
tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents())
|
||||
tableTag.insert(1,tableCaptionTag)
|
||||
divTag.insert(dtc,tableTag)
|
||||
dtc += 1
|
||||
body.insert(1,divTag)
|
||||
else:
|
||||
# Add the caption below the table
|
||||
#print "Looking for caption in this soup:\n%s" % img.prettify()
|
||||
captionTag = Tag(soup,'p')
|
||||
captionTag['class'] = 'caption'
|
||||
if hasattr(img,'alt') and img['alt']:
|
||||
captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['alt']))
|
||||
divTag.insert(dtc, captionTag)
|
||||
dtc += 1
|
||||
else:
|
||||
try:
|
||||
captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['cutline']))
|
||||
divTag.insert(dtc, captionTag)
|
||||
dtc += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
hrTag = Tag(soup, 'hr')
|
||||
divTag.insert(dtc, hrTag)
|
||||
dtc += 1
|
||||
|
||||
# Delete <div id="applyMainStoryPhoto"
|
||||
photoJunk = soup.find('div',{'id':'applyMainStoryPhoto'})
|
||||
if photoJunk:
|
||||
photoJunk.extract()
|
||||
|
||||
# Insert img after headline
|
||||
tag = body.find(True)
|
||||
insertLoc = 0
|
||||
headline_found = False
|
||||
while True:
|
||||
# Scan the top-level tags
|
||||
insertLoc += 1
|
||||
if hasattr(tag,'class') and tag['class'] == 'headline':
|
||||
headline_found = True
|
||||
body.insert(insertLoc,divTag)
|
||||
break
|
||||
tag = tag.nextSibling
|
||||
if not tag:
|
||||
break
|
||||
|
||||
if not headline_found:
|
||||
# Monolithic <div> - restructure
|
||||
tag = body.find(True)
|
||||
while True:
|
||||
insertLoc += 1
|
||||
try:
|
||||
if hasattr(tag,'class') and tag['class'] == 'headline':
|
||||
headline_found = True
|
||||
tag.insert(insertLoc,divTag)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
tag = tag.next
|
||||
if not tag:
|
||||
break
|
||||
|
||||
# Yank out headline, img and caption
|
||||
headline = body.find('h2','headline')
|
||||
img = body.find('div','image')
|
||||
caption = body.find('p''class')
|
||||
|
||||
# body(0) is calibre_navbar
|
||||
# body(1) is <div class="item">
|
||||
|
||||
btc = 1
|
||||
headline.extract()
|
||||
body.insert(1, headline)
|
||||
btc += 1
|
||||
if img:
|
||||
img.extract()
|
||||
body.insert(btc, img)
|
||||
btc += 1
|
||||
if caption:
|
||||
caption.extract()
|
||||
body.insert(btc, caption)
|
||||
btc += 1
|
||||
|
||||
if len(imgs) > 1:
|
||||
if True:
|
||||
[img.extract() for img in imgs[1:]]
|
||||
else:
|
||||
# Format the remaining images
|
||||
# This doesn't work yet
|
||||
for img in imgs[1:]:
|
||||
print "img:\n%s\n" % img.prettify()
|
||||
divTag = Tag(soup, 'div')
|
||||
divTag['class'] = 'image'
|
||||
|
||||
# Table for photo and credit
|
||||
tableTag = Tag(soup,'table')
|
||||
|
||||
# Photo
|
||||
trimgTag = Tag(soup, 'tr')
|
||||
tdimgTag = Tag(soup, 'td')
|
||||
tdimgTag.insert(0,img)
|
||||
trimgTag.insert(0,tdimgTag)
|
||||
tableTag.insert(0,trimgTag)
|
||||
|
||||
# Credit
|
||||
trcreditTag = Tag(soup, 'tr')
|
||||
|
||||
tdcreditTag = Tag(soup, 'td')
|
||||
tdcreditTag['class'] = 'credit'
|
||||
try:
|
||||
tdcreditTag.insert(0,NavigableString(img['credit']))
|
||||
except:
|
||||
tdcreditTag.insert(0,NavigableString(''))
|
||||
trcreditTag.insert(0,tdcreditTag)
|
||||
tableTag.insert(1,trcreditTag)
|
||||
divTag.insert(0,tableTag)
|
||||
soup.img.replaceWith(divTag)
|
||||
|
||||
return soup
|
||||
|
||||
def postprocess_book(self, oeb, opts, log) :
|
||||
|
||||
def extract_byline(href) :
|
||||
# <meta name="byline" content=
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
byline = soup.find('div',attrs={'class':'byline'})
|
||||
if byline:
|
||||
byline['class'] = 'byline'
|
||||
# Replace comma with middot
|
||||
byline.contents[0].replaceWith(re.sub(u",", u" ·",
|
||||
byline.renderContents(encoding=None)))
|
||||
return byline.renderContents(encoding=None)
|
||||
else :
|
||||
paras = soup.findAll(text=True)
|
||||
for para in paras:
|
||||
if para.startswith("Copyright"):
|
||||
return para[len('Copyright xxxx '):para.find('.')]
|
||||
return None
|
||||
|
||||
def extract_description(href) :
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
description = soup.find('meta',attrs={'name':'description'})
|
||||
if description :
|
||||
return self.massageNCXText(description['content'])
|
||||
else:
|
||||
# Take first paragraph of article
|
||||
articleBody = soup.find('div',attrs={'id':['articleBody','item']})
|
||||
if articleBody:
|
||||
paras = articleBody.findAll('p')
|
||||
for p in paras:
|
||||
if p.renderContents() > '' :
|
||||
return self.massageNCXText(self.tag_to_string(p,use_alt=False))
|
||||
else:
|
||||
print "Didn't find <div id='articleBody'> in this soup:\n%s" % soup.prettify()
|
||||
return None
|
||||
|
||||
# Method entry point here
|
||||
# Single section toc looks different than multi-section tocs
|
||||
if oeb.toc.depth() == 2 :
|
||||
for article in oeb.toc :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
elif oeb.toc.depth() == 3 :
|
||||
for section in oeb.toc :
|
||||
for article in section :
|
||||
article.author = extract_byline(article.href)
|
||||
'''
|
||||
if article.author is None :
|
||||
article.author = self.massageNCXText(extract_byline(article.href))
|
||||
else:
|
||||
article.author = self.massageNCXText(article.author)
|
||||
'''
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
39
recipes/vitalia.recipe
Normal file
@ -0,0 +1,39 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Tomas Latal<latal.tomas at gmail.com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class VitaliaCZ(BasicNewsRecipe):
|
||||
title = 'Vitalia'
|
||||
__author__ = 'Tomas Latal'
|
||||
__version__ = '1.0'
|
||||
__date__ = '30 April 2011'
|
||||
description = u'Aktuality a \u010dl\xe1nky z Vitalia.cz'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 10
|
||||
encoding = 'utf8'
|
||||
publisher = 'Internet Info s.r.o.'
|
||||
category = 'zdravi, vztahy, wellness, CZ'
|
||||
language = 'cs'
|
||||
publication_type = 'newsportal'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
extra_css = 'p.perex{font-size: 1.2em; margin: 0 0 10px 0; line-height: 1.4; padding: 0 0 10px 0; font-weight: bold;} \
|
||||
p.perex img {display:none;} \
|
||||
span.author {font-size:0.8em; font-style:italic} \
|
||||
.urs div.rs-tip-major {padding:0.5em; background: #e0e0e0 none repeat scroll 0 0;border: 1px solid #909090;} \
|
||||
.urs p {margin: 0 0 0.8em 0;}'
|
||||
|
||||
feeds = [
|
||||
(u'Aktuality', 'http://www.vitalia.cz/rss/aktuality/'),
|
||||
(u'\u010cl\xe1nky', u'http://www.vitalia.cz/rss/clanky/'),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id='main')
|
||||
|
||||
remove_tags_after = [dict(id='main')]
|
||||
|
||||
remove_tags = [
|
||||
dict(attrs={'class':['author clear','tags-rubrics','box border style1 links clear','enquiry clear','serial','box border style1 TitleList','breadcrumb clear','article-discussion box border style1 monitoringComponentArticle','link-more border prev-next clear']}),
|
||||
dict(id=['discussionList','similarItems','sidebar','footer','opl','promo-box'])
|
||||
]
|
115
recipes/volksrant_sub.recipe
Normal file
@ -0,0 +1,115 @@
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Volkskrant_full(BasicNewsRecipe):
|
||||
# This recipe will download the Volkskrant newspaper,
|
||||
# from the subscribers site. It requires a password.
|
||||
# Known issues are: articles that are spread out over
|
||||
# multiple pages will appear multiple times. Pages
|
||||
# that contain only adverts will appear, but empty.
|
||||
# The supplement 'Volkskrant Magazine' on saturday
|
||||
# is currently not downloaded.
|
||||
# You can set a manual date, to download an archived
|
||||
# newspaper. Volkskrant stores over a month at the
|
||||
# moment of writing. To do so I suggest you unmark
|
||||
# the date on the line below, and insert it in the title. Then
|
||||
# follow the instructions marked further below.
|
||||
|
||||
title = 'De Volkskrant (subscription)' # [za, 13 nov 2010]'
|
||||
__author__ = u'Selcal'
|
||||
description = u"Volkskrant"
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
language = 'nl'
|
||||
use_embedded_content = False
|
||||
simultaneous_downloads = 1
|
||||
delay = 1
|
||||
needs_subscription = True
|
||||
# Set RETRIEVEDATE to 'yyyymmdd' to load an older
|
||||
# edition. Otherwise keep '%Y%m%d'
|
||||
# When setting a manual date, unmark and add the date
|
||||
# to the title above, and unmark the timefmt line to stop
|
||||
# Calibre from adding today's date in addition.
|
||||
|
||||
# timefmt = ''
|
||||
RETRIEVEDATE = strftime('%Y%m%d')
|
||||
INDEX_MAIN = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/#text'
|
||||
INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/'
|
||||
LOGIN = 'http://www.volkskrant.nl/vk/user/loggedIn.do'
|
||||
remove_tags = [dict(name='address')]
|
||||
cover_url = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/page.jpg'
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open(self.LOGIN)
|
||||
br.select_form(nr = 0)
|
||||
br['username'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
krant = []
|
||||
def strip_title(_title):
|
||||
i = 0
|
||||
while ((_title[i] <> ":") and (i <= len(_title))):
|
||||
i = i + 1
|
||||
return(_title[0:i])
|
||||
for temp in range (5):
|
||||
try:
|
||||
soup = self.index_to_soup(self.INDEX_MAIN)
|
||||
break
|
||||
except:
|
||||
#print '(Retrying main index load)'
|
||||
continue
|
||||
mainsoup = soup.find('td', attrs={'id': 'select_page_top'})
|
||||
for option in mainsoup.findAll('option'):
|
||||
articles = []
|
||||
_INDEX = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/#text'
|
||||
_INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/'
|
||||
#print ''
|
||||
#print '<------- Processing section: ' + _INDEX + ' ------------------------->'
|
||||
for temp in range (5):
|
||||
try:
|
||||
soup = self.index_to_soup(_INDEX)
|
||||
break
|
||||
except:
|
||||
#print '(Retrying index load)'
|
||||
continue
|
||||
for item in soup.findAll('area'):
|
||||
art_nr = item['class']
|
||||
attrname = art_nr[0:12] + '_section' + option['value'][0:5] + '_' + art_nr[26:len(art_nr)]
|
||||
#print '==> Found: ' + attrname;
|
||||
index_title = soup.find('div', attrs={'class': attrname})
|
||||
get_title = index_title['title'];
|
||||
_ARTICLE = _INDEX_ARTICLE + attrname + '.html#text'
|
||||
title = get_title;
|
||||
#print '--> Title: ' + title;
|
||||
#print '--> URL: ' + _ARTICLE;
|
||||
for temp in range (5):
|
||||
try:
|
||||
souparticle = self.index_to_soup(_ARTICLE);
|
||||
break
|
||||
except:
|
||||
print '(Retrying URL load)'
|
||||
continue
|
||||
headerurl = souparticle.findAll('frame')[0]['src'];
|
||||
#print '--> Read frame name for header: ' + headerurl;
|
||||
url = _INDEX_ARTICLE + headerurl[0:len(headerurl)-12] + '_text.html';
|
||||
#print '--> Corrected URL: ' + url;
|
||||
if (get_title <> ''):
|
||||
title = strip_title(get_title)
|
||||
date = strftime(' %B %Y')
|
||||
if (title <> ''):
|
||||
articles.append({
|
||||
'title' :title
|
||||
,'date' :date
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
krant.append( (option.string, articles))
|
||||
return krant
|
||||
|
20
recipes/welt_der_physik.recipe
Normal file
@ -0,0 +1,20 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
title = u'Welt der Physik'
|
||||
__author__ = 'schuster'
|
||||
remove_tags_befor = [dict(name='div', attrs={'class':'inhalt_bild_text_printonly'})]
|
||||
remove_tags_after = [dict(name='span', attrs={'class':'clearinhalt_bild'})]
|
||||
remove_tags = [dict(attrs={'class':['invisible', 'searchfld', 'searchbtn', 'topnavi', 'topsearch']}),
|
||||
dict(id=['naservice', 'phservicemenu', '',]),
|
||||
dict(name=['naservice'])]
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
|
||||
|
||||
feeds = [(u'Nachrichten und Neuigkeiten', u'http://www.weltderphysik.de/rss/alles.xml')]
|
53
recipes/ziuaveche.recipe
Normal file
@ -0,0 +1,53 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
ziuaveche.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ZiuaVeche(BasicNewsRecipe):
|
||||
title = u'Ziua Veche'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = 'Cotidian online'
|
||||
publisher = 'Ziua Veche'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Cotidiane,Stiri'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.ziuaveche.ro/wp-content/themes/tema/images/zv-logo-alb-old.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'singlePost'})
|
||||
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'LikePluginPagelet'})
|
||||
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'LikePluginPagelet'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://www.ziuaveche.ro/feed/rss')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -41,14 +41,19 @@ authors_completer_append_separator = False
|
||||
#: Author sort name algorithm
|
||||
# The algorithm used to copy author to author_sort
|
||||
# Possible values are:
|
||||
# invert: use "fn ln" -> "ln, fn" (the default algorithm)
|
||||
# invert: use "fn ln" -> "ln, fn"
|
||||
# copy : copy author to author_sort without modification
|
||||
# comma : use 'copy' if there is a ',' in the name, otherwise use 'invert'
|
||||
# nocomma : "fn ln" -> "ln fn" (without the comma)
|
||||
# When this tweak is changed, the author_sort values stored with each author
|
||||
# must be recomputed by right-clicking on an author in the left-hand tags pane,
|
||||
# selecting 'manage authors', and pressing 'Recalculate all author sort values'.
|
||||
author_sort_copy_method = 'invert'
|
||||
# The author name suffixes are words that are ignored when they occur at the
|
||||
# end of an author name. The case of the suffix is ignored and trailing
|
||||
# periods are automatically handled.
|
||||
author_sort_copy_method = 'comma'
|
||||
author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
|
||||
'MD', 'M.D', 'I', 'II', 'III', 'IV')
|
||||
|
||||
#: Use author sort in Tag Browser
|
||||
# Set which author field to display in the tags pane (the list of authors,
|
||||
@ -118,6 +123,7 @@ sort_columns_at_startup = None
|
||||
# timestamp default if not set: dd MMM yyyy
|
||||
gui_pubdate_display_format = 'MMM yyyy'
|
||||
gui_timestamp_display_format = 'dd MMM yyyy'
|
||||
gui_last_modified_display_format = 'dd MMM yyyy'
|
||||
|
||||
#: Control sorting of titles and series in the library display
|
||||
# Control title and series sorting in the library view. If set to
|
||||
@ -266,26 +272,6 @@ max_content_server_tags_shown=5
|
||||
content_server_will_display = ['*']
|
||||
content_server_wont_display = []
|
||||
|
||||
#: Set custom metadata fields that the book details panel will or will not display.
|
||||
# book_details_will_display is a list of custom fields to be displayed.
|
||||
# book_details_wont_display is a list of custom fields not to be displayed.
|
||||
# wont_display has priority over will_display.
|
||||
# The special value '*' means all custom fields. The value [] means no entries.
|
||||
# Defaults:
|
||||
# book_details_will_display = ['*']
|
||||
# book_details_wont_display = []
|
||||
# Examples:
|
||||
# To display only the custom fields #mytags and #genre:
|
||||
# book_details_will_display = ['#mytags', '#genre']
|
||||
# book_details_wont_display = []
|
||||
# To display all fields except #mycomments:
|
||||
# book_details_will_display = ['*']
|
||||
# book_details_wont_display['#mycomments']
|
||||
# As above, this tweak affects only display of custom fields. The standard
|
||||
# fields are not affected
|
||||
book_details_will_display = ['*']
|
||||
book_details_wont_display = []
|
||||
|
||||
#: Set the maximum number of sort 'levels'
|
||||
# Set the maximum number of sort 'levels' that calibre will use to resort the
|
||||
# library after certain operations such as searches or device insertion. Each
|
||||
|
BIN
resources/images/drm-locked.png
Normal file
After Width: | Height: | Size: 1.6 KiB |
BIN
resources/images/drm-unlocked.png
Normal file
After Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 6.3 KiB |
BIN
resources/images/identifiers.png
Normal file
After Width: | Height: | Size: 705 B |
@ -7,17 +7,30 @@ CREATE TABLE books ( id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
title TEXT NOT NULL DEFAULT 'Unknown' COLLATE NOCASE,
|
||||
sort TEXT COLLATE NOCASE,
|
||||
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
uri TEXT,
|
||||
series_index INTEGER NOT NULL DEFAULT 1,
|
||||
pubdate TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
series_index REAL NOT NULL DEFAULT 1.0,
|
||||
author_sort TEXT COLLATE NOCASE,
|
||||
isbn TEXT DEFAULT "" COLLATE NOCASE,
|
||||
path TEXT NOT NULL DEFAULT ""
|
||||
);
|
||||
lccn TEXT DEFAULT "" COLLATE NOCASE,
|
||||
path TEXT NOT NULL DEFAULT "",
|
||||
flags INTEGER NOT NULL DEFAULT 1
|
||||
, uuid TEXT, has_cover BOOL DEFAULT 0, last_modified TIMESTAMP NOT NULL DEFAULT "2000-01-01 00:00:00+00:00");
|
||||
CREATE TABLE books_authors_link ( id INTEGER PRIMARY KEY,
|
||||
book INTEGER NOT NULL,
|
||||
author INTEGER NOT NULL,
|
||||
UNIQUE(book, author)
|
||||
);
|
||||
CREATE TABLE books_languages_link ( id INTEGER PRIMARY KEY,
|
||||
book INTEGER NOT NULL,
|
||||
lang_code INTEGER NOT NULL,
|
||||
item_order INTEGER NOT NULL DEFAULT 0,
|
||||
UNIQUE(book, lang_code)
|
||||
);
|
||||
CREATE TABLE books_plugin_data(id INTEGER PRIMARY KEY,
|
||||
book INTEGER NON NULL,
|
||||
name TEXT NON NULL,
|
||||
val TEXT NON NULL,
|
||||
UNIQUE(book,name));
|
||||
CREATE TABLE books_publishers_link ( id INTEGER PRIMARY KEY,
|
||||
book INTEGER NOT NULL,
|
||||
publisher INTEGER NOT NULL,
|
||||
@ -49,11 +62,51 @@ CREATE TABLE conversion_options ( id INTEGER PRIMARY KEY,
|
||||
data BLOB NOT NULL,
|
||||
UNIQUE(format,book)
|
||||
);
|
||||
CREATE TABLE custom_columns (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
label TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
datatype TEXT NOT NULL,
|
||||
mark_for_delete BOOL DEFAULT 0 NOT NULL,
|
||||
editable BOOL DEFAULT 1 NOT NULL,
|
||||
display TEXT DEFAULT "{}" NOT NULL,
|
||||
is_multiple BOOL DEFAULT 0 NOT NULL,
|
||||
normalized BOOL NOT NULL,
|
||||
UNIQUE(label)
|
||||
);
|
||||
CREATE TABLE data ( id INTEGER PRIMARY KEY,
|
||||
book INTEGER NON NULL,
|
||||
format TEXT NON NULL COLLATE NOCASE,
|
||||
uncompressed_size INTEGER NON NULL,
|
||||
name TEXT NON NULL,
|
||||
UNIQUE(book, format)
|
||||
);
|
||||
CREATE TABLE feeds ( id INTEGER PRIMARY KEY,
|
||||
title TEXT NOT NULL,
|
||||
script TEXT NOT NULL,
|
||||
UNIQUE(title)
|
||||
);
|
||||
CREATE TABLE identifiers ( id INTEGER PRIMARY KEY,
|
||||
book INTEGER NON NULL,
|
||||
type TEXT NON NULL DEFAULT "isbn" COLLATE NOCASE,
|
||||
val TEXT NON NULL COLLATE NOCASE,
|
||||
UNIQUE(book, type)
|
||||
);
|
||||
CREATE TABLE languages ( id INTEGER PRIMARY KEY,
|
||||
lang_code TEXT NON NULL COLLATE NOCASE,
|
||||
UNIQUE(lang_code)
|
||||
);
|
||||
CREATE TABLE library_id ( id INTEGER PRIMARY KEY,
|
||||
uuid TEXT NOT NULL,
|
||||
UNIQUE(uuid)
|
||||
);
|
||||
CREATE TABLE metadata_dirtied(id INTEGER PRIMARY KEY,
|
||||
book INTEGER NOT NULL,
|
||||
UNIQUE(book));
|
||||
CREATE TABLE preferences(id INTEGER PRIMARY KEY,
|
||||
key TEXT NON NULL,
|
||||
val TEXT NON NULL,
|
||||
UNIQUE(key));
|
||||
CREATE TABLE publishers ( id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL COLLATE NOCASE,
|
||||
sort TEXT COLLATE NOCASE,
|
||||
@ -72,34 +125,143 @@ CREATE TABLE tags ( id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL COLLATE NOCASE,
|
||||
UNIQUE (name)
|
||||
);
|
||||
CREATE TABLE data ( id INTEGER PRIMARY KEY,
|
||||
book INTEGER NON NULL,
|
||||
format TEXT NON NULL COLLATE NOCASE,
|
||||
uncompressed_size INTEGER NON NULL,
|
||||
name TEXT NON NULL,
|
||||
UNIQUE(book, format)
|
||||
);
|
||||
|
||||
CREATE VIEW meta AS
|
||||
SELECT id, title,
|
||||
(SELECT concat(name) FROM authors WHERE authors.id IN (SELECT author from books_authors_link WHERE book=books.id)) authors,
|
||||
(SELECT name FROM publishers WHERE publishers.id IN (SELECT publisher from books_publishers_link WHERE book=books.id)) publisher,
|
||||
(SELECT rating FROM ratings WHERE ratings.id IN (SELECT rating from books_ratings_link WHERE book=books.id)) rating,
|
||||
timestamp,
|
||||
(SELECT MAX(uncompressed_size) FROM data WHERE book=books.id) size,
|
||||
(SELECT concat(name) FROM tags WHERE tags.id IN (SELECT tag from books_tags_link WHERE book=books.id)) tags,
|
||||
(SELECT text FROM comments WHERE book=books.id) comments,
|
||||
(SELECT name FROM series WHERE series.id IN (SELECT series FROM books_series_link WHERE book=books.id)) series,
|
||||
series_index,
|
||||
sort,
|
||||
author_sort,
|
||||
(SELECT concat(format) FROM data WHERE data.book=books.id) formats,
|
||||
isbn
|
||||
FROM books;
|
||||
SELECT id, title,
|
||||
(SELECT sortconcat(bal.id, name) FROM books_authors_link AS bal JOIN authors ON(author = authors.id) WHERE book = books.id) authors,
|
||||
(SELECT name FROM publishers WHERE publishers.id IN (SELECT publisher from books_publishers_link WHERE book=books.id)) publisher,
|
||||
(SELECT rating FROM ratings WHERE ratings.id IN (SELECT rating from books_ratings_link WHERE book=books.id)) rating,
|
||||
timestamp,
|
||||
(SELECT MAX(uncompressed_size) FROM data WHERE book=books.id) size,
|
||||
(SELECT concat(name) FROM tags WHERE tags.id IN (SELECT tag from books_tags_link WHERE book=books.id)) tags,
|
||||
(SELECT text FROM comments WHERE book=books.id) comments,
|
||||
(SELECT name FROM series WHERE series.id IN (SELECT series FROM books_series_link WHERE book=books.id)) series,
|
||||
series_index,
|
||||
sort,
|
||||
author_sort,
|
||||
(SELECT concat(format) FROM data WHERE data.book=books.id) formats,
|
||||
isbn,
|
||||
path,
|
||||
lccn,
|
||||
pubdate,
|
||||
flags,
|
||||
uuid
|
||||
FROM books;
|
||||
CREATE VIEW tag_browser_authors AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(id) FROM books_authors_link WHERE author=authors.id) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_authors_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.author=authors.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0) avg_rating,
|
||||
sort AS sort
|
||||
FROM authors;
|
||||
CREATE VIEW tag_browser_filtered_authors AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(books_authors_link.id) FROM books_authors_link WHERE
|
||||
author=authors.id AND books_list_filter(book)) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_authors_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.author=authors.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0 AND
|
||||
books_list_filter(bl.book)) avg_rating,
|
||||
sort AS sort
|
||||
FROM authors;
|
||||
CREATE VIEW tag_browser_filtered_publishers AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(books_publishers_link.id) FROM books_publishers_link WHERE
|
||||
publisher=publishers.id AND books_list_filter(book)) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_publishers_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.publisher=publishers.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0 AND
|
||||
books_list_filter(bl.book)) avg_rating,
|
||||
name AS sort
|
||||
FROM publishers;
|
||||
CREATE VIEW tag_browser_filtered_ratings AS SELECT
|
||||
id,
|
||||
rating,
|
||||
(SELECT COUNT(books_ratings_link.id) FROM books_ratings_link WHERE
|
||||
rating=ratings.id AND books_list_filter(book)) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_ratings_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.rating=ratings.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0 AND
|
||||
books_list_filter(bl.book)) avg_rating,
|
||||
rating AS sort
|
||||
FROM ratings;
|
||||
CREATE VIEW tag_browser_filtered_series AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(books_series_link.id) FROM books_series_link WHERE
|
||||
series=series.id AND books_list_filter(book)) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_series_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.series=series.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0 AND
|
||||
books_list_filter(bl.book)) avg_rating,
|
||||
(title_sort(name)) AS sort
|
||||
FROM series;
|
||||
CREATE VIEW tag_browser_filtered_tags AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(books_tags_link.id) FROM books_tags_link WHERE
|
||||
tag=tags.id AND books_list_filter(book)) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_tags_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.tag=tags.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0 AND
|
||||
books_list_filter(bl.book)) avg_rating,
|
||||
name AS sort
|
||||
FROM tags;
|
||||
CREATE VIEW tag_browser_publishers AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(id) FROM books_publishers_link WHERE publisher=publishers.id) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_publishers_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.publisher=publishers.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0) avg_rating,
|
||||
name AS sort
|
||||
FROM publishers;
|
||||
CREATE VIEW tag_browser_ratings AS SELECT
|
||||
id,
|
||||
rating,
|
||||
(SELECT COUNT(id) FROM books_ratings_link WHERE rating=ratings.id) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_ratings_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.rating=ratings.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0) avg_rating,
|
||||
rating AS sort
|
||||
FROM ratings;
|
||||
CREATE VIEW tag_browser_series AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(id) FROM books_series_link WHERE series=series.id) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_series_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.series=series.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0) avg_rating,
|
||||
(title_sort(name)) AS sort
|
||||
FROM series;
|
||||
CREATE VIEW tag_browser_tags AS SELECT
|
||||
id,
|
||||
name,
|
||||
(SELECT COUNT(id) FROM books_tags_link WHERE tag=tags.id) count,
|
||||
(SELECT AVG(ratings.rating)
|
||||
FROM books_tags_link AS tl, books_ratings_link AS bl, ratings
|
||||
WHERE tl.tag=tags.id AND bl.book=tl.book AND
|
||||
ratings.id = bl.rating AND ratings.rating <> 0) avg_rating,
|
||||
name AS sort
|
||||
FROM tags;
|
||||
CREATE INDEX authors_idx ON books (author_sort COLLATE NOCASE);
|
||||
CREATE INDEX books_authors_link_aidx ON books_authors_link (author);
|
||||
CREATE INDEX books_authors_link_bidx ON books_authors_link (book);
|
||||
CREATE INDEX books_idx ON books (sort COLLATE NOCASE);
|
||||
CREATE INDEX books_languages_link_aidx ON books_languages_link (lang_code);
|
||||
CREATE INDEX books_languages_link_bidx ON books_languages_link (book);
|
||||
CREATE INDEX books_publishers_link_aidx ON books_publishers_link (publisher);
|
||||
CREATE INDEX books_publishers_link_bidx ON books_publishers_link (book);
|
||||
CREATE INDEX books_ratings_link_aidx ON books_ratings_link (rating);
|
||||
@ -111,32 +273,38 @@ CREATE INDEX books_tags_link_bidx ON books_tags_link (book);
|
||||
CREATE INDEX comments_idx ON comments (book);
|
||||
CREATE INDEX conversion_options_idx_a ON conversion_options (format COLLATE NOCASE);
|
||||
CREATE INDEX conversion_options_idx_b ON conversion_options (book);
|
||||
CREATE INDEX custom_columns_idx ON custom_columns (label);
|
||||
CREATE INDEX data_idx ON data (book);
|
||||
CREATE INDEX formats_idx ON data (format);
|
||||
CREATE INDEX languages_idx ON languages (lang_code COLLATE NOCASE);
|
||||
CREATE INDEX publishers_idx ON publishers (name COLLATE NOCASE);
|
||||
CREATE INDEX series_idx ON series (sort COLLATE NOCASE);
|
||||
CREATE INDEX series_idx ON series (name COLLATE NOCASE);
|
||||
CREATE INDEX tags_idx ON tags (name COLLATE NOCASE);
|
||||
CREATE TRIGGER books_delete_trg
|
||||
AFTER DELETE ON books
|
||||
BEGIN
|
||||
DELETE FROM books_authors_link WHERE book=OLD.id;
|
||||
DELETE FROM books_publishers_link WHERE book=OLD.id;
|
||||
DELETE FROM books_ratings_link WHERE book=OLD.id;
|
||||
DELETE FROM books_series_link WHERE book=OLD.id;
|
||||
DELETE FROM books_tags_link WHERE book=OLD.id;
|
||||
DELETE FROM data WHERE book=OLD.id;
|
||||
DELETE FROM comments WHERE book=OLD.id;
|
||||
DELETE FROM conversion_options WHERE book=OLD.id;
|
||||
AFTER DELETE ON books
|
||||
BEGIN
|
||||
DELETE FROM books_authors_link WHERE book=OLD.id;
|
||||
DELETE FROM books_publishers_link WHERE book=OLD.id;
|
||||
DELETE FROM books_ratings_link WHERE book=OLD.id;
|
||||
DELETE FROM books_series_link WHERE book=OLD.id;
|
||||
DELETE FROM books_tags_link WHERE book=OLD.id;
|
||||
DELETE FROM books_languages_link WHERE book=OLD.id;
|
||||
DELETE FROM data WHERE book=OLD.id;
|
||||
DELETE FROM comments WHERE book=OLD.id;
|
||||
DELETE FROM conversion_options WHERE book=OLD.id;
|
||||
DELETE FROM books_plugin_data WHERE book=OLD.id;
|
||||
DELETE FROM identifiers WHERE book=OLD.id;
|
||||
END;
|
||||
CREATE TRIGGER books_insert_trg
|
||||
AFTER INSERT ON books
|
||||
CREATE TRIGGER books_insert_trg AFTER INSERT ON books
|
||||
BEGIN
|
||||
UPDATE books SET sort=title_sort(NEW.title) WHERE id=NEW.id;
|
||||
UPDATE books SET sort=title_sort(NEW.title),uuid=uuid4() WHERE id=NEW.id;
|
||||
END;
|
||||
CREATE TRIGGER books_update_trg
|
||||
AFTER UPDATE ON books
|
||||
BEGIN
|
||||
UPDATE books SET sort=title_sort(NEW.title) WHERE id=NEW.id;
|
||||
END;
|
||||
AFTER UPDATE ON books
|
||||
BEGIN
|
||||
UPDATE books SET sort=title_sort(NEW.title)
|
||||
WHERE id=NEW.id AND OLD.title <> NEW.title;
|
||||
END;
|
||||
CREATE TRIGGER fkc_comments_insert
|
||||
BEFORE INSERT ON comments
|
||||
BEGIN
|
||||
@ -169,23 +337,41 @@ CREATE TRIGGER fkc_data_update
|
||||
THEN RAISE(ABORT, 'Foreign key violation: book not in books')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_delete_books_authors_link
|
||||
CREATE TRIGGER fkc_delete_on_authors
|
||||
BEFORE DELETE ON authors
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
WHEN (SELECT COUNT(id) FROM books_authors_link WHERE book=OLD.book) > 0
|
||||
THEN RAISE(ABORT, 'Foreign key violation: author is still referenced')
|
||||
WHEN (SELECT COUNT(id) FROM books_authors_link WHERE author=OLD.id) > 0
|
||||
THEN RAISE(ABORT, 'Foreign key violation: authors is still referenced')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_delete_books_publishers_link
|
||||
CREATE TRIGGER fkc_delete_on_languages
|
||||
BEFORE DELETE ON languages
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
WHEN (SELECT COUNT(id) FROM books_languages_link WHERE lang_code=OLD.id) > 0
|
||||
THEN RAISE(ABORT, 'Foreign key violation: language is still referenced')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_delete_on_languages_link
|
||||
BEFORE INSERT ON books_languages_link
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
WHEN (SELECT id from books WHERE id=NEW.book) IS NULL
|
||||
THEN RAISE(ABORT, 'Foreign key violation: book not in books')
|
||||
WHEN (SELECT id from languages WHERE id=NEW.lang_code) IS NULL
|
||||
THEN RAISE(ABORT, 'Foreign key violation: lang_code not in languages')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_delete_on_publishers
|
||||
BEFORE DELETE ON publishers
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
WHEN (SELECT COUNT(id) FROM books_publishers_link WHERE book=OLD.book) > 0
|
||||
THEN RAISE(ABORT, 'Foreign key violation: publisher is still referenced')
|
||||
WHEN (SELECT COUNT(id) FROM books_publishers_link WHERE publisher=OLD.id) > 0
|
||||
THEN RAISE(ABORT, 'Foreign key violation: publishers is still referenced')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_delete_books_series_link
|
||||
CREATE TRIGGER fkc_delete_on_series
|
||||
BEFORE DELETE ON series
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
@ -193,12 +379,12 @@ CREATE TRIGGER fkc_delete_books_series_link
|
||||
THEN RAISE(ABORT, 'Foreign key violation: series is still referenced')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_delete_books_tags_link
|
||||
CREATE TRIGGER fkc_delete_on_tags
|
||||
BEFORE DELETE ON tags
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
WHEN (SELECT COUNT(id) FROM books_tags_link WHERE tag=OLD.id) > 0
|
||||
THEN RAISE(ABORT, 'Foreign key violation: tag is still referenced')
|
||||
THEN RAISE(ABORT, 'Foreign key violation: tags is still referenced')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_insert_books_authors_link
|
||||
@ -267,6 +453,22 @@ CREATE TRIGGER fkc_update_books_authors_link_b
|
||||
THEN RAISE(ABORT, 'Foreign key violation: author not in authors')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_update_books_languages_link_a
|
||||
BEFORE UPDATE OF book ON books_languages_link
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
WHEN (SELECT id from books WHERE id=NEW.book) IS NULL
|
||||
THEN RAISE(ABORT, 'Foreign key violation: book not in books')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_update_books_languages_link_b
|
||||
BEFORE UPDATE OF lang_code ON books_languages_link
|
||||
BEGIN
|
||||
SELECT CASE
|
||||
WHEN (SELECT id from languages WHERE id=NEW.lang_code) IS NULL
|
||||
THEN RAISE(ABORT, 'Foreign key violation: lang_code not in languages')
|
||||
END;
|
||||
END;
|
||||
CREATE TRIGGER fkc_update_books_publishers_link_a
|
||||
BEFORE UPDATE OF book ON books_publishers_link
|
||||
BEGIN
|
||||
@ -341,3 +543,4 @@ CREATE TRIGGER series_update_trg
|
||||
BEGIN
|
||||
UPDATE series SET sort=NEW.name WHERE id=NEW.id;
|
||||
END;
|
||||
pragma user_version=20;
|
||||
|