Merge from trunk

This commit is contained in:
Charles Haley 2012-04-29 08:28:13 +02:00
commit 7a07df0413
346 changed files with 129332 additions and 101296 deletions

View File

@ -19,6 +19,315 @@
# new recipes:
# - title:
- version: 0.8.49
date: 2012-04-27
new features:
- title: "Experimental support for generating Amazon's new KF8 format MOBI files"
description: "calibre can now generate Amazon's new KF8 format MOBI files.
To turn on this feature, go to Preferences->Tweaks and click Plugin Tweaks. In the box add:
test_mobi_output_type = 'both'
calibre will now produce MOBI files that have both the old MOBI format and the new KF8 format in them.
To learn more about KF8, see: http://www.amazon.com/gp/feature.html?docId=1000729511
Note that calibre support for KF8 is still experimental and there will likely be bugs."
- title: "Upgrade to using cssutils 0.9.9 for CSS parsing. Improved speed and robustness."
- title: "Show cover size in a tooltip in the conversion dialog"
tickets: [986958]
- title: "Driver for Nook Simple Touch with Glow Light"
tickets: [989264]
bug fixes:
- title: "Heuristics: When italicizing words do not operate on words not in between HTML tags."
tickets: [986298]
- title: "Fix (I hope) the bulk metadata download process crashing for some people on OS X when clicking the Yes button to apply the updates."
tickets: [986658]
- title: "Fix tooltip not being updated in the book details panel when pasting in a new cover"
tickets: [986958]
- title: "Cover Browser: Wrap the title on space only, not in between words."
tickets: [986516]
- title: "Edit metadata dialog: If a permission denied error occurs when clicking the next or prev buttons, stay on the current book."
tickets: [986903]
- title: "Fix heuristics not removing unnecessary hyphens from the end of lines."
tickets: [822744]
improved recipes:
- Metro Nieuws NL
- Der Tagesspiegel
new recipes:
- title: Berria
author: Alayn Gortazar
- title: Sol Haber
author: Onur Gungor
- title: Telam
author: Darko Miletic
- title: Richmond Times-Dispatch
author: jde
- version: 0.8.48
date: 2012-04-20
new features:
- title: "Conversion: The search and replace feature has been completely revamped."
description: "You can now use any number of search and replace
expression, not just three. You can also store and load frequently used
sets of search and replace expressions. Also, the wizard generates its
preview in a separate process to protect against crashes/memory leaks."
tickets: [983476,983484,983478]
- title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free."
- title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X"
tickets: [981185]
bug fixes:
- title: "Get Books: Support the new website design of Barnes & Noble"
- title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted."
tickets: [943586]
- title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'"
- title: "MOBI Output: Handle background color specified on <td> and <tr> in addition to <table> tags."
tickets: [980813]
- title: "MOBI Output: Fix underline style applied to parent element not getting inherited by <a> children."
tickets: [985711]
improved recipes:
- xkcd
- Metro Nieuws
- Calgary Herald
- Orlando Sentinel
- countryfile
- Heise
new recipes:
- title: Various new Polish news sources
author: fenuks
- title: Various Italian news sources
author: faber1971
- title: Jakarta Globe
author: rty
- title: Acim Bilim Dergisi
author: thomass
- version: 0.8.47
date: 2012-04-13
new features:
- title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec."
tickets: [976056]
- title: "Support for viewing and converting the Haodoo PDB ebook format"
tickets: [976478]
- title: "Device driver for Laser EB720"
bug fixes:
- title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled"
tickets: [976336]
- title: 'Fix "Tags" field in advanced search does not obey regex setting'
tickets: [980221]
- title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single <img> tag, instead of rendering the page"
- title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device"
- title: "Amazon metadata download: Handle books whose titles start with a bracket."
tickets: [976365]
- title: "Get Books: Fix downloading of purchased books from Baen"
tickets: [975929]
improved recipes:
- Forbes
- Caros Amigos
- Trouw
- Sun UK
- Metro
- Daily Mirror
new recipes:
- title: "Melbourne Herald Sun"
author: Ray Hartley
- title: "Editoriali and Zerocalcare"
author: faber1971
- version: 0.8.46
date: 2012-04-06
new features:
- title: "Auto adding: When automatically adding files from a folder, automatically convert the files to the current output format after adding. This can be turned off via Preferences->Adding Books->Automatic Adding."
tickets: [969053]
- title: "E-book viewer: When reading a MOBI file that is actually a KF8 book, show the format as being KF8"
- title: "Content server: Workaround for android stock browser not support HTTP AUTH."
- title: "Edit metadata dialog: Change the remove unused series button to a clear series button (as the remove unused series function is now automatic)"
- title: "Driver for PocketBook 622."
tickets: [969875]
bug fixes:
- title: "Run metadata downloads in a separate process to workaround memory leaks in third party plugins. Also removes the need to break up bulk metadata downloads into 100 book batches."
- title: "Make tag browser filtering work when capital letters are entered."
- title: "EPUB metadata: Ignore urn:isbn: prefix from ISBN declaration when reading metadata"
- title: "Get books: Fix feedbooks store not showing all available formats"
- title: "KF8 Input: When the KF8 book has no metadata ToC, try to extract the ToC from the HTML instead."
tickets: [969238]
- title: "Fix regression that broke access to Preferences via the Preferences item in the calibre menu on OS X"
tickets: [969418]
- title: "Fix bug that ignored metadata specified on the command line when using calibredb add"
improved recipes:
- OReilly Premium
- Real Clear
- Soldier's Magazine
- Rue89
new recipes:
- title: The Southern Star
author: watou
- title: Buenos Aires Herald
author: Darko Miletic
- version: 0.8.45
date: 2012-03-30
new features:
- title: "E-book viewer: Allow the up and down keys to scroll past section boundaries"
- title: "calibredb: Allow specification of basic metadata on the command line when adding books."
tickets: [951063]
- title: "Driver for Samsung Galaxy Plus GT-I9001"
- title: "KF8 Input: Support KF8 format Amazon book samples."
tickets: [963418]
- title: "When a new plugin is added to calibre for the first time, have its icon (if any) show up even when a device is connected (this can be changed by the user at the time of plugin installation)"
- title: "Add keyboard shortcuts for Bold, Italic and Underline to the comments editor in the edit metadata dialog"
tickets: [963559]
bug fixes:
- title: "E-book viewer: Fix last read position (and bookmarks in general) being inaccurate for some books."
description: "The technique for marking locations in books used by the viewer has changed. The new technique should be much more accurate than the last one, especially when the font size at which the book is being viewed is changed. Note that this change means that bookmarks created with this release of calibre will not be read by previous calibre versions. On a technical note, the viewer now uses the CFI specification from the EPUB 3 standard for bookmarks."
type: major
- title: "Workarounds for a few regressions in the user interface in 0.8.44 caused by the update to Qt 4.8.0"
- title: "Books list: Preserve the horizontal scroll position when sorting by a column"
- title: "Fix saving to disk and then adding the book back not restoring tags-like custom columns"
- title: "Linux installer: Fix completion for ebook-convert not working."
tickets: [967834]
- title: "MOBI Output: Recognize type=text in addition to type=start guide elements"
- title: "Get Books: Updates to Nexto, Ebookpoint and Woblink stores"
- title: "Fix unable to clear username/password in Fetch news dialog"
- title: "PDF Output: Fix margin specifications not being applied"
- title: "Linux installer: Manually preserve the defaults.list mimetype association file to workaround buggy xdg-desktop-menu implementations in some distros."
tickets: [926559]
- title: "E-book viewer: Fix regression that caused the ebook viewer to stop functioning if it is launched from the main calibre program and then the main calibre program is closed."
tickets: [963960]
improved recipes:
- Our Daily Bread
new recipes:
- title: NRC Handelsblad (free)
author: veezh
- version: 0.8.44
date: 2012-03-23
new features:
- title: "E-book viewer: A whole new full screen mode."
description: "The new mode has no toolbars to distract from the text and the ability to set the width of the column of text via Preferences in the ebook viewer. Click the Fullscreen button on the toolbar in the viewer to enter fullscreen mode (or press the F11 or Ctrl+Shit+F keys)"
type: major
tickets: [959830]
- title: "Copy to Library: If books were auto merged by the copy to library process, popup a message telling the user about it, as otherwise some people forget they have turned on auto merge and accuse calibre of losing their books."
- title: "Unix driver for Ectaco JetBook color"
tickets: [958442]
- title: "Add a link to the 'Adding Books Preferences' in the drop down menu of the Add Books button for easier access and more prominence"
tickets: [958145]
- title: "Smarten punctuation: Add a few more cases for detecting opening and closing quotes"
bug fixes:
- title: "Get Books: Updates to various store plugins to deal with website changes: Amazon Europe, Waterstones, Foyles, B&N, Kobo, Woblink and Empik"
- title: "Catalog generation: Do not error out when generating csv/xml catalogs if the catalog title contains filename invalid characters."
tickets: [960154]
- title: "RTF Output: Ignore corrupted images in the input document, instead of erroring out."
tickets: [959600]
- title: "E-book viewer: Try to preserve page position when the window is resized"
- title: "Fix bug that caused wrong series to be shown when clicking on the first letter of a series group in the Tag Browser"
- title: "Fix calibre not supporting different http and https proxies."
tickets: [960173]
- title: "MOBI Input: Fix regression caused by KF8 support that broke reading of ancient non-Amazon PRC files"
- title: "Fix EPUB to EPUB conversion of an EPUB with obfuscated fonts resulting in the fonts not being readable in Adobe Digital Editions"
tickets: [957527]
- title: "RTF Output: Fix bug that broke conversion to RTF when the input document contains <img> tags with no src attribute."
- title: "Fix regression in 0.8.43 that broke use of general mode templates that ended in a semi-colon."
tickets: [957295]
improved recipes:
- b92
- Various Polish news sources
- Le Monde
- FHM UK
new recipes:
- title: Ivana Milakovic and Klub knjige
author: Darko Miletic
- version: 0.8.43
date: 2012-03-16

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1334868409(BasicNewsRecipe):
title = u'AÇIK BİLİM DERGİSİ'
description = ' Aylık çevrimiçi bilim dergisi'
__author__ = u'thomass'
oldest_article = 30
max_articles_per_feed = 300
auto_cleanup = True
encoding = 'UTF-8'
publisher = 'açık bilim'
category = 'haber, bilim,TR,dergi'
language = 'tr'
publication_type = 'magazine '
conversion_options = {
'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
feeds = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]

View File

@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe):
no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
index='http://www.adventure-zone.info/fusion/'
use_embedded_content=False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe):
skip_tag = skip_tag.findAll(name='a')
for r in skip_tag:
if r.strong:
word=r.strong.string
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
word=r.strong.string.lower()
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
def preprocess_html(self, soup):
footer=soup.find(attrs={'class':'news-footer middle-border'})
if footer and len(footer('a'))>=2:
footer('a')[1].extract()
for item in soup.findAll(style=True):
del item['style']
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -6,6 +6,7 @@ class Android_com_pl(BasicNewsRecipe):
description = 'Android.com.pl - biggest polish Android site'
category = 'Android, mobile'
language = 'pl'
use_embedded_content=True
cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png'
oldest_article = 8
max_articles_per_feed = 100

19
recipes/ara_info.recipe Normal file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = 'Ruben Pollan <meskio@sindominio.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1335656316(BasicNewsRecipe):
title = u'AraInfo.org'
__author__ = 'Ruben Pollan'
description = 'Regional newspaper from Aragon'
language = 'es'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
cover_url = u'http://arainfo.org/wordpress/wp-content/uploads/2011/10/logo-web_alta.jpg'
feeds = [(u'Movimientos', u'http://arainfo.org/category/movimientos/feed/'), (u'Econom\xeda', u'http://arainfo.org/category/economia/feed/'), (u'Ecolog\xeda', u'http://arainfo.org/category/ecologia/feed/'), (u'Culturas', u'http://arainfo.org/category/culturas/feed/'), (u'Altavoz', u'http://arainfo.org/category/altavoz/feed/')]

View File

@ -1,6 +1,6 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
b92.net
'''
@ -20,13 +20,13 @@ class B92(BasicNewsRecipe):
encoding = 'cp1250'
language = 'sr'
publication_type = 'newsportal'
masthead_url = 'http://www.b92.net/images/fp/logo.gif'
masthead_url = 'http://b92s.net/v4/img/new-logo.png'
extra_css = """
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Arial,Helvetica,sans1,sans-serif}
.articledescription{font-family: serif1, serif}
.article-info2,.article-info1{text-transform: uppercase; font-size: small}
img{display: block}
.sms{font-weight: bold}
"""
conversion_options = {
@ -37,11 +37,17 @@ class B92(BasicNewsRecipe):
, 'linearize_tables' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
preprocess_regexps = [
(re.compile(u'\u0110'), lambda match: u'\u00D0'),
(re.compile(r'<html.*?<body>', re.DOTALL|re.IGNORECASE), lambda match: '<html><head><title>something</title></head><body>')
]
keep_only_tags = [dict(attrs={'class':['article-info1','article-text']})]
remove_attributes = ['width','height','align','hspace','vspace','border']
remove_tags = [dict(name=['embed','link','base','meta'])]
remove_attributes = ['width','height','align','hspace','vspace','border','lang','xmlns:fb']
remove_tags = [
dict(name=['embed','link','base','meta','iframe'])
,dict(attrs={'id':'social'})
]
feeds = [
(u'Vesti' , u'http://www.b92.net/info/rss/vesti.xml' )

80
recipes/ba_herald.recipe Normal file
View File

@ -0,0 +1,80 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.buenosairesherald.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class BuenosAiresHerald(BasicNewsRecipe):
title = 'Buenos Aires Herald'
__author__ = 'Darko Miletic'
description = 'A world of information in a few words'
publisher = 'Editorial Nefir S.A.'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en_AR'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.buenosairesherald.com/img/logo.jpg'
INDEX = 'http://www.buenosairesherald.com'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
h1{font-family: Georgia,serif}
#fecha{text-align: right; font-size: small}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['meta','link','iframe'])]
keep_only_tags = [dict(attrs={'class':'nota_texto p'})]
feeds = [
(u'Argentina' , u'http://www.buenosairesherald.com/argentina' )
,(u'World' , u'http://www.buenosairesherald.com/world' )
,(u'Latin America' , u'http://www.buenosairesherald.com/latin-america' )
,(u'Entertainment' , u'http://www.buenosairesherald.com/entertainment' )
,(u'Sports' , u'http://www.buenosairesherald.com/sports' )
]
def print_version(self, url):
artidraw = url.rpartition('/article/')[2]
artid = artidraw.partition('/')[0]
return 'http://www.buenosairesherald.com/articles/print.aspx?ix=' + artid
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}):
description = self.tag_to_string(item.h2)
atag = item.h2.find('a')
if atag and atag.has_key('href'):
url = self.INDEX + atag['href']
title = description
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe):
self.image_article(soup, soup.body)
else:
self.append_page(soup, soup.body)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.INDEX + a['href']
return soup

44
recipes/berria.recipe Normal file
View File

@ -0,0 +1,44 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Alayn Gortazar <zutoin at gmail dot com>'
'''
www.berria.info
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Berria(BasicNewsRecipe):
title = 'Berria'
__author__ = 'Alayn Gortazar'
description = 'Euskal Herriko euskarazko egunkaria'
publisher = 'Berria'
category = 'news, politics, sports, Basque Country'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'eu'
remove_empty_feeds = True
masthead_url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Berria_Logo.svg/400px-Berria_Logo.svg.png'
keep_only_tags = [
dict(id='goiburua'),
dict(name='div', attrs={'class':['ber_ikus']}),
dict(name='section', attrs={'class':'ber_ikus'})
]
remove_tags = [
dict(name='a', attrs={'class':'iruzkinak'}),
dict(name='div', attrs={'class':'laguntzaileak'})
]
extra_css = '#goiburua{font-weight: bold} .zintiloa{font-size: small} .sarrera{color:#666} .titularra{font-size: x-large} .sarrera{font-weight: bold} .argazoin{color:#666; font-size: small}'
feeds = [
(u'Edizioa jarraia', u'http://berria.info/rss/ediziojarraia.xml'),
(u'Iritzia', u'http://berria.info/rss/iritzia.xml'),
(u'Euskal Herria', u'http://berria.info/rss/euskalherria.xml'),
(u'Ekonomia', u'http://berria.info/rss/ekonomia.xml'),
(u'Mundua', u'http://berria.info/rss/mundua.xml'),
(u'Kirola', u'http://berria.info/rss/kirola.xml'),
(u'Plaza', u'http://berria.info/rss/plaza.xml')
]

View File

@ -1,220 +1,35 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class CanWestPaper(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def get_cover_url(self):
from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
from calibre.web.feeds.news import BasicNewsRecipe
class CalgaryHerald(BasicNewsRecipe):
title = u'Calgary Herald'
oldest_article = 3
max_articles_per_feed = 100
feeds = [
(u'News', u'http://rss.canada.com/get/?F233'),
(u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'),
(u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'),
(u'Politics', u'http://rss.canada.com/get/?F7551'),
(u'National', u'http://rss.canada.com/get/?F7552'),
(u'World', u'http://rss.canada.com/get/?F7553'),
]
__author__ = 'rty'
pubisher = 'Calgary Herald'
description = 'Calgary, Alberta, Canada'
category = 'News, Calgary, Alberta, Canada'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en_CA'
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
##masthead_url = 'http://www.calgaryherald.com/index.html'
keep_only_tags = [
dict(name='div', attrs={'id':'storyheader'}),
dict(name='div', attrs={'id':'storycontent'})
]
remove_tags_after = {'class':"story_tool_hr"}

View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1331729727(BasicNewsRecipe):
title = u'Camera di Commercio di Bari'
oldest_article = 7
__author__ = 'faber1971'
description = 'News from the Chamber of Commerce of Bari'
language = 'it'
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://www.ba.camcom.it/grafica/layout-bordo/logo_camcom_bari.png'
feeds = [(u'Camera di Commercio di Bari', u'http://feed43.com/4715147488845101.xml')]
__license__ = 'GPL v3'
__copyright__ = '2012, faber1971'
__version__ = 'v1.00'
__date__ = '17, April 2012'

View File

@ -1,7 +1,5 @@
__copyright__ = '2011, Pablo Aldama <pabloaldama at gmail.com>'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1311839910(BasicNewsRecipe):
title = u'Caros Amigos'
oldest_article = 20
@ -9,9 +7,8 @@ class AdvancedUserRecipe1311839910(BasicNewsRecipe):
language = 'pt_BR'
__author__ = 'Pablo Aldama'
feeds = [(u'Caros Amigos', u'http://carosamigos.terra.com.br/index/index.php?format=feed&type=rss')]
feeds = [(u'Caros Amigos', u'http://carosamigos.terra.com.br/index2/index.php?format=feed&type=rss')]
keep_only_tags = [dict(name='div', attrs={'class':['blog']})
,dict(name='div', attrs={'class':['blogcontent']})
]
remove_tags = [dict(name='div', attrs={'class':'addtoany'})]

View File

@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe):
description = 'cdaction.pl - polish games magazine site'
category = 'games'
language = 'pl'
index='http://www.cdaction.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
return getattr(self, 'cover_url', self.cover_url)
return getattr(self, 'cover_url', self.cover_url)
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class CGM(BasicNewsRecipe):
title = u'CGM'
@ -17,9 +18,9 @@ class CGM(BasicNewsRecipe):
remove_tags_before=dict(id='mainContent')
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
dict(id=['movieShare', 'container'])]
feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
dict(id=['movieShare', 'container'])]
feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),
(u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')]
@ -33,10 +34,12 @@ class CGM(BasicNewsRecipe):
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
gallery.contents[1].name='img'
gallery.contents[1]['src']=img
pos = len(gallery.contents)
gallery.insert(pos, BeautifulSoup('<br />'))
for item in soup.findAll(style=True):
del item['style']
ad=soup.findAll('a')
for r in ad:
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
r.extract()
return soup
return soup

View File

@ -1,11 +1,12 @@
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
# last updated 29/1/12
# last updated 15/4/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
@ -13,7 +14,23 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True
auto_cleanup = True
#articles_are_obfuscated = True
def get_cover_url(self):
soup = self.index_to_soup('http://www.countryfile.com/')
cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
#print '******** ',cov,' ***'
cov2 = str(cov)
cov2=cov2[124:-90]
#print '******** ',cov2,' ***'
# try to get cover - if can't get known cover
br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
return cover_url
remove_tags = [
# dict(attrs={'class' : ['player']}),

View File

@ -1,20 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
import mechanize
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 11/2/12
# last updated 7/4/12
language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 1
max_articles_per_feed = 5
max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
@ -75,3 +76,28 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
img { display:block}
'''
def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function
br = mechanize.Browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
#cover_url = cov2
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url

View File

@ -10,8 +10,8 @@ from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard'
__author__ = 'Gerhard Aigner and Sujata Raman'
description = u'Nachrichten aus ??sterreich'
__author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
@ -30,17 +30,26 @@ class DerStandardRecipe(BasicNewsRecipe):
h4{color:#404450;font-size:x-small;}
h6{color:#404450; font-size:x-small;}
'''
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
(u'Web', u'http://derstandard.at/?page=rss&ressort=webstandard'),
(u'Sport', u'http://derstandard.at/?page=rss&ressort=sport'),
(u'Panorama', u'http://derstandard.at/?page=rss&ressort=panorama'),
(u'Etat', u'http://derstandard.at/?page=rss&ressort=etat'),
(u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'),
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')
feeds = [
(u'Newsroom', u'http://derStandard.at/?page=rss&ressort=Seite1'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=InnenPolitik'),
(u'International', u'http://derstandard.at/?page=rss&ressort=InternationalPolitik'),
(u'Wirtschaft', u'http://derStandard.at/?page=rss&ressort=Wirtschaft'),
(u'Web', u'http://derStandard.at/?page=rss&ressort=Web'),
(u'Sport', u'http://derStandard.at/?page=rss&ressort=Sport'),
(u'Panorama', u'http://derStandard.at/?page=rss&ressort=Panorama'),
(u'Etat', u'http://derStandard.at/?page=rss&ressort=Etat'),
(u'Kultur', u'http://derStandard.at/?page=rss&ressort=Kultur'),
(u'Wissenschaft', u'http://derStandard.at/?page=rss&ressort=Wissenschaft'),
(u'Gesundheit', u'http://derStandard.at/?page=rss&ressort=Gesundheit'),
(u'Bildung', u'http://derStandard.at/?page=rss&ressort=Bildung'),
(u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'),
(u'Lifestyle', u'http://derStandard.at/?page=rss&ressort=Lifestyle'),
(u'Reisen', u'http://derStandard.at/?page=rss&ressort=Reisen'),
(u'Karriere', u'http://derStandard.at/?page=rss&ressort=Karriere'),
(u'Immobilien', u'http://derstandard.at/?page=rss&ressort=Immobilien'),
(u'dieStandard', u'http://dieStandard.at/?page=rss&ressort=diestandard'),
(u'daStandard', u'http://daStandard.at/?page=rss&ressort=dastandard')
]
keep_only_tags = [

37
recipes/diagonal.recipe Normal file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = 'Ruben Pollan <meskio@sindominio.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1335657507(BasicNewsRecipe):
title = u'diagonal'
__author__ = 'Ruben Pollan'
description = 'Periodico quincenal de actualidad critica'
language = 'es'
oldest_article = 15
max_articles_per_feed = 100
auto_cleanup = True
cover_url = u'http://diagonalperiodico.net/IMG/siteon0.jpg'
feeds = [(u'Panorama', u'http://diagonalperiodico.net/-Panorama-.html?page=backend'),
(u'Global', u'http://diagonalperiodico.net/-Global,104-.html?page=backend'),
(u'Fotonoticia - Galería', u'http://diagonalperiodico.net/-Fotonoticia-Galeria-.html?page=backend'),
(u'Libertades y Derechos', u'http://diagonalperiodico.net/-Libertades-y-Derechos,77-.html?page=backend'),
(u'Saberes', u'http://diagonalperiodico.net/-Saberes,78-.html?page=backend'),
(u'En movimiento', u'http://diagonalperiodico.net/-En-movimiento-.html?page=backend'),
(u'Culturas', u'http://diagonalperiodico.net/-Culturas,89-.html?page=backend'),
(u'Cuerpo', u'http://diagonalperiodico.net/-Cuerpo,99-.html?page=backend'),
(u'La plaza', u'http://diagonalperiodico.net/-La-plaza-.html?page=backend'),
(u'Enfoques', u'http://diagonalperiodico.net/-Enfoques,106-.html?page=backend'),
(u'Humor - Galería', u'http://diagonalperiodico.net/-Humor-Galeria-.html?page=backend'),
(u'Entrevistas digitales', u'http://diagonalperiodico.net/-Entrevistas-Digitales-.html?page=backend'),
(u'Cartas a diagonal', u'http://diagonalperiodico.net/-Cartas-a-Diagonal-.html?page=backend'),
(u'Blogs', u'http://diagonalperiodico.net/-Blogs-.html?page=backend')]
def get_article_url(self, article):
link = article.get('link')
return 'http://diagonalperiodico.net/' + link

View File

@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
description = u'Aktualności i blogi z dobreprogramy.pl'
encoding = 'utf-8'
index='http://www.dobreprogramy.pl/'
no_stylesheets = True
language = 'pl'
extra_css = '.title {font-size:22px;}'
@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe):
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe):
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history'
language = 'pl'
index='http://dzieje.pl'
oldest_article = 8
max_articles_per_feed = 100
remove_javascript=True
@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe):
remove_tags_after= dict(id='dogory')
remove_tags=[dict(id='dogory')]
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

16
recipes/editoriali.recipe Normal file
View File

@ -0,0 +1,16 @@
__version__ = 'v1.0'
__date__ = '7, April 2012'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1332847053(BasicNewsRecipe):
title = u'Editoriali'
__author__ = 'faber1971'
description = 'Leading articles on Italy by the best Italian editorials'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = True
conversion_options = {'linearize_tables': True}
masthead_url = 'http://folkbulletin.folkest.com/wp-content/uploads/editoriale1.jpg'
feeds = [(u'Micromega', u'http://temi.repubblica.it/micromega-online/feed/'), (u'Corriere della Sera', u'http://xml.corriereobjects.it/rss/editoriali.xml'), (u'La Stampa', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=25'), (u"Italia dall'estero", u'http://italiadallestero.info/feed')]

View File

@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe):
(u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'),
(u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Elektroda(BasicNewsRecipe):
title = u'Elektroda'
@ -13,3 +14,18 @@ class Elektroda(BasicNewsRecipe):
remove_tags_after=dict(name='td', attrs={'class':'spaceRow'})
remove_tags=[dict(name='a', attrs={'href':'#top'})]
feeds = [(u'Elektroda', u'http://www.elektroda.pl/rtvforum/rss.php')]
def preprocess_html(self, soup):
tag=soup.find('span', attrs={'class':'postbody'})
if tag:
pos = len(tag.contents)
tag.insert(pos, BeautifulSoup('<br />'))
return soup
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
article.title=article.title[article.title.find("::")+3:]
return feeds

View File

@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe):
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
index='http://www.emuzyka.pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe):
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -7,7 +7,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
# last updated 17/3/12
# last updated 14/4/12
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
@ -28,7 +28,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
#]
feeds = [
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
(u'From the Homepage',u'http://feed43.com/0032328550253453.xml'),
#http://feed43.com/8053226782885416.xml'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
#(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),

View File

@ -7,13 +7,14 @@ class Filmweb_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png'
category = 'movies'
language = 'pl'
index='http://www.filmweb.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
remove_empty_feeds=True
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})]
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe):
self.log.warn(skip_tag)
return self.index_to_soup(skip_tag['href'], raw=True)
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -1,39 +1,49 @@
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Forbes(BasicNewsRecipe):
title = u'Forbes'
description = 'Business and Financial News'
__author__ = 'Darko Miletic'
__author__ = 'Kovid Goyal'
oldest_article = 30
max_articles_per_feed = 100
max_articles_per_feed = 20
language = 'en'
encoding = 'utf-8'
recursions = 1
no_stylesheets = True
html2lrf_options = ['--base-font-size', '10']
cover_url = u'http://www.forbes.com/media/current_covers/forbes_120_160.gif'
feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
(u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
(u'Most Emailed', u'http://www.forbes.com/feeds/mostemailed.xml'),
(u'Faces', u'http://www.forbes.com/facesscan/index.xml'),
(u'Technology', u'http://www.forbes.com/technology/index.xml'),
(u'Personal Tech', u'http://www.forbes.com/personaltech/index.xml'),
(u'Wireless', u'http://www.forbes.com/wireless/index.xml'),
(u'Business', u'http://www.forbes.com/business/index.xml'),
(u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
(u'Sports', u'http://www.forbes.com/forbeslife/sports/index.xml'),
(u'Vehicles', u'http://www.forbes.com/forbeslife/vehicles/index.xml'),
(u'Leadership', u'http://www.forbes.com/leadership/index.xml'),
(u'Careers', u'http://www.forbes.com/leadership/careers/index.xml'),
(u'Compensation', u'http://www.forbes.com/leadership/compensation/index.xml'),
(u'Managing', u'http://www.forbes.com/leadership/managing/index.xml')]
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('latin1', 'replace'))
print_link = soup.find('a', {'onclick':"s_linkTrackVars='prop18';s_linkType='o';s_linkName='Print';if(typeof(globalPageName)!='undefined')s_prop18=globalPageName;s_lnk=s_co(this);s_gs(s_account);"})
if print_link is None:
return ''
return 'http://www.forbes.com' + print_link['href']
feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
(u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
(u'Technology', u'http://www.forbes.com/technology/index.xml'),
(u'Business', u'http://www.forbes.com/business/index.xml'),
(u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
(u'Leadership', u'http://www.forbes.com/leadership/index.xml'),]
keep_only_tags = \
{'class':lambda x: x and (set(x.split()) & {'body', 'pagination',
'articleHead', 'article_head'})}
remove_tags_before = {'name':'h1'}
remove_tags = [
{'class':['comment_bug', 'engagement_block',
'video_promo_block', 'article_actions']},
{'id':'comments'}
]
def is_link_wanted(self, url, tag):
ans = re.match(r'http://.*/[2-9]/', url) is not None
if ans:
self.log('Following multipage link: %s'%url)
return ans
def postprocess_html(self, soup, first_fetch):
for pag in soup.findAll(True, 'pagination'):
pag.extract()
if not first_fetch:
h1 = soup.find('h1')
if h1 is not None:
h1.extract()
return soup

View File

@ -0,0 +1,16 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Fotoblogia_pl(BasicNewsRecipe):
title = u'Fotoblogia.pl'
__author__ = 'fenuks'
category = 'photography'
language = 'pl'
masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})]
remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]

View File

@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe):
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
index='http://gameplay.pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
remove_javascript= True
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
return url
return url
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and '../' in a['href']:
a['href']=self.index + a['href'][2:]
return soup

View File

@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe):
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
remove_empty_feeds=True
no_stylesheets=True
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
keep_only_tags=dict(name='div', attrs={'class':'widetext'})
@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe):
self.log.warn('odnosnik')
self.log.warn(link['href'])
return self.index_to_soup(link['href'], raw=True)
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
if '/gry/' in a['href']:
a['href']='http://www.gry.gildia.pl' + a['href']
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
a['href']='http://www.literatura.gildia.pl' + a['href']
elif u'komiks' in soup.title.string.lower():
a['href']='http://www.literatura.gildia.pl' + a['href']
else:
a['href']='http://www.gildia.pl' + a['href']
return soup

View File

@ -7,14 +7,15 @@ class Gram_pl(BasicNewsRecipe):
category = 'games'
language = 'pl'
oldest_article = 8
index='http://www.gram.pl'
max_articles_per_feed = 100
no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;}'
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'),
(u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
@ -23,3 +24,36 @@ class Gram_pl(BasicNewsRecipe):
if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
feed.articles.remove(article)
return feeds
def append_page(self, soup, appendtag):
nexturl = appendtag.find('a', attrs={'class':'cpn'})
while nexturl:
soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href'])
r=appendtag.find(id='pgbox')
if r:
r.extract()
pagetext = soup2.find(attrs={'class':'main'})
r=pagetext.find('h1')
if r:
r.extract()
r=pagetext.find('h2')
if r:
r.extract()
for r in pagetext.findAll('script'):
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
nexturl = appendtag.find('a', attrs={'class':'cpn'})
r=appendtag.find(id='pgbox')
if r:
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
tag=soup.findAll(name='div', attrs={'class':'picbox'})
for t in tag:
t['style']='float: left;'
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -59,6 +59,7 @@ class heiseDe(BasicNewsRecipe):
dict(name='span', attrs={'class':'rsaquo'}),
dict(name='div', attrs={'class':'news_logo'}),
dict(name='div', attrs={'class':'bcadv ISI_IGNORE'}),
dict(name='div', attrs={'class':'navi_top_container'}),
dict(name='p', attrs={'class':'news_option'}),
dict(name='p', attrs={'class':'news_navi'}),
dict(name='div', attrs={'class':'news_foren'})]
@ -69,3 +70,5 @@ class heiseDe(BasicNewsRecipe):

View File

@ -13,7 +13,7 @@ class HighCountryNews(BasicNewsRecipe):
__author__ = 'Armin Geller' # 2012-01-31
publisher = 'High Country News'
timefmt = ' [%a, %d %b %Y]'
language = 'en-Us'
language = 'en'
encoding = 'UTF-8'
publication_type = 'newspaper'
oldest_article = 7

View File

@ -0,0 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
class historia_news(BasicNewsRecipe):
title = u'historia-news'
__author__ = 'fenuks'
description = u'Historia-news to portal dla ludzi kochających historię. Najnowsze newsy z historii bliższej i dalszej, archeologii, paleontologii oraz ciekawostki i podcasty z historii kultury, sportu, motoryzacji i inne.'
masthead_url = 'http://historia-news.pl/templates/hajak4/images/header.jpg'
cover_url= 'http://www.historia-news.pl/templates/hajak4/images/header.jpg'
category = 'history'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
remove_tags=[dict(name='form'), dict(name='img', attrs={'alt':'Print'}), dict(attrs={'class':['commbutt', 'cpr']}), dict(id=['plusone', 'facebook'])]
feeds = [(u'Wiadomo\u015bci', u'http://historia-news.pl/wiadomoci.feed?type=rss'), (u'Artyku\u0142y', u'http://historia-news.pl/artykuy.feed?type=rss')]
def print_version(self, url):
return url + '?tmpl=component&print=1&layout=default&page='

Binary file not shown.

Before

Width:  |  Height:  |  Size: 413 B

After

Width:  |  Height:  |  Size: 1.5 KiB

BIN
recipes/icons/ba_herald.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 978 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 379 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 833 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1006 B

BIN
recipes/icons/telam.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

View File

@ -8,6 +8,7 @@ class in4(BasicNewsRecipe):
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
category = 'IT'
language = 'pl'
index='http://www.in4.pl/'
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
no_stylesheets = True
remove_empty_feeds = True
@ -39,6 +40,7 @@ class in4(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe):
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
index='http://infra.org.pl'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe):
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -0,0 +1,43 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
ivanamilakovic.blogspot.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class IvanaMilakovic(BasicNewsRecipe):
title = u'Ivana Milaković'
__author__ = 'Darko Miletic'
description = u'Hronika mačijeg škrabala - priče, inspiracija, knjige, pisanje, prevodi...'
oldest_article = 80
max_articles_per_feed = 100
language = 'sr'
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = True
publication_type = 'blog'
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif}
img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px }
"""
conversion_options = {
'comment' : description
, 'tags' : 'knjige, blog, srbija, sf'
, 'publisher': 'Ivana Milakovic'
, 'language' : language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [(u'Posts', u'http://ivanamilakovic.blogspot.com/feeds/posts/default')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -0,0 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class JakartaGlobe(BasicNewsRecipe):
title = u'Jakarta Globe'
oldest_article = 3
max_articles_per_feed = 100
feeds = [
(u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'),
(u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'),
(u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'),
(u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'),
(u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'),
(u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'),
]
__author__ = 'rty'
pubisher = 'JakartaGlobe.com'
description = 'JakartaGlobe, Indonesia, Newspaper'
category = 'News, Indonesia'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en_ID'
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
keep_only_tags = [
dict(name='div', attrs={'class':'story'}),
dict(name='span', attrs={'class':'headline'}),
dict(name='div', attrs={'class':'story'}),
dict(name='p', attrs={'id':'bodytext'})
]

42
recipes/klubknjige.recipe Normal file
View File

@ -0,0 +1,42 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
klub-knjige.blogspot.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class KlubKnjige(BasicNewsRecipe):
title = 'Klub knjige'
__author__ = 'Darko Miletic'
description = 'literarni blog'
oldest_article = 30
max_articles_per_feed = 100
language = 'sr'
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = True
publication_type = 'blog'
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif}
img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px }
"""
conversion_options = {
'comment' : description
, 'tags' : 'knjige, blog, srbija, sf'
, 'publisher': 'Klub Knjige'
, 'language' : language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [(u'Posts', u'http://klub-knjige.blogspot.com/feeds/posts/default')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Konflikty(BasicNewsRecipe):
title = u'Konflikty Zbrojne'
@ -10,6 +11,23 @@ class Konflikty(BasicNewsRecipe):
category='military, history'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')]
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
(u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'),
(u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
(u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
(u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
(u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
(u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for image in soup.findAll(name='a', attrs={'class':'image'}):
if image.img and image.img.has_key('alt'):
image.name='div'
pos = len(image.contents)
image.insert(pos, BeautifulSoup('<p style="font-style:italic;">'+image.img['alt']+'</p>'))
return soup

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__copyright__ = '2011'
__copyright__ = '2012'
'''
lemonde.fr
'''
@ -25,7 +25,7 @@ class LeMonde(BasicNewsRecipe):
.ariane{font-size:xx-small;}
.source{font-size:xx-small;}
#.href{font-size:xx-small;}
.LM_caption{color:#666666; font-size:x-small;}
#.figcaption style{color:#666666; font-size:x-small;}
#.main-article-info{font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
@ -48,7 +48,7 @@ class LeMonde(BasicNewsRecipe):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return self.adeify_images(soup)
return soup
preprocess_regexps = [
(re.compile(r'([0-9])%'), lambda m: m.group(1) + '&nbsp;%'),
@ -61,6 +61,11 @@ class LeMonde(BasicNewsRecipe):
(re.compile(r'&rdquo;'), lambda match: '&nbsp;&raquo;)'),
(re.compile(r'>\''), lambda match: '>&lsquo;'),
(re.compile(r' \''), lambda match: ' &lsquo;'),
(re.compile(r' &quot;'), lambda match: ' &laquo;&nbsp;'),
(re.compile(r'>&quot;'), lambda match: '>&laquo;&nbsp;'),
(re.compile(r'&quot;<'), lambda match: '&nbsp;&raquo;<'),
(re.compile(r'&quot; '), lambda match: '&nbsp;&raquo; '),
(re.compile(r'&quot;,'), lambda match: '&nbsp;&raquo;,'),
(re.compile(r'\''), lambda match: '&rsquo;'),
(re.compile(r'"<em>'), lambda match: '<em>&laquo;&nbsp;'),
(re.compile(r'"<em>"</em><em>'), lambda match: '<em>&laquo;&nbsp;'),
@ -86,9 +91,10 @@ class LeMonde(BasicNewsRecipe):
(re.compile(r'\s»'), lambda match: '&nbsp;»'),
(re.compile(r'«\s'), lambda match: '«&nbsp;'),
(re.compile(r' %'), lambda match: '&nbsp;%'),
(re.compile(r'\.jpg&nbsp;&raquo; border='), lambda match: '.jpg'),
(re.compile(r'\.png&nbsp;&raquo; border='), lambda match: '.png'),
(re.compile(r'\.jpg&nbsp;&raquo; width='), lambda match: '.jpg'),
(re.compile(r'\.png&nbsp;&raquo; width='), lambda match: '.png'),
(re.compile(r' &ndash; '), lambda match: '&nbsp;&ndash; '),
(re.compile(r'figcaption style="display:none"'), lambda match: 'figcaption'),
(re.compile(r' '), lambda match: '&nbsp;&ndash; '),
(re.compile(r' - '), lambda match: '&nbsp;&ndash; '),
(re.compile(r' -,'), lambda match: '&nbsp;&ndash;,'),
@ -97,10 +103,15 @@ class LeMonde(BasicNewsRecipe):
keep_only_tags = [
dict(name='div', attrs={'class':['contenu']})
dict(name='div', attrs={'class':['global']})
]
remove_tags = [dict(name='div', attrs={'class':['LM_atome']})]
remove_tags_after = [dict(id='appel_temoignage')]
remove_tags = [
dict(name='div', attrs={'class':['bloc_base meme_sujet']}),
dict(name='p', attrs={'class':['lire']})
]
remove_tags_after = [dict(id='fb-like')]
def get_article_url(self, article):
url = article.get('guid', None)
@ -136,4 +147,3 @@ class LeMonde(BasicNewsRecipe):
cover_url = link_item.img['src']
return cover_url

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1334649829(BasicNewsRecipe):
title = u'Liberatorio Politico'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://liberatorio.altervista.org/wp-content/uploads/2012/01/Testata-LIBERATORIO-Altervista1.jpg'
feeds = [(u'Liberatorio Politico', u'http://liberatorio.altervista.org/feed/')]
__author__ = 'faber1971'
description = 'Inquiry journalism - a blog on Molfetta, Land of Bari, Apulia and Italy - v1.00 (07, April 2012)'
language = 'it'

50
recipes/limes.recipe Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2012, faber1971'
__version__ = 'v1.00'
__date__ = '16, April 2012'
__description__ = 'Geopolitical Italian magazine'
from calibre.web.feeds.news import BasicNewsRecipe
class Limes(BasicNewsRecipe):
description = 'Italian weekly magazine'
__author__ = 'faber1971'
cover_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
title = 'Limes'
category = 'Geopolitical news'
language = 'it'
# encoding = 'cp1252'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 16
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
masthead_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
feeds = [
(u'Limes', u'http://temi.repubblica.it/limes/feed/')
]
keep_only_tags = [
dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
dict(name='div', attrs={'id':['content-second-right','content2']})
]
remove_tags = [
dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
dict(name='ul',attrs={'id':'user-utility'}),
dict(name=['script','noscript','iframe'])
]

View File

@ -1,20 +1,21 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
title = u'Marketing Magazine'
description = 'Collection of Italian marketing websites'
language = 'it'
__author__ = 'faber1971'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
no_stylesheets = True
conversion_options = {'linearize_tables': True}
remove_tags = [
dict(name='ul', attrs={'id':'ads0'})
]
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
__author__ = 'faber1971'
description = 'Collection of Italian marketing websites - v1.03 (20, February 2012)'
language = 'it'
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'[4]marketing.biz', u'http://feeds.feedburner.com/4marketing'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Bloguerrilla', u'http://feeds.feedburner.com/Bloguerrilla'), (u'Nonconvenzionale', u'http://feeds.feedburner.com/nonconvenzionale'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]

View File

@ -0,0 +1,85 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Matthew Briggs'
__docformat__ = 'restructuredtext en'
'''
http://www.herald sun.com.au/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DailyTelegraph(BasicNewsRecipe):
title = u'Melbourne Herald Sun'
__author__ = u'Ray Hartley'
description = (u'Victorian and National News'
'. You will need to have a subscription to '
'http://www.heraldsun.com.au to get full articles.')
language = 'en_AU'
oldest_article = 2
needs_subscription = 'optional'
max_articles_per_feed = 30
remove_javascript = True
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en_AU'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://resources2.news.com.au/cs/heraldsun/images/header-and-footer/logo.gif'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
.caption{display: inline; font-size: x-small}
"""
conversion_options = {
'comment' : description
, 'language' : language
}
keep_only_tags = [dict(attrs={'id':'story'})]
remove_tags_before=dict(attrs={'class':'story-header'})
remove_tags_after=dict(attrs={'class':'story-footer'})
remove_tags = [
dict(name=['meta','link','base','iframe','embed','object','media-metadata','media-reference','media-producer'])
,dict(attrs={'class':['story-header-tools','story-sidebar','story-footer','story-summary-list']})
]
remove_attributes=['lang']
feeds = [(u'Breaking News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_breakingnews_206.xml' )
,(u'Business' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_business_207.xml' )
,(u'Entertainment' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_entertainment_208.xml' )
,(u'Health Science' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_health_212.xml' )
,(u'Music' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_music_449.xml' )
,(u'National News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_national_209.xml' )
,(u'Sport News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_sport_213.xml' )
,(u'AFL News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_205.xml' )
,(u'State News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_vic_214.xml' )
,(u'Technology' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tech_215.xml' )
,(u'World News' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_world_216.xml' )
,(u'Opinion', u'http://feeds.news.com.au/public/rss/2.0/heraldsun_opinion_210.xml' )
,(u'Andrew Bolt' , u'http://blogs.news.com.au/heraldsun/andrewbolt/index.php/xml/rss_2.0/heraldsun/hs_andrewbolt/')
,(u'Afl - St Kilda' , u'http://feeds.news.com.au/public/rss/2.0/heraldsun_afl_stkilda_565.xml')
,(u'Terry McCrann' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_tmccrann_224.xml' )
,(u'The Other side' ,u'http://feeds.news.com.au/public/rss/2.0/heraldsun_otherside_211.xml')]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username and self.password:
br.open('http://www.heraldsun.com.au')
br.select_form(nr=0)
br['username'] = self.username
br['password'] = self.password
raw = br.submit().read()
if '>log out' not in raw.lower():
raise ValueError('Failed to log in to www.heralsun'
' are your username and password correct?')
return br
def get_article_url(self, article):
return article.id

View File

@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
try:
from calibre_plugins.drMerry.debug import debuglogger as mlog
print 'drMerry debuglogger found, debug options can be used'
from calibre_plugins.drMerry.stats import statslogger as mstat
print 'drMerry stats tracker found, stat can be tracked'
mlog.setLoglevel(1) #-1 == no log; 0 for normal output
mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
KEEPSTATS = mstat.keepmystats()
SHOWDEBUG0 = mlog.showdebuglevel(0)
SHOWDEBUG1 = mlog.showdebuglevel(1)
SHOWDEBUG2 = mlog.showdebuglevel(2)
except:
#print 'drMerry debuglogger not found, skipping debug options'
SHOWDEBUG0 = False
SHOWDEBUG1 = False
SHOWDEBUG2 = False
KEEPSTATS = False
#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website.
added info date on title
@ -43,80 +24,75 @@ except:
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
Version 1.9.1 18-04-2012
removed some debug settings
updated code to match new metro-layout
Version 1.9.2 24-04-2012
updated code to match new metro-layout
Version 1.9.3 25-04-2012
Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
Added new feeds
Updated css
Changed order of regex to speedup proces
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL'
oldest_article = 1.2
max_articles_per_feed = 25
__author__ = u'DrMerry'
description = u'Metro Nederland'
language = u'nl'
simultaneous_downloads = 3
__author__ = u'DrMerry'
description = u'Metro Nederland'
language = u'nl'
simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 10
center_navbar = True
timefmt = ' [%A, %d %b %Y]'
center_navbar = True
timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper'
encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height']
encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
use_embedded_content = False
conversion_options = {
'authors' : 'Metro Nederland & calibre & DrMerry',
'author_sort' : 'Metro Nederland & calibre & DrMerry',
'publisher' : 'DrMerry/Metro Nederland'
}
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
#date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
.article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
.article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
div.column-1-2 {display: inline;padding-right: 7px;}\
p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact.module-title{margin:8px 0}.article-box-fact.module-title,h2{font-size:1.1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2{border:0;padding:0}.column1,h1,h2{margin:0}'
preprocess_regexps = [
(re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
lambda match: '<hr class="merryhr" />'),
(re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'(&nbsp;|\s|<img[^>]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
#(re.compile(r'(&nbsp;|\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
#(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
]
remove_tags_before= dict(id='date')
remove_tags_after = [dict(name='div', attrs={'class':['column-1-3','gallery-text']})]#id='share-and-byline')]
remove_tags = [
dict(name=['iframe','script','noscript','style']),
dict(name='div', attrs={'class':[re.compile('column-[14]-5'),'col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)')]}),
dict(id=['column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'sidebar',re.compile('^article-\d'),'comments','gallery-1']),
dict(name='a', attrs={'name':'comments'}),
#dict(name='div', attrs={'data-href'}),
dict(name='img', attrs={'class':'top-line'}),
dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]
'''removed by before/after:
id:
column-1-5-top,'hidden_div','footer',
class:
'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
'''
def preprocess_html(self, soup):
if SHOWDEBUG0 == True:
mlog.setdefaults()
mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
if KEEPSTATS == True:
mlog.addDebug('Stats will be calculated')
else:
mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
mlog.showDebug()
myProcess = MerryProcess()
myProcess.moveTitleAndAuthor(soup)
myProcess.removeUnwantedTags(soup)
return soup
def postprocess_html(self, soup, first):
myProcess = MerryProcess()
myProcess.optimizeLayout(soup)
if SHOWDEBUG0 == True:
if KEEPSTATS == True:
statinfo = 'generated stats:'
statinfo += str(mstat.stats(mstat.statslist))
print statinfo
statinfo = 'generated stats (for removed tags):'
statinfo += str(mstat.stats(mstat.removedtagslist))
print statinfo
#show all Debug info we forgot to report
#Using print to be sure that this text will not be added at the end of the log.
print '\n!!!!!unreported messages:\n(should be empty)\n'
mlog.showDebug()
return soup
feeds = [
@ -128,295 +104,109 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
(u'Dot', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Strips',u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
(u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
(u'Carri&egrave;re', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Wetenschap',u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
(u'Planeet',u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
(u'Gezondheid',u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
]
class MerryPreProcess():
def replacePictures(self, soup):
#to be implemented
return soup
def optimizePicture(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
if SHOWDEBUG0 == True:
mlog.addDebug('Images optimized')
mlog.showDebug()
try:
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
except:
print '\n!!image optimize failed!!\n'
continue
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['items to remove'],[killingSoup])
try:
if soupIsArray == True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
if SHOWDEBUG1 == True:
mlog.addDebug('tag extracted')
mlog.showDebug()
if KEEPSTATS == True:
try:
mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
except:
mstat.addstat(mstat.removedtagslist,'unknown')
except:
if SHOWDEBUG1 == True:
mlog.addDebug('tag extraction failed')
mlog.showDebug()
if KEEPSTATS == True:
mstat.addstat(mstat.removedtagslist,'exception')
return False
else:
return False
return killingSoup
class MerryReplace():
myKiller = MerryExtract()
def replaceATag(self, soup):
anchors = []
anchors = soup.findAll('a')
if anchors and not (anchors == None or anchors == []):
try:
for link in anchors:
# print str(link)
if link and not link == None:
# print ('type: %s'%(str(type(link))))
# print ('link: %s' % (link))
myParent = link.parent
# print str('parent: %s'%(myParent))
try:
myIndex = link.parent.index(link)
hasIndex = True
except:
myIndex = 0
hasIndex = False
# print str('index %s'%(myIndex))
if not link.string == None:
# print 'link=notnone'
if hasIndex == True:
myParent.insert(myIndex, link.string)
else:
myParent.append(link.string)
else:
# print 'link=none'
myParent.insert(myIndex, link.contents)
self.myKiller.safeRemovePart(link, False)
else:
notshown = 'tag received is empty' # print
except:
notshown = 'tag received is empty' # print
notshown
return soup
class MerryProcess(BeautifulSoup):
myKiller = MerryExtract()
myReplacer = MerryReplace()
myPrepare = MerryPreProcess()
def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup)
if SHOWDEBUG0 == True:
mlog.addDebug('End of Optimize Layout')
mlog.showDebug()
return soup
def insertFacts(self, soup):
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfacts'],[allfacts])
mlog.showDebug()
thefactpart = re.compile('^article-box-fact.*$')
allfacts = soup.findAll('div', {'class':thefactpart})
if allfacts and not allfacts == None:
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
mlog.showDebug()
allfactsparent = soup.find('div', {'class':thefactpart}).parent
for part in allfactsparent:
if not part in allfacts:
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['FOUND A non-fact'],[part])
mlog.showDebug()
self.myKiller.safeRemovePart(part, True)
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['New All Facts'],[allfacts])
mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'})
errorOccured=False
if (articlefacts and not articlefacts==None):
try:
contenttag = soup.find('div', {'class':'article-body'})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['curcontag'],[contenttag])
mlog.showDebug()
foundrighttag = False
if contenttag and not contenttag == None:
foundrighttag = True
if SHOWDEBUG0 == True:
if errorOccured == False:
mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
else:
mlog.addDebug('Could not find right parent tag. Error Occured')
mlog.showDebug()
if foundrighttag == True:
contenttag.insert(0, allfactsparent)
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['added parent'],[soup.prettify()])
mlog.showDebug()
except:
errorOccured=True
mlog.addTrace()
else:
errorOccured=True
if SHOWDEBUG0 == True and errorOccured == True:
mlog.addTextAndTag(['no articlefacts'],[articlefacts])
mlog.showDebug()
pass
return soup
def moveTitleAndAuthor(self, soup):
moveitem = soup.h1
pubdate = soup.find(id="date")
if moveitem and not moveitem == None and pubdate and not pubdate == None:
try:
pubdate.parent.insert(0, moveitem)
except:
print '\n!!error in moving title!!\n'
pass
moveitem = None
moveitem = soup.find('div', {'class':'byline'})
if moveitem and not moveitem == None:
try:
moveitem.parent.parent.insert(-1, moveitem)
except:
print '\n!!error in moving byline!!\n'
pass
return soup
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
findsibsof = soup
firstpart = previous
if findsibsof and not findsibsof == None:
if soupIsArray == True:
for foundsib in findsibsof:
self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
else:
if firstpart == True and soupIsArray == False:
sibs = findsibsof.previousSiblingGenerator()
else:
sibs = findsibsof.nextSiblingGenerator()
for sib in sibs:
self.myKiller.safeRemovePart(sib, True)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('Not any sib found')
return
def removeUnwantedTags(self,soup):
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
mlog.showDebug()
self.removeTagsByName(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
mlog.showDebug()
self.insertFacts(soup)
self.removeFirstAndLastPart(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedParts(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.removeEmptyTags(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.myReplacer.replaceATag(soup)
return soup
def removeUnwantedParts(self, soup):
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByID(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByClass(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByStyle(soup)
return soup
def removeUnwantedTagsByStyle(self,soup):
self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
if SHOWDEBUG0 == True:
mlog.addDebug('end remove by style')
self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
return soup
def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True)
def removeUnwantedTagsByClass(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start remove by class')
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
return soup
def removeUnwantedTagsByID(self,soup):
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
for removeid in defaultids:
if SHOWDEBUG1 == True:
mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
mlog.showDebug()
self.removeArrayOfTags(soup.findAll(id=removeid))
return soup
# def safeRemoveTag(self, subtree):
# return self.myKiller.safeRemovePart(subtree, True)
def removeTagsByName(self, soup):
self.myKiller.safeRemovePart(soup.script, True)
self.myKiller.safeRemovePart(soup.iframe, True)
self.myKiller.safeRemovePart(soup.style, True)
self.myKiller.safeRemovePart(soup.noscript, True)
return soup
def removeEmptyTags(self,soup,run=0):
if SHOWDEBUG0 == True:
mlog.addDebug('starting removeEmptyTags')
if SHOWDEBUG1 == True:
run += 1
mlog.addDebug(run)
if SHOWDEBUG2 == True:
mlog.addDebug(str(soup.prettify()))
mlog.showDebug()
emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$')
emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
if SHOWDEBUG1 == True:
mlog.addDebug('tags found')
mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('no empty tags found')
mlog.showDebug()
if SHOWDEBUG0 == True:
if SHOWDEBUG2 == True:
mlog.addDebug('new soup:')
mlog.addDebug(str(soup.prettify()))
mlog.addDebug('RemoveEmptyTags Completed')
mlog.showDebug()
return soup
def removeFirstAndLastPart(self,soup):
def findparenttag(lookuptag):
if lookuptag and not lookuptag == None:
return lookuptag.findParents()
findtag = soup.find(id="date")
self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
return soup
return soup

View File

@ -1,52 +1,30 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK'
description = 'News as provide by The Metro -UK'
#timefmt = ''
__author__ = 'Dave Asbury'
#last update 3/12/11
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
no_stylesheets = True
#no_stylesheets = True
oldest_article = 1
max_articles_per_feed = 20
max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
auto_cleanup = True
#preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
preprocess_regexps = [
(re.compile(r'<span class="img-cap legend">', re.IGNORECASE | re.DOTALL), lambda match: '<p></p><span class="img-cap legend"> ')]
preprocess_regexps = [
(re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
keep_only_tags = [
dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
dict(attrs={'class':['img-cnt figure']}),
dict(attrs={'class':['art-img']}),
dict(name='div', attrs={'class':'art-lft'}),
dict(name='p')
]
remove_tags = [
dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
]
feeds = [
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
extra_css = '''
body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''

View File

@ -7,12 +7,12 @@ class naczytniki(BasicNewsRecipe):
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
language = 'pl'
description ='everything about e-readers'
category='readers'
category='e-readers'
no_stylesheets=True
use_embedded_content=False
oldest_article = 7
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><br><b>Zobacz także:</b></p>.*?</body>', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]
feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]

View File

@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class recipeMagic(BasicNewsRecipe):
title = 'National Geographic PL'
__author__ = 'Marcin Urban 2011'
__modified_by__ = 'fenuks'
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
#cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe):
]
remove_attributes = ['width','height']
feeds=[]
feeds = [
('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
]
def find_articles(self, url):
articles = []
soup=self.index_to_soup(url)
tag=soup.find(attrs={'class':'arl'})
art=tag.ul.findAll('li')
for i in art:
title=i.a['title']
url=i.a['href']
#date=soup.find(id='footer').ul.li.string[41:-1]
desc=i.div.p.string
articles.append({'title' : title,
'url' : url,
'date' : '',
'description' : desc
})
return articles
def parse_index(self):
feeds = []
feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/')))
feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/')))
return feeds
def print_version(self, url):
return url.replace('artykuly0Cpokaz', 'drukuj-artykul')
if 'artykuly' in url:
return url.replace('artykuly/pokaz', 'drukuj-artykul')
elif 'aktualnosci' in url:
return url.replace('aktualnosci/pokaz', 'drukuj-artykul')
else:
return url
def get_cover_url(self):
soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/')
tag=soup.find(attrs={'class':'txt jus'})
self.cover_url=tag.img['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,16 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1335362999(BasicNewsRecipe):
title = u'Non leggerlo'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = False
keep_only_tags = [
dict(name='div', attrs={'class':'post hentry'})
]
feeds = [(u'Non leggerlo', u'http://nonleggerlo.blogspot.com/feeds/posts/default')]
description = 'An Italian satirical blog'
language = 'it'
__author__ = 'faber1971'
__version__ = 'v1.0'
__date__ = '24, April 2012'

View File

@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
title=soup.find(attrs={'class':'tytul'})
if title:
title['style']='font-size: 20px; font-weight: bold;'
self.log.warn(soup)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.INDEX + a['href']
return soup

View File

@ -0,0 +1,76 @@
__license__ = 'GPL v3'
__copyright__ = '2012'
'''
nrc.nl
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class NRC(BasicNewsRecipe):
title = 'NRC Handelsblad'
__author__ = 'veezh'
description = 'Nieuws (no subscription needed)'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf-8'
publisher = 'nrc.nl'
category = 'news, Netherlands, world'
language = 'nl'
timefmt = ''
#publication_type = 'newsportal'
extra_css = '''
h1{font-size:130%;}
#h2{font-size:100%;font-weight:normal;}
#.href{font-size:xx-small;}
.bijschrift{color:#666666; font-size:x-small;}
#.main-article-info{font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
'''
#preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
remove_empty_feeds = True
filterDuplicates = True
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
keep_only_tags = [dict(name='div', attrs={'class':'article'})]
remove_tags_after = [dict(id='broodtekst')]
# keep_only_tags = [
# dict(name='div', attrs={'class':['label']})
# ]
# remove_tags_after = [dict(name='dl', attrs={'class':['tags']})]
# def get_article_url(self, article):
# link = article.get('link')
# if 'blog' not in link and ('chat' not in link):
# return link
feeds = [
# ('Nieuws', 'http://www.nrc.nl/rss.php'),
('Binnenland', 'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php'),
('Buitenland', 'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php'),
('Economie', 'http://www.nrc.nl/nieuws/categorie/economie/rss.php'),
('Wetenschap', 'http://www.nrc.nl/nieuws/categorie/wetenschap/rss.php'),
('Cultuur', 'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php'),
('Boeken', 'http://www.nrc.nl/boeken/rss.php'),
('Tech', 'http://www.nrc.nl/tech/rss.php/'),
('Klimaat', 'http://www.nrc.nl/klimaat/rss.php/'),
]

View File

@ -1,45 +1,69 @@
# Talking Points is not grabbing everything.
# The look is right, but only the last one added?
import re
import time
import traceback
# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
# strip ads and graphics
# Current Column lacks a title.
# Talking Points Memo - shorten title - Remove year and Bill's name
import os
from calibre.web.feeds import feeds_from_index
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert --username xxx --password xxx
# this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
title = 'Bill O\'Reilly Premium'
auto_cleanup = True
conversion_options = {'linearize_tables': True}
encoding = 'utf8'
needs_subscription = True
language = 'en'
no_stylesheets = True
oldest_article = 20
needs_subscription = True
oldest_article = 31
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 2000
max_articles_per_feed = 20
debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
# ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
# ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
# ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
# ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
feeds = [
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
]
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
# Now using RSS
def get_browser(self):
print("In get_browser")
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -66,6 +90,7 @@ class OReillyPremium(BasicNewsRecipe):
def stripBadChars(self, inString) :
return inString.replace("\'", "")
def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
@ -73,6 +98,7 @@ class OReillyPremium(BasicNewsRecipe):
fullReturn = []
for i in range(len(self.catList)) :
articleList = []
print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1])
# Set defaults
description = 'None'
@ -81,14 +107,12 @@ class OReillyPremium(BasicNewsRecipe):
# 3-5 create one.
# So no for-div for 3-5
if i < 3 :
if i == 0 :
print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
print("Next DIV:")
print(div)
if i == 1:
a = div.find('a', href=True)
else :
a = div
print(a)
a = div
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +120,63 @@ class OReillyPremium(BasicNewsRecipe):
continue
# url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href']
if i < 2 :
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
elif i == 2 :
# Daily Briefs
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = div.contents[0]
if self.debugMessages :
print(title+" @ "+url)
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
elif i == 3 : # Stratfor
a = soup.find('a', self.catList[i][3])
if a is None :
continue
url = baseURL+a['href']
title = self.tag_to_string(a, use_alt=True).strip()
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
# Changed 12 Jan 2012 - new page format
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
#stratBody = stratSoup.find('td', {'class':['blogBody']})
elif i == 4 : # Talking Points
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
myTable = topDate.findParents('table')[0]
if myTable is not None:
upOneTable = myTable.findParents('table')[0]
if upOneTable is not None:
upTwo = upOneTable.findParents('table')[0]
if upTwo is None:
continue
# Now navigate rows of upTwo
if self.debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
# This looks wrong.
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
url = baseURL+re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if self.debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
else : # Current Column
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
if titleSpan is None :
print("No Current Column Title Span")
print(soup)
continue
title = titleSpan.contents[0]
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
if i == 3 or i == 5 :
if i == 1 :
if self.debugMessages :
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
summary = div.find(True, attrs={'class':'summary'})
if summary:
print("At Summary")
print(summary)
if summary is not None:
description = self.tag_to_string(summary, use_alt=False)
print("At append")
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
self.catList[i][3] = articleList
fullReturn.append((self.catList[i][0], articleList))
print("Returning")
# print fullReturn
return fullReturn
# build_index() starts with:
# try:
# feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
# max_articles_per_feed=self.max_articles_per_feed,
# log=self.log)
# self.report_progress(0, _('Got feeds from index page'))
# except NotImplementedError:
# feeds = self.parse_feeds()
# which in turn is from __init__.py
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
# log=default_log):
#'''
#@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
#@return: A list of L{Feed} objects.
#@rtype: list
#'''
#feeds = []
#for title, articles in index:
# pfeed = Feed(log=log)
# pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
# max_articles_per_feed=max_articles_per_feed)
# feeds.append(pfeed)
# return feeds
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@ -182,12 +187,19 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
# it is called by download
def parse_index(self):
# Parse the page into Python Soup
print("Entering recipe print_index from:")
traceback.print_stack()
print("web")
baseURL = "https://www.billoreilly.com"
return self.parseGeneric(baseURL)
masterList = self.parseGeneric(baseURL)
#print(masterList)
return masterList
def preprocess_html(self, soup):
print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
@ -195,3 +207,128 @@ class OReillyPremium(BasicNewsRecipe):
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def build_index(self):
print("In OReilly build_index()\n\n")
feedsRSS = []
self.report_progress(0, ('Fetching feeds...'))
#try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, ('Got feeds from index page'))
#except NotImplementedError:
# feeds = self.parse_feeds()
# Now add regular feeds.
feedsRSS = self.parse_feeds()
print ("feedsRSS is type "+feedsRSS.__class__.__name__)
for articles in feedsRSS:
print("articles is type "+articles.__class__.__name__)
print("Title:" + articles.title)
feeds.append(articles)
if not feeds:
raise ValueError('No articles found, aborting')
#feeds = FeedCollection(feeds)
self.report_progress(0, ('Trying to download cover...'))
self.download_cover()
self.report_progress(0, ('Generating masthead...'))
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
self.jobs = []
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.feed_objects = feeds
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
func, arg = (self.fetch_embedded_article, article) \
if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
else \
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
else self.fetch_article), url)
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(feeds):
print("Writing feeds for "+feed.title)
html = self.feed2index(f,feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
self.create_opf(feeds)
self.report_progress(1, ('Feeds downloaded to %s')%index)
return index

View File

@ -1,3 +1,4 @@
import urllib, re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1279258912(BasicNewsRecipe):
@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
keep_only_tags = [
dict(name='div', attrs={'class':'story'})
]
remove_tags = [
dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}),
]
remove_tags_after = [
dict(name='p', attrs={'class':'copyright'}),
]
auto_cleanup = True
def get_article_url(self, article):
ans = None
try:
s = article.summary
ans = urllib.unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
if ans is None:
link = article.get('feedburner_origlink', None)
if link and link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
'0S':'//'}
for k, v in encoding.iteritems():
link = link.replace(k, v)
ans = link
elif link:
ans = link
if ans is not None:
return ans.replace('?track=rss', '')

View File

@ -14,6 +14,7 @@ class OurDailyBread(BasicNewsRecipe):
language = 'en'
max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = True
use_embedded_content = False
category = 'ODB, Daily Devotional, Bible, Christian Devotional, Devotional, RBC Ministries, Our Daily Bread, Devotionals, Daily Devotionals, Christian Devotionals, Faith, Bible Study, Bible Studies, Scripture, RBC, religion'
encoding = 'utf-8'
@ -25,12 +26,12 @@ class OurDailyBread(BasicNewsRecipe):
,'linearize_tables' : True
}
keep_only_tags = [dict(attrs={'class':'module-content'})]
remove_tags = [
dict(attrs={'id':'article-zoom'})
,dict(attrs={'class':'listen-now-box'})
]
remove_tags_after = dict(attrs={'class':'readable-area'})
#keep_only_tags = [dict(attrs={'class':'module-content'})]
#remove_tags = [
#dict(attrs={'id':'article-zoom'})
#,dict(attrs={'class':'listen-now-box'})
#]
#remove_tags_after = dict(attrs={'class':'readable-area'})
extra_css = '''
.text{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}

View File

@ -17,21 +17,8 @@ class Overclock_pl(BasicNewsRecipe):
remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})]
feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')]
def append_page(self, soup, appendtag):
tag=soup.find(id='navigation')
if tag:
nexturl=tag.findAll('option')
tag.extract()
for nextpage in nexturl[2:]:
soup2 = self.index_to_soup(nextpage['value'])
pagetext = soup2.find(id='content')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
rem=appendtag.find(attrs={'alt':'Pierwsza'})
if rem:
rem.parent.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup
def print_version(self, url):
if 'articles/show' in url:
return url.replace('show', 'showall')
else:
return url

View File

@ -10,5 +10,7 @@ class palmtop_pl(BasicNewsRecipe):
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content=True
#remove_tags_before=dict(name='h2')
#remove_tags_after=dict(attrs={'class':'entry clearfix'})
feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')]

View File

@ -1,31 +1,32 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Arena(BasicNewsRecipe):
title = u'PCArena'
oldest_article = 18300
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif'
cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif'
index='http://pcarena.pl'
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
remove_tags=[dict(attrs={'class':'pages'})]
feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')]
remove_empty_feeds=True
#keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
#remove_tags=[dict(attrs={'class':'pages'})]
feeds = [(u'Aktualności', u'http://pcarena.pl/aktualnosci/feeds.rss'), (u'Testy', u'http://pcarena.pl/testy/feeds.rss'), (u'Software', u'http://pcarena.pl/oprogramowanie/feeds.rss'), (u'Poradniki', u'http://pcarena.pl/poradniki/feeds.rss'), (u'Mobile', u'http://pcarena.pl/mobile/feeds.rss')]
def print_version(self, url):
return url.replace('show', 'print')
def append_page(self, soup, appendtag):
tag=soup.find(name='div', attrs={'class':'pagNum'})
if tag:
nexturl=tag.findAll('a')
tag.extract()
for nextpage in nexturl[1:]:
nextpage= 'http://pcarena.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(attrs={'class':'artBody'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://pcarena.pl' + url
else:
return url
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -10,32 +10,11 @@ class PC_Centre(BasicNewsRecipe):
masthead_url= 'http://pccentre.pl/views/images/logo.gif'
cover_url= 'http://pccentre.pl/views/images/logo.gif'
no_stylesheets = True
keep_only_tags= [dict(id='content')]
remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')]
remove_empty_feeds = True
#keep_only_tags= [dict(id='content')]
#remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
remove_tags=[dict(attrs={'class':'logo_print'})]
feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')]
def append_page(self, soup, appendtag):
tag=soup.find(name='div', attrs={'class':'pages'})
if tag:
nexturl=tag.findAll('a')
tag.extract()
for nextpage in nexturl[:-1]:
nextpage= 'http://pccentre.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(id='content')
rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']})
for r in rem:
r.extract()
rem=pagetext.findAll(id='comments')
for r in rem:
r.extract()
rem=pagetext.findAll('h1')
for r in rem:
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup
def print_version(self, url):
return url.replace('show', 'print')

View File

@ -1,5 +1,5 @@
"""
readitlaterlist.com
Pocket Calibre Recipe v1.0
"""
__license__ = 'GPL v3'
__copyright__ = '''
@ -12,22 +12,23 @@ from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Readitlater(BasicNewsRecipe):
title = 'ReadItLater'
class Pocket(BasicNewsRecipe):
title = 'Pocket'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
up your news. This version displays pages of articles from \
description = '''Personalized news feeds. Go to getpocket.com to setup up \
your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
publisher = 'readitlaterlist.com'
publisher = 'getpocket.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 50
minimum_articles = 1
minimum_articles = 10
mark_as_read_after_dl = True
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
INDEX = u'http://readitlaterlist.com'
INDEX = u'http://getpocket.com'
LOGIN = INDEX + u'/l'
readList = []
@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
print 'Marking read: ', url
response = br.open(url)
response
print response.info()
def cleanup(self):
self.mark_as_read(self.readList)
if self.mark_as_read_after_dl:
self.mark_as_read(self.readList)
else:
pass
def default_cover(self, cover_file):
'''
Create a generic cover for recipes that don't have a cover
This override adds time to the cover
'''
try:
from calibre.ebooks import calibre_cover
title = self.title if isinstance(self.title, unicode) else \
self.title.decode('utf-8', 'replace')
date = strftime(self.timefmt)
time = strftime('[%I:%M %p]')
img_data = calibre_cover(title, date, time)
cover_file.write(img_data)
cover_file.flush()
except:
self.log.exception('Failed to generate default cover')
return False
return True

View File

@ -1,5 +1,7 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
import re
import time
from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
# Don't go down
recursions = 0
max_articles_per_feed = 400
debugMessages = False
debugMessages = True
# Numeric parameter is type, controls whether we look for
feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Policy", "http://www.realclearpolicy.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
printhints = [
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
# usatoday - just prints with all current crap anyhow
]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL):
tagURL = pageURL
baseParse = urlparse(pageURL)
baseURL = baseParse[0]+"://"+baseParse[1]
hintsCount =len(self.printhints)
for x in range(0,hintsCount):
if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
soup = self.index_to_soup(pageURL)
if soup is None:
return pageURL
if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages == True :
print("search1")
print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
if self.debugMessages == True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
# print(soup)
print("end soup\n\n");
continue
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 :
return baseURL+printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
def parse_index(self):
# Parse the page into Python Soup
#articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
print(ans)
return ans

View File

@ -0,0 +1,59 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1335532466(BasicNewsRecipe):
title = u'Richmond Times-Dispatch'
description = 'News from Richmond, Virginia, USA'
__author__ = 'jde'
cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
language = 'en'
encoding = 'utf8'
oldest_article = 1 #days
max_articles_per_feed = 25
needs_subscription = False
remove_javascript = True
recursions = 0
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('News',
'http://www2.timesdispatch.com/list/feed/rss/news-archive'),
('Breaking News',
'http://www2.timesdispatch.com/list/feed/rss/breaking-news'),
('National News',
'http://www2.timesdispatch.com/list/feed/rss/national-news'),
('Local News',
'http://www2.timesdispatch.com/list/feed/rss/local-news'),
('Business',
'http://www2.timesdispatch.com/list/feed/rss/business'),
('Local Business',
'http://www2.timesdispatch.com/list/feed/rss/local-business'),
('Politics',
'http://www2.timesdispatch.com/list/feed/rss/politics'),
('Virginia Politics',
'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'),
('Editorials',
'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'),
('Columnists and Blogs',
'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'),
('Opinion Columnists',
'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'),
('Letters to the Editor',
'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'),
('Traffic',
'http://www2.timesdispatch.com/list/feed/rss/traffic'),
('Sports',
'http://www2.timesdispatch.com/list/feed/rss/sports2'),
('Entertainment/Life',
'http://www2.timesdispatch.com/list/feed/rss/entertainment'),
('Movies',
'http://www2.timesdispatch.com/list/feed/rss/movies'),
('Music',
'http://www2.timesdispatch.com/list/feed/rss/music'),
('Dining & Food',
'http://www2.timesdispatch.com/list/feed/rss/dining'),
]

View File

@ -6,6 +6,7 @@ Rue89
__author__ = '2010-2012, Louis Gesbert <meta at antislash dot info>'
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Rue89(BasicNewsRecipe):
@ -15,23 +16,24 @@ class Rue89(BasicNewsRecipe):
title = u'Rue89'
language = 'fr'
oldest_article = 7
max_articles_per_feed = 12
max_articles_per_feed = 50
use_embedded_content = False
# From http://www.rue89.com/les-flux-rss-de-rue89
feeds = [
(u'La Une', u'http://www.rue89.com/feed'),
(u'Rue69', u'http://www.rue89.com/rue69/feed'),
(u'Eco', u'http://www.rue89.com/rue89-eco/feed'),
(u'Planète', u'http://www.rue89.com/rue89-planete/feed'),
(u'Sport', u'http://www.rue89.com/rue89-sport/feed'),
(u'Culture', u'http://www.rue89.com/culture/feed'),
(u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'),
(u'Media', u'http://www.rue89.com/medias/feed'),
(u'Monde', u'http://www.rue89.com/monde/feed'),
(u'Politique', u'http://www.rue89.com/politique/feed'),
(u'Societe', u'http://www.rue89.com/societe/feed'),
# Other feeds disabled, 'La Une' seems to include them all
# (u'Rue69', u'http://www.rue89.com/rue69/feed'),
# (u'Eco', u'http://www.rue89.com/rue89-eco/feed'),
# (u'Planète', u'http://www.rue89.com/rue89-planete/feed'),
# (u'Sport', u'http://www.rue89.com/rue89-sport/feed'),
# (u'Culture', u'http://www.rue89.com/culture/feed'),
# (u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'),
# (u'Media', u'http://www.rue89.com/medias/feed'),
# (u'Monde', u'http://www.rue89.com/monde/feed'),
# (u'Politique', u'http://www.rue89.com/politique/feed'),
# (u'Societe', u'http://www.rue89.com/societe/feed'),
]
# Follow redirection from feedsportal.com
@ -41,19 +43,36 @@ class Rue89(BasicNewsRecipe):
def print_version(self, url):
return url + '?imprimer=1'
no_stylesheets = True
conversion_options = { 'smarten_punctuation' : True }
keep_only_tags = [
dict(name='div', attrs={'id':'article'}),
dict(name='div', attrs={'id':'content'}),
]
remove_tags_after = [
dict(name='div', attrs={'id':'plus_loin'}),
dict(name='div', attrs={'class':'stats'}),
]
remove_tags = [
dict(name='div', attrs={'id':'article_tools'}),
dict(name='div', attrs={'id':'plus_loin'}),
dict(name='div', attrs={'class':'stats'}),
dict(name='div', attrs={'class':'tools'}),
]
extra_css = "#content { padding: 0 0; }"
# Without this, parsing of video articles returns strange results
preprocess_regexps = [
(re.compile(r'<script.*?</script>', re.IGNORECASE|re.DOTALL), ''),
]
def preprocess_html(self, soup):
# Remove whole article if it's a "zapnet" (video)
if soup.find('h1', {'class':'zapnet_title'}):
return None
# Reduce h2 titles to h3
for title in soup.findAll('h2'):
title.name = 'h3'
return soup

141
recipes/sol_haber.recipe Normal file
View File

@ -0,0 +1,141 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
__docformat__ = 'restructuredtext en'
'''
www.sol.org.tr
'''
import datetime
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class SolHaberRecipe(BasicNewsRecipe):
title = u'soL Haber'
oldest_article = 7
max_articles_per_feed = 100
language = 'tr'
__author__ = 'Onur Güngör'
description = 'Hayata soL''dan bakın..'
publisher = 'soL Haber'
tags = 'news, haberler, siyaset, türkiye, turkey, politics'
conversion_options = {
'comment' : description
, 'tags' : tags
, 'publisher' : publisher
, 'language' : language
}
category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
'devlet-ve-siyaset':'Devlet ve Siyaset',
'ekonomi':'Ekonomi',
'enternasyonal-gundem':'Enternasyonel Gündem',
'kent-gundemleri':'Kent Gündemleri',
'kultur-sanat':'Kültür Sanat',
'dunyadan':'Dünyadan',
'serbest-kursu':'Serbest Kürsü',
'medya':'Medya',
'liseliler':'Liseliler',
'yazarlar':'Köşe Yazıları'}
end_date = datetime.date.today().isoformat()
start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
# Disable stylesheets from site.
no_stylesheets = True
cover_margins = (20, 20, '#ffffff')
storybody_reg_exp = '^\s*(haber|kose)\s*$'
comments_reg_exp = '^\s*makale-elestiri\s*$'
remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
def get_masthead_title(self):
return self.title + "(" + self.end_date + ")"
def parse_index(self):
result = []
articles_dict = dict()
author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
category_regexp = re.compile('^http://.*?/(.+?)/.*$')
for section_tuple in self.section_tuples:
section_title = section_tuple[0]
section_index_url = section_tuple[1]
self.log('Bölüm:', section_title, 'URL:', section_index_url)
soup = self.index_to_soup(section_index_url)
logo = soup.find('div', id='logo').find('img', src=True)
if logo is not None:
self.cover_url = logo['src']
if self.cover_url.startswith('/'):
self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
if view_content == None:
break
rows = view_content.find('tbody').findAll('tr')
self.log('Row sayısı', len(rows))
for row in rows:
cells = row.findAll('td')
a = cells[1].find('a', href=True)
url = a['href']
title = self.tag_to_string(a)
if url.startswith('/'):
url = 'http://haber.sol.org.tr'+url
category = section_title
category_match_result = category_regexp.match(url)
if category_match_result:
category = category_match_result.group(1)
date = self.tag_to_string(cells[2])
author = 'soL haber'
author_match_result = author_regexp.match(url)
if author_match_result:
author = author_match_result.group(1)
self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
if category in articles_dict:
articles_dict[category].append(article)
else:
articles_dict[category] = [article]
for category in articles_dict.keys():
if category in self.category_dict:
result.append((self.category_dict[category], articles_dict[category]))
else:
result.append((category, articles_dict[category]))
return result

View File

@ -15,6 +15,8 @@ class Soldiers(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
auto_cleanup = True
auto_cleanup_keep = '//div[@id="mediaWrapper"]'
simultaneous_downloads = 1
delay = 4
max_connections = 1
@ -31,14 +33,14 @@ class Soldiers(BasicNewsRecipe):
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
#keep_only_tags = [dict(name='div', attrs={'id':['storyHeader','textArea']})]
remove_tags = [
dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
,dict(name=['object','link'])
]
#remove_tags = [
#dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
#,dict(name=['object','link'])
#]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/2/' )]
def get_cover_url(self):

136
recipes/southernstar.recipe Normal file
View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2012, watou'
'''
southernstar.ie
'''
import re
import tempfile
import os
import codecs
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class TheSouthernStar(BasicNewsRecipe):
title = 'The Southern Star'
__author__ = 'watou'
description = 'West Cork\'s leading news and information provider since 1889'
NEWS_INDEX = 'http://www.southernstar.ie/news.php'
LOCAL_NOTES = 'http://www.southernstar.ie/localnotes.php'
SPORT_INDEX = 'http://www.southernstar.ie/sport.php'
CLASSIFIEDS = 'http://www.southernstar.ie/classifieds.php'
language = 'en_IE'
encoding = 'cp1252'
publication_type = 'newspaper'
masthead_url = 'http://www.southernstar.ie/images/logo.gif'
remove_tags_before = dict(name='div', attrs={'class':'article'})
remove_tags_after = dict(name='div', attrs={'class':'article'})
remove_tags = [dict(name='div', attrs={'style':'width:300px; position:relative'}),
dict(name='form'),
dict(name='div', attrs={'class':'endpanel'})]
no_stylesheets = True
tempfiles = []
pubdate = ''
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
def parse_index(self):
feeds = []
seen_titles = set([])
articles = self.fetch_ss_articles(self.NEWS_INDEX, seen_titles)
if articles:
feeds.append(('News', articles))
articles = self.fetch_ss_notes(self.LOCAL_NOTES)
if articles:
feeds.append(('Local Notes', articles))
articles = self.fetch_ss_articles(self.SPORT_INDEX, seen_titles)
if articles:
feeds.append(('Sport', articles))
articles = self.fetch_ss_notes(self.CLASSIFIEDS)
if articles:
feeds.append(('Classifieds', articles))
return feeds
def fetch_ss_articles(self, index, seen_titles):
articles = []
soup = self.index_to_soup(index)
ts = soup.find('div', {'class':'article'})
ds = self.tag_to_string(ts.find('strong'))
self.pubdate = ' ['+ds+']'
self.timefmt = ' [%s]'%ds
for post in ts.findAll('h1'):
a = post.find('a', href=True)
title = self.tag_to_string(a)
if title in seen_titles:
continue
seen_titles.add(title)
url = a['href']
if url.startswith('article'):
url = 'http://www.southernstar.ie/'+url
self.log('\tFound article:', title, 'at', url)
p = post.findNextSibling('p')
desc = None
if p is not None:
desc = str(p)
articles.append({'title':title, 'url':url, 'description':desc,
'date':self.pubdate})
return articles
def fetch_ss_notes(self, page):
articles = []
soup = self.index_to_soup(page)
ts = soup.find('div', {'class':'content'})
for post in ts.findAll('h1'):
title = self.tag_to_string(post)
self.log('\tFound note:', title)
f = tempfile.NamedTemporaryFile(suffix='.html',delete=False)
f.close()
f = codecs.open(f.name, 'w+b', self.encoding, 'replace')
url = "file://" + f.name
f.write(u'<html><head><meta http-equiv="Content-Type" content="text/html; charset='+
self.encoding+'"></head><body><h1>'+title+'</h1>')
f.write(str(post.findNextSibling('p')))
f.write(u'</body></html>')
self.log('\tWrote note to', f.name)
f.close()
self.tempfiles.append(f)
articles.append({'title':title, 'url':url, 'date':self.pubdate})
return articles
def postprocess_html(self, soup, first):
for table in soup.findAll('table', align='right'):
img = table.find('img')
if img is not None:
img.extract()
caption = self.tag_to_string(table).strip()
div = Tag(soup, 'div')
div['style'] = 'text-align:center'
div.insert(0, img)
div.insert(1, Tag(soup, 'br'))
if caption:
div.insert(2, NavigableString(caption))
table.replaceWith(div)
return soup
def image_url_processor(self, baseurl, url):
return url.replace(' ','%20')
def cleanup(self):
self.log('cleaning up')
for f in self.tempfiles:
os.unlink(f.name)
self.tempfiles = []

View File

@ -0,0 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Swiat_Obrazu(BasicNewsRecipe):
title = u'Swiat Obrazu'
__author__ = 'fenuks'
description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.'
category = 'photography'
masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript= True
use_embedded_content = False
feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')]
def print_version(self, url):
return url + ',drukuj'
def image_url_processor(self, baseurl, url):
if 'http://' not in url or 'https://' not in url:
return 'http://www.swiatobrazu.pl' + url[5:]
else:
return url

View File

@ -8,10 +8,11 @@ class Tablety_pl(BasicNewsRecipe):
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
category = 'IT'
language = 'pl'
use_embedded_content=True
oldest_article = 8
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'})
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})]
#remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
#remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
#remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})]
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]

View File

@ -34,7 +34,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
no_javascript = True
remove_empty_feeds = True
encoding = 'utf-8'
remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}]
remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}]
def print_version(self, url):
url = url.split('/')
@ -51,6 +51,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None
articles = {}
links = set()
key = None
ans = []
maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')})
@ -59,7 +60,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
if div['class'] == 'hcf-header':
try:
key = string.capwords(feed_title(div.em.a))
key = string.capwords(feed_title(div.em))
articles[key] = []
ans.append(key)
except:
@ -70,6 +71,12 @@ class TagesspiegelRSS(BasicNewsRecipe):
if not a:
continue
url = 'http://www.tagesspiegel.de' + a['href']
# check for duplicates
if url in links:
continue
links.add(url)
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')

View File

@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
if 'tanuki-anime' in soup.title.string.lower():
a['href']='http://anime.tanuki.pl' + a['href']
elif 'tanuki-manga' in soup.title.string.lower():
a['href']='http://manga.tanuki.pl' + a['href']
elif 'tanuki-czytelnia' in soup.title.string.lower():
a['href']='http://czytelnia.tanuki.pl' + a['href']
return soup

62
recipes/telam.recipe Normal file
View File

@ -0,0 +1,62 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.telam.com.ar
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Telam(BasicNewsRecipe):
title = 'Telam'
__author__ = 'Darko Miletic'
description = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
publisher = 'Telam S.E.'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es_AR'
remove_empty_feeds = True
publication_type = 'newsportal'
masthead_url = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['meta','link'])]
remove_tags_before = dict(attrs={'class':'nota_fecha'})
remove_tags_after = dict(attrs={'class':'nota_completa'})
remove_attributes = ['lang']
feeds = [
(u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
,(u'Politica' , u'http://www.telam.com.ar/xml/rss/1')
,(u'Economia' , u'http://www.telam.com.ar/xml/rss/2')
,(u'Sociedad' , u'http://www.telam.com.ar/xml/rss/3')
,(u'Policiales' , u'http://www.telam.com.ar/xml/rss/4')
,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
,(u'Espectaculos' , u'http://www.telam.com.ar/xml/rss/7')
,(u'Cultura' , u'http://www.telam.com.ar/xml/rss/8')
,(u'Deportes' , u'http://www.telam.com.ar/xml/rss/9')
,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
]
def print_version(self, url):
artid = url.rpartition('/')[2]
return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,24 +1,23 @@
import re
import re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
description = 'A Recipe for The Sun tabloid UK - uses feed43'
description = 'A Recipe for The Sun tabloid UK'
__author__ = 'Dave Asbury'
# last updated 20/2/12
# last updated 7/4/12
language = 'en_GB'
oldest_article = 1
max_articles_per_feed = 15
remove_empty_feeds = True
no_stylesheets = True
#auto_cleanup = True
#articles_are_obfuscated = True
masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
encoding = 'cp1251'
encoding = 'UTF-8'
encoding = 'cp1252'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
@ -30,13 +29,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
preprocess_regexps = [
(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
keep_only_tags = [
dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
dict(name='div',attrs={'class' : 'text-center'}),
dict(name='div',attrs={'id' : 'bodyText'})
# dict(name='p')
]
remove_tags=[
#dict(name='head'),
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
@ -46,12 +46,46 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
]
feeds = [
(u'News','http://feed43.com/2517447382644748.xml'),
(u'Sport', u'http://feed43.com/4283846255668687.xml'),
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
(u'Film',u'http://feed43.com/1307545221226200.xml'),
(u'Music',u'http://feed43.com/1701513435064132.xml'),
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
(u'News','http://feed43.com/2517447382644748.xml'),
(u'Sport', u'http://feed43.com/4283846255668687.xml'),
(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
(u'Film',u'http://feed43.com/1307545221226200.xml'),
(u'Music',u'http://feed43.com/1701513435064132.xml'),
(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
]
def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
#cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function
br = mechanize.Browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
#cover_url = cov2
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url

View File

@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
__author__ = 'Darko Miletic'
description = 'Title says it all'
publisher = "The Philosophers' Magazine"
recipe_disabled = ('This recipe has been disabled as the website has'
' started providing articles only in PDF form')
category = 'philosophy, news'
oldest_article = 25
max_articles_per_feed = 200

View File

@ -1,71 +1,12 @@
#!/usr/bin/python
from calibre.web.feeds.news import BasicNewsRecipe
class Trouw(BasicNewsRecipe):
class BasicUserRecipe1333905513(BasicNewsRecipe):
title = u'Trouw'
__author__ = u'JvdW'
__author__ = 'asalet_r'
language = 'nl'
description = u'Trouw de Verdieping'
oldest_article = 7
oldest_article = 1
max_articles_per_feed = 25
language = u'nl'
simultaneous_downloads = 1
delay = 1
# timefmt = ' [%A, %d %B, %Y]'
timefmt = ''
no_stylesheets = True
cover_url = 'http://www.trouw.nl/template/ver2-0/images/trouw_logo.gif'
auto_cleanup = True
# keep_only_tags = [ dict(name='div', attrs={'id':'content'}) ]
remove_tags = [
dict(name='div', attrs={'id' :'leaderboard' })
,dict(name='div', attrs={'class':'banner' })
,dict(name='div', attrs={'id' :'header' })
,dict(name='div', attrs={'class':'options' })
,dict(name='div', attrs={'id' :'menu_main' })
,dict(name='div', attrs={'id' :'menu_sub' })
,dict(name='div', attrs={'id' :'column_right' })
,dict(name='div', attrs={'class':'meta_information'})
,dict(name='div', attrs={'id' :'comments_form' })
,dict(name='div', attrs={'id' :'mailfriend' })
,dict(name='div', attrs={'id' :'footer' })
,dict(name='img', attrs={'id' :'dot_clear' })
]
keep_only_tags = [dict(id=['columns'])]
feeds = [
(u'Algemen', u'http://www.trouw.nl/?service=rss'),
(u'Nederland', u'http://www.trouw.nl/nieuws/nederland/?service=rss'),
(u'Europa', u'http://www.trouw.nl/nieuws/europa/?service=rss'),
(u'Wereld', u'http://www.trouw.nl/nieuws/wereld/?service=rss'),
(u'Economie', u'http://www.trouw.nl/nieuws/economie/?service=rss'),
(u'Wetenschap', u'http://www.trouw.nl/nieuws/Wetenschap/?service=rss'),
(u'Groen', u'http://www.trouw.nl/groen/?service=rss'),
(u'Religie en Filosofie', u'http://www.trouw.nl/religie-filosofie/?service=rss'),
(u'Politiek', u'http://www.trouw.nl/nieuws/politiek/?service=rss'),
(u'Zorg', u'http://www.trouw.nl/nieuws/zorg/?service=rss'),
(u'Onderwijs', u'http://www.trouw.nl/onderwijs/nieuws/?service=rss'),
(u'Sport', u'http://www.trouw.nl/nieuws/sport/?service=rss'),
(u'Achtergrond', u'http://www.trouw.nl/achtergrond/?service=rss'),
(u'De Verdieping', u'http://www.trouw.nl/achtergrond/deverdieping/?service=rss'),
(u'Naschrift', u'http://www.trouw.nl/achtergrond/Naschrift/?service=rss'),
(u'Opinie', u'http://www.trouw.nl/opinie/?service=rss'),
(u'Podium', u'http://www.trouw.nl/opinie/podium/?service=rss'),
(u'Commentaar', u'http://www.trouw.nl/opinie/commentaar/?service=rss'),
(u'Cultuur', u'http://www.trouw.nl/cultuur/?service=rss'),
(u'Boeken', u'http://www.trouw.nl/cultuur/boeken/?service=rss'),
(u'Film', u'http://www.trouw.nl/cultuur/film/?service=rss'),
(u'Beeldende kunst', u'http://www.trouw.nl/cultuur/beeldendekunst/?service=rss'),
(u'Theater', u'http://www.trouw.nl/cultuur/theater/?service=rss'),
(u'Muziek', u'http://www.trouw.nl/cultuur/muziek/?service=rss'),
(u'Kinderen', u'http://www.trouw.nl/cultuur/kinderen/?service=rss'),
(u'Ontspanning', u'http://www.trouw.nl/ontspanning/?service=rss'),
(u'De Gids', u'http://www.trouw.nl/ontspanning/degids/?service=rss'),
(u'Moderne manieren', u'http://www.trouw.nl/ontspanning/modernemanieren/?service=rss'),
(u'Reizen', u'http://www.trouw.nl/ontspanning/reizen/?service=rss'),
(u'Koken', u'http://www.trouw.nl/ontspanning/koken/?service=rss')
]
def print_version(self, url):
return url + '?all=true'
feeds = [(u'Nederland', u'http://www.trouw.nl/nieuws/nederland/rss.xml'), (u'Buitenland', u'http://www.trouw.nl/nieuws/buitenland/rss.xml'), (u'Politiek', u'http://www.trouw.nl/nieuws/politiek/rss.xml'), (u'Economie', u'http://www.trouw.nl/nieuws/economie/rss.xml'), (u'Sport', u'http://www.trouw.nl/nieuws/sport/rss.xml'), (u'Cultuur', u'http://www.trouw.nl/nieuws/cultuur/rss.xml'), (u'Gezondheid', u'http://www.trouw.nl/nieuws/gezondheid/rss.xml'), (u'Onderwijs', u'http://www.trouw.nl/nieuws/onderwijs/rss.xml'), (u'Opinie', u'http://www.trouw.nl/opinie/rss.xml'), (u'Groen', u'http://www.trouw.nl/groen/rss.xml'), (u'Religie-Filosofie', u'http://www.trouw.nl/religie-filosofie/rss.xml'), (u'Schrijf', u'http://www.trouw.nl/schrijf/rss.xml'), (u'Moderne Manieren', u'http://www.trouw.nl/moderne-manieren/rss.xml')]

View File

@ -2,65 +2,50 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
''' Changelog
2012-04-27 DrMerry:
Added cover picture
removed some extra tags
'''
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Tweakers(BasicNewsRecipe):
title = u'Tweakers.net - with Reactions'
__author__ = 'Roedi06'
title = u'Tweakers.net'
__author__ = 'Kovid Goyal'
language = 'nl'
oldest_article = 7
max_articles_per_feed = 100
cover_url = 'http://img51.imageshack.us/img51/7470/tweakersnetebook.gif'
oldest_article = 4
max_articles_per_feed = 40
cover_url = 'http://tweakers.net/ext/launch/g/logo.gif'
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'}),
{'id':'reacties'},
]
keep_only_tags = [dict(name='div', attrs={'class':'columnwrapper news'})]
remove_tags = [dict(name='div', attrs={'id' : ['utracker']}),
{'id' : ['channelNav']},
{'id' : ['contentArea']},
{'class' : ['breadCrumb']},
{'class' : ['nextPrevious ellipsis']},
{'class' : ['advertorial']},
{'class' : ['sidebar']},
{'class' : ['filterBox']},
{'id' : ['toggleButtonTxt']},
{'id' : ['socialButtons']},
{'class' : ['button']},
{'class' : ['textadTop']},
{'class' : ['commentLink']},
{'title' : ['Reageer op deze reactie']},
{'class' : ['pageIndex']},
{'class' : ['reactieHeader collapsed']},
remove_tags = [dict(name='div', attrs={'class':'reacties'}),
{'id' : ['utracker','socialButtons','b_ac']},
{'class' : ['sidebar','advertorial']},
{'class' : re.compile('nextPrevious')},
]
no_stylesheets=True
filter_regexps = [r'ads\.doubleclick\.net',r'ad\.doubleclick\.net']
preprocess_regexps = [
(re.compile(r'<hr*?>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</p>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<a.*?>'), lambda h1: '<b><u>'),
(re.compile(r'</a>'), lambda h2: '</u></b>'),
(re.compile(r'<span class="new">', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'</span>', re.IGNORECASE | re.DOTALL), lambda match : ''),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'), lambda match : ' - moderated 0<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_0'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'), lambda match : ' - moderated +1<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_1'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'), lambda match : ' - moderated +2<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_2'),
(re.compile(r'<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'), lambda match : ' - moderated +3<div class="moderation"><img src="http://tweakimg.net/g/if/comments/score_3'),
(re.compile(r'<div class="moderation">.*?</div>'), lambda h1: ''),
]
feeds = [(u'Tweakers.net', u'http://tweakers.net/feeds/nieuws.xml')]
extra_css = '.reactieHeader { color: #333333; font-size: 6px; border-bottom:solid 2px #333333; border-top:solid 1px #333333; } \
.reactieContent { font-family:"Times New Roman",Georgia,Serif; color: #000000; font-size: 8px; } \
.quote { font-family:"Times New Roman",Georgia,Serif; padding-left:2px; border-left:solid 3px #666666; color: #666666; }'
feeds = [(u'Tweakers.net', u'http://feeds.feedburner.com/tweakers/nieuws')]
def print_version(self, url):
return url + '?max=200'
def preprocess_html(self, soup):
for a in soup.findAll('a', href=True, rel=True):
if a['rel'].startswith('imageview'):
a['src'] = a['href']
del a['href']
a.name = 'img'
for x in a.findAll(True):
x.extract()
return soup
def postprocess_html(self, soup, first):
for base in soup.findAll('base'):
base.extract()
return soup

19
recipes/vignette.recipe Normal file
View File

@ -0,0 +1,19 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1334935485(BasicNewsRecipe):
title = u'Vignette'
oldest_article = 15
max_articles_per_feed = 100
auto_cleanup = False
keep_only_tags = [
dict(name='div', attrs={'class':['HomeFirstNewsfoto', 'photo']}),
dict(name='img', attrs={'class':'altan-big'})
]
masthead_url = 'http://vauro.globalist.it/vauroglobalistit/Img/vauro-logo-beta.gif'
feeds = [(u'Altan', u'http://feed43.com/3556647724071522.xml'), (u'Ellekappa', u'http://ellekappa.tumblr.com/rss'), (u'Vauro', u'http://feeds.feedburner.com/vauro')]
description = 'Ellekappa, Altan, Vauro - Italian best satirical cartoons'
language = 'it'
__author__ = 'faber1971'
__version__ = 'v1.0'
__date__ = '24, April 2012'

View File

@ -8,6 +8,7 @@ class webhosting_pl(BasicNewsRecipe):
cover_url='http://webhosting.pl/images/logo.png'
masthead_url='http://webhosting.pl/images/logo.png'
oldest_article = 7
index='http://webhosting.pl'
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
@ -36,4 +37,10 @@ class webhosting_pl(BasicNewsRecipe):
(u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')]
def print_version(self, url):
return url.replace('webhosting.pl', 'webhosting.pl/print')
return url.replace('webhosting.pl', 'webhosting.pl/print')
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
return soup

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe1312886443(BasicNewsRecipe):
title = u'WNP'
@ -8,10 +8,11 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
description = u'Wirtualny Nowy Przemysł'
category = 'economy'
language = 'pl'
preprocess_regexps = [(re.compile(ur'Czytaj też:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Czytaj więcej:.*?</a>', re.DOTALL), lambda match: '')]
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
keep_only_tags = dict(name='div', attrs={'id':'contentText'})
remove_tags=[dict(attrs={'class':'printF'})]
feeds = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'),
(u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'),
(u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'),
@ -19,3 +20,7 @@ class AdvancedUserRecipe1312886443(BasicNewsRecipe):
(u'Serwis Górnictwo', u'http://www.wnp.pl/rss/serwis_rss_4.xml'),
(u'Serwis Logistyka', u'http://www.wnp.pl/rss/serwis_rss_5.xml'),
(u'Serwis IT', u'http://www.wnp.pl/rss/serwis_rss_6.xml')]
def print_version(self, url):
return 'http://wnp.pl/drukuj/' +url[url.find(',')+1:]

View File

@ -21,7 +21,7 @@ class XkcdCom(BasicNewsRecipe):
use_embedded_content = False
oldest_article = 60
keep_only_tags = [dict(id='middleContent')]
keep_only_tags = [dict(id='middleContainer')]
remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
no_stylesheets = True
# turn image bubblehelp into a paragraph

View File

@ -0,0 +1,18 @@
__version__ = 'v1.0'
__date__ = '7, April 2012'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1333705905(BasicNewsRecipe):
title = u'Zerocalcare'
__author__ = 'faber1971'
description = 'Free Italian Comics'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = False
keep_only_tags = [
dict(name='div', attrs={'class':'main entry-content group'})
]
masthead_url = 'http://zerocalcare.it/wp-content/uploads/2011/11/zerocalcare-banner.jpg'
feeds = [(u'Zerocalcare', u'http://feeds.feedburner.com/Zerocalcareit')]

View File

@ -377,7 +377,7 @@
<xsl:apply-templates/><br/>
</xsl:template>
<!-- image -->
<xsl:template match="fb:image">
<xsl:template match="fb:body/fb:image|fb:section/fb:image">
<div align="center">
<xsl:element name="img">
<xsl:attribute name="border">1</xsl:attribute>
@ -395,4 +395,20 @@
</xsl:element>
</div>
</xsl:template>
<xsl:template match="fb:image">
<xsl:element name="img">
<xsl:choose>
<xsl:when test="starts-with(@xlink:href,'#')">
<xsl:attribute name="src"><xsl:value-of select="substring-after(@xlink:href,'#')"/></xsl:attribute>
</xsl:when>
<xsl:otherwise>
<xsl:attribute name="src"><xsl:value-of select="@xlink:href"/></xsl:attribute>
</xsl:otherwise>
</xsl:choose>
<xsl:if test="@title">
<xsl:attribute name="title"><xsl:value-of select="@title"/></xsl:attribute>
</xsl:if>
</xsl:element>
</xsl:template>
</xsl:stylesheet>

View File

@ -26,7 +26,7 @@ def login_to_google(username, password):
br.form['Email'] = username
br.form['Passwd'] = password
raw = br.submit().read()
if re.search(br'<title>.*?Account Settings</title>', raw) is None:
if re.search(br'(?i)<title>.*?Account Settings</title>', raw) is None:
x = re.search(br'(?is)<title>.*?</title>', raw)
if x is not None:
print ('Title of post login page: %s'%x.group())

Some files were not shown because too many files have changed in this diff Show More