Merge from trunk

This commit is contained in:
Charles Haley 2012-02-07 14:33:32 +01:00
commit 4c38f87b88
270 changed files with 93501 additions and 81080 deletions

View File

@ -5,7 +5,7 @@
# Also, each release can have new and improved recipes.
# - version: ?.?.?
# date: 2011-??-??
# date: 2012-??-??
#
# new features:
# - title:
@ -19,8 +19,201 @@
# new recipes:
# - title:
- version: 0.8.38
date: 2012-02-03
new features:
- title: "Implement the ability to automatically add books to calibre from a specified folder."
type: major
description: "calibre can now watch a folder on your computer and instantly add any files you put there to the calibre library as new books. You can tell calibre which folder to watch via Preferences->Adding Books->Automatic Adding."
tickets: [920249]
- title: "Conversion: When automatically inserting page breaks, do not put a page break before a <h1> or <h2> tag if it is immediately preceded by another <h1> or <h2> tag."
- title: "Driver for EZReader T730 and Point-of-View PlayTab Pro"
tickets: [923283, 922969]
bug fixes:
- title: "Fix device entry not visible in menubar even when it has been added via Preferences->Toolbars."
tickets: [923175]
- title: "Fix metadata plugboards not applied when auto sending news by email"
- title: "Fix regression in 0.8.34 that broke recipes that used skip_ad_pages() but not get_browser(). "
tickets: [923724]
- title: "Restore device support on FreeBSD, by using HAL"
tickets: [924503]
- title: "Get books: Show no more than 10 results from the Gandalf store"
- title: "Content server: Fix metadata not being updated when sending for some MOBI files."
tickets: [923130]
- title: "Heuristic processing: Fix the italicize common patterns algorithm breaking on some HTML markup."
tickets: [922317]
- title: "When trying to find an ebook inside a zip file, do not fail if the zip file itself contains other zip files."
tickets: [925670]
- title: "EPUB Input: Handle EPUBs with duplicate entries in the manifest."
tickets: [925831]
- title: "MOBI Input: Handle files that have extra </html> tags sprinkled through out their markup."
tickets: [925833]
improved recipes:
- Metro Nieuws NL
- FHM UK
new recipes:
- title: Strange Horizons
author: Jim DeVona
- title: Telegraph India and Live Mint
author: Krittika Goyal
- title: High Country News
author: Armin Geller
- title: Countryfile
author: Dave Asbury
- title: Liberation (subscription version)
author: Remi Vanicat
- title: Various Italian news sources
author: faber1971
- version: 0.8.37
date: 2012-01-27
new features:
- title: "Allow calibre to be run simultaneously in two different user accounts on windows."
tickets: [919856]
- title: "Driver for Motorola Photon and Point of View PlayTab"
tickets: [920582, 919080]
- title: "Add a checkbox to preferences->plugins to show only user installed plugins"
- title: "Add a restart calibre button to the warning dialog that pops up after changing some preference that requires a restart"
bug fixes:
- title: "Fix regression in 0.8.36 that caused the remove format from book function to only delete the entry from the database and not delete the actual file from the disk"
tickets: [921721]
- title: "Fix regression in 0.8.36 that caused the calibredb command to not properly refresh the format information in the GUI"
tickets: [919494]
- title: "E-book viewer: Preserve the current position more accurately when changing font size/other preferences."
tickets: [912406]
- title: "Conversion pipeline: Fix items in the <guide> that refer to files with URL unsafe filenames being ignored."
tickets: [920804]
- title: "Fix calibre not running on linux systems that set LANG to an empty string"
- title: "On first run of calibre, ensure the columns are sized appropriately"
- title: "MOBI Output: Do not collapse whitespace when setting the comments metadata in newly created MOBI files"
- title: "HTML Input: Fix handling of files with ä characters in their filenames."
tickets: [919931]
- title: "Fix the sort on startup tweak ignoring more than three levels"
tickets: [919584]
- title: "Edit metadata dialog: Fix a bug that broke adding of a file to the book that calibre did not previously know about in the books directory while simultaneously changing the author or title of the book."
tickets: [922003]
improved recipes:
- People's Daily
- Plus Info
- grantland.com
- Eret es irodalom
- Sueddeutsche.de
new recipes:
- title: Mumbai Mirror
author: Krittika Goyal
- title: Real Clear
author: TMcN
- title: Gazeta Wyborcza
author: ravcio
- title: The Daily News Egypt and al masry al youm
author: Omm Mishmishah
- title: Klip.me
author: Ken Sun
- version: 0.8.36
date: 2012-01-20
new features:
- title: "Decrease startup time for large libraries with at least one composite custom column by reading format info on demand"
- title: "When automatically deleting news older than x days, from the calibre library, only delete the book if it both has the tag News and the author calibre. This prevents accidental deletion of books tagged with News by the user."
- title: "Driver for Infibeam Pi 2"
- title: "Add a Tag Editor for tags like custom columns to the edit metadata dialog"
bug fixes:
- title: "E-book viewer: Fix regression in 0.8.35 that caused viewer to raise an error on books that did not define a language"
- title: "Content server: Fix grouping for categories based on custom columns."
tickets: [919011]
- title: "Edit metadata dialog: When setting the series from a format or via metadata download, ensure that the series index is not automatically changed, when closing the dialog."
tickets: [918751]
- title: "When reading metadata from Topaz (azw1) files, handle non ascii metadata correctly."
tickets: [917419]
- title: "CHM Input: Do not choke on CHM files with non ascii internal filenames on windows."
tickets: [917696]
- title: "Fix reading metadata from CHM files with non-ascii titles"
- title: "Fix HTML 5 parser choking on comments"
- title: "If calibre is started from a directory that does not exist, automatically use the home directory as the working directory, instead of crashing"
- title: "Fix iriver story HD Wi-Fi device and external SD card swapped"
tickets: [916364]
- title: "Content server: Fix ugly URLs for specific format download in the book details and permalink panels"
- title: "When adding FB2 files do not set the date field from the metadata in the file"
improved recipes:
- OReilly Premuim
- Variety
- Blic
- New Journal of Physics
- Der Tagesspiegel
new recipes:
- title: Tweakers.net
author: Roedi06
- title: Village Voice
author: Barty
- title: Edge.org Conversations
author: levien
- title: Novi list - printed edition
author: Darko Miletic
- version: 0.8.35
date: 2011-01-13
date: 2012-01-13
new features:
- title: "Metadata plugboards: Allow creation of plugboards for email delivery."

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
'''
abc.net.au/news
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class TheDailyNewsEG(BasicNewsRecipe):
title = u'al-masry al-youm'
__author__ = 'Omm Mishmishah'
description = 'Independent News from Egypt'
masthead_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
cover_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
auto_cleanup = True
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'Independent News Egypt'
category = 'News, Egypt, World'
language = 'en_EG'
publication_type = 'newsportal'
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': False
}
keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height']
feeds = [(u'English News', u'http://www.almasryalyoum.com/en/rss_feed_term/113/rss.xml'),
(u'News Features', u'http://www.almasryalyoum.com/en/rss_feed_term/115/rss.xml'),
(u'Culture', u'http://www.almasryalyoum.com/en/rss_feed_term/133/rss.xml'),
(u'Cinema', u'http://www.almasryalyoum.com/en/rss_feed_term/134/rss.xml')
]

View File

@ -0,0 +1,16 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327747616(BasicNewsRecipe):
title = u'Beppe Grillo'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Beppe Grillo', u'http://feeds.feedburner.com/beppegrillo/atom')]
description = 'Blog of the famous comedian and politician Beppe Grillo - v1.00 (28, January 2012)'
__author__ = 'faber1971'
language = 'it'

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,45 +7,81 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Calgary Herald
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
encoding = 'latin1'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
@ -72,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@ -98,9 +209,7 @@ class CanWestPaper(BasicNewsRecipe):
atag = h1tag.find('a',href=True)
if not atag:
continue
url = atag['href']
if not url.startswith('http:'):
url = self.url_prefix+'/news/todays-paper/'+atag['href']
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)

View File

@ -77,8 +77,18 @@ class ChicagoTribune(BasicNewsRecipe):
def get_article_url(self, article):
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
url = article.get('feedburner_origlink', article.get('guid', article.get('link')))
if url.endswith('?track=rss'):
url = url.partition('?')[0]
return url
def skip_ad_pages(self, soup):
text = soup.find(text='click here to continue to article')
if text:
a = text.parent
url = a.get('href')
if url:
return self.index_to_soup(url, raw=True)
def postprocess_html(self, soup, first_fetch):
# Remove the navigation bar. It was kept until now to be able to follow

View File

@ -0,0 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
# last updated 29/1/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
remove_empty_feeds = True
no_stylesheets = True
auto_cleanup = True
#articles_are_obfuscated = True
remove_tags = [
# dict(attrs={'class' : ['player']}),
]
feeds = [
(u'Homepage', u'http://www.countryfile.com/rss/home'),
(u'Country News', u'http://www.countryfile.com/rss/news'),
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
]

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,45 +7,77 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Edmonton Journal
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
title = u'Edmonton Journal'
url_prefix = 'http://www.edmontonjournal.com'
description = u'News from Edmonton, AB'
fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -76,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,16 +1,16 @@
################################################################################
#Description: http://es.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2010.12.01. - V1.0
#Description: http://es.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2012.01.20. - V1.2
################################################################################
from calibre.web.feeds.recipes import BasicNewsRecipe
class elet_es_irodalom(BasicNewsRecipe):
title = u'Elet es Irodalom'
title = u'\u00c9let \u00e9s Irodalom'
__author__ = 'Bigpapa'
oldest_article = 7
max_articles_per_feed = 20 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
max_articles_per_feed = 30 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
#delay = 1
use_embedded_content = False
@ -19,30 +19,40 @@ class elet_es_irodalom(BasicNewsRecipe):
language = 'hu'
publication_type = 'newsportal'
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
needs_subscription = 'optional'
masthead_url = 'http://www.es.hu/images/logo.jpg'
timefmt = ' [%Y %b %d, %a]'
#Nem ide a kódba kell beleírni a hozzáférés adatait, hanem azt akkor adod meg, ha le akarod tölteni!
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.es.hu/')
br.select_form(name='userfrmlogin')
br['cusername'] = self.username
br['cpassword'] = self.password
br.submit()
return br
keep_only_tags = [
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
]
]
remove_tags = [
dict(name='a', attrs={'target':['_TOP']}),
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
dict(name='a', attrs={'target':['_TOP']}),
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
]
]
feeds = [
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
]
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
]

View File

@ -6,7 +6,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
# last updated 27/12/11
# last updated 27/1/12
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
@ -22,9 +22,13 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
]
#remove_tags = [
#dict(attrs={'class' : ['player']}),
#]
feeds = [
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
]

View File

@ -3,10 +3,17 @@ import re
from calibre.ptempfile import PersistentTemporaryFile
class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications:
1) fetch issue cover
2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei weichen302@gmx.com, 2012-02-05'''
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en'
version = 1
version = 1.01
title = u'Foreign Affairs (Subcription or (free) Registration)'
publisher = u'Council on Foreign Relations'
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
remove_javascript = True
INDEX = 'http://www.foreignaffairs.com'
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
INCLUDE_PREMIUM = False
remove_tags = []
remove_tags.append(dict(name = 'base'))
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
temp_files = []
articles_are_obfuscated = True
def get_cover_url(self):
soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
img_url = div.find('img')['src']
return self.INDEX + img_url
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
@ -50,57 +66,47 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return self.temp_files[-1].name
def parse_index(self):
soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
articles = []
answer = []
content = soup.find('div', attrs = {'class': 'center-wrapper'})
if content:
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tag = div.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
# If they ever fix their markup, this will break :-(
summary = self.tag_to_string(tag.findNextSibling('p'))
description = author + '<br/>' + summary
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
else:
continue
answer.append(('Magazine', articles))
ul = content.find('ul')
if ul:
soup = self.index_to_soup(self.FRONTPAGE)
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
for sec in sec_start:
content = sec.nextSibling
if content:
section = self.tag_to_string(content.find('h2'))
articles = []
for li in ul.findAll('li'):
tag = li.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
description = ''
tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
if tag:
description = self.tag_to_string(tag)
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
tags = []
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tags.append(div)
ul = content.find('ul')
for li in content.findAll('li'):
tags.append(li)
for div in tags:
title = url = description = author = None
if self.INCLUDE_PREMIUM:
found_premium = False
else:
continue
answer.append(('Letters to the Editor', articles))
found_premium = div.findAll('span', attrs={'class':
'premium-icon'})
if not found_premium:
tag = div.find('div', attrs={'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
description = self.tag_to_string(tag_summary)
articles.append({'title':title, 'date':None, 'url':url,
'description':description, 'author':author})
if articles:
answer.append((section, articles))
return answer
def preprocess_html(self, soup):

View File

@ -2,105 +2,75 @@ import re
from calibre.web.feeds.news import BasicNewsRecipe
class GrantLand(BasicNewsRecipe):
title = u"Grantland"
description = 'Writings on Sports & Pop Culture'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
# auto_cleanup is too aggressive sometimes and we end up with blank articles
auto_cleanup = False
timefmt = ' [%a, %d %b %Y]'
oldest_article = 365
title = u"Grantland"
description = 'Writings on Sports & Pop Culture'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = True
# auto_cleanup is too aggressive sometimes and we end up with blank articles
auto_cleanup = False
timefmt = ' [%a, %d %b %Y]'
oldest_article = 90
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
INDEX = 'http://www.grantland.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, url suffix, max number of articles to load)
('Today in Grantland','',20),
('In Case You Missed It','incaseyoumissedit',35),
]
INDEX = 'http://www.grantland.com'
CATEGORIES = [
# comment out second line if you don't want older articles
# (user friendly name, url suffix, max number of articles to load)
('Today in Grantland','',20),
('In Case You Missed It','incaseyoumissedit',35),
]
remove_tags = [
{'name':['head','style','script']},
{'id':['header']},
{'class':re.compile(r'\bside|\bad\b|floatright|tags')}
]
remove_tags_before = {'class':'wrapper'}
remove_tags_after = [{'id':'content'}]
remove_tags = [
{'name':['style','aside','nav','footer','script']},
{'name':'h1','text':'Grantland'},
{'id':['header','col-right']},
{'class':['connect_widget']},
{'name':'section','class':re.compile(r'\b(ad|module)\b')},
]
preprocess_regexps = [
# <header> tags with an img inside are just blog banners, don't need them
# note: there are other useful <header> tags so we don't want to just strip all of them
(re.compile(r'<header class.+?<img .+?>.+?</header>', re.DOTALL|re.IGNORECASE),lambda m: ''),
# delete everything between the *last* <hr class="small" /> and </article>
(re.compile(r'<hr class="small"(?:(?!<hr class="small").)+</article>', re.DOTALL|re.IGNORECASE),lambda m: '<hr class="small" /></article>'),
]
extra_css = """cite, time { font-size: 0.8em !important; margin-right: 1em !important; }
img + cite { display:block; text-align:right}"""
preprocess_regexps = [
# remove blog banners
(re.compile(r'<a href="/blog/(?:(?!</a>).)+</a>', re.DOTALL|re.IGNORECASE), lambda m: ''),
]
def parse_index(self):
feeds = []
seen_urls = set([])
def parse_index(self):
feeds = []
seen_urls = set([])
for category in self.CATEGORIES:
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
self.log('Reading category:', cat_name)
articles = []
(cat_name, tag, max_articles) = category
self.log('Reading category:', cat_name)
articles = []
page = "%s/%s" % (self.INDEX, tag)
soup = self.index_to_soup(page)
headers = soup.findAll('h2' if tag=='' else 'h3')
page = "%s/%s" % (self.INDEX, tag)
soup = self.index_to_soup(page)
for header in headers:
tag = header.find('a',href=True)
if tag is None:
continue
url = tag['href']
if url in seen_urls:
continue
title = self.tag_to_string(tag)
if 'Podcast:' in title or 'In Case You Missed It' in title:
continue
desc = dt = ''
# get at the div that contains description and other info
div = header.parent.find('div')
if div is not None:
desc = self.tag_to_string(div)
dt = div.find('time')
if dt is not None:
dt = self.tag_to_string( dt)
main = soup.find('div',id='col-main')
if main is None:
main = soup
# if div contains the same url that is in h2/h3
# that means this is a series split into multiple articles
if div.find('a',href=url):
self.log('\tFound series:', title)
# grab all articles in series
for tag in div.findAll('a',href=True):
url = tag['href']
if url in seen_urls:
continue
self.log('\t', url)
seen_urls.add(url)
articles.append({'title':title+' - '+self.tag_to_string( tag),
'url':url,'description':desc,'date':dt})
else:
self.log('\tFound article:', title)
self.log('\t', url)
seen_urls.add(url)
articles.append({'title':title,'url':url,'description':desc,'date':dt})
for tag in main.findAll('a', href=re.compile(r'(story|post)/_/id/\d+')):
url = tag['href']
if url in seen_urls:
continue
title = tag.string
# blank title probably means <a href=".."><img /></a>. skip
if not title:
continue
self.log('\tFound article:', title)
self.log('\t', url)
articles.append({'title':title,'url':url})
seen_urls.add(url)
if len(articles) >= max_articles:
break
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
if articles:
feeds.append((cat_name, articles))
return feeds
def print_version(self, url):
return url+'?view=print'
return feeds

View File

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
'''
Fetch High Country News
'''
from calibre.web.feeds.news import BasicNewsRecipe
class HighCountryNews(BasicNewsRecipe):
title = u'High Country News'
description = u'News from the American West'
__author__ = 'Armin Geller' # 2012-01-31
publisher = 'High Country News'
timefmt = ' [%a, %d %b %Y]'
language = 'en-Us'
encoding = 'UTF-8'
publication_type = 'newspaper'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = True
remove_javascript = True
use_embedded_content = False
masthead_url = 'http://www.hcn.org/logo.jpg' # 2012-01-31 AGe add
cover_source = 'http://www.hcn.org' # 2012-01-31 AGe add
def get_cover_url(self): # 2012-01-31 AGe add
cover_source_soup = self.index_to_soup(self.cover_source)
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
return preview_image_div.div.img['src']
feeds = [
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent'),
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue'),
(u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
(u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
]
def print_version(self, url):
return url + '/print_view'

View File

Before

Width:  |  Height:  |  Size: 712 B

After

Width:  |  Height:  |  Size: 712 B

110
recipes/ilmanifesto.recipe Normal file
View File

@ -0,0 +1,110 @@
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
MANIFESTO_BASEURL = 'http://www.ilmanifesto.it/'
class IlManifesto(BasicNewsRecipe):
title = 'Il Manifesto'
__author__ = 'Giacomo Lacava'
description = 'quotidiano comunista - ultima edizione html disponibile'
publication_type = 'newspaper'
publisher = 'il manifesto coop. editrice a r.l.'
language = 'it'
oldest_article = 2
max_articles_per_feed = 100
delay = 1
no_stylesheets = True
simultaneous_downloads = 5
timeout = 30
auto_cleanup = True
remove_tags = [dict(name='div', attrs={'class':'column_1 float_left'})]
remove_tags_before = dict(name='div',attrs={'class':'column_2 float_right'})
remove_tags_after = dict(id='myPrintArea')
manifesto_index = None
manifesto_datestr = None
def _set_manifesto_index(self):
if self.manifesto_index == None:
startUrl = MANIFESTO_BASEURL + 'area-abbonati/in-edicola/'
startSoup = self.index_to_soup(startUrl)
lastEdition = startSoup.findAll('div',id='accordion_inedicola')[1].find('a')['href']
del(startSoup)
self.manifesto_index = MANIFESTO_BASEURL + lastEdition
urlsplit = lastEdition.split('/')
self.manifesto_datestr = urlsplit[-1]
if urlsplit[-1] == '':
self.manifesto_datestr = urlsplit[-2]
def get_cover_url(self):
self._set_manifesto_index()
url = MANIFESTO_BASEURL + 'fileadmin/archivi/in_edicola/%sprimapagina.gif' % self.manifesto_datestr
return url
def parse_index(self):
self._set_manifesto_index()
soup = self.index_to_soup(self.manifesto_index)
feedLinks = soup.find('div',id='accordion_inedicola').findAll('a')
result = []
for feed in feedLinks:
articles = []
feedName = feed.find('h2').string
feedUrl = MANIFESTO_BASEURL + feed['href']
feedSoup = self.index_to_soup(feedUrl)
indexRoot = feedSoup.find('div',attrs={'class':'column1'})
for div in indexRoot.findAll('div',attrs={'class':'strumenti1_inedicola'}):
artLink = div.find('a')
if artLink is None: continue # empty div
title = artLink.string
url = MANIFESTO_BASEURL + artLink['href']
description = ''
descNode = div.find('div',attrs={'class':'text_12'})
if descNode is not None:
description = descNode.string
author = ''
authNode = div.find('div',attrs={'class':'firma'})
if authNode is not None:
author = authNode.string
articleText = ''
article = {
'title':title,
'url':url,
'date': strftime('%d %B %Y'),
'description': description,
'content': articleText,
'author': author
}
articles.append(article)
result.append((feedName,articles))
return result
def extract_readable_article(self, html, url):
bs = BeautifulSoup(html)
col1 = bs.find('div',attrs={'class':'column1'})
content = col1.find('div',attrs={'class':'bodytext'})
title = bs.find(id='titolo_articolo').string
author = col1.find('span',attrs={'class':'firma'})
subtitle = ''
subNode = col1.findPrevious('div',attrs={'class':'occhiello_rosso'})
if subNode is not None:
subtitle = subNode
summary = ''
sommNode = bs.find('div',attrs={'class':'sommario'})
if sommNode is not None:
summary = sommNode
template = "<html><head><title>%(title)s</title></head><body><h1>%(title)s</h1><h2>%(subtitle)s</h2><h3>%(author)s</h3><div style='font-size: x-large;'>%(summary)s</div><div>%(content)s</div></body></html>"
del(bs)
return template % dict(title=title,subtitle=subtitle,author=author,summary=summary,content=content)

72
recipes/klip_me.recipe Normal file
View File

@ -0,0 +1,72 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1299694372(BasicNewsRecipe):
title = u'Klipme'
__author__ = 'Ken Sun'
publisher = 'Klip.me'
category = 'info, custom, Klip.me'
oldest_article = 365
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
remove_tags = [
dict(name='div', attrs={'id':'text_controls_toggle'})
,dict(name='script')
,dict(name='div', attrs={'id':'text_controls'})
,dict(name='div', attrs={'id':'editing_controls'})
,dict(name='div', attrs={'class':'bar bottom'})
]
use_embedded_content = False
needs_subscription = True
INDEX = u'http://www.klip.me'
LOGIN = INDEX + u'/fav/signin?callback=/fav'
feeds = [
(u'Klip.me unread', u'http://www.klip.me/fav'),
(u'Klip.me started', u'http://www.klip.me/fav?s=starred')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None:
br.open(self.LOGIN)
br.select_form(nr=0)
br['Email'] = self.username
if self.password is not None:
br['Passwd'] = self.password
br.submit()
return br
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('table',attrs={'class':['item','item new']}):
atag = item.a
if atag and atag.has_key('href'):
url = atag['href']
articles.append({
'url' :url
})
totalfeeds.append((feedtitle, articles))
return totalfeeds
def print_version(self, url):
return 'http://www.klip.me' + url
def populate_article_metadata(self, article, soup, first):
article.title = soup.find('title').contents[0].strip()
def postprocess_html(self, soup, first_fetch):
for link_tag in soup.findAll(attrs={"id" : "story"}):
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
print link_tag
return soup

15
recipes/la_voce.recipe Normal file
View File

@ -0,0 +1,15 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324114228(BasicNewsRecipe):
title = u'La Voce'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://www.lavoce.info/binary/la_voce/testata/lavoce.1184661635.gif'
feeds = [(u'La Voce', u'http://www.lavoce.info/feed_rss.php?id_feed=1')]
__author__ = 'faber1971'
description = 'Italian website on Economy - v1.01 (17, December 2011)'
language = 'it'

View File

@ -0,0 +1,103 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
'''
liberation.fr
'''
# The cleanning is from the Liberation recipe, by Darko Miletic
from calibre.web.feeds.news import BasicNewsRecipe
class Liberation(BasicNewsRecipe):
title = u'Libération: Édition abonnés'
__author__ = 'Rémi Vanicat'
description = u'Actualités'
category = 'Actualités, France, Monde'
language = 'fr'
needs_subscription = True
use_embedded_content = False
no_stylesheets = True
remove_empty_feeds = True
extra_css = '''
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
keep_only_tags = [
dict(name='div', attrs={'class':'article'})
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
,dict(name='div', attrs={'class':'entry'})
,dict(name='div', attrs={'class':'col_contenu'})
]
remove_tags_after = [
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
,dict(name='p',attrs={'class':['chapo']})
,dict(id='_twitter_facebook')
]
remove_tags = [
dict(name='iframe')
,dict(name='a', attrs={'class':'lnk-comments'})
,dict(name='div', attrs={'class':'toolbox'})
,dict(name='ul', attrs={'class':'share-box'})
,dict(name='ul', attrs={'class':'tool-box'})
,dict(name='ul', attrs={'class':'rub'})
,dict(name='p',attrs={'class':['chapo']})
,dict(name='p',attrs={'class':['tag']})
,dict(name='div',attrs={'class':['blokLies']})
,dict(name='div',attrs={'class':['alire']})
,dict(id='_twitter_facebook')
]
index = 'http://www.liberation.fr/abonnes/'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.liberation.fr/jogger/login/')
br.select_form(nr=0)
br['email'] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
soup=self.index_to_soup(self.index)
content = soup.find('div', { 'class':'block-content' })
articles = []
cat_articles = []
for tag in content.findAll(recursive=False):
if(tag['class']=='headrest headrest-basic-rounded'):
cat_articles = []
articles.append((tag.find('h5').contents[0],cat_articles))
else:
title = tag.find('h3').contents[0]
url = tag.find('a')['href']
print(url)
descripion = tag.find('p',{ 'class':'subtitle' }).contents[0]
article = {
'title': title,
'url': url,
'descripion': descripion,
'content': ''
}
cat_articles.append(article)
return articles
# Local Variables:
# mode: python
# End:

View File

@ -1,41 +1,26 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.livemint.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LiveMint(BasicNewsRecipe):
title = u'Livemint'
__author__ = 'Darko Miletic'
description = 'The Wall Street Journal'
publisher = 'The Wall Street Journal'
category = 'news, games, adventure, technology'
language = 'en'
title = u'Live Mint'
language = 'en_IN'
__author__ = 'Krittika Goyal'
#encoding = 'cp1252'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = True
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
extra_css = ' #dvArtheadline{font-size: x-large} #dvArtAbstract{font-size: large} '
no_stylesheets = True
auto_cleanup = True
keep_only_tags = [dict(name='div', attrs={'class':'innercontent'})]
remove_tags = [dict(name=['object','link','embed','form','iframe'])]
feeds = [
('Latest News',
'http://www.livemint.com/StoryRss.aspx?LN=Latestnews'),
('Gallery',
'http://www.livemint.com/GalleryRssfeed.aspx'),
('Top Stories',
'http://www.livemint.com/StoryRss.aspx?ts=Topstories'),
('Banking',
'http://www.livemint.com/StoryRss.aspx?Id=104'),
]
feeds = [(u'Articles', u'http://www.livemint.com/SectionRssfeed.aspx?Mid=1')]
def print_version(self, url):
link = url
msoup = self.index_to_soup(link)
mlink = msoup.find(attrs={'id':'ctl00_bodyplaceholdercontent_cntlArtTool_printUrl'})
if mlink:
link = 'http://www.livemint.com/Articles/' + mlink['href'].rpartition('/Articles/')[2]
return link
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,16 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
title = u'Marketing Magazine'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
__author__ = 'faber1971'
description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)'
language = 'it'

View File

@ -38,18 +38,23 @@ except:
removed keep_only tags
Version 1.8 26-11-2022
added remove tag: article-slideshow
Version 1.9 31-1-2012
removed some left debug settings
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL'
oldest_article = 10
max_articles_per_feed = 15
oldest_article = 1.2
max_articles_per_feed = 25
__author__ = u'DrMerry'
description = u'Metro Nederland'
language = u'nl'
simultaneous_downloads = 5
simultaneous_downloads = 3
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 2
timeout = 10
center_navbar = True
timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,15 +7,77 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Montreal Gazette
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
title = u'Montreal Gazette'
url_prefix = 'http://www.montrealgazette.com'
description = u'News from Montreal, QC'
fp_tag = 'CAN_MG'
language = 'en_CA'
@ -46,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -0,0 +1,59 @@
from calibre.web.feeds.news import BasicNewsRecipe
class MumbaiMirror(BasicNewsRecipe):
title = u'Mumbai Mirror'
oldest_article = 2
max_articles_per_feed = 100
__author__ = 'Krittika Goyal'
description = 'People Daily Newspaper'
language = 'en_IN'
category = 'News, Mumbai, India'
remove_javascript = True
use_embedded_content = False
auto_cleanup = True
no_stylesheets = True
#encoding = 'GB2312'
conversion_options = {'linearize_tables':True}
feeds = [
('Cover Story',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=latest'),
('City Diary',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=citydiary'),
('Columnists',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=mmcolumnists'),
('Mumbai, The City',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=city'),
('Nation',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=nation'),
('Top Stories',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=topstories'),
('Business',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=business'),
('World',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=world'),
(' Chai Time',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=chaitime'),
('Technology',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=technology'),
('Entertainment',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=entertainment'),
('Style',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=style'),
('Ask the Sexpert',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=askthesexpert'),
('Television',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=television'),
('Lifestyle',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=lifestyle'),
('Sports',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=sports'),
('Travel: Travelers Diary',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=travellersdiaries'),
('Travel: Domestic',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=traveldomestic'),
('Travel: International',
'http://www.mumbaimirror.com/rssfeeds.aspx?feed=travelinternational')
]

View File

@ -1,58 +1,53 @@
#!/usr/bin/env python
##
## Title: Microwave Journal RSS recipe
## Title: Microwave Journal
## Contact: Kiavash (use Mobile Read)
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: Kiavash
##
## Written: Jan 2012
## Last Edited: Jan 2012
## Last Edited: Feb 2012
##
# Feb 2012: New Recipe compatible with the MWJournal 2.0 website
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'Kiavash'
__author__ = 'Kaivash'
'''
Microwave Journal Monthly Magazine
You need to sign up (free) and get username/password.
microwavejournal.com
'''
import re # Import the regular expressions module.
from calibre.ptempfile import TemporaryFile # we need this for saving to a temp file
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class MWJournal(BasicNewsRecipe):
# Title to use for the ebook.
title = u'Microwave Journal'
__author__ = 'Kiavash'
language = 'en'
#A brief description for the ebook.
description = u'Microwave Journal web site ebook created using rss feeds.'
# Set publisher and publication type.
publisher = 'Horizon House'
title = u'Microwave Journal'
description = u'Microwave Journal Monthly Magazine'
publisher = 'Horizon House'
publication_type = 'magazine'
INDEX = 'http://www.microwavejournal.com/publications/'
oldest_article = 31 # monthly published magazine. Some months are 31 days!
max_articles_per_feed = 100
remove_empty_feeds = True
auto_cleanup = True
# Disable stylesheets and javascript from site.
no_stylesheets = True
remove_javascript = True
asciiize = True # Converts all none ascii characters to their ascii equivalents
needs_subscription = True # oh yeah... we need to login btw.
# Timeout for fetching files from the server in seconds. The default of 120 seconds, seems somewhat excessive.
language = 'en'
timeout = 30
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
Convert_Grayscale = False # Convert images to gray scale or not
keep_only_tags = [dict(name='div', attrs={'class':'record'})]
no_stylesheets = True
remove_javascript = True
remove_tags = [
dict(name='font', attrs={'class':'footer'}), # remove fonts
]
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
@ -72,72 +67,75 @@ class MWJournal(BasicNewsRecipe):
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
remove_tags = [
dict(name='div', attrs={'class':'boxadzonearea350'}), # Removes banner ads
dict(name='font', attrs={'class':'footer'}), # remove fonts if you do like your fonts more! Comment out to use website's fonts
dict(name='div', attrs={'class':'newsarticlead'})
]
# Remove various tag attributes to improve the look of the ebook pages.
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Remove the line breaks as well as href links. Books don't have links generally speaking
# Remove the line breaks, href links and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<a.*?>'), lambda h1: ''),
(re.compile(r'</a>'), lambda h2: '')
(re.compile(r'</a>'), lambda h2: ''),
(re.compile(r'float:.*?'), lambda h3: ''),
(re.compile(r'width:.*?px'), lambda h4: ''),
(re.compile(r'height:.*?px'), lambda h5: '')
]
# Select the feeds that you are interested.
feeds = [
(u'Current Issue', u'http://www.mwjournal.com/rss/Rss.asp?type=99'),
(u'Industry News', u'http://www.mwjournal.com/rss/Rss.asp?type=1'),
(u'Resources', u'http://www.mwjournal.com/rss/Rss.asp?type=3'),
(u'Buyer\'s Guide', u'http://www.mwjournal.com/rss/Rss.asp?type=5'),
(u'Events', u'http://www.mwjournal.com/rss/Rss.asp?type=2'),
(u'All Updates', u'http://www.mwjournal.com/rss/Rss.asp?type=0'),
]
# No magazine is complete without cover. Let's get it then!
# The function is adapted from the Economist recipe
def get_cover_url(self):
cover_url = None
cover_page_location = 'http://www.mwjournal.com/Journal/' # Cover image is located on this page
soup = self.index_to_soup(cover_page_location)
cover_item = soup.find('img',attrs={'src':lambda x: x and '/IssueImg/3_MWJ_CurrIss_CoverImg' in x}) # There are three files named cover, we want the highest resolution which is the 3rd image. So we look for the pattern. Remember that the name of the cover image changes every month so we cannot search for the complete name. Instead we are searching for the pattern
if cover_item:
cover_url = 'http://www.mwjournal.com' + cover_item['src'].strip() # yeah! we found it. Let's fetch the image file and pass it as cover to calibre
return cover_url
def print_version(self, url):
if url.find('/Journal/article.asp?HH_ID=') >= 0:
return self.browser.open_novisit(url).geturl().replace('/Journal/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
elif url.find('/News/article.asp?HH_ID=') >= 0:
return self.browser.open_novisit(url).geturl().replace('/News/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
elif url.find('/Resources/TechLib.asp?HH_ID=') >= 0:
return self.browser.open_novisit(url).geturl().replace('/Resources/TechLib.asp?HH_ID=', '/Resources/PrintRessource.asp?Id=')
return url.replace('/articles/', '/articles/print/')
def get_browser(self):
'''
Microwave Journal website, directs the login page to omeda.com once login info is submitted, omeda.com redirects to mwjournal.com with again the browser logs in into that site (hidden from the user). To overcome this obsticle, first login page is fetch and its output is stored to an HTML file. Then the HTML file is opened again and second login form is submitted (Many thanks to Barty which helped with second page login).
'''
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
url = ('http://www.omeda.com/cgi-win/mwjreg.cgi?m=login') # main login page.
br.open(url) # fetch the 1st login page
br.select_form('login') # finds the login form
br['EMAIL_ADDRESS'] = self.username # fills the username
br['PASSWORD'] = self.password # fills the password
raw = br.submit().read() # submit the form and read the 2nd login form
# save it to an htm temp file (from ESPN recipe written by Kovid Goyal kovid@kovidgoyal.net
with TemporaryFile(suffix='.htm') as fname:
with open(fname, 'wb') as f:
f.write(raw)
br.open_local_file(fname)
br.select_form(nr=0) # finds submit on the 2nd form
didwelogin = br.submit().read() # submit it and read the return html
if 'Welcome ' not in didwelogin: # did it login successfully? Is Username/password correct?
raise Exception('Failed to login, are you sure your username and password are correct?')
#login is done
return br
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
ts = soup.find('div', attrs={'class':'box1 article publications-show'})
ds = self.tag_to_string(ts.find('h2'))
self.log('Found Current Issue:', ds)
self.timefmt = ' [%s]'%ds
cover = ts.find('img', src=True)
if cover is not None:
self.cover_url = 'http://www.microwavejournal.com' + cover['src']
self.log('Found Cover image:', self.cover_url)
feeds = []
seen_titles = set([]) # This is used to remove duplicant articles
sections = soup.find('div', attrs={'class':'box2 publication'})
for section in sections.findAll('div', attrs={'class':'records'}):
section_title = self.tag_to_string(section.find('h3'))
self.log('Found section:', section_title)
articles = []
for post in section.findAll('div', attrs={'class':'record'}):
h = post.find('h2')
title = self.tag_to_string(h)
if title.find('The MWJ Puzzler') >=0: #Let's get rid of the useless Puzzler!
continue
if title in seen_titles:
continue
seen_titles.add(title)
a = post.find('a', href=True)
url = a['href']
if url.startswith('/'):
url = 'http://www.microwavejournal.com'+url
abstract = post.find('div', attrs={'class':'abstract'})
p = abstract.find('p')
desc = None
self.log('\tFound article:', title, 'at', url)
if p is not None:
desc = self.tag_to_string(p)
self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc,
'date':self.timefmt})
if articles:
feeds.append((section_title, articles))
return feeds
def postprocess_html(self, soup, first):
if self.Convert_Grayscale:
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -1,16 +1,35 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
##
## Title: Microwave Journal RSS recipe
## Contact: AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: 2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## Written: 2008
## Last Edited: Jan 2012
##
'''
01-19-2012: Added GrayScale Image conversion and Duplicant article removals
'''
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2008-2012, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
__version__ = 'v0.5.0'
__date__ = '2012-01-19'
__author__ = 'Darko Miletic'
'''
newscientist.com
'''
import re
import urllib
from calibre.utils.magick import Image
from calibre.web.feeds.news import BasicNewsRecipe
class NewScientist(BasicNewsRecipe):
title = 'New Scientist - Online News w. subscription'
__author__ = 'Darko Miletic'
description = 'Science news and science articles from New Scientist.'
language = 'en'
publisher = 'Reed Business Information Ltd.'
@ -39,10 +58,19 @@ class NewScientist(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
# more than one section). If True, only the first occurance will be downloaded.
filterDuplicates = True
# Whether to convert images to grayscale for eInk readers.
Convert_Grayscale = False
url_list = [] # This list is used to check if an article had already been included.
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.open('http://www.newscientist.com/')
if self.username is not None and self.password is not None:
if self.username is not None and self.password is not None:
br.open('https://www.newscientist.com/user/login')
data = urllib.urlencode({ 'source':'form'
,'redirectURL':''
@ -80,6 +108,10 @@ class NewScientist(BasicNewsRecipe):
return article.get('guid', None)
def print_version(self, url):
if self.filterDuplicates:
if url in self.url_list:
return
self.url_list.append(url)
return url + '?full=true&print=true'
def preprocess_html(self, soup):
@ -91,7 +123,7 @@ class NewScientist(BasicNewsRecipe):
item.name='p'
for item in soup.findAll(['xref','figref']):
tstr = item.string
item.replaceWith(tstr)
item.replaceWith(tstr)
for tg in soup.findAll('a'):
if tg.string == 'Home':
tg.parent.extract()
@ -101,3 +133,16 @@ class NewScientist(BasicNewsRecipe):
tg.replaceWith(tstr)
return soup
# Converts images to Gray Scale
def postprocess_html(self, soup, first):
if self.Convert_Grayscale:
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

21
recipes/onda_rock.recipe Normal file
View File

@ -0,0 +1,21 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1328535130(BasicNewsRecipe):
title = u'Onda Rock'
__author__ = 'faber1971'
description = 'Italian rock webzine'
language = 'it'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = False
remove_tags = [
dict(name='div', attrs={'id':['boxHeader','boxlinks_med','footer','boxinterviste','box_special_med','boxdiscografia_head','path']}),
dict(name='div', attrs={'align':'left'}),
dict(name='div', attrs={'style':'text-align: center'}),
]
no_stylesheets = True
feeds = [(u'Onda Rock', u'http://www.ondarock.it/feed.php')]
masthead_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/71135_45820579767_4993043_n.jpg'

View File

@ -14,6 +14,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
auto_cleanup = True

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,20 +7,77 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Ottawa Citizen
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
title = u'Ottawa Citizen'
url_prefix = 'http://www.ottawacitizen.com'
description = u'News from Ottawa, ON'
fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -51,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,10 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
import os, time
class AdvancedUserRecipe1277129332(BasicNewsRecipe):
title = u'People Daily - China'
title = u'人民日报'
oldest_article = 2
max_articles_per_feed = 100
__author__ = 'rty'
__author__ = 'zzh'
pubisher = 'people.com.cn'
description = 'People Daily Newspaper'
@ -14,21 +15,65 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
use_embedded_content = False
no_stylesheets = True
encoding = 'GB2312'
language = 'zh'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.people.com.cn/img/2010wb/images/logo.gif'
feeds = [(u'\u56fd\u5185\u65b0\u95fb', u'http://www.people.com.cn/rss/politics.xml'),
(u'\u56fd\u9645\u65b0\u95fb', u'http://www.people.com.cn/rss/world.xml'),
(u'\u7ecf\u6d4e\u65b0\u95fb', u'http://www.people.com.cn/rss/finance.xml'),
(u'\u4f53\u80b2\u65b0\u95fb', u'http://www.people.com.cn/rss/sports.xml'),
(u'\u53f0\u6e7e\u65b0\u95fb', u'http://www.people.com.cn/rss/haixia.xml')]
feeds = [
(u'时政', u'http://www.people.com.cn/rss/politics.xml'),
(u'国际', u'http://www.people.com.cn/rss/world.xml'),
(u'经济', u'http://www.people.com.cn/rss/finance.xml'),
(u'体育', u'http://www.people.com.cn/rss/sports.xml'),
(u'教育', u'http://www.people.com.cn/rss/edu.xml'),
(u'文化', u'http://www.people.com.cn/rss/culture.xml'),
(u'社会', u'http://www.people.com.cn/rss/society.xml'),
(u'传媒', u'http://www.people.com.cn/rss/media.xml'),
(u'娱乐', u'http://www.people.com.cn/rss/ent.xml'),
# (u'汽车', u'http://www.people.com.cn/rss/auto.xml'),
(u'海峡两岸', u'http://www.people.com.cn/rss/haixia.xml'),
# (u'IT频道', u'http://www.people.com.cn/rss/it.xml'),
# (u'环保', u'http://www.people.com.cn/rss/env.xml'),
# (u'科技', u'http://www.people.com.cn/rss/scitech.xml'),
# (u'新农村', u'http://www.people.com.cn/rss/nc.xml'),
# (u'天气频道', u'http://www.people.com.cn/rss/weather.xml'),
(u'生活提示', u'http://www.people.com.cn/rss/life.xml'),
(u'卫生', u'http://www.people.com.cn/rss/medicine.xml'),
# (u'人口', u'http://www.people.com.cn/rss/npmpc.xml'),
# (u'读书', u'http://www.people.com.cn/rss/booker.xml'),
# (u'食品', u'http://www.people.com.cn/rss/shipin.xml'),
# (u'女性新闻', u'http://www.people.com.cn/rss/women.xml'),
# (u'游戏', u'http://www.people.com.cn/rss/game.xml'),
# (u'家电频道', u'http://www.people.com.cn/rss/homea.xml'),
# (u'房产', u'http://www.people.com.cn/rss/house.xml'),
# (u'健康', u'http://www.people.com.cn/rss/health.xml'),
# (u'科学发展观', u'http://www.people.com.cn/rss/kxfz.xml'),
# (u'知识产权', u'http://www.people.com.cn/rss/ip.xml'),
# (u'高层动态', u'http://www.people.com.cn/rss/64094.xml'),
# (u'党的各项工作', u'http://www.people.com.cn/rss/64107.xml'),
# (u'党建聚焦', u'http://www.people.com.cn/rss/64101.xml'),
# (u'机关党建', u'http://www.people.com.cn/rss/117094.xml'),
# (u'事业党建', u'http://www.people.com.cn/rss/117095.xml'),
# (u'国企党建', u'http://www.people.com.cn/rss/117096.xml'),
# (u'非公党建', u'http://www.people.com.cn/rss/117097.xml'),
# (u'社区党建', u'http://www.people.com.cn/rss/117098.xml'),
# (u'高校党建', u'http://www.people.com.cn/rss/117099.xml'),
# (u'农村党建', u'http://www.people.com.cn/rss/117100.xml'),
# (u'军队党建', u'http://www.people.com.cn/rss/117101.xml'),
# (u'时代先锋', u'http://www.people.com.cn/rss/78693.xml'),
# (u'网友声音', u'http://www.people.com.cn/rss/64103.xml'),
# (u'反腐倡廉', u'http://www.people.com.cn/rss/64371.xml'),
# (u'综合报道', u'http://www.people.com.cn/rss/64387.xml'),
# (u'中国人大新闻', u'http://www.people.com.cn/rss/14576.xml'),
# (u'中国政协新闻', u'http://www.people.com.cn/rss/34948.xml'),
]
keep_only_tags = [
dict(name='div', attrs={'class':'left_content'}),
dict(name='div', attrs={'class':'text_c'}),
]
remove_tags = [
dict(name='table', attrs={'class':'title'}),
dict(name='div', attrs={'class':'tools'}),
]
remove_tags_after = [
dict(name='table', attrs={'class':'bianji'}),
dict(name='div', attrs={'id':'p_content'}),
]
def append_page(self, soup, appendtag, position):
@ -36,7 +81,7 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
if pager:
nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'left_content'})
texttag = soup2.find('div', attrs={'class':'text_c'})
#for it in texttag.findAll(style=True):
# del it['style']
newpos = len(texttag.contents)
@ -44,9 +89,15 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
texttag.extract()
appendtag.insert(position,texttag)
def skip_ad_pages(self, soup):
if ('advertisement' in soup.find('title').string.lower()):
href = soup.find('a').get('href')
return self.browser.open(href).read().decode('GB2312', 'ignore')
else:
return None
def preprocess_html(self, soup):
mtag = '<meta http-equiv="content-type" content="text/html;charset=GB2312" />\n<meta http-equiv="content-language" content="utf-8" />'
mtag = '<meta http-equiv="content-type" content="text/html;charset=GB2312" />\n<meta http-equiv="content-language" content="GB2312" />'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['form']
@ -55,3 +106,19 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
#if pager:
# pager.extract()
return soup
def get_cover_url(self):
cover = None
os.environ['TZ'] = 'Asia/Shanghai'
time.tzset()
year = time.strftime('%Y')
month = time.strftime('%m')
day = time.strftime('%d')
cover = 'http://paper.people.com.cn/rmrb/page/'+year+'-'+month+'/'+day+'/01/RMRB'+year+month+day+'B001_b.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nCover unavailable: " + cover)
cover = None
return cover

View File

@ -1,47 +1,50 @@
#!/usr/bin/env python
__author__ = 'Darko Spasovski'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
www.plusinfo.mk
'''
from calibre.web.feeds.news import BasicNewsRecipe
class PlusInfo(BasicNewsRecipe):
INDEX = 'www.plusinfo.mk'
title = u'+info'
__author__ = 'Darko Spasovski'
description = 'Macedonian news portal'
publication_type = 'newsportal'
category = 'news, Macedonia'
language = 'mk'
masthead_url = 'http://www.plusinfo.mk/style/images/logo.jpg'
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
oldest_article = 1
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class': 'vest'})]
remove_tags = [dict(name='div', attrs={'class':['komentari_holder', 'objava']})]
feeds = [(u'Македонија', u'http://www.plusinfo.mk/rss/makedonija'),
(u'Бизнис', u'http://www.plusinfo.mk/rss/biznis'),
(u'Скопје', u'http://www.plusinfo.mk/rss/skopje'),
(u'Култура', u'http://www.plusinfo.mk/rss/kultura'),
(u'Свет', u'http://www.plusinfo.mk/rss/svet'),
(u'Сцена', u'http://www.plusinfo.mk/rss/scena'),
(u'Здравје', u'http://www.plusinfo.mk/rss/zdravje'),
(u'Магазин', u'http://www.plusinfo.mk/rss/magazin'),
(u'Спорт', u'http://www.plusinfo.mk/rss/sport')]
# uncomment the following block if you want the print version (note: it lacks photos)
# def print_version(self,url):
# segments = url.split('/')
# printURL = '/'.join(segments[0:3]) + '/print/' + '/'.join(segments[5:])
# return printURL
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__author__ = 'Darko Spasovski'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
www.plusinfo.mk
'''
from calibre.web.feeds.news import BasicNewsRecipe
class PlusInfo(BasicNewsRecipe):
INDEX = 'www.plusinfo.mk'
title = u'+info'
__author__ = 'Darko Spasovski'
description = 'Macedonian news portal'
publication_type = 'newsportal'
category = 'news, Macedonia'
language = 'mk'
masthead_url = 'http://www.plusinfo.mk/style/images/logo.jpg'
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
oldest_article = 1
max_articles_per_feed = 100
remove_tags = []
remove_tags.append(dict(name='div', attrs={'class':['komentari_holder', 'objava', 'koment']}))
remove_tags.append(dict(name='ul', attrs={'class':['vest_meni']}))
remove_tags.append(dict(name='a', attrs={'name': ['fb_share']}))
keep_only_tags = [dict(name='div', attrs={'class': 'vest1'})]
feeds = [(u'Македонија', u'http://www.plusinfo.mk/rss/makedonija'),
(u'Бизнис', u'http://www.plusinfo.mk/rss/biznis'),
(u'Скопје', u'http://www.plusinfo.mk/rss/skopje'),
(u'Култура', u'http://www.plusinfo.mk/rss/kultura'),
(u'Свет', u'http://www.plusinfo.mk/rss/svet'),
(u'Сцена', u'http://www.plusinfo.mk/rss/scena'),
(u'Здравје', u'http://www.plusinfo.mk/rss/zdravje'),
(u'Магазин', u'http://www.plusinfo.mk/rss/magazin'),
(u'Спорт', u'http://www.plusinfo.mk/rss/sport')]
# uncomment the following block if you want the print version (note: it lacks photos)
# def print_version(self,url):
# segments = url.split('/')
# printURL = '/'.join(segments[0:3]) + '/print/' + '/'.join(segments[5:])
# return printURL

View File

@ -1,30 +1,36 @@
"""
readitlaterlist.com
"""
__license__ = 'GPL v3'
__copyright__ = '''
2010, Darko Miletic <darko.miletic at gmail.com>
2011, Przemyslaw Kryger <pkryger at gmail.com>
'''
'''
readitlaterlist.com
2012, tBunnyMan <Wag That Tail At Me dot com>
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Readitlater(BasicNewsRecipe):
title = 'Read It Later'
__author__ = 'Darko Miletic, Przemyslaw Kryger'
description = '''Personalized news feeds. Go to readitlaterlist.com to
setup up your news. Fill in your account
username, and optionally you can add password.'''
publisher = 'readitlater.com'
title = 'ReadItLater'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
up your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
publisher = 'readitlaterlist.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 100
max_articles_per_feed = 50
minimum_articles = 1
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
INDEX = u'http://readitlaterlist.com'
LOGIN = INDEX + u'/l'
readList = []
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@ -33,41 +39,46 @@ class Readitlater(BasicNewsRecipe):
br.select_form(nr=0)
br['feed_id'] = self.username
if self.password is not None:
br['password'] = self.password
br['password'] = self.password
br.submit()
return br
def get_feeds(self):
self.report_progress(0, ('Fetching list of feeds...'))
self.report_progress(0, ('Fetching list of pages...'))
lfeeds = []
i = 1
feedurl = self.INDEX + u'/unread/1'
while True:
title = u'Unread articles, page ' + str(i)
lfeeds.append((title, feedurl))
self.report_progress(0, ('Got ') + str(i) + (' feeds'))
lfeeds.insert(0, (title, feedurl))
self.report_progress(0, ('Got ') + str(i) + (' pages'))
i += 1
soup = self.index_to_soup(feedurl)
ritem = soup.find('a',attrs={'id':'next', 'class':'active'})
ritem = soup.find('a', attrs={'id':'next', 'class':'active'})
if ritem is None:
break
feedurl = self.INDEX + ritem['href']
if self.test:
return lfeeds[:2]
return lfeeds
def parse_index(self):
totalfeeds = []
articlesToGrab = self.max_articles_per_feed
lfeeds = self.get_feeds()
for feedobj in lfeeds:
if articlesToGrab < 1:
break
feedtitle, feedurl = feedobj
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
ritem = soup.find('ul',attrs={'id':'list'})
for item in ritem.findAll('li'):
ritem = soup.find('ul', attrs={'id':'list'})
for item in reversed(ritem.findAll('li')):
if articlesToGrab < 1:
break
else:
articlesToGrab -= 1
description = ''
atag = item.find('a',attrs={'class':'text'})
atag = item.find('a', attrs={'class':'text'})
if atag and atag.has_key('href'):
url = self.INDEX + atag['href']
title = self.tag_to_string(item.div)
@ -78,6 +89,20 @@ class Readitlater(BasicNewsRecipe):
,'url' :url
,'description':description
})
readLink = item.find('a', attrs={'class':'check'})['href']
self.readList.append(readLink)
totalfeeds.append((feedtitle, articles))
if len(self.readList) < self.minimum_articles:
raise Exception("Not enough articles in RIL! Change minimum_articles or add more.")
return totalfeeds
def mark_as_read(self, markList):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
response = br.open(url)
response
def cleanup(self):
self.mark_as_read(self.readList)

170
recipes/real_clear.recipe Normal file
View File

@ -0,0 +1,170 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
import time
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
class RealClear(BasicNewsRecipe):
title = u'Real Clear'
__author__ = 'TMcN'
description = 'Real Clear Politics/Science/etc... aggregation of news\n'
cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
custom_title = 'Real Clear - '+ time.strftime('%d %b %Y')
auto_cleanup = True
encoding = 'utf8'
language = 'en'
needs_subscription = False
no_stylesheets = True
oldest_article = 7
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 400
debugMessages = False
# Numeric parameter is type, controls whether we look for
feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
# ["Commentary", "http://feeds.feedburner.com/Realclearpolitics-Articles", 1],
["Markets Home", "http://www.realclearmarkets.com/index.xml", 0],
["Markets", "http://www.realclearmarkets.com/articles/index.xml", 0],
["World", "http://www.realclearworld.com/index.xml", 0],
["World Blog", "http://www.realclearworld.com/blog/index.xml", 2]
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
printhints = [
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
["nationalreview.com", ">Print<", 'a', ''],
["reason.com", "", 'a', 'printer']
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle
# forbes,
# usatoday - just prints with all current crap anyhow
]
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL):
tagURL = pageURL
hintsCount =len(self.printhints)
for x in range(0,hintsCount):
if pageURL.find(self.printhints[x][0])== -1 :
continue
print("Trying "+self.printhints[x][0])
# Only retrieve the soup if we have a match to check for the printed article with.
soup = self.index_to_soup(pageURL)
if soup is None:
return pageURL
if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
print("search1")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
continue
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
return printFind['href']
tag = printFind.parent
print(tag)
if tag['href'] is None:
if self.debugMessages == True :
print("Not in parent, trying skip-up")
if tag.parent['href'] is None:
if self.debugMessages == True :
print("Not in skip either, aborting")
continue;
return tag.parent['href']
return tag['href']
return tagURL
def get_browser(self):
if self.debugMessages == True :
print("In get_browser")
br = BasicNewsRecipe.get_browser()
return br
def parseRSS(self, index) :
if self.debugMessages == True :
print("\n\nStarting "+self.feedsets[index][0])
articleList = []
soup = self.index_to_soup(self.feedsets[index][1])
for div in soup.findAll("item"):
title = div.find("title").contents[0]
urlEl = div.find("originalLink")
if urlEl is None or len(urlEl.contents)==0 :
urlEl = div.find("originallink")
if urlEl is None or len(urlEl.contents)==0 :
urlEl = div.find("link")
if urlEl is None or len(urlEl.contents)==0 :
urlEl = div.find("guid")
if urlEl is None or title is None or len(urlEl.contents)==0 :
print("Error in feed "+ self.feedsets[index][0])
print(div)
continue
print(title)
print(urlEl)
url = urlEl.contents[0].encode("utf-8")
description = div.find("description")
if description is not None and description.contents is not None and len(description.contents)>0:
description = description.contents[0]
else :
description="None"
pubDateEl = div.find("pubDate")
if pubDateEl is None :
pubDateEl = div.find("pubdate")
if pubDateEl is None :
pubDate = time.strftime('%a, %d %b')
else :
pubDate = pubDateEl.contents[0]
if self.debugMessages == True :
print("Article");
print(title)
print(description)
print(pubDate)
print(url)
url = self.extractPrintURL(url)
print(url)
#url +=re.sub(r'\?.*', '', div['href'])
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
# 'title' : article title,
# 'url' : URL of print version,
# 'date' : The publication date of the article as a string,
# 'description' : A summary of the article
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self):
# Parse the page into Python Soup
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
feedarticles = self.parseRSS(x)
if feedarticles is not None:
ans.append((self.feedsets[x][0], feedarticles))
if self.debugMessages == True :
print(ans)
return ans

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,35 +7,77 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Regina Leader-Post
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
title = u'Regina Leader-Post'
url_prefix = 'http://www.leaderpost.com'
description = u'News from Regina, SK'
fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -66,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,30 +7,77 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Saskatoon Star-Phoenix
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
title = u'Saskatoon Star-Phoenix'
url_prefix = 'http://www.thestarphoenix.com'
description = u'News from Saskatoon, SK'
fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -61,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

14
recipes/satira.recipe Normal file
View File

@ -0,0 +1,14 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327351409(BasicNewsRecipe):
title = u'Satira'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'spinoza', u'http://feeds.feedburner.com/Spinoza'), (u'umore maligno', u'http://www.umoremaligno.it/feed/rss/'), (u'fed-ex', u'http://exfed.tumblr.com/rss'), (u'metilparaben', u'http://feeds.feedburner.com/metil'), (u'freddy nietzsche', u'http://feeds.feedburner.com/FreddyNietzsche')]
__author__ = 'faber1971'
description = 'Collection of Italian satiric blogs - v1.00 (28, January 2012)'
language = 'it'

View File

@ -0,0 +1,133 @@
#!/usr/bin/env python
import urlparse
from collections import OrderedDict
from calibre.web.feeds.news import BasicNewsRecipe
class StrangeHorizons(BasicNewsRecipe):
# Recipe metadata
# Any issue archive page is an acceptable index as well.
# However, reviews will not be included in older issues.
# (Using the reviews archive instead of the recent reviews page would fix.)
INDEX = 'http://www.strangehorizons.com/'
title = 'Strange Horizons'
description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends'
masthead_url = 'http://strangehorizons.com/images/sh_head.gif'
publication_type = 'magazine'
language = 'en'
__author__ = 'Jim DeVona'
__version__ = '1.0'
# Cruft filters
keep_only_tags = [dict(name='div', id='content')]
remove_tags = [dict(name='p', attrs={'class': 'forum-links'}), dict(name='p', attrs={'class': 'top-link'})]
remove_tags_after = [dict(name='p', attrs={'class': 'author-bio'})]
# Styles
no_stylesheets = True
extra_css = '''div.image-left { margin: 0.5em auto 1em auto; } div.image-right { margin: 0.5em auto 1em auto; } div.illustration { margin: 0.5em auto 1em auto; text-align: center; } p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; } h1 { font-size: 160%; } h2 { font-size: 110%; } h3 { font-size: 85%; } h4 { font-size: 80%; } p { font-size: 90%; margin: 1em 1em 1em 15px; } p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; } p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; } p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; } p.content-date { font-weight: bold; } p.dedication { font-style: italic; } div.stanza { margin-bottom: 1em; } div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; } p.verse-line { margin-bottom: 0px; margin-top: 0px; } p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; } p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; } p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; } .foreign { font-style: italic; } .thought { font-style: italic; } .thought cite { font-style: normal; } .thought em { font-style: normal; } blockquote { font-size: 90%; font-style: italic; } blockquote cite { font-style: normal; } blockquote em { font-style: normal; } blockquote .foreign { font-style: normal; } blockquote .thought { font-style: normal; } .speaker { font-weight: bold; } pre { margin-left: 15px; } div.screenplay { font-family: monospace; } blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; } .screenplay p.dialogue-first { margin-top: 0; } .screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; } blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; } .no-italics { font-style: normal; }'''
def parse_index(self):
sections = OrderedDict()
strange_soup = self.index_to_soup(self.INDEX)
# Find the heading that marks the start of this issue.
issue_heading = strange_soup.find('h2')
issue_date = self.tag_to_string(issue_heading)
self.title = self.title + " - " + issue_date
# Examine subsequent headings for information about this issue.
heading_tag = issue_heading.findNextSibling(['h2','h3'])
while heading_tag != None:
# An h2 indicates the start of the next issue.
if heading_tag.name == 'h2':
break
# The heading begins with a word indicating the article category.
section = self.tag_to_string(heading_tag).split(':', 1)[0].title()
# Reviews aren't linked from the index, so we need to look them up
# separately. Currently using Recent Reviews page. The reviews
# archive page lists all reviews, but is >500k.
if section == 'Review':
# Get the list of recent reviews.
review_soup = self.index_to_soup('http://www.strangehorizons.com/reviews/')
review_titles = review_soup.findAll('p', attrs={'class': 'contents-title'})
# Get the list of reviews included in this issue. (Kludgey.)
reviews_summary = heading_tag.findNextSibling('p', attrs={'class': 'contents-pullquote'})
for br in reviews_summary.findAll('br'):
br.replaceWith('----')
review_summary_text = self.tag_to_string(reviews_summary)
review_lines = review_summary_text.split(' ----')
# Look for each of the needed reviews (there are 3, right?)...
for review_info in review_lines[0:3]:
# Get the review's release day (unused), title, and author.
day, tna = review_info.split(': ', 1)
article_title, article_author = tna.split(', reviewed by ')
# ... in the list of recent reviews.
for review_title_tag in review_titles:
review_title = self.tag_to_string(review_title_tag)
if review_title != article_title:
continue
# Extract review information from heading and surrounding text.
article_summary = self.tag_to_string(review_title_tag.findNextSibling('p', attrs={'class': 'contents-pullquote'}))
review_date = self.tag_to_string(review_title_tag.findNextSibling('p', attrs={'class': 'contents-date'}))
article_url = review_title_tag.find('a')['href']
# Add this review to the Review section.
if section not in sections:
sections[section] = []
sections[section].append({
'title': article_title,
'author': article_author,
'url': article_url,
'description': article_summary,
'date': review_date})
break
else:
# Try http://www.strangehorizons.com/reviews/archives.shtml
self.log("Review not found in Recent Reviews:", article_title)
else:
# Extract article information from the heading and surrounding text.
link = heading_tag.find('a')
article_title = self.tag_to_string(link)
article_url = urlparse.urljoin(self.INDEX, link['href'])
article_author = link.nextSibling.replace(', by ', '')
article_summary = self.tag_to_string(heading_tag.findNextSibling('p', attrs={'class':'contents-pullquote'}))
# Add article to the appropriate collection of sections.
if section not in sections:
sections[section] = []
sections[section].append({
'title': article_title,
'author': article_author,
'url': article_url,
'description': article_summary,
'date': issue_date})
heading_tag = heading_tag.findNextSibling(['h2','h3'])
# Manually insert standard info about the magazine.
sections['About'] = [{
'title': 'Strange Horizons',
'author': 'Niall Harrison, Editor-in-Chief',
'url': 'http://www.strangehorizons.com/AboutUs.shtml',
'description': 'Strange Horizons is a magazine of and about speculative fiction and related nonfiction. Speculative fiction includes science fiction, fantasy, horror, slipstream, and all other flavors of fantastika. Work published in Strange Horizons has been shortlisted for or won Hugo, Nebula, Rhysling, Theodore Sturgeon, James Tiptree Jr., and World Fantasy Awards.',
'date': ''}]
return sections.items()

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' # 2012-01-26 AGe change to actual Year
'''
Fetch sueddeutsche.de
@ -8,19 +8,30 @@ Fetch sueddeutsche.de
from calibre.web.feeds.news import BasicNewsRecipe
class Sueddeutsche(BasicNewsRecipe):
title = u'sueddeutsche.de'
description = 'News from Germany'
__author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2011-12-16
use_embedded_content = False
timefmt = ' [%d %b %Y]'
oldest_article = 7
max_articles_per_feed = 50
no_stylesheets = True
language = 'de'
encoding = 'utf-8'
remove_javascript = True
auto_cleanup = True
cover_url = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1237395.1324054345!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-12-16 AGe
title = u'Süddeutsche.de' # 2012-01-26 AGe Correct Title
description = 'News from Germany, Access to online content' # 2012-01-26 AGe
__author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-01-26
publisher = 'Süddeutsche Zeitung' # 2012-01-26 AGe add
category = 'news, politics, Germany' # 2012-01-26 AGe add
timefmt = ' [%a, %d %b %Y]' # 2012-01-26 AGe add %a
oldest_article = 7
max_articles_per_feed = 100
language = 'de'
encoding = 'utf-8'
publication_type = 'newspaper' # 2012-01-26 add
cover_source = 'http://www.sueddeutsche.de/verlag' # 2012-01-26 AGe add from Darko Miletic paid content source
masthead_url = 'http://www.sueddeutsche.de/static_assets/build/img/sdesiteheader/logo_homepage.441d531c.png' # 2012-01-26 AGe add
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
auto_cleanup = True
def get_cover_url(self): # 2012-01-26 AGe add from Darko Miletic paid content source
cover_source_soup = self.index_to_soup(self.cover_source)
preview_image_div = cover_source_soup.find(attrs={'class':'preview-image'})
return preview_image_div.div.img['src']
feeds = [
(u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'),
(u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'),
@ -29,6 +40,9 @@ class Sueddeutsche(BasicNewsRecipe):
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
(u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'), #2012-01-26 AGe New
(u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'), #2012-01-26 AGe New
(u'Stil', u'http://rss.sueddeutsche.de/rss/stil'), #2012-01-26 AGe New
(u'München & Region', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMünchen&Region%24?output=rss'),
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
@ -42,6 +56,7 @@ class Sueddeutsche(BasicNewsRecipe):
(u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'), # sometimes only
(u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only
(u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'), # sometimes only
]
# AGe 2011-12-16 Problem of Handling redirections solved by a solution of Recipes-Re-usable code from kiklop74.
# Feed is: http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss

View File

@ -0,0 +1,15 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327051385(BasicNewsRecipe):
title = u'Tech Economy'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://www.techeconomy.it/wp-content/uploads/2012/01/Logo-TE9.png'
feeds = [(u'Tech Economy', u'http://www.techeconomy.it/feed/')]
remove_tags_after = [dict(name='div', attrs={'class':'cab-author-name'})]
__author__ = 'faber1971'
description = 'Italian website on technology - v1.00 (28, January 2012)'
language = 'it'

View File

@ -0,0 +1,37 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Telegraph(BasicNewsRecipe):
title = u'The Telegraph India'
language = 'en_IN'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('Front Page',
'http://www.telegraphindia.com/feeds/rss.jsp?id=3'),
('Nation',
'http://www.telegraphindia.com/feeds/rss.jsp?id=4'),
('Calcutta',
'http://www.telegraphindia.com/feeds/rss.jsp?id=5'),
('Bengal',
'http://www.telegraphindia.com/feeds/rss.jsp?id=8'),
('Bihar',
'http://www.telegraphindia.com/feeds/rss.jsp?id=22'),
('Sports',
'http://www.telegraphindia.com/feeds/rss.jsp?id=7'),
('International',
'http://www.telegraphindia.com/feeds/rss.jsp?id=13'),
('Business',
'http://www.telegraphindia.com/feeds/rss.jsp?id=9'),
('Entertainment',
'http://www.telegraphindia.com/feeds/rss.jsp?id=20'),
('Opinion',
'http://www.telegraphindia.com/feeds/rss.jsp?id=6'),
]

View File

@ -0,0 +1,46 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
'''
abc.net.au/news
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class TheDailyNewsEG(BasicNewsRecipe):
title = u'The Daily News Egypt'
__author__ = 'Omm Mishmishah'
description = 'News from Egypt'
masthead_url = 'http://www.thedailynewsegypt.com/images/DailyNews-03_05.gif'
cover_url = 'http://www.thedailynewsegypt.com/images/DailyNews-03_05.gif'
auto_cleanup = True
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'The Daily News Egypt'
category = 'News, Egypt, World'
language = 'en_EG'
publication_type = 'newsportal'
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': False
}
keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height']
feeds = [(u'The Daily News Egypt', u'http://www.thedailynewsegypt.com/rss.php?sectionid=all')]

View File

@ -0,0 +1,24 @@
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327434170(BasicNewsRecipe):
title = u"Tom's Hardware"
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
masthead_url = 'http://userlogos.org/files/logos/spaljeni/tomshardwre.png'
def get_article_url(self, article):
link = BasicNewsRecipe.get_article_url(self, article)
if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
a=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'L' , 'N' , 'S' ]
b=['0', '.', '/', '?', '-', '=', '&', '_', 'http://', '.com', 'www.']
for i in range(0,len(a)):
link=link.replace('0'+a[-i],b[-i])
return link
feeds = [(u"Tom's Hardware", u'http://rss.feedsportal.com/c/32604/f/531080/index.rss')]
__author__ = 'faber1971'
description = 'Italian website on technology - v1.00 (28, January 2012)'
language = 'it'

View File

@ -0,0 +1,233 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
title = u'Vancouver Province'
url_prefix = 'http://www.theprovince.com'
description = u'News from Vancouver, BC'
fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,50 +7,77 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Sun
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
title = u'Vancouver Sun'
url_prefix = 'http://www.vancouversun.com'
description = u'News from Vancouver, BC'
fp_tag = 'CAN_VS'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -81,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
@ -6,60 +7,77 @@ __license__ = 'GPL v3'
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Victoria Times Colonist
# un-comment the following four lines for the Victoria Times Colonist
title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC'
fp_tag = 'CAN_TC'
# un-comment the following three lines for the Vancouver Province
#title = u'Vancouver Province'
#url_prefix = 'http://www.theprovince.com'
#description = u'News from Vancouver, BC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following three lines for the Vancouver Sun
#title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
@ -91,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

233
recipes/windsor_star.recipe Normal file
View File

@ -0,0 +1,233 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
title = u'Windsor Star'
url_prefix = 'http://www.windsorstar.com'
description = u'News from Windsor, ON'
fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,144 @@
#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
class GazetaWyborczaDuzyForma(BasicNewsRecipe):
cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
title = u"Gazeta Wyborcza Duzy Format"
__author__ = 'ravcio - rlelusz[at]gmail.com'
description = u"Articles from Gazeta's website"
language = 'pl'
max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work
recursions = 0
encoding = 'iso-8859-2'
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
keep_only_tags = [
dict(name='div', attrs={'id':['k1']})
]
remove_tags = [
dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
,dict(name='ul', attrs={'id':['articleToolbar']})
,dict(name='img', attrs={'class':['brand']})
,dict(name='h5', attrs={'class':['author']})
,dict(name='h6', attrs={'class':['date']})
,dict(name='p', attrs={'class':['txt_upl']})
]
remove_tags_after = [
dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii
]
def load_article_links(self, url, count):
print '--- load_article_links', url, count
#page with link to articles
soup = self.index_to_soup(url)
#table with articles
list = soup.find('div', attrs={'class':'GWdalt'})
#single articles (link, title, ...)
links = list.findAll('div', attrs={'class':['GWdaltE']})
if len(links) < count:
#load links to more articles...
#remove new link
pages_nav = list.find('div', attrs={'class':'pages'})
next = pages_nav.find('a', attrs={'class':'next'})
if next:
print 'next=', next['href']
url = 'http://wyborcza.pl' + next['href']
#e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
older_links = self.load_article_links(url, count - len(links))
links.extend(older_links)
return links
#produce list of articles to download
def parse_index(self):
print '--- parse_index'
max_articles = 8000
links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
ans = []
key = None
articles = {}
key = 'Uncategorized'
articles[key] = []
for div_art in links:
div_date = div_art.find('div', attrs={'class':'kL'})
div = div_art.find('div', attrs={'class':'kR'})
a = div.find('a', href=True)
url = a['href']
title = a.string
description = ''
pubdate = div_date.string.rstrip().lstrip()
summary = div.find('span', attrs={'class':'lead'})
desc = summary.find('a', href=True)
if desc:
desc.extract()
description = self.tag_to_string(summary, use_alt=False)
description = description.rstrip().lstrip()
feed = key if key is not None else 'Duzy Format'
if not articles.has_key(feed):
articles[feed] = []
if description != '': # skip just pictures atricle
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
ans = [(key, articles[key])]
return ans
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'id':'Str'})
if pager:
#seek for 'a' element with nast value (if not found exit)
list = pager.findAll('a')
for elem in list:
if 'nast' in elem.string:
nexturl = elem['href']
soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
texttag = soup2.find('div', attrs={'id':'artykul'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
# finally remove some tags
pager = soup.find('div',attrs={'id':'Str'})
if pager:
pager.extract()
pager = soup.find('div',attrs={'class':'tylko_int'})
if pager:
pager.extract()
return soup

View File

@ -3,7 +3,7 @@
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>..:: calibre {library} ::.. {title}</title>
<meta http-equiv="X-UA-Compatible" content="IE=100" />
<link rel="icon" type="image/x-icon" href="http://calibre-ebook.com/favicon.ico" />
@ -58,7 +58,7 @@
method="post" title="Donate to support the development of calibre">
<div>
<input type="hidden" name="cmd" value="_s-xclick"></input>
<input type="hidden" name="hosted_button_id" value="3028915"></input>
<input type="hidden" name="hosted_button_id" value="MZQCP8EESW4H4"></input>
<input type="image"
src="{prefix}/static/button-donate.png"
name="submit"></input>

Binary file not shown.

View File

@ -26,7 +26,11 @@ def login_to_google(username, password):
br.form['Email'] = username
br.form['Passwd'] = password
raw = br.submit().read()
if b'<title>Account overview - Account Settings</title>' not in raw:
if re.search(br'<title>.*?Account Settings</title>', raw) is None:
x = re.search(br'(?is)<title>.*?</title>', raw)
if x is not None:
print ('Title of post login page: %s'%x.group())
#open('/tmp/goog.html', 'wb').write(raw)
raise ValueError(('Failed to login to google with credentials: %s %s'
'\nGoogle sometimes requires verification when logging in from a '
'new IP address. Use lynx to login and supply the verification, '

View File

@ -18,14 +18,14 @@ msgstr ""
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
"devel@lists.alioth.debian.org>\n"
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
"PO-Revision-Date: 2012-01-08 20:03+0000\n"
"Last-Translator: Simeon <Unknown>\n"
"PO-Revision-Date: 2012-01-14 02:30+0000\n"
"Last-Translator: Wolfgang Rohdewald <wolfgang@rohdewald.de>\n"
"Language-Team: German <debian-l10n-german@lists.debian.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2012-01-09 04:49+0000\n"
"X-Generator: Launchpad (build 14640)\n"
"X-Launchpad-Export-Date: 2012-01-15 05:18+0000\n"
"X-Generator: Launchpad (build 14664)\n"
"Language: de\n"
#. name for aaa

File diff suppressed because it is too large Load Diff

View File

@ -12,14 +12,14 @@ msgstr ""
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
"devel@lists.alioth.debian.org>\n"
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
"PO-Revision-Date: 2011-11-03 23:08+0000\n"
"PO-Revision-Date: 2012-02-01 20:12+0000\n"
"Last-Translator: drMerry <Unknown>\n"
"Language-Team: Dutch <vertaling@vrijschrift.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2011-11-26 05:12+0000\n"
"X-Generator: Launchpad (build 14381)\n"
"X-Launchpad-Export-Date: 2012-02-02 05:57+0000\n"
"X-Generator: Launchpad (build 14738)\n"
"Language: nl\n"
#. name for aaa
@ -17956,7 +17956,7 @@ msgstr ""
#. name for nds
msgid "German; Low"
msgstr ""
msgstr "Duits; Laag"
#. name for ndt
msgid "Ndunga"
@ -30424,7 +30424,7 @@ msgstr ""
#. name for zlm
msgid "Malay (individual language)"
msgstr ""
msgstr "Maleis (aparte taal)"
#. name for zln
msgid "Zhuang; Lianshan"

View File

@ -151,7 +151,7 @@ class Translations(POT): # {{{
self.info('\tCopying ISO 639 translations')
subprocess.check_call(['msgfmt', '-o', dest, iso639])
elif locale not in ('en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc',
'ltg', 'nds', 'te', 'yi', 'fo', 'sq', 'ast', 'ml'):
'ltg', 'nds', 'te', 'yi', 'fo', 'sq', 'ast', 'ml', 'ku'):
self.warn('No ISO 639 translations for locale:', locale)
self.write_stats()

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
__appname__ = u'calibre'
numeric_version = (0, 8, 35)
numeric_version = (0, 8, 38)
__version__ = u'.'.join(map(unicode, numeric_version))
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
@ -161,4 +161,32 @@ def get_version():
v += '*'
return v
def get_unicode_windows_env_var(name):
import ctypes
name = unicode(name)
n = ctypes.windll.kernel32.GetEnvironmentVariableW(name, None, 0)
if n == 0:
return None
buf = ctypes.create_unicode_buffer(u'\0'*n)
ctypes.windll.kernel32.GetEnvironmentVariableW(name, buf, n)
return buf.value
def get_windows_username():
'''
Return the user name of the currently loggen in user as a unicode string.
Note that usernames on windows are case insensitive, the case of the value
returned depends on what the user typed into the login box at login time.
'''
import ctypes
try:
advapi32 = ctypes.windll.advapi32
GetUserName = getattr(advapi32, u'GetUserNameW')
except AttributeError:
pass
else:
buf = ctypes.create_unicode_buffer(257)
n = ctypes.c_int(257)
if GetUserName(buf, ctypes.byref(n)):
return buf.value
return get_unicode_windows_env_var(u'USERNAME')

View File

@ -5,13 +5,14 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, glob, functools, re
from calibre import guess_type
from calibre.customize import FileTypePlugin, MetadataReaderPlugin, \
MetadataWriterPlugin, PreferencesPlugin, InterfaceActionBase, StoreBase
from calibre.customize import (FileTypePlugin, MetadataReaderPlugin,
MetadataWriterPlugin, PreferencesPlugin, InterfaceActionBase, StoreBase)
from calibre.constants import numeric_version
from calibre.ebooks.metadata.archive import ArchiveExtract, get_cbz_metadata
from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.ebooks.html.to_zip import HTML2ZIP
plugins = []
# To archive plugins {{{
class PML2PMLZ(FileTypePlugin):
@ -86,6 +87,8 @@ class TXT2TXTZ(FileTypePlugin):
return list(set(images))
def run(self, path_to_ebook):
from calibre.ebooks.metadata.opf2 import metadata_to_opf
with open(path_to_ebook, 'rb') as ebf:
txt = ebf.read()
base_dir = os.path.dirname(path_to_ebook)
@ -117,6 +120,7 @@ class TXT2TXTZ(FileTypePlugin):
# No images so just import the TXT file.
return path_to_ebook
plugins += [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract,]
# }}}
# Metadata reader plugins {{{
@ -399,6 +403,10 @@ class ZipMetadataReader(MetadataReaderPlugin):
def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.zip import get_metadata
return get_metadata(stream)
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
# }}}
# Metadata writer plugins {{{
@ -499,107 +507,51 @@ class TXTZMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.extz import set_metadata
set_metadata(stream, mi)
# }}}
from calibre.ebooks.comic.input import ComicInput
from calibre.ebooks.djvu.input import DJVUInput
from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.fb2.input import FB2Input
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.htmlz.input import HTMLZInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.odt.input import ODTInput
from calibre.ebooks.pdb.input import PDBInput
from calibre.ebooks.azw4.input import AZW4Input
from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.pml.input import PMLInput
from calibre.ebooks.rb.input import RBInput
from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.rtf.input import RTFInput
from calibre.ebooks.tcr.input import TCRInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lrf.input import LRFInput
from calibre.ebooks.chm.input import CHMInput
from calibre.ebooks.snb.input import SNBInput
from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.fb2.output import FB2Output
from calibre.ebooks.lit.output import LITOutput
from calibre.ebooks.lrf.output import LRFOutput
from calibre.ebooks.mobi.output import MOBIOutput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.pdb.output import PDBOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.output import PMLOutput
from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.txt.output import TXTZOutput
from calibre.ebooks.html.output import HTMLOutput
from calibre.ebooks.htmlz.output import HTMLZOutput
from calibre.ebooks.snb.output import SNBOutput
from calibre.customize.profiles import input_profiles, output_profiles
from calibre.devices.apple.driver import ITUNES
from calibre.devices.hanlin.driver import HANLINV3, HANLINV5, BOOX, SPECTRA
from calibre.devices.blackberry.driver import BLACKBERRY, PLAYBOOK
from calibre.devices.cybook.driver import CYBOOK, ORIZON
from calibre.devices.eb600.driver import (EB600, COOL_ER, SHINEBOOK,
POCKETBOOK360, GER2, ITALICA, ECLICTO, DBOOK, INVESBOOK,
BOOQ, ELONEX, POCKETBOOK301, MENTOR, POCKETBOOK602,
POCKETBOOK701, POCKETBOOK360P, PI2)
from calibre.devices.iliad.driver import ILIAD
from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
from calibre.devices.kindle.driver import (KINDLE, KINDLE2, KINDLE_DX,
KINDLE_FIRE)
from calibre.devices.nook.driver import NOOK, NOOK_COLOR
from calibre.devices.prs505.driver import PRS505
from calibre.devices.prst1.driver import PRST1
from calibre.devices.user_defined.driver import USER_DEFINED
from calibre.devices.android.driver import ANDROID, S60, WEBOS
from calibre.devices.nokia.driver import N770, N810, E71X, E52
from calibre.devices.eslick.driver import ESLICK, EBK52
from calibre.devices.nuut2.driver import NUUT2
from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import (N516, EB511, ALEX, AZBOOKA, THEBOOK,
LIBREAIR, ODYSSEY)
from calibre.devices.edge.driver import EDGE
from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS,
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH, WEXLER)
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import (PALMPRE, AVANT, SWEEX, PDNOVEL,
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR,
TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK, COBY, EX124G)
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
from calibre.devices.bambook.driver import BAMBOOK
from calibre.devices.boeye.driver import BOEYE_BEX, BOEYE_BDX
from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, ]
# New metadata download plugins {{{
from calibre.ebooks.metadata.sources.google import GoogleBooks
from calibre.ebooks.metadata.sources.amazon import Amazon
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
from calibre.ebooks.metadata.sources.overdrive import OverDrive
from calibre.ebooks.metadata.sources.douban import Douban
from calibre.ebooks.metadata.sources.ozon import Ozon
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataWriter')]
# }}}
# Conversion plugins {{{
from calibre.ebooks.conversion.plugins.comic_input import ComicInput
from calibre.ebooks.conversion.plugins.djvu_input import DJVUInput
from calibre.ebooks.conversion.plugins.epub_input import EPUBInput
from calibre.ebooks.conversion.plugins.fb2_input import FB2Input
from calibre.ebooks.conversion.plugins.html_input import HTMLInput
from calibre.ebooks.conversion.plugins.htmlz_input import HTMLZInput
from calibre.ebooks.conversion.plugins.lit_input import LITInput
from calibre.ebooks.conversion.plugins.mobi_input import MOBIInput
from calibre.ebooks.conversion.plugins.odt_input import ODTInput
from calibre.ebooks.conversion.plugins.pdb_input import PDBInput
from calibre.ebooks.conversion.plugins.azw4_input import AZW4Input
from calibre.ebooks.conversion.plugins.pdf_input import PDFInput
from calibre.ebooks.conversion.plugins.pml_input import PMLInput
from calibre.ebooks.conversion.plugins.rb_input import RBInput
from calibre.ebooks.conversion.plugins.recipe_input import RecipeInput
from calibre.ebooks.conversion.plugins.rtf_input import RTFInput
from calibre.ebooks.conversion.plugins.tcr_input import TCRInput
from calibre.ebooks.conversion.plugins.txt_input import TXTInput
from calibre.ebooks.conversion.plugins.lrf_input import LRFInput
from calibre.ebooks.conversion.plugins.chm_input import CHMInput
from calibre.ebooks.conversion.plugins.snb_input import SNBInput
from calibre.ebooks.conversion.plugins.epub_output import EPUBOutput
from calibre.ebooks.conversion.plugins.fb2_output import FB2Output
from calibre.ebooks.conversion.plugins.lit_output import LITOutput
from calibre.ebooks.conversion.plugins.lrf_output import LRFOutput
from calibre.ebooks.conversion.plugins.mobi_output import MOBIOutput
from calibre.ebooks.conversion.plugins.oeb_output import OEBOutput
from calibre.ebooks.conversion.plugins.pdb_output import PDBOutput
from calibre.ebooks.conversion.plugins.pdf_output import PDFOutput
from calibre.ebooks.conversion.plugins.pml_output import PMLOutput
from calibre.ebooks.conversion.plugins.rb_output import RBOutput
from calibre.ebooks.conversion.plugins.rtf_output import RTFOutput
from calibre.ebooks.conversion.plugins.tcr_output import TCROutput
from calibre.ebooks.conversion.plugins.txt_output import TXTOutput, TXTZOutput
from calibre.ebooks.conversion.plugins.html_output import HTMLOutput
from calibre.ebooks.conversion.plugins.htmlz_output import HTMLZOutput
from calibre.ebooks.conversion.plugins.snb_output import SNBOutput
plugins += [
ComicInput,
DJVUInput,
@ -642,6 +594,66 @@ plugins += [
HTMLZOutput,
SNBOutput,
]
# }}}
# Catalog plugins {{{
from calibre.library.catalogs.csv_xml import CSV_XML
from calibre.library.catalogs.bibtex import BIBTEX
from calibre.library.catalogs.epub_mobi import EPUB_MOBI
plugins += [CSV_XML, BIBTEX, EPUB_MOBI]
# }}}
# EPUB Fix plugins {{{
from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins += [Unmanifested, Epubcheck]
# }}}
# Profiles {{{
from calibre.customize.profiles import input_profiles, output_profiles
plugins += input_profiles + output_profiles
# }}}
# Device driver plugins {{{
from calibre.devices.apple.driver import ITUNES
from calibre.devices.hanlin.driver import HANLINV3, HANLINV5, BOOX, SPECTRA
from calibre.devices.blackberry.driver import BLACKBERRY, PLAYBOOK
from calibre.devices.cybook.driver import CYBOOK, ORIZON
from calibre.devices.eb600.driver import (EB600, COOL_ER, SHINEBOOK,
POCKETBOOK360, GER2, ITALICA, ECLICTO, DBOOK, INVESBOOK,
BOOQ, ELONEX, POCKETBOOK301, MENTOR, POCKETBOOK602,
POCKETBOOK701, POCKETBOOK360P, PI2)
from calibre.devices.iliad.driver import ILIAD
from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
from calibre.devices.kindle.driver import (KINDLE, KINDLE2, KINDLE_DX,
KINDLE_FIRE)
from calibre.devices.nook.driver import NOOK, NOOK_COLOR
from calibre.devices.prs505.driver import PRS505
from calibre.devices.prst1.driver import PRST1
from calibre.devices.user_defined.driver import USER_DEFINED
from calibre.devices.android.driver import ANDROID, S60, WEBOS
from calibre.devices.nokia.driver import N770, N810, E71X, E52
from calibre.devices.eslick.driver import ESLICK, EBK52
from calibre.devices.nuut2.driver import NUUT2
from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import (N516, EB511, ALEX, AZBOOKA, THEBOOK,
LIBREAIR, ODYSSEY)
from calibre.devices.edge.driver import EDGE
from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS,
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH, WEXLER)
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import (PALMPRE, AVANT, SWEEX, PDNOVEL,
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, LUMIREAD, ALURATEK_COLOR,
TREKSTOR, EEEREADER, NEXTBOOK, ADAM, MOOVYBOOK, COBY, EX124G)
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
from calibre.devices.bambook.driver import BAMBOOK
from calibre.devices.boeye.driver import BOEYE_BEX, BOEYE_BDX
# Order here matters. The first matched device is the one used.
plugins += [
HANLINV3,
@ -716,11 +728,20 @@ plugins += [
BOEYE_BDX,
USER_DEFINED,
]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataWriter')]
plugins += input_profiles + output_profiles
# }}}
# New metadata download plugins {{{
from calibre.ebooks.metadata.sources.google import GoogleBooks
from calibre.ebooks.metadata.sources.amazon import Amazon
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
from calibre.ebooks.metadata.sources.overdrive import OverDrive
from calibre.ebooks.metadata.sources.douban import Douban
from calibre.ebooks.metadata.sources.ozon import Ozon
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
# }}}
# Interface Actions {{{
@ -1508,6 +1529,7 @@ class StoreVirtualoStore(StoreBase):
headquarters = 'PL'
formats = ['EPUB', 'MOBI', 'PDF']
affiliate = True
class StoreWaterstonesUKStore(StoreBase):
name = 'Waterstones UK'
@ -1622,3 +1644,34 @@ plugins += [
]
# }}}
if __name__ == '__main__':
# Test load speed
import subprocess, textwrap
try:
subprocess.check_call(['python', '-c', textwrap.dedent(
'''
from __future__ import print_function
import time, sys, init_calibre
st = time.time()
import calibre.customize.builtins
t = time.time() - st
ret = 0
for x in ('lxml', 'calibre.ebooks.BeautifulSoup', 'uuid',
'calibre.utils.terminfo', 'calibre.utils.magick', 'PIL', 'Image',
'sqlite3', 'mechanize', 'httplib', 'xml'):
if x in sys.modules:
ret = 1
print (x, 'has been loaded by a plugin')
if ret:
print ('\\nA good way to track down what is loading something is to run'
' python -c "import init_calibre; import calibre.customize.builtins"')
print()
print ('Time taken to import all plugins: %.2f'%t)
sys.exit(ret)
''')])
except subprocess.CalledProcessError:
raise SystemExit(1)

View File

@ -5,7 +5,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from itertools import izip
from xml.sax.saxutils import escape
from calibre.customize import Plugin as _Plugin
@ -268,6 +267,7 @@ class OutputProfile(Plugin):
@classmethod
def tags_to_string(cls, tags):
from xml.sax.saxutils import escape
return escape(', '.join(tags))
class iPadOutput(OutputProfile):

View File

@ -447,11 +447,14 @@ def plugin_for_catalog_format(fmt):
# }}}
def device_plugins(): # {{{
def device_plugins(include_disabled=False): # {{{
for plugin in _initialized_plugins:
if isinstance(plugin, DevicePlugin):
if not is_disabled(plugin):
if include_disabled or not is_disabled(plugin):
if platform in plugin.supported_platforms:
if getattr(plugin, 'plugin_needs_delayed_initialization',
False):
plugin.do_delayed_plugin_initialization()
yield plugin
# }}}
@ -496,7 +499,7 @@ def initialize_plugin(plugin, path_to_zip_file):
def has_external_plugins():
return bool(config['plugins'])
def initialize_plugins():
def initialize_plugins(perf=False):
global _initialized_plugins
_initialized_plugins = []
conflicts = [name for name in config['plugins'] if name in
@ -504,6 +507,11 @@ def initialize_plugins():
for p in conflicts:
remove_plugin(p)
external_plugins = config['plugins']
ostdout, ostderr = sys.stdout, sys.stderr
if perf:
from collections import defaultdict
import time
times = defaultdict(lambda:0)
for zfp in list(external_plugins) + builtin_plugins:
try:
if not isinstance(zfp, type):
@ -516,12 +524,22 @@ def initialize_plugins():
plugin = load_plugin(zfp) if not isinstance(zfp, type) else zfp
except PluginNotFound:
continue
if perf:
st = time.time()
plugin = initialize_plugin(plugin, None if isinstance(zfp, type) else zfp)
if perf:
times[plugin.name] = time.time() - st
_initialized_plugins.append(plugin)
except:
print 'Failed to initialize plugin:', repr(zfp)
if DEBUG:
traceback.print_exc()
# Prevent a custom plugin from overriding stdout/stderr as this breaks
# ipython
sys.stdout, sys.stderr = ostdout, ostderr
if perf:
for x in sorted(times, key=lambda x:times[x]):
print ('%50s: %.3f'%(x, times[x]))
_initialized_plugins.sort(cmp=lambda x,y:cmp(x.priority, y.priority), reverse=True)
reread_filetype_plugins()
reread_metadata_plugins()

View File

@ -38,6 +38,8 @@ class ANDROID(USBMS):
0xca4 : [0x100, 0x0227, 0x0226, 0x222],
0xca9 : [0x100, 0x0227, 0x0226, 0x222],
0xcac : [0x100, 0x0227, 0x0226, 0x222],
0xccf : [0x100, 0x0227, 0x0226, 0x222],
0x2910 : [0x222],
},
# Eken
@ -51,6 +53,7 @@ class ANDROID(USBMS):
0x70c6 : [0x226],
0x4316 : [0x216],
0x42d6 : [0x216],
0x42d7 : [0x216],
},
# Freescale
0x15a2 : {
@ -162,7 +165,7 @@ class ANDROID(USBMS):
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA',
'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT', 'HYSTON',
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO']
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP']
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
@ -175,13 +178,14 @@ class ANDROID(USBMS):
'GT-S5830_CARD', 'GT-S5570_CARD', 'MB870', 'MID7015A',
'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008', 'EMX51_BBG_ANDROI',
'UMS', '.K080', 'P990', 'LTE', 'MB853', 'GT-S5660_CARD', 'A107',
'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET']
'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855',
'XT910']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
'__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL',
'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853',
'A1-07___C0541A4F', 'XT912']
'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910']
OSX_MAIN_MEM = 'Android Device Main Memory'
@ -220,6 +224,20 @@ class ANDROID(USBMS):
drives['main'] = letter_a
return drives
@classmethod
def configure_for_kindle_app(cls):
proxy = cls._configProxy()
proxy['format_map'] = ['mobi', 'azw', 'azw1', 'azw4', 'pdf']
proxy['use_subdirs'] = False
proxy['extra_customization'] = ','.join(['kindle']+cls.EBOOK_DIR_MAIN)
@classmethod
def configure_for_generic_epub_app(cls):
proxy = cls._configProxy()
del proxy['format_map']
del proxy['use_subdirs']
del proxy['extra_customization']
class S60(USBMS):
name = 'S60 driver'

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
Sanda library wrapper
'''
import ctypes, uuid, hashlib, os, sys
import ctypes, hashlib, os, sys
from threading import Event, Lock
from calibre.constants import iswindows
from calibre import load_library
@ -350,6 +350,7 @@ class Bambook:
return None
def SendFile(self, fileName, guid = None):
import uuid
if self.handle:
taskID = job.NewJob()
if guid:

File diff suppressed because one or more lines are too long

View File

@ -97,3 +97,13 @@ class FOLDER_DEVICE(USBMS):
@classmethod
def settings(self):
return FOLDER_DEVICE_FOR_CONFIG._config().parse()
@classmethod
def config_widget(cls):
return FOLDER_DEVICE_FOR_CONFIG.config_widget()
@classmethod
def save_settings(cls, config_widget):
return FOLDER_DEVICE_FOR_CONFIG.save_settings(config_widget)

View File

@ -9,7 +9,6 @@ Generates and writes an APNX page mapping file.
'''
import struct
import uuid
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.pdb.header import PdbHeaderReader
@ -51,6 +50,7 @@ class APNXBuilder(object):
apnxf.write(apnx)
def generate_apnx(self, pages):
import uuid
apnx = ''
content_vals = {

View File

@ -10,10 +10,8 @@ Device driver for Amazon's Kindle
import datetime, os, re, sys, json, hashlib
from calibre.devices.kindle.apnx import APNXBuilder
from calibre.devices.kindle.bookmark import Bookmark
from calibre.devices.usbms.driver import USBMS
from calibre.ebooks.metadata import MetaInformation
from calibre import strftime
'''
@ -152,6 +150,7 @@ class KINDLE(USBMS):
path_map, book_ext = resolve_bookmark_paths(storage, path_map)
bookmarked_books = {}
for id in path_map:
bookmark_ext = path_map[id].rpartition('.')[2]
myBookmark = Bookmark(path_map[id], id, book_ext[id], bookmark_ext)
@ -236,6 +235,8 @@ class KINDLE(USBMS):
def add_annotation_to_library(self, db, db_id, annotation):
from calibre.ebooks.BeautifulSoup import Tag
from calibre.ebooks.metadata import MetaInformation
bm = annotation
ignore_tags = set(['Catalog', 'Clippings'])
@ -363,6 +364,8 @@ class KINDLE2(KINDLE):
'''
Hijacking this function to write the apnx file.
'''
from calibre.devices.kindle.apnx import APNXBuilder
opts = self.settings()
if not opts.extra_customization[self.OPT_APNX]:
return

View File

@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
import os
from contextlib import closing
import sqlite3 as sqlite
class Bookmark(): # {{{
'''
@ -32,7 +31,7 @@ class Bookmark(): # {{{
def get_bookmark_data(self):
''' Return the timestamp and last_read_location '''
import sqlite3 as sqlite
user_notes = {}
self.timestamp = os.path.getmtime(self.path)
with closing(sqlite.connect(self.db_path)) as connection:

View File

@ -6,7 +6,6 @@ __copyright__ = '2010, Timothy Legge <timlegge@gmail.com> and Kovid Goyal <kovid
__docformat__ = 'restructuredtext en'
import os, time, calendar
import sqlite3 as sqlite
from contextlib import closing
from calibre.devices.usbms.books import BookList
from calibre.devices.kobo.books import Book
@ -16,7 +15,6 @@ from calibre.devices.mime import mime_type_ext
from calibre.devices.usbms.driver import USBMS, debug_print
from calibre import prints
from calibre.devices.usbms.books import CollectionsBookList
from calibre.utils.magick.draw import save_cover_data_to
from calibre.ptempfile import PersistentTemporaryFile
class KOBO(USBMS):
@ -230,6 +228,7 @@ class KOBO(USBMS):
traceback.print_exc()
return changed
import sqlite3 as sqlite
with closing(sqlite.connect(
self.normalize_path(self._main_prefix +
'.kobo/KoboReader.sqlite'))) as connection:
@ -344,6 +343,7 @@ class KOBO(USBMS):
# 2) volume_shorcover
# 2) content
import sqlite3 as sqlite
debug_print('delete_via_sql: ContentID: ', ContentID, 'ContentType: ', ContentType)
with closing(sqlite.connect(self.normalize_path(self._main_prefix +
'.kobo/KoboReader.sqlite'))) as connection:
@ -739,6 +739,8 @@ class KOBO(USBMS):
# Needs to be outside books collection as in the case of removing
# the last book from the collection the list of books is empty
# and the removal of the last book would not occur
import sqlite3 as sqlite
with closing(sqlite.connect(self.normalize_path(self._main_prefix +
'.kobo/KoboReader.sqlite'))) as connection:
@ -850,6 +852,7 @@ class KOBO(USBMS):
debug_print('FAILED to upload cover', filepath)
def _upload_cover(self, path, filename, metadata, filepath, uploadgrayscale):
from calibre.utils.magick.draw import save_cover_data_to
if metadata.cover:
cover = self.normalize_path(metadata.cover.replace('/', os.sep))
@ -859,6 +862,7 @@ class KOBO(USBMS):
ContentType = self.get_content_type_from_extension(extension) if extension != '' else self.get_content_type_from_path(filepath)
ContentID = self.contentid_from_path(filepath, ContentType)
import sqlite3 as sqlite
with closing(sqlite.connect(self.normalize_path(self._main_prefix +
'.kobo/KoboReader.sqlite'))) as connection:

View File

@ -209,8 +209,8 @@ class ALURATEK_COLOR(USBMS):
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'books'
VENDOR_NAME = 'USB_2.0'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB_FLASH_DRIVER'
VENDOR_NAME = ['USB_2.0', 'EZREADER']
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['USB_FLASH_DRIVER', '.']
class TREKSTOR(USBMS):

View File

@ -7,8 +7,6 @@ __docformat__ = 'restructuredtext en'
import os, time
from base64 import b64decode
from uuid import uuid4
from lxml import etree
from datetime import date
from calibre import prints, guess_type, isbytestring
@ -78,6 +76,7 @@ def strftime(epoch, zone=time.localtime):
return ' '.join(src)
def uuid():
from uuid import uuid4
return str(uuid4()).replace('-', '', 1).upper()
# }}}
@ -85,6 +84,8 @@ def uuid():
class XMLCache(object):
def __init__(self, paths, ext_paths, prefixes, use_author_sort):
from lxml import etree
if DEBUG:
debug_print('Building XMLCache...', paths)
self.paths = paths
@ -714,6 +715,8 @@ class XMLCache(object):
def write(self):
from lxml import etree
for i, path in self.paths.items():
self.move_playlists_to_bottom()
self.cleanup_whitespace(i)

View File

@ -12,8 +12,6 @@ Device driver for the SONY T1 devices
'''
import os, time, re
import sqlite3 as sqlite
from sqlite3 import DatabaseError
from contextlib import closing
from datetime import date
@ -146,6 +144,8 @@ class PRST1(USBMS):
return True
def books(self, oncard=None, end_session=True):
import sqlite3 as sqlite
dummy_bl = BookList(None, None, None)
if (
@ -246,6 +246,8 @@ class PRST1(USBMS):
debug_print('PRST1: finished sync_booklists')
def update_device_database(self, booklist, collections_attributes, oncard):
import sqlite3 as sqlite
debug_print('PRST1: starting update_device_database')
plugboard = None
@ -274,6 +276,8 @@ class PRST1(USBMS):
def update_device_books(self, connection, booklist, source_id, plugboard,
dbpath):
from sqlite3 import DatabaseError
opts = self.settings()
upload_covers = opts.extra_customization[self.OPT_UPLOAD_COVERS]
refresh_covers = opts.extra_customization[self.OPT_REFRESH_COVERS]
@ -489,6 +493,8 @@ class PRST1(USBMS):
debug_print('PRS-T1: finished rebuild_collections')
def upload_cover(self, path, filename, metadata, filepath):
import sqlite3 as sqlite
debug_print('PRS-T1: uploading cover')
if filepath.startswith(self._main_prefix):

View File

@ -8,7 +8,7 @@ manner.
import sys, os, re
from threading import RLock
from calibre.constants import iswindows, isosx, plugins, islinux
from calibre.constants import iswindows, isosx, plugins, islinux, isfreebsd
osx_scanner = win_scanner = linux_scanner = None
@ -155,17 +155,78 @@ class LinuxScanner(object):
ans.add(tuple(dev))
return ans
class FreeBSDScanner(object):
def __call__(self):
ans = set([])
import dbus
try:
bus = dbus.SystemBus()
manager = dbus.Interface(bus.get_object('org.freedesktop.Hal',
'/org/freedesktop/Hal/Manager'), 'org.freedesktop.Hal.Manager')
paths = manager.FindDeviceStringMatch('freebsd.driver','da')
for path in paths:
obj = bus.get_object('org.freedesktop.Hal', path)
objif = dbus.Interface(obj, 'org.freedesktop.Hal.Device')
parentdriver = None
while parentdriver != 'umass':
try:
obj = bus.get_object('org.freedesktop.Hal',
objif.GetProperty('info.parent'))
objif = dbus.Interface(obj, 'org.freedesktop.Hal.Device')
try:
parentdriver = objif.GetProperty('freebsd.driver')
except dbus.exceptions.DBusException, e:
continue
except dbus.exceptions.DBusException, e:
break
if parentdriver != 'umass':
continue
dev = []
try:
dev.append(objif.GetProperty('usb.vendor_id'))
dev.append(objif.GetProperty('usb.product_id'))
dev.append(objif.GetProperty('usb.device_revision_bcd'))
except dbus.exceptions.DBusException, e:
continue
try:
dev.append(objif.GetProperty('info.vendor'))
except:
dev.append('')
try:
dev.append(objif.GetProperty('info.product'))
except:
dev.append('')
try:
dev.append(objif.GetProperty('usb.serial'))
except:
dev.append('')
dev.append(path)
ans.add(tuple(dev))
except dbus.exceptions.DBusException, e:
print >>sys.stderr, "Execution failed:", e
return ans
linux_scanner = None
if islinux:
linux_scanner = LinuxScanner()
freebsd_scanner = None
if isfreebsd:
freebsd_scanner = FreeBSDScanner()
class DeviceScanner(object):
def __init__(self, *args):
if isosx and osx_scanner is None:
raise RuntimeError('The Python extension usbobserver must be available on OS X.')
self.scanner = win_scanner if iswindows else osx_scanner if isosx else linux_scanner
self.scanner = win_scanner if iswindows else osx_scanner if isosx else freebsd_scanner if isfreebsd else linux_scanner
self.devices = []
def scan(self):

View File

@ -591,26 +591,7 @@ class Device(DeviceConfig, DevicePlugin):
mp = self.node_mountpoint(node)
if mp is not None:
return mp, 0
if type == 'main':
label = self.MAIN_MEMORY_VOLUME_LABEL
if type == 'carda':
label = self.STORAGE_CARD_VOLUME_LABEL
if type == 'cardb':
label = self.STORAGE_CARD2_VOLUME_LABEL
if not label:
label = self.STORAGE_CARD_VOLUME_LABEL + ' 2'
if not label:
label = 'E-book Reader (%s)'%type
extra = 0
while True:
q = ' (%d)'%extra if extra else ''
if not os.path.exists('/media/'+label+q):
break
extra += 1
if extra:
label += ' (%d)'%extra
def do_mount(node, label):
def do_mount(node):
try:
from calibre.devices.udisks import mount
mount(node)
@ -621,8 +602,7 @@ class Device(DeviceConfig, DevicePlugin):
traceback.print_exc()
return 1
ret = do_mount(node, label)
ret = do_mount(node)
if ret != 0:
return None, ret
return self.node_mountpoint(node)+'/', 0
@ -697,19 +677,21 @@ class Device(DeviceConfig, DevicePlugin):
self._card_a_prefix = self._card_b_prefix
self._card_b_prefix = None
# ------------------------------------------------------
#
# open for FreeBSD
# find the device node or nodes that match the S/N we already have from the scanner
# and attempt to mount each one
# 1. get list of disk devices from sysctl
# 2. compare that list with the one from camcontrol
# 3. and see if it has a matching s/n
# 6. find any partitions/slices associated with each node
# 7. attempt to mount, using calibre-mount-helper, each one
# 8. when finished, we have a list of mount points and associated device nodes
# find the device node or nodes that match the S/N we already have from the scanner
# and attempt to mount each one
# 1. get list of devices in /dev with matching s/n etc.
# 2. get list of volumes associated with each
# 3. attempt to mount each one using Hal
# 4. when finished, we have a list of mount points and associated dbus nodes
#
def open_freebsd(self):
import dbus
# There should be some way to access the -v arg...
verbose = False
# this gives us access to the S/N, etc. of the reader that the scanner has found
# and the match routines for some of that data, like s/n, vendor ID, etc.
@ -719,128 +701,146 @@ class Device(DeviceConfig, DevicePlugin):
raise DeviceError("Device has no S/N. Can't continue")
return False
devs={}
di=0
ndevs=4 # number of possible devices per reader (main, carda, cardb, launcher)
vols=[]
#get list of disk devices
p=subprocess.Popen(["sysctl", "kern.disks"], stdout=subprocess.PIPE)
kdsks=subprocess.Popen(["sed", "s/kern.disks: //"], stdin=p.stdout, stdout=subprocess.PIPE).communicate()[0]
p.stdout.close()
#print kdsks
for dvc in kdsks.split():
# for each one that's also in the list of cam devices ...
p=subprocess.Popen(["camcontrol", "devlist"], stdout=subprocess.PIPE)
devmatch=subprocess.Popen(["grep", dvc], stdin=p.stdout, stdout=subprocess.PIPE).communicate()[0]
p.stdout.close()
if devmatch:
#print "Checking ", devmatch
# ... see if we can get a S/N from the actual device node
sn=subprocess.Popen(["camcontrol", "inquiry", dvc, "-S"], stdout=subprocess.PIPE).communicate()[0]
sn=sn[0:-1] # drop the trailing newline
#print "S/N = ", sn
if sn and d.match_serial(sn):
# we have a matching s/n, record this device node
#print "match found: ", dvc
devs[di]=dvc
di += 1
bus = dbus.SystemBus()
manager = dbus.Interface(bus.get_object('org.freedesktop.Hal',
'/org/freedesktop/Hal/Manager'), 'org.freedesktop.Hal.Manager')
paths = manager.FindDeviceStringMatch('usb.serial',d.serial)
for path in paths:
objif = dbus.Interface(bus.get_object('org.freedesktop.Hal', path), 'org.freedesktop.Hal.Device')
# Extra paranoia...
try:
if d.idVendor == objif.GetProperty('usb.vendor_id') and \
d.idProduct == objif.GetProperty('usb.product_id') and \
d.manufacturer == objif.GetProperty('usb.vendor') and \
d.product == objif.GetProperty('usb.product') and \
d.serial == objif.GetProperty('usb.serial'):
dpaths = manager.FindDeviceStringMatch('storage.originating_device', path)
for dpath in dpaths:
#devif = dbus.Interface(bus.get_object('org.freedesktop.Hal', dpath), 'org.freedesktop.Hal.Device')
try:
vpaths = manager.FindDeviceStringMatch('block.storage_device', dpath)
for vpath in vpaths:
try:
vdevif = dbus.Interface(bus.get_object('org.freedesktop.Hal', vpath), 'org.freedesktop.Hal.Device')
if not vdevif.GetProperty('block.is_volume'):
continue
if vdevif.GetProperty('volume.fsusage') != 'filesystem':
continue
volif = dbus.Interface(bus.get_object('org.freedesktop.Hal', vpath), 'org.freedesktop.Hal.Device.Volume')
pdevif = dbus.Interface(bus.get_object('org.freedesktop.Hal', vdevif.GetProperty('info.parent')), 'org.freedesktop.Hal.Device')
vol = {'node': pdevif.GetProperty('block.device'),
'dev': vdevif,
'vol': volif,
'label': vdevif.GetProperty('volume.label')}
vols.append(vol)
except dbus.exceptions.DBusException, e:
print e
continue
except dbus.exceptions.DBusException, e:
print e
continue
except dbus.exceptions.DBusException, e:
continue
# sort the list of devices
for i in range(1,ndevs+1):
for j in reversed(range(1,i)):
if devs[j-1] > devs[j]:
x=devs[j-1]
devs[j-1]=devs[j]
devs[j]=x
#print devs
def ocmp(x,y):
if x['node'] < y['node']:
return -1
if x['node'] > y['node']:
return 1
return 0
vols.sort(cmp=ocmp)
if verbose:
print "FBSD: ", vols
# now we need to see if any of these have slices/partitions
mtd=0
label="READER" # could use something more unique, like S/N or productID...
cmd = '/usr/local/bin/calibre-mount-helper'
cmd = [cmd, 'mount']
for i in range(0,ndevs):
cmd2="ls /dev/"+devs[i]+"*"
p=subprocess.Popen(cmd2, shell=True, stdout=subprocess.PIPE)
devs[i]=subprocess.Popen(["cut", "-d", "/", "-f" "3"], stdin=p.stdout, stdout=subprocess.PIPE).communicate()[0]
p.stdout.close()
# try all the nodes to see what we can mount
for dev in devs[i].split():
mp='/media/'+label+'-'+dev
mmp = mp
if mmp.endswith('/'):
mmp = mmp[:-1]
#print "trying ", dev, "on", mp
for vol in vols:
mp = ''
if vol['dev'].GetProperty('volume.is_mounted'):
mp = vol['dev'].GetProperty('volume.mount_point')
else:
try:
p = subprocess.Popen(cmd + ["/dev/"+dev, mmp])
except OSError:
raise DeviceError(_('Could not find mount helper: %s.')%cmd[0])
while p.poll() is None:
time.sleep(0.1)
vol['vol'].Mount('Calibre-'+vol['label'],
vol['dev'].GetProperty('volume.fstype'), [])
loops = 0
while not vol['dev'].GetProperty('volume.is_mounted'):
time.sleep(1)
loops += 1
if loops > 100:
print "ERROR: Timeout waiting for mount to complete"
continue
mp = vol['dev'].GetProperty('volume.mount_point')
except dbus.exceptions.DBusException, e:
print "Failed to mount ", e
continue
if p.returncode == 0:
#print " mounted", dev
if i == 0:
self._main_prefix = mp
self._main_dev = "/dev/"+dev
#print "main = ", self._main_dev, self._main_prefix
if i == 1:
self._card_a_prefix = mp
self._card_a_dev = "/dev/"+dev
#print "card a = ", self._card_a_dev, self._card_a_prefix
if i == 2:
self._card_b_prefix = mp
self._card_b_dev = "/dev/"+dev
#print "card b = ", self._card_b_dev, self._card_b_prefix
# Mount Point becomes Mount Path
mp += '/'
mtd += 1
break
if verbose:
print "FBSD: mounted", vol['label'], "on", mp
if mtd == 0:
self._main_prefix = mp
self._main_vol = vol['vol']
if verbose:
print "FBSD: main = ", self._main_prefix
if mtd == 1:
self._card_a_prefix = mp
self._card_a_vol = vol['vol']
if verbose:
print "FBSD: card a = ", self._card_a_prefix
if mtd == 2:
self._card_b_prefix = mp
self._card_b_vol = vol['vol']
if verbose:
print "FBSD: card b = ", self._card_b_prefix
# Note that mtd is used as a bool... not incrementing is fine.
break
mtd += 1
if mtd > 0:
return True
else :
return False
raise DeviceError(_('Unable to mount the device'))
#
# ------------------------------------------------------
#
# this one is pretty simple:
# just umount each of the previously
# mounted filesystems, using the mount helper
# this one is pretty simple:
# just umount each of the previously
# mounted filesystems, using the stored volume object
#
def eject_freebsd(self):
cmd = '/usr/local/bin/calibre-mount-helper'
cmd = [cmd, 'eject']
import dbus
# There should be some way to access the -v arg...
verbose = False
if self._main_prefix:
#print "umount main:", cmd, self._main_dev, self._main_prefix
if verbose:
print "FBSD: umount main:", self._main_prefix
try:
p = subprocess.Popen(cmd + [self._main_dev, self._main_prefix])
except OSError:
raise DeviceError(
_('Could not find mount helper: %s.')%cmd[0])
while p.poll() is None:
time.sleep(0.1)
self._main_vol.Unmount([])
except dbus.exceptions.DBusException, e:
print 'Unable to eject ', e
if self._card_a_prefix:
#print "umount card a:", cmd, self._card_a_dev, self._card_a_prefix
if verbose:
print "FBSD: umount card a:", self._card_a_prefix
try:
p = subprocess.Popen(cmd + [self._card_a_dev, self._card_a_prefix])
except OSError:
raise DeviceError(
_('Could not find mount helper: %s.')%cmd[0])
while p.poll() is None:
time.sleep(0.1)
self._card_a_vol.Unmount([])
except dbus.exceptions.DBusException, e:
print 'Unable to eject ', e
if self._card_b_prefix:
#print "umount card b:", cmd, self._card_b_dev, self._card_b_prefix
if verbose:
print "FBSD: umount card b:", self._card_b_prefix
try:
p = subprocess.Popen(cmd + [self._card_b_dev, self._card_b_prefix])
except OSError:
raise DeviceError(
_('Could not find mount helper: %s.')%cmd[0])
while p.poll() is None:
time.sleep(0.1)
self._card_b_vol.Unmount([])
except dbus.exceptions.DBusException, e:
print 'Unable to eject ', e
self._main_prefix = None
self._card_a_prefix = None
@ -859,11 +859,10 @@ class Device(DeviceConfig, DevicePlugin):
time.sleep(7)
self.open_linux()
if isfreebsd:
self._main_dev = self._card_a_dev = self._card_b_dev = None
self._main_vol = self._card_a_vol = self._card_b_vol = None
try:
self.open_freebsd()
except DeviceError:
subprocess.Popen(["camcontrol", "rescan", "all"])
time.sleep(2)
self.open_freebsd()
if iswindows:

View File

@ -10,7 +10,7 @@ driver. It is intended to be subclassed with the relevant parts implemented
for a particular device.
'''
import os, re, time, json, uuid, functools, shutil
import os, re, time, json, functools, shutil
from itertools import cycle
from calibre.constants import numeric_version
@ -58,6 +58,7 @@ class USBMS(CLI, Device):
SCAN_FROM_ROOT = False
def _update_driveinfo_record(self, dinfo, prefix, location_code, name=None):
import uuid
if not isinstance(dinfo, dict):
dinfo = {}
if dinfo.get('device_store_uuid', None) is None:

View File

@ -90,6 +90,10 @@ class USER_DEFINED(USBMS):
OPT_CARD_A_FOLDER = 9
def initialize(self):
self.plugin_needs_delayed_initialization = True
USBMS.initialize(self)
def do_delayed_plugin_initialization(self):
try:
e = self.settings().extra_customization
self.VENDOR_ID = int(e[self.OPT_USB_VENDOR_ID], 16)
@ -107,4 +111,6 @@ class USER_DEFINED(USBMS):
except:
import traceback
traceback.print_exc()
USBMS.initialize(self)
self.plugin_needs_delayed_initialization = False

View File

@ -8,7 +8,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, codecs
from chardet import detect
ENCODING_PATS = [
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
@ -34,8 +33,13 @@ def substitute_entites(raw):
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
def detect(*args, **kwargs):
from chardet import detect
return detect(*args, **kwargs)
def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw[:1024*50])
except:

View File

@ -7,11 +7,10 @@ __docformat__ = 'restructuredtext en'
Based on ideas from comiclrf created by FangornUK.
'''
import os, shutil, traceback, textwrap, time, codecs
import os, traceback, time
from Queue import Empty
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import extract, CurrentDir, prints, walk
from calibre import extract, prints, walk
from calibre.constants import filesystem_encoding
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.ipc.server import Server
@ -273,245 +272,4 @@ def process_pages(pages, opts, update, tdir):
return ans, failures
class ComicInput(InputFormatPlugin):
name = 'Comic Input'
author = 'Kovid Goyal'
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
file_types = set(['cbz', 'cbr', 'cbc'])
is_image_collection = True
core_usage = -1
options = set([
OptionRecommendation(name='colors', recommended_value=256,
help=_('Number of colors for grayscale image conversion. Default: '
'%default. Values of less than 256 may result in blurred text '
'on your device if you are creating your comics in EPUB format.')),
OptionRecommendation(name='dont_normalize', recommended_value=False,
help=_('Disable normalize (improve contrast) color range '
'for pictures. Default: False')),
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
OptionRecommendation(name='dont_sharpen', recommended_value=False,
help=_('Disable sharpening.')),
OptionRecommendation(name='disable_trim', recommended_value=False,
help=_('Disable trimming of comic pages. For some comics, '
'trimming might remove content as well as borders.')),
OptionRecommendation(name='landscape', recommended_value=False,
help=_("Don't split landscape images into two portrait images")),
OptionRecommendation(name='wide', recommended_value=False,
help=_("Keep aspect ratio and scale image using screen height as "
"image width for viewing in landscape mode.")),
OptionRecommendation(name='right2left', recommended_value=False,
help=_('Used for right-to-left publications like manga. '
'Causes landscape pages to be split into portrait pages '
'from right to left.')),
OptionRecommendation(name='despeckle', recommended_value=False,
help=_('Enable Despeckle. Reduces speckle noise. '
'May greatly increase processing time.')),
OptionRecommendation(name='no_sort', recommended_value=False,
help=_("Don't sort the files found in the comic "
"alphabetically by name. Instead use the order they were "
"added to the comic.")),
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
recommended_value='png', help=_('The format that images in the created ebook '
'are converted to. You can experiment to see which format gives '
'you optimal size and look on your device.')),
OptionRecommendation(name='no_process', recommended_value=False,
help=_("Apply no processing to the image")),
OptionRecommendation(name='dont_grayscale', recommended_value=False,
help=_('Do not convert the image to grayscale (black and white)')),
OptionRecommendation(name='comic_image_size', recommended_value=None,
help=_('Specify the image size as widthxheight pixels. Normally,'
' an image size is automatically calculated from the output '
'profile, this option overrides it.')),
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
help=_('When converting a CBC do not add links to each page to'
' the TOC. Note this only applies if the TOC has more than one'
' section')),
])
recommendations = set([
('margin_left', 0, OptionRecommendation.HIGH),
('margin_top', 0, OptionRecommendation.HIGH),
('margin_right', 0, OptionRecommendation.HIGH),
('margin_bottom', 0, OptionRecommendation.HIGH),
('insert_blank_line', False, OptionRecommendation.HIGH),
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
('change_justification', 'left', OptionRecommendation.HIGH),
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
('chapter', None, OptionRecommendation.HIGH),
('page_breaks_brefore', None, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('disable_font_rescaling', True, OptionRecommendation.HIGH),
('linearize_tables', False, OptionRecommendation.HIGH),
])
def get_comics_from_collection(self, stream):
from calibre.libunzip import extract as zipextract
tdir = PersistentTemporaryDirectory('_comic_collection')
zipextract(stream, tdir)
comics = []
with CurrentDir(tdir):
if not os.path.exists('comics.txt'):
raise ValueError((
'%s is not a valid comic collection'
' no comics.txt was found in the file')
%stream.name)
raw = open('comics.txt', 'rb').read()
if raw.startswith(codecs.BOM_UTF16_BE):
raw = raw.decode('utf-16-be')[1:]
elif raw.startswith(codecs.BOM_UTF16_LE):
raw = raw.decode('utf-16-le')[1:]
elif raw.startswith(codecs.BOM_UTF8):
raw = raw.decode('utf-8')[1:]
else:
raw = raw.decode('utf-8')
for line in raw.splitlines():
line = line.strip()
if not line:
continue
fname, title = line.partition(':')[0], line.partition(':')[-1]
fname = fname.replace('#', '_')
fname = os.path.join(tdir, *fname.split('/'))
if not title:
title = os.path.basename(fname).rpartition('.')[0]
if os.access(fname, os.R_OK):
comics.append([title, fname])
if not comics:
raise ValueError('%s has no comics'%stream.name)
return comics
def get_pages(self, comic, tdir2):
tdir = extract_comic(comic)
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
verbose=self.opts.verbose)
thumbnail = None
if not new_pages:
raise ValueError('Could not find any pages in the comic: %s'
%comic)
if self.opts.no_process:
n2 = []
for page in new_pages:
n2.append(os.path.join(tdir2, os.path.basename(page)))
shutil.copyfile(page, n2[-1])
new_pages = n2
else:
new_pages, failures = process_pages(new_pages, self.opts,
self.report_progress, tdir2)
if failures:
self.log.warning('Could not process the following pages '
'(run with --verbose to see why):')
for f in failures:
self.log.warning('\t', f)
if not new_pages:
raise ValueError('Could not find any valid pages in comic: %s'
% comic)
thumbnail = os.path.join(tdir2,
'thumbnail.'+self.opts.output_format.lower())
if not os.access(thumbnail, os.R_OK):
thumbnail = None
return new_pages
def get_images(self):
return self._images
def convert(self, stream, opts, file_ext, log, accelerators):
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
self.opts, self.log= opts, log
if file_ext == 'cbc':
comics_ = self.get_comics_from_collection(stream)
else:
comics_ = [['Comic', os.path.abspath(stream.name)]]
stream.close()
comics = []
for i, x in enumerate(comics_):
title, fname = x
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
cdir = os.path.abspath(cdir)
if not os.path.exists(cdir):
os.makedirs(cdir)
pages = self.get_pages(fname, cdir)
if not pages: continue
wrappers = self.create_wrappers(pages)
comics.append((title, pages, wrappers))
if not comics:
raise ValueError('No comic pages found in %s'%stream.name)
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
[_('Unknown')])
opf = OPFCreator(os.path.abspath('.'), mi)
entries = []
def href(x):
if len(comics) == 1: return os.path.basename(x)
return '/'.join(x.split(os.sep)[-2:])
for comic in comics:
pages, wrappers = comic[1:]
entries += [(w, None) for w in map(href, wrappers)] + \
[(x, None) for x in map(href, pages)]
opf.create_manifest(entries)
spine = []
for comic in comics:
spine.extend(map(href, comic[2]))
self._images = []
for comic in comics:
self._images.extend(comic[1])
opf.create_spine(spine)
toc = TOC()
if len(comics) == 1:
wrappers = comics[0][2]
for i, x in enumerate(wrappers):
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
play_order=i)
else:
po = 0
for comic in comics:
po += 1
wrappers = comic[2]
stoc = toc.add_item(href(wrappers[0]),
None, comic[0], play_order=po)
if not opts.dont_add_comic_pages_to_toc:
for i, x in enumerate(wrappers):
stoc.add_item(href(x), None,
_('Page')+' %d'%(i+1), play_order=po)
po += 1
opf.set_toc(toc)
m, n = open('metadata.opf', 'wb'), open('toc.ncx', 'wb')
opf.render(m, n, 'toc.ncx')
return os.path.abspath('metadata.opf')
def create_wrappers(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
wrappers = []
WRAPPER = textwrap.dedent('''\
<html xmlns="%s">
<head>
<title>Page #%d</title>
<style type="text/css">
@page { margin:0pt; padding: 0pt}
body { margin: 0pt; padding: 0pt}
div { text-align: center }
</style>
</head>
<body>
<div>
<img src="%s" alt="comic page #%d" />
</div>
</body>
</html>
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
open(page, 'wb').write(wrapper)
wrappers.append(page)
return wrappers

View File

@ -0,0 +1,11 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@ -7,8 +7,6 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.azw4.reader import Reader
class AZW4Input(InputFormatPlugin):
@ -19,6 +17,9 @@ class AZW4Input(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.azw4.reader import Reader
header = PdbHeaderReader(stream)
reader = Reader(header, stream, log, options)
opf = reader.extract_content(os.getcwd())

View File

@ -3,9 +3,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import os, uuid
from lxml import html
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
@ -77,7 +75,7 @@ class CHMInput(InputFormatPlugin):
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
# use HTMLInput plugin to generate book
from calibre.ebooks.html.input import HTMLInput
from calibre.customize.builtins import HTMLInput
opts.breadth_first = True
htmlinput = HTMLInput(None)
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
@ -85,6 +83,8 @@ class CHMInput(InputFormatPlugin):
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
import uuid
from lxml import html
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import DirContainer
oeb = create_oebbook(log, None, opts,
@ -142,6 +142,7 @@ class CHMInput(InputFormatPlugin):
return oeb
def _create_html_root(self, hhcpath, log):
from lxml import html
hhcdata = self._read_file(hhcpath)
hhcroot = html.fromstring(hhcdata)
chapters = self._process_nodes(hhcroot)

View File

@ -0,0 +1,259 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Based on ideas from comiclrf created by FangornUK.
'''
import shutil, textwrap, codecs, os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
class ComicInput(InputFormatPlugin):
name = 'Comic Input'
author = 'Kovid Goyal'
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
file_types = set(['cbz', 'cbr', 'cbc'])
is_image_collection = True
core_usage = -1
options = set([
OptionRecommendation(name='colors', recommended_value=256,
help=_('Number of colors for grayscale image conversion. Default: '
'%default. Values of less than 256 may result in blurred text '
'on your device if you are creating your comics in EPUB format.')),
OptionRecommendation(name='dont_normalize', recommended_value=False,
help=_('Disable normalize (improve contrast) color range '
'for pictures. Default: False')),
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
OptionRecommendation(name='dont_sharpen', recommended_value=False,
help=_('Disable sharpening.')),
OptionRecommendation(name='disable_trim', recommended_value=False,
help=_('Disable trimming of comic pages. For some comics, '
'trimming might remove content as well as borders.')),
OptionRecommendation(name='landscape', recommended_value=False,
help=_("Don't split landscape images into two portrait images")),
OptionRecommendation(name='wide', recommended_value=False,
help=_("Keep aspect ratio and scale image using screen height as "
"image width for viewing in landscape mode.")),
OptionRecommendation(name='right2left', recommended_value=False,
help=_('Used for right-to-left publications like manga. '
'Causes landscape pages to be split into portrait pages '
'from right to left.')),
OptionRecommendation(name='despeckle', recommended_value=False,
help=_('Enable Despeckle. Reduces speckle noise. '
'May greatly increase processing time.')),
OptionRecommendation(name='no_sort', recommended_value=False,
help=_("Don't sort the files found in the comic "
"alphabetically by name. Instead use the order they were "
"added to the comic.")),
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
recommended_value='png', help=_('The format that images in the created ebook '
'are converted to. You can experiment to see which format gives '
'you optimal size and look on your device.')),
OptionRecommendation(name='no_process', recommended_value=False,
help=_("Apply no processing to the image")),
OptionRecommendation(name='dont_grayscale', recommended_value=False,
help=_('Do not convert the image to grayscale (black and white)')),
OptionRecommendation(name='comic_image_size', recommended_value=None,
help=_('Specify the image size as widthxheight pixels. Normally,'
' an image size is automatically calculated from the output '
'profile, this option overrides it.')),
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
help=_('When converting a CBC do not add links to each page to'
' the TOC. Note this only applies if the TOC has more than one'
' section')),
])
recommendations = set([
('margin_left', 0, OptionRecommendation.HIGH),
('margin_top', 0, OptionRecommendation.HIGH),
('margin_right', 0, OptionRecommendation.HIGH),
('margin_bottom', 0, OptionRecommendation.HIGH),
('insert_blank_line', False, OptionRecommendation.HIGH),
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
('change_justification', 'left', OptionRecommendation.HIGH),
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
('chapter', None, OptionRecommendation.HIGH),
('page_breaks_brefore', None, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('disable_font_rescaling', True, OptionRecommendation.HIGH),
('linearize_tables', False, OptionRecommendation.HIGH),
])
def get_comics_from_collection(self, stream):
from calibre.libunzip import extract as zipextract
tdir = PersistentTemporaryDirectory('_comic_collection')
zipextract(stream, tdir)
comics = []
with CurrentDir(tdir):
if not os.path.exists('comics.txt'):
raise ValueError((
'%s is not a valid comic collection'
' no comics.txt was found in the file')
%stream.name)
raw = open('comics.txt', 'rb').read()
if raw.startswith(codecs.BOM_UTF16_BE):
raw = raw.decode('utf-16-be')[1:]
elif raw.startswith(codecs.BOM_UTF16_LE):
raw = raw.decode('utf-16-le')[1:]
elif raw.startswith(codecs.BOM_UTF8):
raw = raw.decode('utf-8')[1:]
else:
raw = raw.decode('utf-8')
for line in raw.splitlines():
line = line.strip()
if not line:
continue
fname, title = line.partition(':')[0], line.partition(':')[-1]
fname = fname.replace('#', '_')
fname = os.path.join(tdir, *fname.split('/'))
if not title:
title = os.path.basename(fname).rpartition('.')[0]
if os.access(fname, os.R_OK):
comics.append([title, fname])
if not comics:
raise ValueError('%s has no comics'%stream.name)
return comics
def get_pages(self, comic, tdir2):
from calibre.ebooks.comic.input import (extract_comic, process_pages,
find_pages)
tdir = extract_comic(comic)
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
verbose=self.opts.verbose)
thumbnail = None
if not new_pages:
raise ValueError('Could not find any pages in the comic: %s'
%comic)
if self.opts.no_process:
n2 = []
for page in new_pages:
n2.append(os.path.join(tdir2, os.path.basename(page)))
shutil.copyfile(page, n2[-1])
new_pages = n2
else:
new_pages, failures = process_pages(new_pages, self.opts,
self.report_progress, tdir2)
if failures:
self.log.warning('Could not process the following pages '
'(run with --verbose to see why):')
for f in failures:
self.log.warning('\t', f)
if not new_pages:
raise ValueError('Could not find any valid pages in comic: %s'
% comic)
thumbnail = os.path.join(tdir2,
'thumbnail.'+self.opts.output_format.lower())
if not os.access(thumbnail, os.R_OK):
thumbnail = None
return new_pages
def get_images(self):
return self._images
def convert(self, stream, opts, file_ext, log, accelerators):
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
self.opts, self.log= opts, log
if file_ext == 'cbc':
comics_ = self.get_comics_from_collection(stream)
else:
comics_ = [['Comic', os.path.abspath(stream.name)]]
stream.close()
comics = []
for i, x in enumerate(comics_):
title, fname = x
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
cdir = os.path.abspath(cdir)
if not os.path.exists(cdir):
os.makedirs(cdir)
pages = self.get_pages(fname, cdir)
if not pages: continue
wrappers = self.create_wrappers(pages)
comics.append((title, pages, wrappers))
if not comics:
raise ValueError('No comic pages found in %s'%stream.name)
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
[_('Unknown')])
opf = OPFCreator(os.path.abspath('.'), mi)
entries = []
def href(x):
if len(comics) == 1: return os.path.basename(x)
return '/'.join(x.split(os.sep)[-2:])
for comic in comics:
pages, wrappers = comic[1:]
entries += [(w, None) for w in map(href, wrappers)] + \
[(x, None) for x in map(href, pages)]
opf.create_manifest(entries)
spine = []
for comic in comics:
spine.extend(map(href, comic[2]))
self._images = []
for comic in comics:
self._images.extend(comic[1])
opf.create_spine(spine)
toc = TOC()
if len(comics) == 1:
wrappers = comics[0][2]
for i, x in enumerate(wrappers):
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
play_order=i)
else:
po = 0
for comic in comics:
po += 1
wrappers = comic[2]
stoc = toc.add_item(href(wrappers[0]),
None, comic[0], play_order=po)
if not opts.dont_add_comic_pages_to_toc:
for i, x in enumerate(wrappers):
stoc.add_item(href(x), None,
_('Page')+' %d'%(i+1), play_order=po)
po += 1
opf.set_toc(toc)
m, n = open('metadata.opf', 'wb'), open('toc.ncx', 'wb')
opf.render(m, n, 'toc.ncx')
return os.path.abspath('metadata.opf')
def create_wrappers(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
wrappers = []
WRAPPER = textwrap.dedent('''\
<html xmlns="%s">
<head>
<title>Page #%d</title>
<style type="text/css">
@page { margin:0pt; padding: 0pt}
body { margin: 0pt; padding: 0pt}
div { text-align: center }
</style>
</head>
<body>
<div>
<img src="%s" alt="comic page #%d" />
</div>
</body>
</html>
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
open(page, 'wb').write(wrapper)
wrappers.append(page)
return wrappers

View File

@ -12,7 +12,6 @@ from subprocess import Popen, PIPE
from cStringIO import StringIO
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic
class DJVUInput(InputFormatPlugin):
@ -28,6 +27,8 @@ class DJVUInput(InputFormatPlugin):
])
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.txt.processor import convert_basic
stdout = StringIO()
ppdjvu = True
# using djvutxt is MUCH faster, should make it an option

View File

@ -3,11 +3,9 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, uuid
import os
from itertools import cycle
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
class EPUBInput(InputFormatPlugin):
@ -30,6 +28,8 @@ class EPUBInput(InputFormatPlugin):
f.write(raw[1024:])
def process_encryption(self, encfile, opf, log):
from lxml import etree
import uuid
key = None
for item in opf.identifier_iter():
scheme = None
@ -65,6 +65,7 @@ class EPUBInput(InputFormatPlugin):
return False
def rationalize_cover(self, opf, log):
from lxml import etree
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
@ -110,6 +111,7 @@ class EPUBInput(InputFormatPlugin):
renderer)
def find_opf(self):
from lxml import etree
def attr(n, attr):
for k, v in n.attrib.items():
if k.endswith(attr):

View File

@ -8,14 +8,12 @@ __docformat__ = 'restructuredtext en'
import os, shutil, re
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from calibre import CurrentDir
from calibre.customize.conversion import OptionRecommendation
from calibre.constants import filesystem_encoding
from lxml import etree
block_level_tags = (
'address',
'body',
@ -289,6 +287,7 @@ class EPUBOutput(OutputFormatPlugin):
# }}}
def condense_ncx(self, ncx_path):
from lxml import etree
if not self.opts.pretty_print:
tree = etree.parse(ncx_path)
for tag in tree.getroot().iter(tag=etree.Element):

View File

@ -6,7 +6,6 @@ Convert .fb2 files to .lrf
"""
import os, re
from base64 import b64decode
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import guess_type
@ -38,6 +37,7 @@ class FB2Input(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER

View File

@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre.ebooks.fb2.fb2ml import FB2MLizer
class FB2Output(OutputFormatPlugin):
@ -162,6 +161,7 @@ class FB2Output(OutputFormatPlugin):
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.fb2.fb2ml import FB2MLizer
try:
rasterizer = SVGRasterizer()

View File

@ -0,0 +1,283 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, tempfile, os
from functools import partial
from itertools import izip
from urllib import quote
from calibre.constants import islinux, isbsd
from calibre.customize.conversion import (InputFormatPlugin,
OptionRecommendation)
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
class HTMLInput(InputFormatPlugin):
name = 'HTML Input'
author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'])
options = set([
OptionRecommendation(name='breadth_first',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Traverse links in HTML files breadth first. Normally, '
'they are traversed depth first.'
)
),
OptionRecommendation(name='max_levels',
recommended_value=5, level=OptionRecommendation.LOW,
help=_('Maximum levels of recursion when following links in '
'HTML files. Must be non-negative. 0 implies that no '
'links in the root HTML file are followed. Default is '
'%default.'
)
),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of the conversion pipeline.'
)
),
])
def convert(self, stream, opts, file_ext, log,
accelerators):
self._is_case_sensitive = None
basedir = os.getcwd()
self.opts = opts
fname = None
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
fname = os.path.basename(stream.name)
if file_ext != 'opf':
if opts.dont_package:
raise ValueError('The --dont-package option is not supported for an HTML input file')
from calibre.ebooks.metadata.html import get_metadata
mi = get_metadata(stream)
if fname:
from calibre.ebooks.metadata.meta import metadata_from_filename
fmi = metadata_from_filename(fname)
fmi.smart_update(mi)
mi = fmi
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
return oeb
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream.name, opts,
encoding=opts.input_encoding)
def is_case_sensitive(self, path):
if getattr(self, '_is_case_sensitive', None) is not None:
return self._is_case_sensitive
if not path or not os.path.exists(path):
return islinux or isbsd
self._is_case_sensitive = not (os.path.exists(path.lower()) \
and os.path.exists(path.upper()))
return self._is_case_sensitive
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
import uuid
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import (DirContainer,
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
xpath)
from calibre import guess_type
from calibre.ebooks.oeb.transforms.metadata import \
meta_info_to_oeb_metadata
from calibre.ebooks.html.input import get_filelist
import cssutils, logging
cssutils.log.setLevel(logging.WARN)
self.OEB_STYLES = OEB_STYLES
oeb = create_oebbook(log, None, opts, self,
encoding=opts.input_encoding, populate=False)
self.oeb = oeb
metadata = oeb.metadata
meta_info_to_oeb_metadata(mi, metadata, log)
if not metadata.language:
oeb.logger.warn(u'Language not specified')
metadata.add('language', get_lang().replace('_', '-'))
if not metadata.creator:
oeb.logger.warn('Creator not specified')
metadata.add('creator', self.oeb.translate(__('Unknown')))
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown')))
bookid = str(uuid.uuid4())
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.oeb.uid = metadata.identifier[0]
break
filelist = get_filelist(htmlpath, basedir, opts, log)
filelist = [f for f in filelist if not f.is_binary]
htmlfile_map = {}
for f in filelist:
path = f.path
oeb.container = DirContainer(os.path.dirname(path), log,
ignore_opf=True)
bname = os.path.basename(path)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(bname))
htmlfile_map[path] = href
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = bname
oeb.spine.add(item, True)
self.added_resources = {}
self.log = log
self.log('Normalizing filename cases')
for path, href in htmlfile_map.items():
if not self.is_case_sensitive(path):
path = path.lower()
self.added_resources[path] = href
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
self.urldefrag = urldefrag
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
self.log('Rewriting HTML links')
for f in filelist:
path = f.path
dpath = os.path.dirname(path)
oeb.container = DirContainer(dpath, log, ignore_opf=True)
item = oeb.manifest.hrefs[htmlfile_map[path]]
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
for item in oeb.manifest.values():
if item.media_type in self.OEB_STYLES:
dpath = None
for path, href in self.added_resources.items():
if href == item.href:
dpath = os.path.dirname(path)
break
cssutils.replaceUrls(item.data,
partial(self.resource_adder, base=dpath))
toc = self.oeb.toc
self.oeb.auto_generated_toc = True
titles = []
headers = []
for item in self.oeb.spine:
if not item.linear: continue
html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
title = re.sub(r'\s+', ' ', title.strip())
if title:
titles.append(title)
headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(xpath(html, expr % tag))
header = re.sub(r'\s+', ' ', header.strip())
if header:
headers[-1] = header
break
use = titles
if len(titles) > len(set(titles)):
use = headers
for title, item in izip(use, self.oeb.spine):
if not item.linear: continue
toc.add(title, item.href)
oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True)
return oeb
def link_to_local_path(self, link_, base=None):
from calibre.ebooks.html.input import Link
if not isinstance(link_, unicode):
try:
link_ = link_.decode('utf-8', 'error')
except:
self.log.warn('Failed to decode link %r. Ignoring'%link_)
return None, None
try:
l = Link(link_, base if base else os.getcwdu())
except:
self.log.exception('Failed to process link: %r'%link_)
return None, None
if l.path is None:
# Not a local resource
return None, None
link = l.path.replace('/', os.sep).strip()
frag = l.fragment
if not link:
return None, None
return link, frag
def resource_adder(self, link_, base=None):
link, frag = self.link_to_local_path(link_, base=base)
if link is None:
return link_
try:
if base and not os.path.isabs(link):
link = os.path.join(base, link)
link = os.path.abspath(link)
except:
return link_
if not os.access(link, os.R_OK):
return link_
if os.path.isdir(link):
self.log.warn(link_, 'is a link to a directory. Ignoring.')
return link_
if not self.is_case_sensitive(tempfile.gettempdir()):
link = link.lower()
if link not in self.added_resources:
bhref = os.path.basename(link)
id, href = self.oeb.manifest.generate(id='added',
href=bhref)
guessed = self.guess_type(href)[0]
media_type = guessed or self.BINARY_MIME
if media_type == 'text/plain':
self.log.warn('Ignoring link to text file %r'%link_)
return None
self.oeb.log.debug('Added', link)
self.oeb.container = self.DirContainer(os.path.dirname(link),
self.oeb.log, ignore_opf=True)
# Load into memory
item = self.oeb.manifest.add(id, href, media_type)
# bhref refers to an already existing file. The read() method of
# DirContainer will call unquote on it before trying to read the
# file, therefore we quote it here.
if isinstance(bhref, unicode):
bhref = bhref.encode('utf-8')
item.html_input_href = quote(bhref).decode('utf-8')
if guessed in self.OEB_STYLES:
item.override_css_fetch = partial(
self.css_import_handler, os.path.dirname(link))
item.data
self.added_resources[link] = href
nlink = self.added_resources[link]
if frag:
nlink = '#'.join((nlink, frag))
return nlink
def css_import_handler(self, base, href):
link, frag = self.link_to_local_path(href, base=base)
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
return (None, None)
try:
raw = open(link, 'rb').read().decode('utf-8', 'replace')
raw = self.oeb.css_preprocessor(raw, add_namespace=True)
except:
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)

View File

@ -4,22 +4,11 @@ __copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
import os, re, shutil
from calibre.utils import zipfile
from os.path import dirname, abspath, relpath, exists, basename
from lxml import etree
from templite import Templite
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.zipfile import ZipFile
from urllib import unquote
from calibre.ebooks.html.meta import EasyMeta
class HTMLOutput(OutputFormatPlugin):
@ -50,6 +39,9 @@ class HTMLOutput(OutputFormatPlugin):
'''
Generate table of contents
'''
from lxml import etree
from urllib import unquote
from calibre.ebooks.oeb.base import element
with CurrentDir(output_dir):
def build_node(current_node, parent=None):
@ -72,11 +64,18 @@ class HTMLOutput(OutputFormatPlugin):
return wrap
def generate_html_toc(self, oeb_book, ref_url, output_dir):
from lxml import etree
root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='utf-8',
xml_declaration=False)
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.utils import zipfile
from templite import Templite
from urllib import unquote
from calibre.ebooks.html.meta import EasyMeta
# read template files
if opts.template_html_index is not None:
@ -192,7 +191,7 @@ class HTMLOutput(OutputFormatPlugin):
f.write(t)
item.unload_data_from_memory(memory=path)
zfile = ZipFile(output_path, "w")
zfile = zipfile.ZipFile(output_path, "w")
zfile.add_dir(output_dir, basename(output_dir))
zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)

View File

@ -10,9 +10,6 @@ import os
from calibre import guess_type
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.zipfile import ZipFile
class HTMLZInput(InputFormatPlugin):
@ -23,6 +20,10 @@ class HTMLZInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.zipfile import ZipFile
self.log = log
html = u''
top_levels = []

View File

@ -9,13 +9,10 @@ __docformat__ = 'restructuredtext en'
import os
from cStringIO import StringIO
from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
class HTMLZOutput(OutputFormatPlugin):
@ -43,7 +40,10 @@ class HTMLZOutput(OutputFormatPlugin):
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.utils.zipfile import ZipFile
# HTML
if opts.htmlz_css_type == 'inline':
@ -81,7 +81,7 @@ class HTMLZOutput(OutputFormatPlugin):
fname = os.path.join(tdir, 'images', images[item.href])
with open(fname, 'wb') as img:
img.write(data)
# Cover
cover_path = None
try:

View File

@ -0,0 +1,87 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, sys
from calibre.customize.conversion import InputFormatPlugin
class LRFInput(InputFormatPlugin):
name = 'LRF Input'
author = 'Kovid Goyal'
description = 'Convert LRF files to HTML'
file_types = set(['lrf'])
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
Canvas, ImageBlock, RuledLine)
self.log = log
self.log('Generating XML')
from calibre.ebooks.lrf.lrfparser import LRFDocument
d = LRFDocument(stream)
d.parse()
xml = d.to_xml(write_files=True)
if options.verbose > 2:
open('lrs.xml', 'wb').write(xml.encode('utf-8'))
parser = etree.XMLParser(no_network=True, huge_tree=True)
try:
doc = etree.fromstring(xml, parser=parser)
except:
self.log.warn('Failed to parse XML. Trying to recover')
parser = etree.XMLParser(no_network=True, huge_tree=True,
recover=True)
doc = etree.fromstring(xml, parser=parser)
char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'):
ro = x.get('refobj')
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
if jump_button:
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
if jump_to:
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
jump_to[0].get('refobj'))
plot_map = {}
for x in doc.xpath('//Plot[@refobj]'):
ro = x.get('refobj')
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
if image:
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
image[0].get('refstream'))
if imgstr:
plot_map[ro] = imgstr[0].get('file')
self.log('Converting XML to HTML...')
styledoc = etree.fromstring(P('templates/lrf.xsl', data=True))
media_type = MediaType()
styles = Styles()
text_block = TextBlock(styles, char_button_map, plot_map, log)
canvas = Canvas(doc, styles, text_block, log)
image_block = ImageBlock(canvas)
ruled_line = RuledLine()
extensions = {
('calibre', 'media-type') : media_type,
('calibre', 'text-block') : text_block,
('calibre', 'ruled-line') : ruled_line,
('calibre', 'styles') : styles,
('calibre', 'canvas') : canvas,
('calibre', 'image-block'): image_block,
}
transform = etree.XSLT(styledoc, extensions=extensions)
try:
result = transform(doc)
except RuntimeError:
sys.setrecursionlimit(5000)
result = transform(doc)
with open('content.opf', 'wb') as f:
f.write(result)
styles.write()
return os.path.abspath('content.opf')

View File

@ -0,0 +1,25 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert an ODT file into a Open Ebook
'''
from calibre.customize.conversion import InputFormatPlugin
class ODTInput(InputFormatPlugin):
name = 'ODT Input'
author = 'Kovid Goyal'
description = 'Convert ODT (OpenOffice) files to HTML'
file_types = set(['odt'])
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.odt.input import Extract
return Extract()(stream, '.', log)

View File

@ -5,13 +5,10 @@ __docformat__ = 'restructuredtext en'
import os, re
from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre import CurrentDir
from calibre.customize.conversion import OptionRecommendation
from urllib import unquote
class OEBOutput(OutputFormatPlugin):
@ -23,6 +20,9 @@ class OEBOutput(OutputFormatPlugin):
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from urllib import unquote
from lxml import etree
self.log, self.opts = log, opts
if not os.path.exists(output_path):
os.makedirs(output_path)

View File

@ -7,8 +7,6 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
class PDBInput(InputFormatPlugin):
@ -19,6 +17,9 @@ class PDBInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
header = PdbHeaderReader(stream)
Reader = get_reader(header.ident)

View File

@ -8,7 +8,7 @@ import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.pdb import PDBError, get_writer, FORMAT_WRITERS
from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
class PDBOutput(OutputFormatPlugin):
@ -19,9 +19,9 @@ class PDBOutput(OutputFormatPlugin):
options = set([
OptionRecommendation(name='format', recommended_value='doc',
level=OptionRecommendation.LOW,
short_switch='f', choices=FORMAT_WRITERS.keys(),
short_switch='f', choices=list(ALL_FORMAT_WRITERS),
help=(_('Format to use inside the pdb container. Choices are:')+\
' %s' % FORMAT_WRITERS.keys())),
' %s' % list(ALL_FORMAT_WRITERS))),
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \

View File

@ -7,10 +7,6 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.constants import plugins
pdfreflow, pdfreflow_err = plugins['pdfreflow']
class PDFInput(InputFormatPlugin):
@ -31,6 +27,9 @@ class PDFInput(InputFormatPlugin):
])
def convert_new(self, stream, accelerators):
from calibre.constants import plugins
pdfreflow, pdfreflow_err = plugins['pdfreflow']
from calibre.ebooks.pdf.reflow import PDFDocument
from calibre.utils.cleantext import clean_ascii_chars
if pdfreflow_err:
@ -43,6 +42,9 @@ class PDFInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdf.pdftohtml import pdftohtml
log.debug('Converting file to html...')
# The main html file will be named index.html
self.opts, self.log = options, log

View File

@ -13,10 +13,50 @@ import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.pdf.pageoptions import UNITS, PAPER_SIZES, \
ORIENTATIONS
UNITS = [
'millimeter',
'point',
'inch' ,
'pica' ,
'didot',
'cicero',
'devicepixel',
]
PAPER_SIZES = ['b2',
'a9',
'executive',
'tabloid',
'b4',
'b5',
'b6',
'b7',
'b0',
'b1',
'letter',
'b3',
'a7',
'a8',
'b8',
'b9',
'a3',
'a1',
'folio',
'c5e',
'dle',
'a0',
'ledger',
'legal',
'a6',
'a2',
'b10',
'a5',
'comm10e',
'a4']
ORIENTATIONS = ['portrait', 'landscape']
class PDFOutput(OutputFormatPlugin):
@ -26,23 +66,23 @@ class PDFOutput(OutputFormatPlugin):
options = set([
OptionRecommendation(name='unit', recommended_value='inch',
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS.keys(),
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
help=_('The unit of measure. Default is inch. Choices '
'are %s '
'Note: This does not override the unit for margins!') % UNITS.keys()),
'Note: This does not override the unit for margins!') % UNITS),
OptionRecommendation(name='paper_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAPER_SIZES.keys(),
level=OptionRecommendation.LOW, choices=PAPER_SIZES,
help=_('The size of the paper. This size will be overridden when a '
'non default output profile is used. Default is letter. Choices '
'are %s') % PAPER_SIZES.keys()),
'are %s') % PAPER_SIZES),
OptionRecommendation(name='custom_size', recommended_value=None,
help=_('Custom size of the document. Use the form widthxheight '
'EG. `123x321` to specify the width and height. '
'This overrides any specified paper-size.')),
OptionRecommendation(name='orientation', recommended_value='portrait',
level=OptionRecommendation.LOW, choices=ORIENTATIONS.keys(),
level=OptionRecommendation.LOW, choices=ORIENTATIONS,
help=_('The orientation of the page. Default is portrait. Choices '
'are %s') % ORIENTATIONS.keys()),
'are %s') % ORIENTATIONS),
OptionRecommendation(name='preserve_cover_aspect_ratio',
recommended_value=False,
help=_('Preserve the aspect ratio of the cover, instead'
@ -105,6 +145,8 @@ class PDFOutput(OutputFormatPlugin):
def convert_text(self, oeb_book):
from calibre.ebooks.pdf.writer import PDFWriter
from calibre.ebooks.metadata.opf2 import OPF
self.log.debug('Serializing oeb input to disk for processing...')
self.get_cover_data()

View File

@ -11,9 +11,6 @@ import shutil
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata.opf2 import OPFCreator
class PMLInput(InputFormatPlugin):
@ -24,6 +21,8 @@ class PMLInput(InputFormatPlugin):
file_types = set(['pml', 'pmlz'])
def process_pml(self, pml_path, html_path, close_all=False):
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
pclose = False
hclose = False
@ -85,6 +84,9 @@ class PMLInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata.opf2 import OPFCreator
self.options = options
self.log = log
pages, images = [], []

Some files were not shown because too many files have changed in this diff Show More