Merge from trunk

This commit is contained in:
Charles Haley 2012-02-16 10:48:58 +01:00
commit 9c0c9b0378
41 changed files with 898 additions and 393 deletions

View File

@ -0,0 +1,18 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class AlbertMohlersBlog(BasicNewsRecipe):
title = u'Albert Mohler\'s Blog'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 90
max_articles_per_feed = 10
auto_cleanup = True
cover_url = 'http://www.albertmohler.com/wp-content/themes/albert-mohler-v5/img/logo-am-lg.gif'
publisher = 'Albert Mohler'
language = 'en'
author = 'Albert Mohler'
feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')]

View File

@ -0,0 +1,51 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.asianreviewofbooks.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AsianReviewOfBooks(BasicNewsRecipe):
title = 'The Asian Review of Books'
__author__ = 'Darko Miletic'
description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.'
publisher = 'The Asian Review of Books'
category = 'literature, books, reviews, Asia'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = 'en_HK'
publication_type = 'magazine'
masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png'
extra_css = """
body{font-family: serif}
.big {font-size: xx-large}
.bold {font-weight: bold}
.italic {font-style: italic}
.small {font-size: small}
img {display: block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['object','script','iframe','embed'])]
remove_attributes = ['style', 'onclick']
feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')]
def print_version(self, url):
root, sep, artid = url.rpartition('?ID=')
return root + 'getarticle.php?articleID=' + artid + '&stats=web'
def preprocess_raw_html(self, raw, url):
return '<html><head><title>title</title></head><body>' + raw + '</body></html>'

View File

@ -1,95 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
borba.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Borba(BasicNewsRecipe):
title = 'Borba Online'
__author__ = 'Darko Miletic'
description = 'Dnevne novine Borba Online'
publisher = 'IP Novine Borba'
category = 'news, politics, Serbia'
language = 'sr'
lang = _('sr-Latn-RS')
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
INDEX = u'http://www.borba.rs/'
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'main'})]
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
remove_tags = [
dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'id':'written_comments_title'})
]
feeds = [
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
]
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
url = item['href']
title = self.tag_to_string(item)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1328971305(BasicNewsRecipe):
title = u'Catholic Daily Readings'
language = 'en'
__author__ = 'adoucette'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Daily Readings - USCCB', u'http://www.usccb.org/bible/readings/rss/'), (u'Daily Reflection - One Bread One Body', u'http://www.presentationministries.com/general/rss.asp'), (u'Mass Readings - Universalis', u'http://www.universalis.com/atommass3.xml'), (u'Saint Of The Day - CNA', u'http://feeds.feedburner.com/catholicnewsagency/saintoftheday')]

View File

@ -1,38 +1,89 @@
#!/usr/bin/env python
##
## Title: Common Dreams
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Cleaned up the output to have only the main article
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
commondreams.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class CommonDreams(BasicNewsRecipe):
# Identify the recipe
title = u'Common Dreams'
description = u'Progressive news and views'
description = u'Breaking News & Views for the Progressive Community.'
cover_url = 'https://s3.amazonaws.com/s3.commondreams.org/images/common-dreams.png'
__author__ = u'XanthanGum'
language = 'en'
# Format the text
extra_css = '''
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
h1{font-size: xx-large;}
h2{font-size: large;}
'''
# Pick no article older than seven days and limit the number of articles per feed to 100
oldest_article = 7
max_articles_per_feed = 100
# Remove everything before the article
no_stylesheets = True
remove_javascript = True
# Flattens all the tables to make it compatible with Nook
conversion_options = {'linearize_tables' : True}
remove_tags_before = dict(name = 'div', attrs = {'id':'node-header'})
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Remove everything after the article
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: ''),
(re.compile(r'<a.*?>'), lambda m: ''),
(re.compile(r'</a>'), lambda m: ''),
]
remove_tags_after = dict(name = 'div', attrs = {'class':'copyright-info'})
# Main article is inside this tag
keep_only_tags = [
dict(name='div', attrs={'id':lambda x: x and 'node-' in x}),
]
remove_tags = [
dict(name='div', attrs={'class':'node-links clear-block'}), # remove Share options
]
# Identify the news feeds
feeds = [(u'Headlines', u'http://www.commondreams.org/feed/headlines_rss'),
(u'Further News Articles', u'http://www.commondreams.org/feed/further_rss'),
(u'Views', u'http://www.commondreams.org/feed/views_rss'),
(u'Progressive Newswire', u'http://www.commondreams.org/feed/newswire_rss')]
feeds = [(u'Headlines', u'https://www.commondreams.org/feed/headlines_rss'),
(u'Further News Articles', u'https://www.commondreams.org/feed/further_rss'),
(u'Views', u'https://www.commondreams.org/feed/views_rss'),
(u'Progressive Newswire', u'https://www.commondreams.org/feed/newswire_rss')]
def print_version(self, url):
url = url + '?print'
return url

View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
##
## Title: Consortium News
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Initial release
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
consortiumnews.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ConsortiumNews(BasicNewsRecipe):
title = u'Consortium News'
publisher = 'Copyright © 2012 Consortiumnews. All Rights Reserved.'
language = 'en'
__author__ = 'kiavash'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
conversion_options = {'linearize_tables' : True} # Flattens all the tables to make it compatible with Nook
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: ''),
(re.compile(r'<a.*?>'), lambda h1: ''),
(re.compile(r'</a>'), lambda h2: ''),
]
# Main article is inside this tag
keep_only_tags = [dict(name='div', attrs={'id':lambda x: x and 'post-' in x})]
remove_tags = [
dict(name='div', attrs={'class':'sociable'}), # remove 'Share this Article'
dict(name='p', attrs={'class':'tags'}), # remove 'Tags: ... '
]
feeds = [(u'Consortium News', u'http://feeds.feedburner.com/Consortiumnewscom')]

View File

@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 26/12/11
# last updated 11/2/12
language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
@ -14,35 +14,58 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
oldest_article = 1
max_articles_per_feed = 20
max_articles_per_feed = 5
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
auto_cleanup = True
#conversion_options = { 'linearize_tables' : True }
#keep_only_tags = [
# dict(name='h1'),
# dict(name='div',attrs={'id' : 'body-content'}),
#dict(name='div',atts={'class' : 'article-body'}),
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
#dict(name='p'),
# ]
#remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
remove_tags = [
dict(name='title'),
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
# dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
#dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
#dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
]
# preprocess_regexps = [
#(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
#preprocess_regexps = [
#(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [
(u'News', u'http://www.mirror.co.uk/news/rss.xml')
,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
(u'UK News', u'http://feed43.com/0287771688643868.xml')
,(u'Tech News', u'http://feed43.com/2455520588350501.xml')
,(u'Weird World','http://feed43.com/0863800333634654.xml')
,(u'Sport','http://feed43.com/7713243036546130.xml')
,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml')
,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml')
,(u'Sport : Other','http://feed43.com/4501416886323415.xml')
,(u'TV and Film','http://feed43.com/5238302853765104.xml')
,(u'Celebs','http://feed43.com/8770061048844683.xml')
,(u'Life Style : Family','http://feed43.com/4356170742410338.xml')
,(u'Travel','http://feed43.com/1436576006476607.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
]

View File

@ -0,0 +1,21 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class DesiringGodEnglish(BasicNewsRecipe):
title = u'Desiring God'
__author__ = 'Peter Grungi'
language = 'en'
cover_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
masthead_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
language = 'en'
oldest_article = 7
max_articles_per_feed = 50
auto_cleanup = True
publisher = 'Desiring God Ministries'
author = 'Desiring God Ministries'
feeds = [(u'Desiring God Blog', u'http://feeds.feedburner.com/DGBlog?format=xml')]

Binary file not shown.

After

Width:  |  Height:  |  Size: 906 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 289 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 968 B

View File

@ -13,9 +13,10 @@ class Kurier(BasicNewsRecipe):
publisher = 'KURIER'
category = 'news, politics, Austria'
oldest_article = 2
max_articles_per_feed = 200
max_articles_per_feed = 100
timeout = 30
encoding = None
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'de_AT'
remove_empty_feeds = True
@ -29,9 +30,11 @@ class Kurier(BasicNewsRecipe):
, 'language' : language
}
remove_tags = [dict(attrs={'class':['functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})]
remove_tags = [ dict(attrs={'id':['artikel_expand_symbol2','imgzoom_close2']}),
dict(attrs={'class':['linkextern','functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})
]
keep_only_tags = [dict(attrs={'id':'content'})]
remove_tags_after = dict(attrs={'id':'author'})
remove_tags_after = [dict(attrs={'id':'author'})]
remove_attributes = ['width','height']
feeds = [
@ -41,7 +44,7 @@ class Kurier(BasicNewsRecipe):
,(u'Kultur' , u'http://kurier.at/rss/kultur_kultur_rss.xml' )
,(u'Freizeit' , u'http://kurier.at/rss/freizeit_freizeit_rss.xml' )
,(u'Wetter' , u'http://kurier.at/rss/oewetter_rss.xml' )
,(u'Verkehr' , u'http://kurier.at/rss/verkehr_rss.xml' )
,(u'Sport' , u'http://kurier.at/newsfeed/detail/sport_rss.xml' )
]
def preprocess_html(self, soup):

View File

@ -1,8 +1,8 @@
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__author__ = 'Lorenzo Vigentini and Olivier Daigle'
__copyright__ = '2012, Lorenzo Vigentini <l.vigentini at gmail.com>, Olivier Daigle <odaigle _at nuvucameras __dot__ com>'
__version__ = 'v1.01'
__date__ = '14, January 2010'
__date__ = '12, February 2012'
__description__ = 'Canadian Paper '
'''
@ -12,13 +12,14 @@ http://www.ledevoir.com/
import re
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date
class ledevoir(BasicNewsRecipe):
author = 'Lorenzo Vigentini'
description = 'Canadian Paper. A subscription is optional, with it you get more content'
cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
title = u'Le Devoir'
title = u'Le Devoir '
publisher = 'leDevoir.com'
category = 'News, finance, economy, politics'
@ -26,11 +27,15 @@ class ledevoir(BasicNewsRecipe):
encoding = 'utf-8'
timefmt = '[%a, %d %b, %Y]'
max_articles_per_feed = 50
oldest_article = 1
max_articles_per_feed = 200
use_embedded_content = False
recursion = 10
needs_subscription = 'optional'
filterDuplicates = False
url_list = []
remove_javascript = True
no_stylesheets = True
@ -38,7 +43,7 @@ class ledevoir(BasicNewsRecipe):
keep_only_tags = [
dict(name='div', attrs={'id':'article'}),
dict(name='ul', attrs={'id':'ariane'})
dict(name='div', attrs={'id':'colonne_principale'})
]
remove_tags = [
@ -51,7 +56,7 @@ class ledevoir(BasicNewsRecipe):
feeds = [
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
(u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
(u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
(u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
(u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
(u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
@ -61,7 +66,7 @@ class ledevoir(BasicNewsRecipe):
(u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
(u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
(u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
(u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50')
(u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50')
]
extra_css = '''
@ -85,8 +90,16 @@ class ledevoir(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.ledevoir.com')
br.select_form(nr=1)
br['login[courriel]'] = self.username
br['login[password]'] = self.password
br.select_form(nr=0)
br['login_popup[courriel]'] = self.username
br['login_popup[password]'] = self.password
br.submit()
return br
def print_version(self, url):
if self.filterDuplicates:
if url in self.url_list:
return
self.url_list.append(url)
return url

View File

@ -0,0 +1,25 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class LivingStonesPastorsBlog(BasicNewsRecipe):
title = u'Living Stones Pastors Blog'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 90
max_articles_per_feed = 10
auto_cleanup = True
cover_url = 'http://blogs.livingstonesreno.com/wp-content/uploads/2011/08/blogBGRD_norepeat.jpg'
masthead_url = 'http://www.livingstonesreno.com/podcast/LSpodcastnew.jpg'
publisher = 'Living Stones Church of Reno, NV'
language = 'en'
author = 'Living Stones Church of Reno, NV'
feeds = [(u'LS Blog', u'http://blogs.livingstonesreno.com/feed?utm_source=calibre&utm_medium=rss')]
def full_version(self, url):
import re
newurl = re.sub(r'\?.*','',url)
return newurl

View File

@ -0,0 +1,217 @@
#!/usr/bin/env python
##
## Title: Microwave and RF
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Feb 2012: Initial release
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
'''
mwrf.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class Microwave_and_RF(BasicNewsRecipe):
Convert_Grayscale = False # Convert images to gray scale or not
# Add sections that want to be excluded from the magazine
exclude_sections = []
# Add sections that want to be included from the magazine
include_sections = []
title = u'Microwave and RF'
__author__ = 'kiavash'
description = u'Microwave and RF Montly Magazine'
publisher = 'Penton Media, Inc.'
publication_type = 'magazine'
site = 'http://mwrf.com'
language = 'en'
asciiize = True
timeout = 120
simultaneous_downloads = 1 # very peaky site!
# Main article is inside this tag
keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})]
no_stylesheets = True
remove_javascript = True
# Flattens all the tables to make it compatible with Nook
conversion_options = {'linearize_tables' : True}
remove_tags = [
dict(name='span', attrs={'class':'body12'}),
]
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { font-size: 175%; font-weight: bold; } \
h2 { font-size: 150%; font-weight: bold; } \
h3 { font-size: 125%; font-weight: bold; } \
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
# Remove the line breaks and float left/right and picture width/height.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'float:.*?'), lambda m: ''),
(re.compile(r'width:.*?px'), lambda m: ''),
(re.compile(r'height:.*?px'), lambda m: '')
]
def print_version(self, url):
url = re.sub(r'.html', '', url)
url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url)
return url
# Need to change the user agent to avoid potential download errors
def get_browser(self, *args, **kwargs):
from calibre import browser
kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0'
return browser(*args, **kwargs)
def parse_index(self):
# Fetches the main page of Microwave and RF
soup = self.index_to_soup(self.site)
# Searches the site for Issue ID link then returns the href address
# pointing to the latest issue
latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href')
# Fetches the index page for of the latest issue
soup = self.index_to_soup(latest_issue)
# Finds the main section of the page containing cover, issue date and
# TOC
ts = soup.find('div', attrs={'id':'columnContainer'})
# Finds the issue date
ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize()
self.log('Found Current Issue:', ds)
self.timefmt = ' [%s]'%ds
# Finds the cover image
cover = ts.find('img', src = lambda x: x and 'Cover' in x)
if cover is not None:
self.cover_url = self.site + cover['src']
self.log('Found Cover image:', self.cover_url)
feeds = []
article_info = []
# Finds all the articles (tiles and links)
articles = ts.findAll('a', attrs={'class':'commonArticleTitle'})
# Finds all the descriptions
descriptions = ts.findAll('span', attrs={'class':'commonCopy'})
# Find all the sections
sections = ts.findAll('span', attrs={'class':'kicker'})
title_number = 0
# Goes thru all the articles one by one and sort them out
for section in sections:
title_number = title_number + 1
# Removes the unwanted sections
if self.tag_to_string(section) in self.exclude_sections:
continue
# Only includes the wanted sections
if self.include_sections:
if self.tag_to_string(section) not in self.include_sections:
continue
title = self.tag_to_string(articles[title_number])
url = articles[title_number].get('href')
if url.startswith('/'):
url = self.site + url
self.log('\tFound article:', title, 'at', url)
desc = self.tag_to_string(descriptions[title_number])
self.log('\t\t', desc)
article_info.append({'title':title, 'url':url, 'description':desc,
'date':self.timefmt})
if article_info:
feeds.append((self.title, article_info))
#self.log(feeds)
return feeds
def postprocess_html(self, soup, first):
if self.Convert_Grayscale:
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup
def preprocess_html(self, soup):
# Includes all the figures inside the final ebook
# Finds all the jpg links
for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}):
# makes sure that the link points to the absolute web address
if figure['href'].startswith('/'):
figure['href'] = self.site + figure['href']
figure.name = 'img' # converts the links to img
figure['src'] = figure['href'] # with the same address as href
figure['style'] = 'display:block' # adds /n before and after the image
del figure['href']
del figure['target']
# Makes the title standing out
for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}):
title.name = 'h1'
del title['href']
del title['target']
# Makes the section name more visible
for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}):
section_name.name = 'h5'
del section_name['href']
del section_name['target']
# Removes all unrelated links
for link in soup.findAll('a', attrs = {'href': True}):
link.name = 'font'
del link['href']
del link['target']
return soup

View File

@ -0,0 +1,26 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1329123365(BasicNewsRecipe):
title = u'Novinite.bg'
__author__ = 'M3 Web'
description = 'Real time provider of the latest news from Bulgaria and the world'
category = 'Business, Politics, Society, Sports, Crime, Lifestyle, World, Health'
oldest_article = 7
max_articles_per_feed = 6
language = 'bg'
encoding = 'windows-1251'
no_stylesheets = True
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
remove_tags = [dict(name='div', attrs={'id':'text_options'})]
remove_tags = [dict(name='div', attrs={'id':'social_shares_top'})]
remove_tags_after = dict(id='textsize')
feeds = [(u'Business', u'http://novinite.bg/rss.php?category_id=1'),
(u'Politics', u'http://novinite.bg/rss.php?category_id=2'),
(u'Society', u'http://novinite.bg/rss.php?category_id=3'),
(u'Sport', u'http://novinite.bg/rss.php?category_id=4'),
(u'Crime', u'http://novinite.bg/rss.php?category_id=5'),
(u'Lifestyle', u'http://novinite.bg/rss.php?category_id=6'),
(u'Health', u'http://novinite.bg/rss.php?category_id=7'),
(u'Other', u'http://novinite.bg/rss.php?category_id=10'),
(u'World', u'http://novinite.bg/rss.php?category_id=9')]

View File

@ -1,18 +1,18 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
pescanik.net
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Pescanik(BasicNewsRecipe):
title = 'Pescanik'
title = 'Peščanik'
__author__ = 'Darko Miletic'
description = 'Pescanik'
publisher = 'Pescanik'
description = 'Peščanik je udruženje građana osnovano 2006. godine. Glavni proizvod Peščanika je radio emisija koja je emitovana na Radiju B92 od 02.02.2000. do 16.06.2011, a od septembra 2011. se emituje na osam radio stanica u Srbiji, Crnoj Gori i BiH'
publisher = 'Peščanik'
category = 'news, politics, Serbia'
oldest_article = 10
max_articles_per_feed = 100
@ -20,8 +20,13 @@ class Pescanik(BasicNewsRecipe):
use_embedded_content = False
encoding = 'utf-8'
language = 'sr'
publication_type = 'newsportal'
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,"Lucida Grande",Tahoma,Verdana,sans1,sans-serif} .contentheading{font-size: x-large; font-weight: bold} .small{font-size: small} .createdate{font-size: x-small; font-weight: bold} '
publication_type = 'newsportal'
masthead_url = 'http://pescanik.net/wp-content/uploads/2011/10/logo1.png'
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Verdana,Arial,Tahoma,sans1,sans-serif}
#BlogTitle{font-size: xx-large; font-weight: bold}
"""
conversion_options = {
'comment' : description
@ -32,29 +37,12 @@ class Pescanik(BasicNewsRecipe):
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_attributes = ['valign','colspan','width','height','align','alt']
remove_tags = [dict(name=['object','link','meta','script'])]
keep_only_tags = [
dict(attrs={'class':['contentheading','small','createdate']})
,dict(name='td', attrs={'valign':'top','colspan':'2'})
]
feeds = [(u'Pescanik Online', u'http://www.pescanik.net/index.php?option=com_rd_rss&id=12')]
remove_tags = [dict(name=['object','link','meta','script','iframe','embed'])]
keep_only_tags = [dict(attrs={'id':['BlogTitle','BlogDate','BlogContent']})]
feeds = [
(u'Autori' , u'http://pescanik.net/category/autori/feed/'),
(u'Prevodi', u'http://pescanik.net/category/prevodi/feed/')
]
def print_version(self, url):
nurl = url.replace('/index.php','/index2.php')
return nurl + '&pop=1&page=0'
def preprocess_html(self, soup):
st = soup.findAll('td')
for it in st:
it.name='p'
for pt in soup.findAll('img'):
brtag = Tag(soup,'br')
brtag2 = Tag(soup,'br')
pt.append(brtag)
pt.append(brtag2)
return soup
return url + 'print/'

20
recipes/resurgence.recipe Normal file
View File

@ -0,0 +1,20 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class TheResurgence(BasicNewsRecipe):
title = u'The Resurgence'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 7
max_articles_per_feed = 10
auto_cleanup = True
cover_url = 'http://cdn.theresurgence.com/images/logo.png'
masthead_url = 'http://cdn.theresurgence.com/images/logo.png'
language = 'en'
publisher = 'The Resurgence'
author = 'The Resurgence'
feeds = [(u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')]

View File

@ -10,6 +10,7 @@ class SHaber (BasicNewsRecipe):
oldest_article =2
max_articles_per_feed =100
no_stylesheets = True
auto_cleanup = True
#delay = 1
#use_embedded_content = False
encoding = 'utf-8'
@ -23,15 +24,15 @@ class SHaber (BasicNewsRecipe):
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
}
extra_css = ' .Haber-Baslik-Yazisi {font-weight: bold; font-size: 9px} .Haber-Ozet-Yazisi{ font-family:sans-serif;font-weight: normal;font-size: 11px } #Haber{ font-family:sans-serif;font-weight: normal;font-size: 9px }.KirmiziText{ font-weight: normal;font-size: 5px }' #.story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
#extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
keep_only_tags = [dict(name='div', attrs={'class':['Haber-Baslik-Yazisi','Haber-Ozet-Yazisi']}),dict(name='div', attrs={'id':['ctl00_ContentPlaceHolder1_imagenew','Haber']})]#,dict(name='h6', attrs={'class':['KirmiziText',]}) dict(name='div', attrs={'id':['Haber']}),dict(name='div', attrs={'id':['gallery']})]
#remove_tags = [dict(name='img', attrs={'src':[ 'http://medya.aksiyon.com.tr/aksiyon/images/logo/logo.bmp','/aksiyon/images/template/green/baslik0.gif','mobile/home.jpg']}) ],dict(name='h1', attrs={'class':['H1-Haber-DetayBasligi']}),dict(name='h4', attrs={'class':['BrownText']}) ,
#keep_only_tags = [dict(name='div', attrs={'class':['Haber-Baslik-Yazisi','Haber-Ozet-Yazisi']}),dict(name='div', attrs={'id':['ctl00_ContentPlaceHolder1_imagenew','Haber']})]#,dict(name='h6', attrs={'class':['KirmiziText',]}) dict(name='div', attrs={'id':['Haber']}),dict(name='div', attrs={'id':['gallery']})]
#remove_tags = [dict(name='div', attrs={'class':['Haber-Baslik-Yazisi']})]#attrs={'src':[ 'http://medya.aksiyon.com.tr/aksiyon/images/logo/logo.bmp','/aksiyon/images/template/green/baslik0.gif','mobile/home.jpg']}) ],dict(name='h1', attrs={'class':['H1-Haber-DetayBasligi']}),dict(name='h4', attrs={'class':['BrownText']}) ,
cover_img_url = 'http://static.samanyoluhaber.com/Images/resources/images/samanyoluhaber-yazi-logo.png'
masthead_url = 'http://static.samanyoluhaber.com/Images/resources/images/samanyoluhaber-yazi-logo.png'
cover_img_url = 'http://www.samanyoluhaber.com/include/logo.png'
masthead_url = 'http://www.samanyoluhaber.com/include/logo.png'
remove_empty_feeds= True
#remove_attributes = ['width','height']

View File

@ -1,115 +0,0 @@
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Volkskrant_full(BasicNewsRecipe):
# This recipe will download the Volkskrant newspaper,
# from the subscribers site. It requires a password.
# Known issues are: articles that are spread out over
# multiple pages will appear multiple times. Pages
# that contain only adverts will appear, but empty.
# The supplement 'Volkskrant Magazine' on saturday
# is currently not downloaded.
# You can set a manual date, to download an archived
# newspaper. Volkskrant stores over a month at the
# moment of writing. To do so I suggest you unmark
# the date on the line below, and insert it in the title. Then
# follow the instructions marked further below.
title = 'De Volkskrant (subscription)' # [za, 13 nov 2010]'
__author__ = u'Selcal'
description = u"Volkskrant"
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
language = 'nl'
use_embedded_content = False
simultaneous_downloads = 1
delay = 1
needs_subscription = True
# Set RETRIEVEDATE to 'yyyymmdd' to load an older
# edition. Otherwise keep '%Y%m%d'
# When setting a manual date, unmark and add the date
# to the title above, and unmark the timefmt line to stop
# Calibre from adding today's date in addition.
# timefmt = ''
RETRIEVEDATE = strftime('%Y%m%d')
INDEX_MAIN = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/#text'
INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/'
LOGIN = 'http://www.volkskrant.nl/vk/user/loggedIn.do'
remove_tags = [dict(name='address')]
cover_url = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/page.jpg'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(nr = 0)
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
krant = []
def strip_title(_title):
i = 0
while ((_title[i] <> ":") and (i <= len(_title))):
i = i + 1
return(_title[0:i])
for temp in range (5):
try:
soup = self.index_to_soup(self.INDEX_MAIN)
break
except:
#print '(Retrying main index load)'
continue
mainsoup = soup.find('td', attrs={'id': 'select_page_top'})
for option in mainsoup.findAll('option'):
articles = []
_INDEX = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/#text'
_INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/'
#print ''
#print '<------- Processing section: ' + _INDEX + ' ------------------------->'
for temp in range (5):
try:
soup = self.index_to_soup(_INDEX)
break
except:
#print '(Retrying index load)'
continue
for item in soup.findAll('area'):
art_nr = item['class']
attrname = art_nr[0:12] + '_section' + option['value'][0:5] + '_' + art_nr[26:len(art_nr)]
#print '==> Found: ' + attrname;
index_title = soup.find('div', attrs={'class': attrname})
get_title = index_title['title'];
_ARTICLE = _INDEX_ARTICLE + attrname + '.html#text'
title = get_title;
#print '--> Title: ' + title;
#print '--> URL: ' + _ARTICLE;
for temp in range (5):
try:
souparticle = self.index_to_soup(_ARTICLE);
break
except:
print '(Retrying URL load)'
continue
headerurl = souparticle.findAll('frame')[0]['src'];
#print '--> Read frame name for header: ' + headerurl;
url = _INDEX_ARTICLE + headerurl[0:len(headerurl)-12] + '_text.html';
#print '--> Corrected URL: ' + url;
if (get_title <> ''):
title = strip_title(get_title)
date = strftime(' %B %Y')
if (title <> ''):
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':''
})
krant.append( (option.string, articles))
return krant

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re, subprocess, hashlib, shutil, glob, stat, sys, time
import os, subprocess, hashlib, shutil, glob, stat, sys, time
from subprocess import check_call
from tempfile import NamedTemporaryFile, mkdtemp
from zipfile import ZipFile
@ -64,15 +64,11 @@ class ReUpload(Command): # {{{
# Data {{{
def get_google_data():
PASSWORD_FILE = os.path.expanduser('~/.googlecodecalibre')
OFFLINEIMAP = os.path.expanduser('~/work/kde/conf/offlineimap/rc')
with open(os.path.expanduser('~/.googlecodecalibre'), 'rb') as f:
gc_password, ga_un, pw = f.read().strip().split('|')
gc_password = open(PASSWORD_FILE).read().strip()
raw = open(OFFLINEIMAP).read()
pw = re.search(r'(?s)remoteuser = .*@gmail.com.*?remotepass = (\S+)',
raw).group(1).strip()
return {
'username':'kovidgoyal@gmail.com', 'password':pw, 'gc_password':gc_password,
'username':ga_un, 'password':pw, 'gc_password':gc_password,
'path_map_server':'root@kovidgoyal.net',
'path_map_location':'/var/www/status.calibre-ebook.com/googlepaths',
# If you change this remember to change it in the

View File

@ -350,20 +350,20 @@ def get_proxy_info(proxy_scheme, proxy_string):
USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
def random_user_agent():
def random_user_agent(choose=None):
choices = [
'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11',
'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.2; rv:9.0.1) Gecko/20100101 Firefox/9.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.78 Safari/532.5',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
]
#return choices[-1]
return choices[random.randint(0, len(choices)-1)]
if choose is None:
choose = random.randint(0, len(choices)-1)
return choices[choose]
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
'''

View File

@ -70,11 +70,13 @@ class ANDROID(USBMS):
0xd12e : [0x0100],
0xe14f : [0x0226],
0x614f : [0x0226, 0x100],
0x6156 : [0x0226, 0x100],
},
# Google
0x18d1 : {
0x0001 : [0x0223, 0x9999],
0x0003 : [0x0230],
0x4e11 : [0x0100, 0x226, 0x227],
0x4e12 : [0x0100, 0x226, 0x227],
0x4e21 : [0x0100, 0x226, 0x227, 0x231],
@ -101,6 +103,7 @@ class ANDROID(USBMS):
0xc001 : [0x0226],
0xc004 : [0x0226],
0x8801 : [0x0226, 0x0227],
0xe115 : [0x0216], # PocketBook A10
},
# Acer
@ -165,7 +168,8 @@ class ANDROID(USBMS):
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA',
'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT', 'HYSTON',
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP']
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP',
'POCKET', 'ONDA_MID']
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
@ -179,13 +183,14 @@ class ANDROID(USBMS):
'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008', 'EMX51_BBG_ANDROI',
'UMS', '.K080', 'P990', 'LTE', 'MB853', 'GT-S5660_CARD', 'A107',
'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855',
'XT910']
'XT910', 'BOOK_A10', 'USB_2.0_DRIVER']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
'__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL',
'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853',
'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910']
'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD',
'USB_2.0_DRIVER']
OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -19,7 +19,12 @@ class APNXBuilder(object):
Create an APNX file using a pseudo page mapping.
'''
def write_apnx(self, mobi_file_path, apnx_path, accurate=True):
def write_apnx(self, mobi_file_path, apnx_path, accurate=True, page_count=0):
'''
If you want a fixed number of pages (such as from a custom column) then
pass in a value to page_count, otherwise a count will be estimated
using either the fast or accurate algorithm.
'''
# Check that this is really a MOBI file.
with open(mobi_file_path, 'rb') as mf:
ident = PdbHeaderReader(mf).identity()
@ -28,16 +33,19 @@ class APNXBuilder(object):
# Get the pages depending on the chosen parser
pages = []
if accurate:
try:
pages = self.get_pages_accurate(mobi_file_path)
except:
# Fall back to the fast parser if we can't
# use the accurate one. Typically this is
# due to the file having DRM.
pages = self.get_pages_fast(mobi_file_path)
if page_count:
pages = self.get_pages_exact(mobi_file_path, page_count)
else:
pages = self.get_pages_fast(mobi_file_path)
if accurate:
try:
pages = self.get_pages_accurate(mobi_file_path)
except:
# Fall back to the fast parser if we can't
# use the accurate one. Typically this is
# due to the file having DRM.
pages = self.get_pages_fast(mobi_file_path)
else:
pages = self.get_pages_fast(mobi_file_path)
if not pages:
raise Exception(_('Could not generate page mapping.'))
@ -77,6 +85,31 @@ class APNXBuilder(object):
return apnx
def get_pages_exact(self, mobi_file_path, page_count):
'''
Given a specified page count (such as from a custom column),
create our array of pages for the apnx file by dividing by
the content size of the book.
'''
pages = []
count = 0
with open(mobi_file_path, 'rb') as mf:
phead = PdbHeaderReader(mf)
r0 = phead.section_data(0)
text_length = struct.unpack('>I', r0[4:8])[0]
chars_per_page = int(text_length / page_count)
while count < text_length:
pages.append(count)
count += chars_per_page
if len(pages) > page_count:
# Rounding created extra page entries
pages = pages[:page_count]
return pages
def get_pages_fast(self, mobi_file_path):
'''
2300 characters of uncompressed text per page. This is

View File

@ -302,19 +302,28 @@ class KINDLE2(KINDLE):
' this information to the Kindle when uploading MOBI files by'
' USB. Note that the page numbers do not correspond to any paper'
' book.'),
_('Use slower but more accurate page number generation') +
_('Use slower but more accurate page number calculation') +
':::' +
_('There are two ways to generate the page number information. Using the more accurate '
'generator will produce pages that correspond better to a printed book. '
'However, this method is slower and will slow down sending files '
'to the Kindle.'),
_('Custom column name to retrieve page counts from') +
':::' +
_('If you have a custom column in your library that you use to '
'store the page count of books, you can have calibre use that '
'information, instead of calculating a page count. Specify the '
'name of the custom column here, for example, #pages. '),
]
EXTRA_CUSTOMIZATION_DEFAULT = [
True,
False,
'',
]
OPT_APNX = 0
OPT_APNX_ACCURATE = 1
OPT_APNX_CUST_COL = 2
def books(self, oncard=None, end_session=True):
bl = USBMS.books(self, oncard=oncard, end_session=end_session)
@ -380,10 +389,20 @@ class KINDLE2(KINDLE):
if not os.path.exists(path):
os.makedirs(path)
cust_col_name = opts.extra_customization[self.OPT_APNX_CUST_COL]
custom_page_count = 0
if cust_col_name:
try:
custom_page_count = int(metadata.get(cust_col_name, 0))
except:
pass
apnx_path = '%s.apnx' % os.path.join(path, filename)
apnx_builder = APNXBuilder()
try:
apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE])
apnx_builder.write_apnx(filepath, apnx_path,
accurate=opts.extra_customization[self.OPT_APNX_ACCURATE],
page_count=custom_page_count)
except:
print 'Failed to generate APNX'
import traceback

View File

@ -160,6 +160,7 @@ def render_cover(opf, opf_path, zf, reader=None):
with open(cpage, 'r+b') as f:
raw = f.read()
f.truncate(0)
f.seek(0)
raw = ffpat.sub(b'', raw)
f.write(raw)
from calibre.ebooks.chardet import xml_to_unicode
@ -174,6 +175,7 @@ def render_cover(opf, opf_path, zf, reader=None):
with open(path, 'r+b') as f:
raw = f.read()
f.truncate(0)
f.seek(0)
raw = ffpat.sub(b'', raw)
f.write(raw)

View File

@ -19,7 +19,7 @@ def get_metadata(stream):
return get_metadata_(src)
def get_meta_regexp_(name):
return re.compile('<meta name=[\'"]' + name + '[\'"] content=[\'"](.+?)[\'"]\s*/?>', re.IGNORECASE)
return re.compile('<meta name=[\'"]' + name + r'[\'"]\s+content=[\'"](.+?)[\'"]\s*/?>', re.IGNORECASE)
def get_metadata_(src, encoding=None):
if not isinstance(src, unicode):
@ -34,6 +34,7 @@ def get_metadata_(src, encoding=None):
# Title
title = None
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
src = src[:150000] # Searching shouldn't take too long
match = pat.search(src)
if match:
title = match.group(2)

View File

@ -233,7 +233,7 @@ def forked_read_metadata(path, tdir):
f.write(mi.cover_data[1])
mi.cover_data = (None, None)
mi.cover = 'cover.jpg'
opf = metadata_to_opf(mi)
opf = metadata_to_opf(mi, default_lang='und')
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
f.write(opf)

View File

@ -13,7 +13,7 @@ from threading import Thread
from Queue import Queue, Empty
from calibre import as_unicode
from calibre import as_unicode, random_user_agent
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors)
@ -174,8 +174,8 @@ class Worker(Thread): # Get details {{{
def get_details(self):
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.soupparser import fromstring
from calibre.ebooks.chardet import xml_to_unicode
import html5lib
try:
raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
@ -202,7 +202,8 @@ class Worker(Thread): # Get details {{{
return
try:
root = fromstring(clean_ascii_chars(raw))
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
namespaceHTMLElements=False)
except:
msg = 'Failed to parse amazon details page: %r'%self.url
self.log.exception(msg)
@ -356,33 +357,46 @@ class Worker(Thread): # Get details {{{
if m is not None:
return float(m.group(1))/float(m.group(3)) * 5
def parse_comments(self, root):
def _render_comments(self, desc):
from calibre.library.comments import sanitize_comments_html
for c in desc.xpath('descendant::noscript'):
c.getparent().remove(c)
for c in desc.xpath('descendant::*[@class="seeAll" or'
' @class="emptyClear" or @id="collapsePS" or'
' @id="expandPS"]'):
c.getparent().remove(c)
for a in desc.xpath('descendant::a[@href]'):
del a.attrib['href']
a.tag = 'span'
desc = self.tostring(desc, method='html', encoding=unicode).strip()
# Encoding bug in Amazon data U+fffd (replacement char)
# in some examples it is present in place of '
desc = desc.replace('\ufffd', "'")
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace
#desc = re.sub('\n+', '\n', desc)
#desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc)
def parse_comments(self, root):
ans = ''
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
if desc:
ans = self._render_comments(desc[0])
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
if desc:
desc = desc[0]
for c in desc.xpath('descendant::*[@class="seeAll" or'
' @class="emptyClear"]'):
c.getparent().remove(c)
for a in desc.xpath('descendant::a[@href]'):
del a.attrib['href']
a.tag = 'span'
desc = self.tostring(desc, method='html', encoding=unicode).strip()
# Encoding bug in Amazon data U+fffd (replacement char)
# in some examples it is present in place of '
desc = desc.replace('\ufffd', "'")
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace
#desc = re.sub('\n+', '\n', desc)
#desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc)
ans += self._render_comments(desc[0])
return ans
def parse_cover(self, root):
imgs = root.xpath('//img[@id="prodImage" and @src]')
@ -467,6 +481,28 @@ class Amazon(Source):
Source.__init__(self, *args, **kwargs)
self.set_amazon_id_touched_fields()
def test_fields(self, mi):
'''
Return the first field from self.touched_fields that is null on the
mi object
'''
for key in self.touched_fields:
if key.startswith('identifier:'):
key = key.partition(':')[-1]
if key == 'amazon':
if self.domain != 'com':
key += '_' + self.domain
if not mi.has_identifier(key):
return 'identifier: ' + key
elif mi.is_null(key):
return key
@property
def user_agent(self):
# Pass in an index to random_user_agent() to test with a particular
# user agent
return random_user_agent()
def save_settings(self, *args, **kwargs):
Source.save_settings(self, *args, **kwargs)
self.set_amazon_id_touched_fields()
@ -507,6 +543,9 @@ class Amazon(Source):
@property
def domain(self):
x = getattr(self, 'testing_domain', None)
if x is not None:
return x
domain = self.prefs['domain']
if domain not in self.AMAZON_DOMAINS:
domain = 'com'
@ -599,16 +638,52 @@ class Amazon(Source):
return url
# }}}
def parse_results_page(self, root): # {{{
from lxml.html import tostring
matches = []
def title_ok(title):
title = title.lower()
for x in ('bulk pack', '[audiobook]', '[audio cd]'):
if x in title:
return False
return True
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
title = tostring(a, method='text', encoding=unicode)
if title_ok(title):
matches.append(a.get('href'))
break
if not matches:
# This can happen for some user agents that Amazon thinks are
# mobile/less capable
for td in root.xpath(
r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
title = tostring(a, method='text', encoding=unicode)
if title_ok(title):
matches.append(a.get('href'))
break
# Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant
return matches[:5]
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
'''
Note this method will retry without identifiers automatically if no
match is found with identifiers.
'''
from lxml.html import tostring
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.soupparser import fromstring
from calibre.ebooks.chardet import xml_to_unicode
from lxml.html import tostring
import html5lib
query, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers)
@ -616,6 +691,8 @@ class Amazon(Source):
log.error('Insufficient metadata to construct query')
return
br = self.browser
if getattr(self, 'running_a_test', False):
print ('Using user agent for amazon: %s'%self.user_agent)
try:
raw = br.open_novisit(query, timeout=timeout).read().strip()
except Exception as e:
@ -634,15 +711,23 @@ class Amazon(Source):
return as_unicode(msg)
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
raw = clean_ascii_chars(xml_to_unicode(raw,
strip_encoding_pats=True, resolve_entities=True)[0])
if getattr(self, 'running_a_test', False):
import tempfile
with tempfile.NamedTemporaryFile(prefix='amazon_results_',
suffix='.html', delete=False) as f:
f.write(raw.encode('utf-8'))
print ('Downloaded html for results page saved in', f.name)
matches = []
found = '<title>404 - ' not in raw
if found:
try:
root = fromstring(clean_ascii_chars(raw))
root = html5lib.parse(raw, treebuilder='lxml',
namespaceHTMLElements=False)
except:
msg = 'Failed to parse amazon page for query: %r'%query
log.exception(msg)
@ -655,30 +740,9 @@ class Amazon(Source):
# The error is almost always a not found error
found = False
if found:
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
title = tostring(a, method='text', encoding=unicode).lower()
if 'bulk pack' not in title:
matches.append(a.get('href'))
break
if not matches:
# This can happen for some user agents that Amazon thinks are
# mobile/less capable
log('Trying alternate results page markup')
for td in root.xpath(
r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
title = tostring(a, method='text', encoding=unicode).lower()
if ('bulk pack' not in title and '[audiobook]' not in
title and '[audio cd]' not in title):
matches.append(a.get('href'))
break
# Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant
matches = matches[:5]
matches = self.parse_results_page(root)
if abort.is_set():
return
@ -686,7 +750,7 @@ class Amazon(Source):
if not matches:
if identifiers and title and authors:
log('No matches found with identifiers, retrying using only'
' title and authors')
' title and authors. Query: %r'%query)
return self.identify(log, result_queue, abort, title=title,
authors=authors, timeout=timeout)
log.error('No matches found with query: %r'%query)
@ -756,9 +820,18 @@ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e
# src/calibre/ebooks/metadata/sources/amazon.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test, title_test, authors_test)
isbn_test, title_test, authors_test, comments_test)
com_tests = [ # {{{
( # Different comments markup, using Book Description section
{'identifiers':{'amazon':'0982514506'}},
[title_test(
"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy"
, exact=True),
comments_test('Jelena'), comments_test('Leslie'),
]
),
( # # in title
{'title':'Expert C# 2008 Business Objects',
'authors':['Lhotka']},
@ -850,7 +923,17 @@ if __name__ == '__main__': # tests {{{
),
] # }}}
test_identify_plugin(Amazon.name, com_tests)
#test_identify_plugin(Amazon.name, de_tests)
def do_test(domain, start=0, stop=None):
tests = globals().get(domain+'_tests')
if stop is None:
stop = len(tests)
tests = tests[start:stop]
test_identify_plugin(Amazon.name, tests, modify_plugin=lambda
p:setattr(p, 'testing_domain', domain))
do_test('com')
#do_test('de')
# }}}

View File

@ -253,10 +253,16 @@ class Source(Plugin):
# Browser {{{
@property
def user_agent(self):
# Pass in an index to random_user_agent() to test with a particular
# user agent
return random_user_agent()
@property
def browser(self):
if self._browser is None:
self._browser = browser(user_agent=random_user_agent())
self._browser = browser(user_agent=self.user_agent)
if self.supports_gzip_transfer_encoding:
self._browser.set_handle_gzip(True)
return self._browser.clone_browser()

View File

@ -84,6 +84,16 @@ def series_test(series, series_index):
return test
def comments_test(sentinel):
def test(mi):
comm = mi.comments.lower() if mi.comments else ''
if sentinel and sentinel.lower() in comm:
return True
prints('comments test failed. %s not in comments'%sentinel)
return False
return test
def init_test(tdir_name):
tdir = tempfile.gettempdir()
lf = os.path.join(tdir, tdir_name.replace(' ', '')+'_identify_test.txt')
@ -157,7 +167,7 @@ def test_identify(tests): # {{{
# }}}
def test_identify_plugin(name, tests): # {{{
def test_identify_plugin(name, tests, modify_plugin=lambda plugin:None): # {{{
'''
:param name: Plugin name
:param tests: List of 2-tuples. Each two tuple is of the form (args,
@ -171,6 +181,7 @@ def test_identify_plugin(name, tests): # {{{
if x.name == name:
plugin = x
break
modify_plugin(plugin)
prints('Testing the identify function of', plugin.name)
prints('Using extra headers:', plugin.browser.addheaders)

View File

@ -15,8 +15,8 @@ from lxml.etree import XPath as _XPath
from lxml import etree
from lxml.cssselect import CSSSelector
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
urldefrag, rewrite_links, urlunquote, barename, XHTML
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
urldefrag, rewrite_links, urlunquote, barename, XHTML, urlnormalize)
from calibre.ebooks.epub import rules
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -159,6 +159,7 @@ class Split(object):
except ValueError:
# Unparseable URL
return url
href = urlnormalize(href)
if href in self.map:
anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None]

View File

@ -69,11 +69,12 @@ def pdftohtml(output_dir, pdf_path, no_images):
raise
logf.flush()
logf.close()
out = open(logf.name, 'rb').read()
out = open(logf.name, 'rb').read().strip()
if ret != 0:
raise ConversionError(out)
print "pdftohtml log:"
print out
if out:
print "pdftohtml log:"
print out
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()

View File

@ -26,7 +26,7 @@ class Worker(Thread):
self.wake_up = Event()
self.path, self.callback = path, callback
self.staging = set()
self.be = frozenset(BOOK_EXTENSIONS)
self.be = frozenset(BOOK_EXTENSIONS) - {'pdr', 'mbp', 'tan'}
def run(self):
self.tdir = PersistentTemporaryDirectory('_auto_adder')

View File

@ -82,6 +82,7 @@ class ConfigWidget(QWidget, Ui_ConfigWidget):
self.opt_extra_customization.append(QLineEdit(self))
l = QLabel(label_text)
l.setToolTip(tt)
self.opt_extra_customization[i].setToolTip(tt)
l.setBuddy(self.opt_extra_customization[i])
l.setWordWrap(True)
self.opt_extra_customization[i].setText(settings.extra_customization[i])

View File

@ -131,9 +131,16 @@ class Metadata(QLabel):
class DoubleSpinBox(QDoubleSpinBox):
def __init__(self, *args, **kwargs):
QDoubleSpinBox.__init__(self, *args, **kwargs)
self.tt = _('Position in book')
self.setToolTip(self.tt)
def set_value(self, val):
self.blockSignals(True)
self.setValue(val)
self.setToolTip(self.tt +
' [{0:.0%}]'.format(float(val)/self.maximum()))
self.blockSignals(False)
class HelpfulLineEdit(QLineEdit):
@ -197,7 +204,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
self.metadata = Metadata(self)
self.pos = DoubleSpinBox()
self.pos.setDecimals(1)
self.pos.setToolTip(_('Position in book'))
self.pos.setSuffix('/'+_('Unknown')+' ')
self.pos.setMinimum(1.)
self.pos.setMinimumWidth(150)

View File

@ -61,4 +61,13 @@ def generate_test_db(library_path, # {{{
print 'Time per record:', t/float(num_of_records)
# }}}
def current_library_name():
from calibre.utils.config import prefs
import posixpath
path = prefs['library_path']
if path:
path = path.replace('\\', '/')
while path.endswith('/'):
path = path[:-1]
return posixpath.basename(path)

View File

@ -136,6 +136,7 @@ def sanitize_comments_html(html):
text = html2text(html)
md = markdown.Markdown(safe_mode=True)
cleansed = re.sub('\n+', '', md.convert(text))
cleansed = cleansed.replace(markdown.HTML_REMOVED_TEXT, '')
return cleansed
def test():

View File

@ -25,7 +25,7 @@ clean:
html:
mkdir -p .build/html .build/doctrees
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) .build/html
$(SPHINXBUILD) -b html -t online $(ALLSPHINXOPTS) .build/html
@echo
@echo "Build finished. The HTML pages are in .build/html."

View File

@ -17,7 +17,7 @@ To get started with more advanced usage, you should read about the :ref:`Graphic
.. only:: online
An ebook version of this user manual is available in `EPUB format <calibre.epub>`_.
**An ebook version of this user manual is available in** `EPUB format <calibre.epub>`_.
Sections
------------

View File

@ -134,6 +134,7 @@ _extra_lang_codes = {
'en_CZ' : _('English (Czech Republic)'),
'en_PK' : _('English (Pakistan)'),
'en_HR' : _('English (Croatia)'),
'en_HK' : _('English (Hong Kong)'),
'en_ID' : _('English (Indonesia)'),
'en_IL' : _('English (Israel)'),
'en_RU' : _('English (Russia)'),