KG updates

This commit is contained in:
GRiker 2011-05-13 04:31:19 -06:00
commit 0bf6badddd
25 changed files with 1079 additions and 481 deletions

View File

@ -3,7 +3,6 @@ __license__ = 'GPL v3'
'''
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Feed
class ReadersDigest(BasicNewsRecipe):
@ -38,151 +37,20 @@ class ReadersDigest(BasicNewsRecipe):
'''
remove_tags = [
dict(name='h4', attrs={'class':'close'}),
dict(name='div', attrs={'class':'fromLine'}),
dict(name='img', attrs={'class':'colorTag'}),
dict(name='div', attrs={'id':'sponsorArticleHeader'}),
dict(name='div', attrs={'class':'horizontalAd'}),
dict(name='div', attrs={'id':'imageCounterLeft'}),
dict(name='div', attrs={'id':'commentsPrint'})
]
feeds = [
('New in RD', 'http://feeds.rd.com/ReadersDigest'),
('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
('Food', 'http://www.rd.com/food/feed'),
('Health', 'http://www.rd.com/health/feed'),
('Home', 'http://www.rd.com/home/feed'),
('Family', 'http://www.rd.com/family/feed'),
('Money', 'http://www.rd.com/money/feed'),
('Travel', 'http://www.rd.com/travel/feed'),
]
cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
#-------------------------------------------------------------------------------------------------
def print_version(self, url):
# Get the identity number of the current article and append it to the root print URL
if url.find('/article') > 0:
ident = url[url.find('/article')+8:url.find('.html?')-4]
url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
elif url.find('/post') > 0:
# in this case, have to get the page itself to derive the Print page.
soup = self.index_to_soup(url)
newsoup = soup.find('ul',attrs={'class':'printBlock'})
url = 'http://www.rd.com' + newsoup('a')[0]['href']
url = url[0:url.find('&Keep')]
return url
#-------------------------------------------------------------------------------------------------
def parse_index(self):
pages = [
('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
# useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
keep_only_tags = dict(id='main-content')
remove_tags = [
{'class':['post-categories']},
]
feeds = []
for page in pages:
section, url, divider, attrList = page
newArticles = self.page_parse(url, divider, attrList)
feeds.append((section,newArticles))
# after the pages of the site have been processed, parse several RSS feeds for additional sections
newfeeds = Feed()
newfeeds = self.parse_rss()
# The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable
# for this module (parse_index).
for feed in newfeeds:
newArticles = []
for article in feed.articles:
newArt = {
'title' : article.title,
'url' : article.url,
'date' : article.date,
'description' : article.text_summary
}
newArticles.append(newArt)
# New and Blogs should be the first two feeds.
if feed.title == 'New in RD':
feeds.insert(0,(feed.title,newArticles))
elif feed.title == 'Blogs':
feeds.insert(1,(feed.title,newArticles))
else:
feeds.append((feed.title,newArticles))
return feeds
#-------------------------------------------------------------------------------------------------
def page_parse(self, mainurl, divider, attrList):
articles = []
mainsoup = self.index_to_soup(mainurl)
for item in mainsoup.findAll(attrs=attrList):
newArticle = {
'title' : item('img')[0]['alt'],
'url' : 'http://www.rd.com'+item('a')[0]['href'],
'date' : '',
'description' : ''
}
articles.append(newArticle)
return articles
#-------------------------------------------------------------------------------------------------
def parse_rss (self):
# Do the "official" parse_feeds first
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop thru the articles in all feeds to find articles with "recipe" in it
recipeArticles = []
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if curarticle.title.upper().find('RECIPE') >= 0:
recipeArticles.append(curarticle)
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
# If there are any recipes found, create a new Feed object and append.
if len(recipeArticles) > 0:
pfeed = Feed()
pfeed.title = 'Recipes'
pfeed.descrition = 'Recipe Feed (Virtual)'
pfeed.image_url = None
pfeed.oldest_article = 30
pfeed.id_counter = len(recipeArticles)
# Create a new Feed, add the recipe articles, and then append
# to "official" list of feeds
pfeed.articles = recipeArticles[:]
feeds.append(pfeed)
return feeds

View File

@ -33,7 +33,7 @@ class StrategyBusinessRecipe(BasicNewsRecipe):
elif c.name.endswith('_password'):
br[c.name] = self.password
raw = br.submit().read()
if '>Logout' not in raw:
if 'You have been logged in' not in raw:
raise ValueError('Failed to login, check your username and password')
return br

View File

@ -628,8 +628,9 @@ from calibre.ebooks.metadata.sources.amazon import Amazon
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
from calibre.ebooks.metadata.sources.overdrive import OverDrive
from calibre.ebooks.metadata.sources.douban import Douban
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
# }}}

View File

@ -253,7 +253,7 @@ class OutputProfile(Plugin):
periodical_date_in_title = True
#: Characters used in jackets and catalogs
missing_char = u'x'
missing_char = u'x'
ratings_char = u'*'
empty_ratings_char = u' '
read_char = u'+'
@ -293,38 +293,38 @@ class iPadOutput(OutputProfile):
}
]
missing_char = u'\u2715\u200a' # stylized 'x' plus hair space
ratings_char = u'\u2605' # filled star
empty_ratings_char = u'\u2606' # hollow star
read_char = u'\u2713' # check mark
missing_char = u'\u2715\u200a' # stylized 'x' plus hair space
ratings_char = u'\u2605' # filled star
empty_ratings_char = u'\u2606' # hollow star
read_char = u'\u2713' # check mark
touchscreen = True
# touchscreen_news_css {{{
touchscreen_news_css = u'''
/* hr used in articles */
.article_articles_list {
/* hr used in articles */
.article_articles_list {
width:18%;
}
}
.article_link {
color: #593f29;
color: #593f29;
font-style: italic;
}
.article_next {
-webkit-border-top-right-radius:4px;
-webkit-border-bottom-right-radius:4px;
-webkit-border-top-right-radius:4px;
-webkit-border-bottom-right-radius:4px;
font-style: italic;
width:32%;
}
.article_prev {
-webkit-border-top-left-radius:4px;
-webkit-border-bottom-left-radius:4px;
-webkit-border-top-left-radius:4px;
-webkit-border-bottom-left-radius:4px;
font-style: italic;
width:32%;
}
.article_sections_list {
.article_sections_list {
width:18%;
}
}
.articles_link {
font-weight: bold;
}
@ -334,8 +334,8 @@ class iPadOutput(OutputProfile):
.caption_divider {
border:#ccc 1px solid;
}
border:#ccc 1px solid;
}
.touchscreen_navbar {
background:#c3bab2;
@ -357,50 +357,50 @@ class iPadOutput(OutputProfile):
text-align:center;
}
.touchscreen_navbar td a:link {
color: #593f29;
text-decoration: none;
}
.touchscreen_navbar td a:link {
color: #593f29;
text-decoration: none;
}
/* Index formatting */
.publish_date {
text-align:center;
}
.divider {
border-bottom:1em solid white;
border-top:1px solid gray;
}
/* Index formatting */
.publish_date {
text-align:center;
}
.divider {
border-bottom:1em solid white;
border-top:1px solid gray;
}
hr.caption_divider {
border-color:black;
border-style:solid;
border-width:1px;
}
hr.caption_divider {
border-color:black;
border-style:solid;
border-width:1px;
}
/* Feed summary formatting */
.article_summary {
display:inline-block;
}
display:inline-block;
}
.feed {
font-family:sans-serif;
font-weight:bold;
font-size:larger;
}
}
.feed_link {
font-style: italic;
}
.feed_next {
-webkit-border-top-right-radius:4px;
-webkit-border-bottom-right-radius:4px;
-webkit-border-top-right-radius:4px;
-webkit-border-bottom-right-radius:4px;
font-style: italic;
width:40%;
}
.feed_prev {
-webkit-border-top-left-radius:4px;
-webkit-border-bottom-left-radius:4px;
-webkit-border-top-left-radius:4px;
-webkit-border-bottom-left-radius:4px;
font-style: italic;
width:40%;
}
@ -410,24 +410,24 @@ class iPadOutput(OutputProfile):
font-size: 160%;
}
.feed_up {
.feed_up {
font-weight: bold;
width:20%;
}
}
.summary_headline {
font-weight:bold;
text-align:left;
}
}
.summary_byline {
text-align:left;
font-family:monospace;
}
}
.summary_text {
text-align:left;
}
}
'''
# }}}
@ -617,8 +617,8 @@ class KindleOutput(OutputProfile):
supports_mobi_indexing = True
periodical_date_in_title = False
missing_char = u'x\u2009'
empty_ratings_char = u'\u2606'
missing_char = u'x\u2009'
empty_ratings_char = u'\u2606'
ratings_char = u'\u2605'
read_char = u'\u2713'
@ -642,8 +642,8 @@ class KindleDXOutput(OutputProfile):
#comic_screen_size = (741, 1022)
supports_mobi_indexing = True
periodical_date_in_title = False
missing_char = u'x\u2009'
empty_ratings_char = u'\u2606'
missing_char = u'x\u2009'
empty_ratings_char = u'\u2606'
ratings_char = u'\u2605'
read_char = u'\u2713'
mobi_ems_per_blockquote = 2.0

View File

@ -92,7 +92,7 @@ def restore_plugin_state_to_default(plugin_or_name):
config['enabled_plugins'] = ep
default_disabled_plugins = set([
'Overdrive',
'Overdrive', 'Douban Books',
])
def is_disabled(plugin):

View File

@ -103,10 +103,11 @@ class EPUBInput(InputFormatPlugin):
t.set('href', guide_cover)
t.set('title', 'Title Page')
from calibre.ebooks import render_html_svg_workaround
renderer = render_html_svg_workaround(guide_cover, log)
if renderer is not None:
open('calibre_raster_cover.jpg', 'wb').write(
renderer)
if os.path.exists(guide_cover):
renderer = render_html_svg_workaround(guide_cover, log)
if renderer is not None:
open('calibre_raster_cover.jpg', 'wb').write(
renderer)
def find_opf(self):
def attr(n, attr):

View File

@ -280,7 +280,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source):
name = 'Amazon.com'
description = _('Downloads metadata from Amazon')
description = _('Downloads metadata and covers from Amazon')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',

View File

@ -0,0 +1,347 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
import time
from urllib import urlencode
from functools import partial
from Queue import Queue, Empty
from lxml import etree
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars
from calibre import as_unicode
NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom',
'db': 'http://www.douban.com/xmlns/',
'gd': 'http://schemas.google.com/g/2005'
}
XPath = partial(etree.XPath, namespaces=NAMESPACES)
total_results = XPath('//openSearch:totalResults')
start_index = XPath('//openSearch:startIndex')
items_per_page = XPath('//openSearch:itemsPerPage')
entry = XPath('//atom:entry')
entry_id = XPath('descendant::atom:id')
title = XPath('descendant::atom:title')
description = XPath('descendant::atom:summary')
publisher = XPath("descendant::db:attribute[@name='publisher']")
isbn = XPath("descendant::db:attribute[@name='isbn13']")
date = XPath("descendant::db:attribute[@name='pubdate']")
creator = XPath("descendant::db:attribute[@name='author']")
booktag = XPath("descendant::db:tag/attribute::name")
rating = XPath("descendant::gd:rating/attribute::average")
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
def get_details(browser, url, timeout): # {{{
try:
raw = browser.open_novisit(url, timeout=timeout).read()
except Exception as e:
gc = getattr(e, 'getcode', lambda : -1)
if gc() != 403:
raise
# Douban is throttling us, wait a little
time.sleep(2)
raw = browser.open_novisit(url, timeout=timeout).read()
return raw
# }}}
def to_metadata(browser, log, entry_, timeout): # {{{
def get_text(extra, x):
try:
ans = x(extra)
if ans:
ans = ans[0].text
if ans and ans.strip():
return ans.strip()
except:
log.exception('Programming error:')
return None
id_url = entry_id(entry_)[0].text
douban_id = id_url.split('/')[-1]
title_ = ': '.join([x.text for x in title(entry_)]).strip()
authors = [x.text.strip() for x in creator(entry_) if x.text]
if not authors:
authors = [_('Unknown')]
if not id_url or not title:
# Silently discard this entry
return None
mi = Metadata(title_, authors)
mi.identifiers = {'douban':douban_id}
try:
raw = get_details(browser, id_url, timeout)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0])
extra = entry(feed)[0]
except:
log.exception('Failed to get additional details for', mi.title)
return mi
mi.comments = get_text(extra, description)
mi.publisher = get_text(extra, publisher)
# ISBN
isbns = []
for x in [t.text for t in isbn(extra)]:
if check_isbn(x):
isbns.append(x)
if isbns:
mi.isbn = sorted(isbns, key=len)[-1]
mi.all_isbns = isbns
# Tags
try:
btags = [x for x in booktag(extra) if x]
tags = []
for t in btags:
atags = [y.strip() for y in t.split('/')]
for tag in atags:
if tag not in tags:
tags.append(tag)
except:
log.exception('Failed to parse tags:')
tags = []
if tags:
mi.tags = [x.replace(',', ';') for x in tags]
# pubdate
pubdate = get_text(extra, date)
if pubdate:
try:
default = utcnow().replace(day=15)
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
except:
log.error('Failed to parse pubdate %r'%pubdate)
# Ratings
if rating(extra):
try:
mi.rating = float(rating(extra)[0]) / 2.0
except:
log.exception('Failed to parse rating')
mi.rating = 0
# Cover
mi.has_douban_cover = None
u = cover_url(extra)
if u:
u = u[0].replace('/spic/', '/lpic/');
# If URL contains "book-default", the book doesn't have a cover
if u.find('book-default') == -1:
mi.has_douban_cover = u
return mi
# }}}
class Douban(Source):
name = 'Douban Books'
author = 'Li Fanxi'
version = (2, 0, 0)
description = _('Downloads metadata and covers from Douban.com')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags',
'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
'identifier:douban']) # language currently disabled
supports_gzip_transfer_encoding = True
cached_cover_url_is_reliable = True
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
def get_book_url(self, identifiers): # {{{
db = identifiers.get('douban', None)
if db is not None:
return ('douban', db, self.DOUBAN_BOOK_URL%db)
# }}}
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
SEARCH_URL = 'http://api.douban.com/book/subjects?'
ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
SUBJECT_URL = 'http://api.douban.com/book/subject/'
q = ''
t = None
isbn = check_isbn(identifiers.get('isbn', None))
subject = identifiers.get('douban', None)
if isbn is not None:
q = isbn
t = 'isbn'
elif subject is not None:
q = subject
t = 'subject'
elif title or authors:
def build_term(prefix, parts):
return ' '.join(x for x in parts)
title_tokens = list(self.get_title_tokens(title))
if title_tokens:
q += build_term('title', title_tokens)
author_tokens = self.get_author_tokens(authors,
only_first_author=True)
if author_tokens:
q += ((' ' if q != '' else '') +
build_term('author', author_tokens))
t = 'search'
q = q.strip()
if isinstance(q, unicode):
q = q.encode('utf-8')
if not q:
return None
url = None
if t == "isbn":
url = ISBN_URL + q
elif t == 'subject':
url = SUBJECT_URL + q
else:
url = SEARCH_URL + urlencode({
'q': q,
})
if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
url = url + "?apikey=" + self.DOUBAN_API_KEY
return url
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')
rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors,
identifiers=identifiers)
if abort.is_set():
return
results = []
while True:
try:
results.append(rq.get_nowait())
except Empty:
break
results.sort(key=self.identify_results_keygen(
title=title, authors=authors, identifiers=identifiers))
for mi in results:
cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None:
break
if cached_url is None:
log.info('No cover found')
return
if abort.is_set():
return
br = self.browser
log('Downloading cover from:', cached_url)
try:
cdata = br.open_novisit(cached_url, timeout=timeout).read()
if cdata:
result_queue.put((self, cdata))
except:
log.exception('Failed to download cover from:', cached_url)
# }}}
def get_cached_cover_url(self, identifiers): # {{{
url = None
db = identifiers.get('douban', None)
if db is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
db = self.cached_isbn_to_identifier(isbn)
if db is not None:
url = self.cached_identifier_to_cover_url(db)
return url
# }}}
def get_all_details(self, br, log, entries, abort, # {{{
result_queue, timeout):
for relevance, i in enumerate(entries):
try:
ans = to_metadata(br, log, i, timeout)
if isinstance(ans, Metadata):
ans.source_relevance = relevance
db = ans.identifiers['douban']
for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, db)
if ans.has_douban_cover:
self.cache_identifier_to_cover_url(db,
ans.has_douban_cover)
self.clean_downloaded_metadata(ans)
result_queue.put(ans)
except:
log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
if abort.is_set():
break
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers)
if not query:
log.error('Insufficient metadata to construct query')
return
br = self.browser
try:
raw = br.open_novisit(query, timeout=timeout).read()
except Exception as e:
log.exception('Failed to make identify query: %r'%query)
return as_unicode(e)
try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0], parser=parser)
entries = entry(feed)
except Exception as e:
log.exception('Failed to parse identify results')
return as_unicode(e)
if not entries and identifiers and title and authors and \
not abort.is_set():
return self.identify(log, result_queue, abort, title=title,
authors=authors, timeout=timeout)
# There is no point running these queries in threads as douban
# throttles requests returning 403 Forbidden errors
self.get_all_details(br, log, entries, abort, result_queue, timeout)
return None
# }}}
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test)
test_identify_plugin(Douban.name,
[
(
{'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
'authors':['刘慈欣']},
[title_test('三体', exact=True),
authors_test(['刘慈欣'])]
),
(
{'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
[title_test('Linux内核修炼之道', exact=False)]
),
])
# }}}

View File

@ -157,7 +157,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
class GoogleBooks(Source):
name = 'Google'
description = _('Downloads metadata from Google Books')
description = _('Downloads metadata and covers from Google Books')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',

View File

@ -382,7 +382,7 @@ def identify(log, abort, # {{{
if key not in filter_results:
filtered_results.append(r)
filter_results.add(key)
presults = filtered_results
results[plugin] = presults = filtered_results
plog = logs[plugin].getvalue().strip()
log('\n'+'*'*30, plugin.name, '*'*30)

View File

@ -30,7 +30,7 @@ base_url = 'http://search.overdrive.com/'
class OverDrive(Source):
name = 'Overdrive'
description = _('Downloads metadata from Overdrive\'s Content Reserve')
description = _('Downloads metadata and covers from Overdrive\'s Content Reserve')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',

View File

@ -12,7 +12,7 @@ A Humane Web Text Generator
#__date__ = '2009/12/04'
__copyright__ = """
Copyright (c) 2011, Leigh Parry
Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
Copyright (c) 2011, John Schember <john@nachtimwald.com>
Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
@ -219,14 +219,13 @@ class Textile(object):
]
glyph_defaults = [
(re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2&#215;\3'), # dimension sign
(re.compile(r'(\d+)\'', re.I), r'\1&#8242;'), # prime
(re.compile(r'(\d+)\"', re.I), r'\1&#8243;'), # prime-double
(re.compile(r'(\d+)\'(\s)', re.I), r'\1&#8242;\2'), # prime
(re.compile(r'(\d+)\"(\s)', re.I), r'\1&#8243;\2'), # prime-double
(re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'<acronym title="\2">\1</acronym>'), # 3+ uppercase acronym
(re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'<span class="caps">\1</span>'), # 3+ uppercase
(re.compile(r'\b(\s{0,1})?\.{3}'), r'\1&#8230;'), # ellipsis
(re.compile(r'^[\*_-]{3,}$', re.M), r'<hr />'), # <hr> scene-break
(re.compile(r'\b--\b'), r'&#8212;'), # em dash
(re.compile(r'(\s)--(\s)'), r'\1&#8212;\2'), # em dash
(re.compile(r'(^|[^-])--([^-]|$)'), r'\1&#8212;\2'), # em dash
(re.compile(r'\s-(?:\s|$)'), r' &#8211; '), # en dash
(re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1&#8482;'), # trademark
(re.compile(r'\b( ?)[([]R[])]', re.I), r'\1&#174;'), # registered
@ -706,6 +705,21 @@ class Textile(object):
result.append(line)
return ''.join(result)
def macros_only(self, text):
# fix: hackish
text = re.sub(r'"\Z', '\" ', text)
result = []
for line in re.compile(r'(<.*?>)', re.U).split(text):
if not re.search(r'<.*>', line):
rules = []
if re.search(r'{.+?}', line):
rules = self.macro_defaults
for s, r in rules:
line = s.sub(r, line)
result.append(line)
return ''.join(result)
def vAlign(self, input):
d = {'^':'top', '-':'middle', '~':'bottom'}
return d.get(input, '')
@ -814,6 +828,7 @@ class Textile(object):
'fooobar ... and hello world ...'
"""
text = self.macros_only(text)
punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
pattern = r'''
@ -1044,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
return Textile(restricted=True, lite=lite,
noimage=noimage).textile(text, rel='nofollow',
html_type=html_type)

View File

@ -66,19 +66,26 @@ class TXTOutput(OutputFormatPlugin):
help=_('Do not remove image references within the document. This is only ' \
'useful when paired with a txt-output-formatting option that '
'is not none because links are always removed with plain text output.')),
OptionRecommendation(name='keep_color',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove font color from output. This is only useful when ' \
'txt-output-formatting is set to textile. Textile is the only ' \
'formatting that supports setting font color. If this option is ' \
'not specified font color will not be set and default to the ' \
'color displayed by the reader (generally this is black).')),
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
if opts.txt_output_formatting.lower() == 'markdown':
from calibre.ebooks.txt.markdownml import MarkdownMLizer
writer = MarkdownMLizer(log)
self.writer = MarkdownMLizer(log)
elif opts.txt_output_formatting.lower() == 'textile':
from calibre.ebooks.txt.textileml import TextileMLizer
writer = TextileMLizer(log)
self.writer = TextileMLizer(log)
else:
writer = TXTMLizer(log)
self.writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts)
txt = self.writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...')
@ -111,17 +118,28 @@ class TXTZOutput(TXTOutput):
from calibre.ebooks.oeb.base import OEB_IMAGES
with TemporaryDirectory('_txtz_output') as tdir:
# TXT
with TemporaryFile('index.txt') as tf:
txt_name = 'index.txt'
if opts.txt_output_formatting.lower() == 'textile':
txt_name = 'index.text'
with TemporaryFile(txt_name) as tf:
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
shutil.copy(tf, os.path.join(tdir, 'index.txt'))
shutil.copy(tf, os.path.join(tdir, txt_name))
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
path = os.path.join(tdir, os.path.dirname(item.href))
if hasattr(self.writer, 'images'):
path = os.path.join(tdir, 'images')
if item.href in self.writer.images:
href = self.writer.images[item.href]
else:
continue
else:
path = os.path.join(tdir, os.path.dirname(item.href))
href = os.path.basename(item.href)
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(tdir, item.href), 'wb') as imgf:
with open(os.path.join(path, href), 'wb') as imgf:
imgf.write(item.data)
# Metadata

View File

@ -242,6 +242,8 @@ def detect_formatting_type(txt):
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
# Links
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
# paragraph blocks
textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
# Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found.

View File

@ -1,62 +1,489 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
__docformat__ = 'restructuredtext en'
'''
Transform OEB content into Textile formatted plain text
'''
import re
from lxml import etree
from functools import partial
from calibre.ebooks.oeb.base import XHTML
from calibre.utils.html2textile import html2textile
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks import unit_convert
from calibre.ebooks.txt.unsmarten import unsmarten
class TextileMLizer(object):
def __init__(self, log):
self.log = log
class TextileMLizer(OEB2HTML):
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to Textile formatted TXT...')
self.oeb_book = oeb_book
self.opts = opts
self.in_pre = False
self.in_table = False
self.links = {}
self.list = []
self.our_links = []
self.in_a_link = False
self.our_ids = []
self.images = {}
self.id_no_text = u''
self.style_embed = []
self.remove_space_after_newline = False
self.base_hrefs = [item.href for item in oeb_book.spine]
self.map_resources(oeb_book)
return self.mlize_spine()
self.style_bold = False
self.style_italic = False
self.style_under = False
self.style_strike = False
self.style_smallcap = False
def mlize_spine(self):
txt = self.mlize_spine(oeb_book)
txt = unsmarten(txt)
# Do some tidying up
txt = self.tidy_up(txt)
return txt
def mlize_spine(self, oeb_book):
output = [u'']
for item in self.oeb_book.spine:
for item in oeb_book.spine:
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output.append('\n\n')
return ''.join(output)
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
def tidy_up(self, text):
# May need tweaking and finetuning
def check_escaping(text, tests):
for t in tests:
# I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
txt = '%s' % t
if txt != '%':
text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text)
text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
return text
if not self.opts.keep_links:
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html)
# Now tidyup links and ids - remove ones that don't have a correponding opposite
if self.opts.keep_links:
for i in self.our_links:
if i[0] == '#':
if i not in self.our_ids:
text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
for i in self.our_ids:
if i not in self.our_links:
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
# Remove obvious non-needed escaping, add sub/sup-script ones
text = check_escaping(text, ['\*', '_', '\*'])
# escape the super/sub-scripts if needed
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
# escape the super/sub-scripts if needed
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
text = html2textile(html)
#remove empty spans
text = re.sub(r'%\xa0+', r'%', text)
#remove empty spans - MAY MERGE SOME ?
text = re.sub(r'%%', r'', text)
#remove spans from tagged output
text = re.sub(r'%([_+*-]+)%', r'\1', text)
#remove spaces before a newline
text = re.sub(r' +\n', r'\n', text)
#remove newlines at top of file
text = re.sub(r'^\n+', r'', text)
#correct blockcode paras
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
#correct blockquote paras
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
# Ensure the section ends with at least two new line characters.
# This is to prevent the last paragraph from a section being
# combined into the fist paragraph of the next.
end_chars = text[-4:]
# Convert all newlines to \n
end_chars = end_chars.replace('\r\n', '\n')
end_chars = end_chars.replace('\r', '\n')
end_chars = end_chars[-2:]
if not end_chars[1] == '\n':
text += '\n\n'
if end_chars[1] == '\n' and not end_chars[0] == '\n':
text += '\n'
#reduce blank lines
text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
#Check span following blank para
text = re.sub(r'\n+ +%', r' %', text)
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
# blank paragraph
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
# blank paragraph
text = re.sub(u'\n\xa0', r'\np. ', text)
# blank paragraph
text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text)
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
#sort out spaces in tables
text = re.sub(r' {2,}\|', r' |', text)
output += text
# Now put back spaces removed earlier as they're needed here
text = re.sub(r'\np\.\n', r'\np. \n', text)
#reduce blank lines
text = re.sub(r' \n\n\n', r' \n\n', text)
output = u''.join(output)
return text
return output
def remove_newlines(self, text):
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
# Condense redundant spaces created by replacing newlines with spaces.
text = re.sub(r'[ ]{2,}', ' ', text)
text = re.sub(r'\t+', '', text)
if self.remove_space_after_newline == True:
text = re.sub(r'^ +', '', text)
self.remove_space_after_newline = False
return text
def check_styles(self, style):
txt = '{'
if self.opts.keep_color:
if 'color' in style.cssdict() and style['color'] != 'black':
txt += 'color:'+style['color']+';'
if 'background' in style.cssdict():
txt += 'background:'+style['background']+';'
txt += '}'
if txt == '{}': txt = ''
return txt
def check_halign(self, style):
tests = {'left':'<','justify':'<>','center':'=','right':'>'}
for i in tests:
if style['text-align'] == i:
return tests[i]
return ''
def check_valign(self, style):
tests = {'top':'^','bottom':'~'} #, 'middle':'-'}
for i in tests:
if style['vertical-align'] == i:
return tests[i]
return ''
def check_padding(self, style, stylizer):
txt = ''
left_padding_pts = 0
left_margin_pts = 0
if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
left = left_margin_pts + left_padding_pts
emleft = int(round(left / stylizer.profile.fbase))
if emleft >= 1:
txt += '(' * emleft
right_padding_pts = 0
right_margin_pts = 0
if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
right = right_margin_pts + right_padding_pts
emright = int(round(right / stylizer.profile.fbase))
if emright >= 1:
txt += ')' * emright
return txt
def check_id_tag(self, attribs):
txt = ''
if attribs.has_key('id'):
txt = '(#'+attribs['id']+ ')'
self.our_ids.append('#'+attribs['id'])
self.id_no_text = u'\xa0'
return txt
def build_block(self, tag, style, attribs, stylizer):
txt = '\n' + tag
if self.opts.keep_links:
txt += self.check_id_tag(attribs)
txt += self.check_padding(style, stylizer)
txt += self.check_halign(style)
txt += self.check_styles(style)
return txt
def prepare_string_for_textile(self, txt):
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
return ' ==%s== ' % txt
return txt
def dump_text(self, elem, stylizer):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
# Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return ['']
# Soft scene breaks.
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
if ems >= 1:
text.append(u'\n\n\xa0' * ems)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
if tag == 'div':
tag = 'p'
text.append(self.build_block(tag, style, attribs, stylizer))
text.append('. ')
tags.append('\n')
if style['font-style'] == 'italic' or tag in ('i', 'em'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
if self.style_italic == False:
if self.in_a_link:
text.append('_')
tags.append('_')
else:
text.append('[_')
tags.append('_]')
self.style_embed.append('_')
self.style_italic = True
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
if self.style_bold == False:
if self.in_a_link:
text.append('*')
tags.append('*')
else:
text.append('[*')
tags.append('*]')
self.style_embed.append('*')
self.style_bold = True
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
if tag != 'a':
if self.style_under == False:
text.append('[+')
tags.append('+]')
self.style_embed.append('+')
self.style_under = True
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
if self.style_strike == False:
text.append('[-')
tags.append('-]')
self.style_embed.append('-')
self.style_strike = True
if tag == 'br':
for i in reversed(self.style_embed):
text.append(i)
text.append('\n')
for i in self.style_embed:
text.append(i)
tags.append('')
self.remove_space_after_newline = True
if tag == 'blockquote':
text.append('\nbq. ')
tags.append('\n')
elif tag in ('abbr', 'acronym'):
text.append('')
txt = attribs['title']
tags.append('(' + txt + ')')
elif tag == 'sup':
text.append('^')
tags.append('^')
elif tag == 'sub':
text.append('~')
tags.append('~')
elif tag == 'code':
if self.in_pre:
text.append('\nbc. ')
tags.append('')
else:
text.append('@')
tags.append('@')
elif tag == 'cite':
text.append('??')
tags.append('??')
elif tag == 'hr':
text.append('\n***')
tags.append('\n')
elif tag == 'pre':
self.in_pre = True
text.append('\npre. ')
tags.append('pre\n')
elif tag == 'a':
if self.opts.keep_links:
if attribs.has_key('href'):
text.append('"')
tags.append('a')
tags.append('":' + attribs['href'])
self.our_links.append(attribs['href'])
if attribs.has_key('title'):
tags.append('(' + attribs['title'] + ')')
self.in_a_link = True
else:
text.append('%')
tags.append('%')
elif tag == 'img':
if self.opts.keep_image_references:
txt = '!' + self.check_halign(style)
txt += self.check_valign(style)
txt += attribs['src']
text.append(txt)
if attribs.has_key('alt'):
txt = attribs['alt']
if txt != '':
text.append('(' + txt + ')')
tags.append('!')
elif tag in ('ol', 'ul'):
self.list.append({'name': tag, 'num': 0})
text.append('')
tags.append(tag)
elif tag == 'li':
if self.list: li = self.list[-1]
else: li = {'name': 'ul', 'num': 0}
text.append('\n')
if li['name'] == 'ul':
text.append('*' * len(self.list) + ' ')
elif li['name'] == 'ol':
text.append('#' * len(self.list) + ' ')
tags.append('')
elif tag == 'dl':
text.append('\n')
tags.append('')
elif tag == 'dt':
text.append('')
tags.append('\n')
elif tag == 'dd':
text.append(' ')
tags.append('')
elif tag == 'dd':
text.append('')
tags.append('\n')
elif tag == 'table':
txt = self.build_block(tag, style, attribs, stylizer)
txt += '. \n'
if txt != '\ntable. \n':
text.append(txt)
else:
text.append('\n')
tags.append('')
elif tag == 'tr':
txt = self.build_block('', style, attribs, stylizer)
txt += '. '
if txt != '\n. ':
txt = re.sub ('\n', '', txt)
text.append(txt)
tags.append('|\n')
elif tag == 'td':
text.append('|')
txt = ''
txt += self.check_halign(style)
txt += self.check_valign(style)
if attribs.has_key ('colspan'):
txt += '\\' + attribs['colspan']
if attribs.has_key ('rowspan'):
txt += '/' + attribs['rowspan']
txt += self.check_styles(style)
if txt != '':
text.append(txt + '. ')
tags.append('')
elif tag == 'th':
text.append('|_. ')
tags.append('')
elif tag == 'span':
if style['font-variant'] == 'small-caps':
if self.style_smallcap == False:
text.append('&')
tags.append('&')
self.style_smallcap = True
else:
if self.in_a_link == False:
txt = '%'
if self.opts.keep_links:
txt += self.check_id_tag(attribs)
txt += self.check_styles(style)
if txt != '%':
text.append(txt)
tags.append('%')
if self.opts.keep_links and attribs.has_key('id'):
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
text.append(self.check_id_tag(attribs))
# Process the styles for any that we want to keep
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
'span', 'table', 'tr', 'td'):
if not self.in_a_link:
text.append(self.check_styles(style))
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
txt = elem.text
if not self.in_pre:
txt = self.prepare_string_for_textile(self.remove_newlines(txt))
text.append(txt)
self.id_no_text = u''
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer)
# Close all open tags.
tags.reverse()
for t in tags:
if tag in ('pre', 'ul', 'ol', 'li', 'table'):
if tag == 'pre':
self.in_pre = False
elif tag in ('ul', 'ol'):
if self.list: self.list.pop()
if not self.list: text.append('\n')
else:
if t == 'a':
self.in_a_link = False
t = ''
text.append(self.id_no_text)
self.id_no_text = u''
if t in ('*]', '*'):
self.style_bold = False
elif t in ('_]', '_'):
self.style_italic = False
elif t == '+]':
self.style_under = False
elif t == '-]':
self.style_strike = False
elif t == '&':
self.style_smallcap = False
if t in ('*]', '_]', '+]', '-]', '*', '_'):
txt = self.style_embed.pop()
text.append('%s' % t)
# Soft scene breaks.
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
if ems >= 1:
text.append(u'\n\n\xa0' * ems)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
tail = elem.tail
if not self.in_pre:
tail = self.prepare_string_for_textile(self.remove_newlines(tail))
text.append(tail)
return text

View File

@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
"""unsmarten : html2textile helper function"""
__version__ = '0.1'
__author__ = 'Leigh Parry'
import re
def unsmarten(txt):
txt = re.sub(u'&#8211;|&ndash;|', r'-', txt) # en-dash
txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt) # double quote
txt = re.sub(u'(["\'‘“]|\s)', r"\1{'/}", txt) # apostrophe
txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|||', r"'", txt) # single quote
txt = re.sub(u'&#162;|&cent;|¢', r'{c\}', txt) # cent
txt = re.sub(u'&#163;|&pound;|£', r'{L-}', txt) # pound
txt = re.sub(u'&#165;|&yen;|¥', r'{Y=}', txt) # yen
txt = re.sub(u'&#169;|&copy;|©', r'{(c)}', txt) # copyright
txt = re.sub(u'&#174;|&reg;|®', r'{(r)}', txt) # registered
txt = re.sub(u'&#188;|&frac14;|¼', r'{1/4}', txt) # quarter
txt = re.sub(u'&#189;|&frac12;|½', r'{1/2}', txt) # half
txt = re.sub(u'&#190;|&frac34;|¾', r'{3/4}', txt) # three-quarter
txt = re.sub(u'&#192;|&Agrave;|À', r'{A`)}', txt) # A-grave
txt = re.sub(u'&#193;|&Aacute;|Á', r"{A'}", txt) # A-acute
txt = re.sub(u'&#194;|&Acirc;|Â', r'{A^}', txt) # A-circumflex
txt = re.sub(u'&#195;|&Atilde;|Ã', r'{A~}', txt) # A-tilde
txt = re.sub(u'&#196;|&Auml;|Ä', r'{A"}', txt) # A-umlaut
txt = re.sub(u'&#197;|&Aring;|Å', r'{Ao}', txt) # A-ring
txt = re.sub(u'&#198;|&AElig;|Æ', r'{AE}', txt) # AE
txt = re.sub(u'&#199;|&Ccedil;|Ç', r'{C,}', txt) # C-cedilla
txt = re.sub(u'&#200;|&Egrave;|È', r'{E`}', txt) # E-grave
txt = re.sub(u'&#201;|&Eacute;|É', r"{E'}", txt) # E-acute
txt = re.sub(u'&#202;|&Ecirc;|Ê', r'{E^}', txt) # E-circumflex
txt = re.sub(u'&#203;|&Euml;|Ë', r'{E"}', txt) # E-umlaut
txt = re.sub(u'&#204;|&Igrave;|Ì', r'{I`}', txt) # I-grave
txt = re.sub(u'&#205;|&Iacute;|Í', r"{I'}", txt) # I-acute
txt = re.sub(u'&#206;|&Icirc;|Î', r'{I^}', txt) # I-circumflex
txt = re.sub(u'&#207;|&Iuml;|Ï', r'{I"}', txt) # I-umlaut
txt = re.sub(u'&#208;|&ETH;|Ð', r'{D-}', txt) # ETH
txt = re.sub(u'&#209;|&Ntilde;|Ñ', r'{N~}', txt) # N-tilde
txt = re.sub(u'&#210;|&Ograve;|Ò', r'{O`}', txt) # O-grave
txt = re.sub(u'&#211;|&Oacute;|Ó', r"{O'}", txt) # O-acute
txt = re.sub(u'&#212;|&Ocirc;|Ô', r'{O^}', txt) # O-circumflex
txt = re.sub(u'&#213;|&Otilde;|Õ', r'{O~}', txt) # O-tilde
txt = re.sub(u'&#214;|&Ouml;|Ö', r'{O"}', txt) # O-umlaut
txt = re.sub(u'&#215;|&times;|×', r'{x}', txt) # dimension
txt = re.sub(u'&#216;|&Oslash;|Ø', r'{O/}', txt) # O-slash
txt = re.sub(u'&#217;|&Ugrave;|Ù', r"{U`}", txt) # U-grave
txt = re.sub(u'&#218;|&Uacute;|Ú', r"{U'}", txt) # U-acute
txt = re.sub(u'&#219;|&Ucirc;|Û', r'{U^}', txt) # U-circumflex
txt = re.sub(u'&#220;|&Uuml;|Ü', r'{U"}', txt) # U-umlaut
txt = re.sub(u'&#221;|&Yacute;|Ý', r"{Y'}", txt) # Y-grave
txt = re.sub(u'&#223;|&szlig;|ß', r'{sz}', txt) # sharp-s
txt = re.sub(u'&#224;|&agrave;|à', r'{a`}', txt) # a-grave
txt = re.sub(u'&#225;|&aacute;|á', r"{a'}", txt) # a-acute
txt = re.sub(u'&#226;|&acirc;|â', r'{a^}', txt) # a-circumflex
txt = re.sub(u'&#227;|&atilde;|ã', r'{a~}', txt) # a-tilde
txt = re.sub(u'&#228;|&auml;|ä', r'{a"}', txt) # a-umlaut
txt = re.sub(u'&#229;|&aring;|å', r'{ao}', txt) # a-ring
txt = re.sub(u'&#230;|&aelig;|æ', r'{ae}', txt) # ae
txt = re.sub(u'&#231;|&ccedil;|ç', r'{c,}', txt) # c-cedilla
txt = re.sub(u'&#232;|&egrave;|è', r'{e`}', txt) # e-grave
txt = re.sub(u'&#233;|&eacute;|é', r"{e'}", txt) # e-acute
txt = re.sub(u'&#234;|&ecirc;|ê', r'{e^}', txt) # e-circumflex
txt = re.sub(u'&#235;|&euml;|ë', r'{e"}', txt) # e-umlaut
txt = re.sub(u'&#236;|&igrave;|ì', r'{i`}', txt) # i-grave
txt = re.sub(u'&#237;|&iacute;|í', r"{i'}", txt) # i-acute
txt = re.sub(u'&#238;|&icirc;|î', r'{i^}', txt) # i-circumflex
txt = re.sub(u'&#239;|&iuml;|ï', r'{i"}', txt) # i-umlaut
txt = re.sub(u'&#240;|&eth;|ð', r'{d-}', txt) # eth
txt = re.sub(u'&#241;|&ntilde;|ñ', r'{n~}', txt) # n-tilde
txt = re.sub(u'&#242;|&ograve;|ò', r'{o`}', txt) # o-grave
txt = re.sub(u'&#243;|&oacute;|ó', r"{o'}", txt) # o-acute
txt = re.sub(u'&#244;|&ocirc;|ô', r'{o^}', txt) # o-circumflex
txt = re.sub(u'&#245;|&otilde;|õ', r'{o~}', txt) # o-tilde
txt = re.sub(u'&#246;|&ouml;|ö', r'{o"}', txt) # o-umlaut
txt = re.sub(u'&#248;|&oslash;|ø', r'{o/}', txt) # o-stroke
txt = re.sub(u'&#249;|&ugrave;|ù', r'{u`}', txt) # u-grave
txt = re.sub(u'&#250;|&uacute;|ú', r"{u'}", txt) # u-acute
txt = re.sub(u'&#251;|&ucirc;|û', r'{u^}', txt) # u-circumflex
txt = re.sub(u'&#252;|&uuml;|ü', r'{u"}', txt) # u-umlaut
txt = re.sub(u'&#253;|&yacute;|ý', r"{y'}", txt) # y-acute
txt = re.sub(u'&#255;|&yuml;|ÿ', r'{y"}', txt) # y-umlaut
txt = re.sub(u'&#338;|&OElig;|Œ', r'{OE}', txt) # OE
txt = re.sub(u'&#339;|&oelig;|œ', r'{oe}', txt) # oe
txt = re.sub(u'&#348;|&Scaron;|Ŝ', r'{S^}', txt) # Scaron
txt = re.sub(u'&#349;|&scaron;|ŝ', r'{s^}', txt) # scaron
txt = re.sub(u'&#8226;|&bull;|•', r'{*}', txt) # bullet
txt = re.sub(u'&#8355;|₣', r'{Fr}', txt) # Franc
txt = re.sub(u'&#8356;|₤', r'{L=}', txt) # Lira
txt = re.sub(u'&#8360;|₨', r'{Rs}', txt) # Rupee
txt = re.sub(u'&#8364;|&euro;|€', r'{C=}', txt) # euro
txt = re.sub(u'&#8482;|&trade;|™', r'{tm}', txt) # trademark
txt = re.sub(u'&#9824;|&spades;|♠', r'{spade}', txt) # spade
txt = re.sub(u'&#9827;|&clubs;|♣', r'{club}', txt) # club
txt = re.sub(u'&#9829;|&hearts;|♥', r'{heart}', txt) # heart
txt = re.sub(u'&#9830;|&diams;|♦', r'{diamond}', txt) # diamond
# Move into main code?
# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
# txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
return txt

View File

@ -620,7 +620,11 @@ class Application(QApplication):
self.original_font = QFont(QApplication.font())
fi = gprefs['font']
if fi is not None:
QApplication.setFont(QFont(*fi))
font = QFont(*(fi[:4]))
s = gprefs.get('font_stretch', None)
if s is not None:
font.setStretch(s)
QApplication.setFont(font)
def _send_file_open_events(self):
with self._file_open_lock:

View File

@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
Widget.__init__(self, parent,
['newline', 'max_line_length', 'force_max_line_length',
'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references',
'txt_output_encoding'])
'keep_color', 'txt_output_encoding'])
self.db, self.book_id = db, book_id
for x in get_option('newline').option.choices:
self.opt_newline.addItem(x)

View File

@ -122,6 +122,13 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_keep_color">
<property name="text">
<string>Keep text color, when possible</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>

View File

@ -161,7 +161,11 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
def initialize(self):
ConfigWidgetBase.initialize(self)
self.current_font = self.initial_font = gprefs['font']
font = gprefs['font']
if font is not None:
font = list(font)
font.append(gprefs.get('font_stretch', QFont.Unstretched))
self.current_font = self.initial_font = font
self.update_font_display()
self.display_model.initialize()
@ -178,7 +182,8 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
def build_font_obj(self):
font_info = self.current_font
if font_info is not None:
font = QFont(*font_info)
font = QFont(*(font_info[:4]))
font.setStretch(font_info[4])
else:
font = qt_app.original_font
return font
@ -215,15 +220,18 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
if fd.exec_() == fd.Accepted:
font = fd.selectedFont()
fi = QFontInfo(font)
self.current_font = (unicode(fi.family()), fi.pointSize(),
fi.weight(), fi.italic())
self.current_font = [unicode(fi.family()), fi.pointSize(),
fi.weight(), fi.italic(), font.stretch()]
self.update_font_display()
self.changed_signal.emit()
def commit(self, *args):
rr = ConfigWidgetBase.commit(self, *args)
if self.current_font != self.initial_font:
gprefs['font'] = self.current_font
gprefs['font'] = (self.current_font[:4] if self.current_font else
None)
gprefs['font_stretch'] = (self.current_font[4] if self.current_font
is not None else QFont.Unstretched)
QApplication.setFont(self.font_display.font())
rr = True
self.display_model.commit()

View File

@ -71,9 +71,10 @@ class SourcesModel(QAbstractTableModel): # {{{
plugin.is_configured()):
return QIcon(I('list_remove.png'))
elif role == Qt.ToolTipRole:
base = plugin.description + '\n\n'
if plugin.is_configured():
return _('This source is configured and ready to go')
return _('This source needs configuration')
return base + _('This source is configured and ready to go')
return base + _('This source needs configuration')
return NONE
def setData(self, index, val, role):

View File

@ -29,7 +29,7 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
detail_item = self.url + detail_item
if external or self.config.get('open_external', False):
open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
open_url(QUrl(url_slash_cleaner(detail_item)))
else:
d = WebStoreDialog(self.gui, self.url, parent, detail_item)
d.setWindowTitle(self.name)
@ -38,9 +38,9 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
def search(self, query, max_results=10, timeout=60):
url = 'http://www.wizardstowerbooks.com/search.html?for=' + urllib.quote(query)
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
@ -60,13 +60,13 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
price = price.strip()
if not price:
continue
title = ''.join(data.xpath('.//span[@class="prti"]/a/b/text()'))
author = ''.join(data.xpath('.//p[@class="last"]/text()'))
a, b, author = author.partition(' by ')
counter -= 1
s = SearchResult()
s.cover_url = cover_url
s.title = title.strip()
@ -74,15 +74,15 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
s.price = price.strip()
s.detail_item = id.strip()
s.drm = SearchResult.DRM_UNLOCKED
yield s
def get_details(self, search_result, timeout):
br = browser()
with closing(br.open(url_slash_cleaner(self.url + search_result.detail_item), timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
formats = ', '.join(idata.xpath('//select[@id="N1_"]//option//text()'))
search_result.formats = formats.upper()
return True

View File

@ -633,8 +633,8 @@ class LibraryPage(QWizardPage, LibraryUI):
try:
lang = prefs['language'].lower()[:2]
metadata_plugins = {
'zh' : ('Douban Books', 'Douban.com covers'),
'fr' : ('Nicebooks', 'Nicebooks covers'),
'zh' : ('Douban Books',),
'fr' : ('Nicebooks',),
}.get(lang, [])
from calibre.customize.ui import enable_plugin
for name in metadata_plugins:

View File

@ -869,7 +869,8 @@ class Engine(threading.Thread):
if DEBUG:
traceback.print_exc()
except:
traceback.print_exc()
if DEBUG:
traceback.print_exc()
except:
pass

View File

@ -1,209 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the <organization> nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from lxml import etree
from calibre.ebooks.oeb.base import barename
class EchoTarget:
def __init__(self):
self.final_output = []
self.block = False
self.ol_ident = 0
self.ul_ident = 0
self.list_types = []
self.haystack = []
def start(self, tag, attrib):
tag = barename(tag)
newline = '\n'
dot = ''
new_tag = ''
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
new_tag = tag
dot = '. '
elif tag == 'p':
new_tag = ''
dot = ''
elif tag == 'blockquote':
new_tag = 'bq'
dot = '. '
elif tag in ('b', 'strong'):
new_tag = '*'
newline = ''
elif tag in ('em', 'i'):
new_tag = '_'
newline = ''
elif tag == 'cite':
new_tag = '??'
newline = ''
elif tag == 'del':
new_tag = '-'
newline = ''
elif tag == 'ins':
new_tag = '+'
newline = ''
elif tag == 'sup':
new_tag = '^'
newline = ''
elif tag == 'sub':
new_tag = '~'
newline = ''
elif tag == 'span':
new_tag = ''
newline = ''
elif tag == 'a':
self.block = True
if 'title' in attrib:
self.a_part = {'title':attrib.get('title'),
'href':attrib.get('href', '')}
else:
self.a_part = {'title':None, 'href':attrib.get('href', '')}
new_tag = ''
newline = ''
elif tag == 'img':
if 'alt' in attrib:
new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
else:
new_tag = ' !%s' % attrib.get('src')
newline = ''
elif tag in ('ul', 'ol'):
new_tag = ''
newline = ''
self.list_types.append(tag)
if tag == 'ul':
self.ul_ident += 1
else:
self.ol_ident += 1
elif tag == 'li':
indent = self.ul_ident + self.ol_ident
if self.list_types[-1] == 'ul':
new_tag = '*' * indent + ' '
newline = '\n'
else:
new_tag = '#' * indent + ' '
newline = '\n'
if tag not in ('ul', 'ol'):
textile = '%(newline)s%(tag)s%(dot)s' % \
{
'newline':newline,
'tag':new_tag,
'dot':dot
}
if not self.block:
self.final_output.append(textile)
else:
self.haystack.append(textile)
def end(self, tag):
tag = barename(tag)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
self.final_output.append('\n')
elif tag in ('b', 'strong'):
self.final_output.append('*')
elif tag in ('em', 'i'):
self.final_output.append('_')
elif tag == 'cite':
self.final_output.append('??')
elif tag == 'del':
self.final_output.append('-')
elif tag == 'ins':
self.final_output.append('+')
elif tag == 'sup':
self.final_output.append('^')
elif tag == 'sub':
self.final_output.append('~')
elif tag == 'span':
self.final_output.append('')
elif tag == 'a':
if self.a_part['title']:
textilized = ' "%s (%s)":%s ' % (
''.join(self.haystack),
self.a_part.get('title'),
self.a_part.get('href'),
)
self.haystack = []
else:
textilized = ' "%s":%s ' % (
''.join(self.haystack),
self.a_part.get('href'),
)
self.haystack = []
self.final_output.append(textilized)
self.block = False
elif tag == 'img':
self.final_output.append('!')
elif tag == 'ul':
self.ul_ident -= 1
self.list_types.pop()
if len(self.list_types) == 0:
self.final_output.append('\n')
elif tag == 'ol':
self.ol_ident -= 1
self.list_types.pop()
if len(self.list_types) == 0:
self.final_output.append('\n')
def data(self, data):
#we dont want any linebreaks inside our tags
node_data = data.replace('\n','')
if not self.block:
self.final_output.append(node_data)
else:
self.haystack.append(node_data)
def comment(self, text):
pass
def close(self):
return "closed!"
def html2textile(html):
#1st pass
#clean the whitespace and convert html to xhtml
parser = etree.HTMLParser()
tree = etree.fromstring(html, parser)
xhtml = etree.tostring(tree, method="xml")
parser = etree.XMLParser(remove_blank_text=True)
root = etree.XML(xhtml, parser)
cleaned_html = etree.tostring(root)
#2nd pass build textile
target = EchoTarget()
parser = etree.XMLParser(target=target)
root = etree.fromstring(cleaned_html, parser)
textilized_text = ''.join(target.final_output).lstrip().rstrip()
return textilized_text