mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KG updates
This commit is contained in:
commit
0bf6badddd
@ -3,7 +3,6 @@ __license__ = 'GPL v3'
|
||||
'''
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.web.feeds import Feed
|
||||
|
||||
|
||||
class ReadersDigest(BasicNewsRecipe):
|
||||
@ -38,151 +37,20 @@ class ReadersDigest(BasicNewsRecipe):
|
||||
'''
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='h4', attrs={'class':'close'}),
|
||||
dict(name='div', attrs={'class':'fromLine'}),
|
||||
dict(name='img', attrs={'class':'colorTag'}),
|
||||
dict(name='div', attrs={'id':'sponsorArticleHeader'}),
|
||||
dict(name='div', attrs={'class':'horizontalAd'}),
|
||||
dict(name='div', attrs={'id':'imageCounterLeft'}),
|
||||
dict(name='div', attrs={'id':'commentsPrint'})
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
('New in RD', 'http://feeds.rd.com/ReadersDigest'),
|
||||
('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
|
||||
('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
|
||||
('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
|
||||
('Food', 'http://www.rd.com/food/feed'),
|
||||
('Health', 'http://www.rd.com/health/feed'),
|
||||
('Home', 'http://www.rd.com/home/feed'),
|
||||
('Family', 'http://www.rd.com/family/feed'),
|
||||
('Money', 'http://www.rd.com/money/feed'),
|
||||
('Travel', 'http://www.rd.com/travel/feed'),
|
||||
]
|
||||
|
||||
cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
|
||||
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def print_version(self, url):
|
||||
|
||||
# Get the identity number of the current article and append it to the root print URL
|
||||
|
||||
if url.find('/article') > 0:
|
||||
ident = url[url.find('/article')+8:url.find('.html?')-4]
|
||||
url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
|
||||
|
||||
elif url.find('/post') > 0:
|
||||
|
||||
# in this case, have to get the page itself to derive the Print page.
|
||||
soup = self.index_to_soup(url)
|
||||
newsoup = soup.find('ul',attrs={'class':'printBlock'})
|
||||
url = 'http://www.rd.com' + newsoup('a')[0]['href']
|
||||
url = url[0:url.find('&Keep')]
|
||||
|
||||
return url
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def parse_index(self):
|
||||
|
||||
pages = [
|
||||
('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
|
||||
# useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
|
||||
('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
|
||||
|
||||
keep_only_tags = dict(id='main-content')
|
||||
remove_tags = [
|
||||
{'class':['post-categories']},
|
||||
]
|
||||
|
||||
feeds = []
|
||||
|
||||
for page in pages:
|
||||
section, url, divider, attrList = page
|
||||
newArticles = self.page_parse(url, divider, attrList)
|
||||
feeds.append((section,newArticles))
|
||||
|
||||
# after the pages of the site have been processed, parse several RSS feeds for additional sections
|
||||
newfeeds = Feed()
|
||||
newfeeds = self.parse_rss()
|
||||
|
||||
|
||||
# The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable
|
||||
# for this module (parse_index).
|
||||
|
||||
for feed in newfeeds:
|
||||
newArticles = []
|
||||
for article in feed.articles:
|
||||
newArt = {
|
||||
'title' : article.title,
|
||||
'url' : article.url,
|
||||
'date' : article.date,
|
||||
'description' : article.text_summary
|
||||
}
|
||||
newArticles.append(newArt)
|
||||
|
||||
|
||||
# New and Blogs should be the first two feeds.
|
||||
if feed.title == 'New in RD':
|
||||
feeds.insert(0,(feed.title,newArticles))
|
||||
elif feed.title == 'Blogs':
|
||||
feeds.insert(1,(feed.title,newArticles))
|
||||
else:
|
||||
feeds.append((feed.title,newArticles))
|
||||
|
||||
|
||||
return feeds
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def page_parse(self, mainurl, divider, attrList):
|
||||
|
||||
articles = []
|
||||
mainsoup = self.index_to_soup(mainurl)
|
||||
for item in mainsoup.findAll(attrs=attrList):
|
||||
newArticle = {
|
||||
'title' : item('img')[0]['alt'],
|
||||
'url' : 'http://www.rd.com'+item('a')[0]['href'],
|
||||
'date' : '',
|
||||
'description' : ''
|
||||
}
|
||||
articles.append(newArticle)
|
||||
|
||||
|
||||
|
||||
return articles
|
||||
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------------------------------
|
||||
|
||||
def parse_rss (self):
|
||||
|
||||
# Do the "official" parse_feeds first
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
|
||||
# Loop thru the articles in all feeds to find articles with "recipe" in it
|
||||
recipeArticles = []
|
||||
for curfeed in feeds:
|
||||
delList = []
|
||||
for a,curarticle in enumerate(curfeed.articles):
|
||||
if curarticle.title.upper().find('RECIPE') >= 0:
|
||||
recipeArticles.append(curarticle)
|
||||
delList.append(curarticle)
|
||||
if len(delList)>0:
|
||||
for d in delList:
|
||||
index = curfeed.articles.index(d)
|
||||
curfeed.articles[index:index+1] = []
|
||||
|
||||
# If there are any recipes found, create a new Feed object and append.
|
||||
if len(recipeArticles) > 0:
|
||||
pfeed = Feed()
|
||||
pfeed.title = 'Recipes'
|
||||
pfeed.descrition = 'Recipe Feed (Virtual)'
|
||||
pfeed.image_url = None
|
||||
pfeed.oldest_article = 30
|
||||
pfeed.id_counter = len(recipeArticles)
|
||||
# Create a new Feed, add the recipe articles, and then append
|
||||
# to "official" list of feeds
|
||||
pfeed.articles = recipeArticles[:]
|
||||
feeds.append(pfeed)
|
||||
|
||||
return feeds
|
||||
|
||||
|
@ -33,7 +33,7 @@ class StrategyBusinessRecipe(BasicNewsRecipe):
|
||||
elif c.name.endswith('_password'):
|
||||
br[c.name] = self.password
|
||||
raw = br.submit().read()
|
||||
if '>Logout' not in raw:
|
||||
if 'You have been logged in' not in raw:
|
||||
raise ValueError('Failed to login, check your username and password')
|
||||
return br
|
||||
|
||||
|
@ -628,8 +628,9 @@ from calibre.ebooks.metadata.sources.amazon import Amazon
|
||||
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
|
||||
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
||||
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
||||
from calibre.ebooks.metadata.sources.douban import Douban
|
||||
|
||||
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
|
||||
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
|
||||
|
||||
# }}}
|
||||
|
||||
|
@ -92,7 +92,7 @@ def restore_plugin_state_to_default(plugin_or_name):
|
||||
config['enabled_plugins'] = ep
|
||||
|
||||
default_disabled_plugins = set([
|
||||
'Overdrive',
|
||||
'Overdrive', 'Douban Books',
|
||||
])
|
||||
|
||||
def is_disabled(plugin):
|
||||
|
@ -103,6 +103,7 @@ class EPUBInput(InputFormatPlugin):
|
||||
t.set('href', guide_cover)
|
||||
t.set('title', 'Title Page')
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
if os.path.exists(guide_cover):
|
||||
renderer = render_html_svg_workaround(guide_cover, log)
|
||||
if renderer is not None:
|
||||
open('calibre_raster_cover.jpg', 'wb').write(
|
||||
|
@ -280,7 +280,7 @@ class Worker(Thread): # Get details {{{
|
||||
class Amazon(Source):
|
||||
|
||||
name = 'Amazon.com'
|
||||
description = _('Downloads metadata from Amazon')
|
||||
description = _('Downloads metadata and covers from Amazon')
|
||||
|
||||
capabilities = frozenset(['identify', 'cover'])
|
||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||
|
347
src/calibre/ebooks/metadata/sources/douban.py
Normal file
347
src/calibre/ebooks/metadata/sources/douban.py
Normal file
@ -0,0 +1,347 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import time
|
||||
from urllib import urlencode
|
||||
from functools import partial
|
||||
from Queue import Queue, Empty
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
from calibre.ebooks.metadata.sources.base import Source
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre import as_unicode
|
||||
|
||||
NAMESPACES = {
|
||||
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||
'atom' : 'http://www.w3.org/2005/Atom',
|
||||
'db': 'http://www.douban.com/xmlns/',
|
||||
'gd': 'http://schemas.google.com/g/2005'
|
||||
}
|
||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||
total_results = XPath('//openSearch:totalResults')
|
||||
start_index = XPath('//openSearch:startIndex')
|
||||
items_per_page = XPath('//openSearch:itemsPerPage')
|
||||
entry = XPath('//atom:entry')
|
||||
entry_id = XPath('descendant::atom:id')
|
||||
title = XPath('descendant::atom:title')
|
||||
description = XPath('descendant::atom:summary')
|
||||
publisher = XPath("descendant::db:attribute[@name='publisher']")
|
||||
isbn = XPath("descendant::db:attribute[@name='isbn13']")
|
||||
date = XPath("descendant::db:attribute[@name='pubdate']")
|
||||
creator = XPath("descendant::db:attribute[@name='author']")
|
||||
booktag = XPath("descendant::db:tag/attribute::name")
|
||||
rating = XPath("descendant::gd:rating/attribute::average")
|
||||
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
|
||||
|
||||
def get_details(browser, url, timeout): # {{{
|
||||
try:
|
||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||
except Exception as e:
|
||||
gc = getattr(e, 'getcode', lambda : -1)
|
||||
if gc() != 403:
|
||||
raise
|
||||
# Douban is throttling us, wait a little
|
||||
time.sleep(2)
|
||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||
|
||||
return raw
|
||||
# }}}
|
||||
|
||||
def to_metadata(browser, log, entry_, timeout): # {{{
|
||||
def get_text(extra, x):
|
||||
try:
|
||||
ans = x(extra)
|
||||
if ans:
|
||||
ans = ans[0].text
|
||||
if ans and ans.strip():
|
||||
return ans.strip()
|
||||
except:
|
||||
log.exception('Programming error:')
|
||||
return None
|
||||
|
||||
id_url = entry_id(entry_)[0].text
|
||||
douban_id = id_url.split('/')[-1]
|
||||
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||
if not authors:
|
||||
authors = [_('Unknown')]
|
||||
if not id_url or not title:
|
||||
# Silently discard this entry
|
||||
return None
|
||||
|
||||
mi = Metadata(title_, authors)
|
||||
mi.identifiers = {'douban':douban_id}
|
||||
try:
|
||||
raw = get_details(browser, id_url, timeout)
|
||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||
strip_encoding_pats=True)[0])
|
||||
extra = entry(feed)[0]
|
||||
except:
|
||||
log.exception('Failed to get additional details for', mi.title)
|
||||
return mi
|
||||
mi.comments = get_text(extra, description)
|
||||
mi.publisher = get_text(extra, publisher)
|
||||
|
||||
# ISBN
|
||||
isbns = []
|
||||
for x in [t.text for t in isbn(extra)]:
|
||||
if check_isbn(x):
|
||||
isbns.append(x)
|
||||
if isbns:
|
||||
mi.isbn = sorted(isbns, key=len)[-1]
|
||||
mi.all_isbns = isbns
|
||||
|
||||
# Tags
|
||||
try:
|
||||
btags = [x for x in booktag(extra) if x]
|
||||
tags = []
|
||||
for t in btags:
|
||||
atags = [y.strip() for y in t.split('/')]
|
||||
for tag in atags:
|
||||
if tag not in tags:
|
||||
tags.append(tag)
|
||||
except:
|
||||
log.exception('Failed to parse tags:')
|
||||
tags = []
|
||||
if tags:
|
||||
mi.tags = [x.replace(',', ';') for x in tags]
|
||||
|
||||
# pubdate
|
||||
pubdate = get_text(extra, date)
|
||||
if pubdate:
|
||||
try:
|
||||
default = utcnow().replace(day=15)
|
||||
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||
except:
|
||||
log.error('Failed to parse pubdate %r'%pubdate)
|
||||
|
||||
# Ratings
|
||||
if rating(extra):
|
||||
try:
|
||||
mi.rating = float(rating(extra)[0]) / 2.0
|
||||
except:
|
||||
log.exception('Failed to parse rating')
|
||||
mi.rating = 0
|
||||
|
||||
# Cover
|
||||
mi.has_douban_cover = None
|
||||
u = cover_url(extra)
|
||||
if u:
|
||||
u = u[0].replace('/spic/', '/lpic/');
|
||||
# If URL contains "book-default", the book doesn't have a cover
|
||||
if u.find('book-default') == -1:
|
||||
mi.has_douban_cover = u
|
||||
return mi
|
||||
# }}}
|
||||
|
||||
class Douban(Source):
|
||||
|
||||
name = 'Douban Books'
|
||||
author = 'Li Fanxi'
|
||||
version = (2, 0, 0)
|
||||
|
||||
description = _('Downloads metadata and covers from Douban.com')
|
||||
|
||||
capabilities = frozenset(['identify', 'cover'])
|
||||
touched_fields = frozenset(['title', 'authors', 'tags',
|
||||
'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
|
||||
'identifier:douban']) # language currently disabled
|
||||
supports_gzip_transfer_encoding = True
|
||||
cached_cover_url_is_reliable = True
|
||||
|
||||
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
|
||||
DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
|
||||
|
||||
def get_book_url(self, identifiers): # {{{
|
||||
db = identifiers.get('douban', None)
|
||||
if db is not None:
|
||||
return ('douban', db, self.DOUBAN_BOOK_URL%db)
|
||||
# }}}
|
||||
|
||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||
SEARCH_URL = 'http://api.douban.com/book/subjects?'
|
||||
ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
|
||||
SUBJECT_URL = 'http://api.douban.com/book/subject/'
|
||||
|
||||
q = ''
|
||||
t = None
|
||||
isbn = check_isbn(identifiers.get('isbn', None))
|
||||
subject = identifiers.get('douban', None)
|
||||
if isbn is not None:
|
||||
q = isbn
|
||||
t = 'isbn'
|
||||
elif subject is not None:
|
||||
q = subject
|
||||
t = 'subject'
|
||||
elif title or authors:
|
||||
def build_term(prefix, parts):
|
||||
return ' '.join(x for x in parts)
|
||||
title_tokens = list(self.get_title_tokens(title))
|
||||
if title_tokens:
|
||||
q += build_term('title', title_tokens)
|
||||
author_tokens = self.get_author_tokens(authors,
|
||||
only_first_author=True)
|
||||
if author_tokens:
|
||||
q += ((' ' if q != '' else '') +
|
||||
build_term('author', author_tokens))
|
||||
t = 'search'
|
||||
q = q.strip()
|
||||
if isinstance(q, unicode):
|
||||
q = q.encode('utf-8')
|
||||
if not q:
|
||||
return None
|
||||
url = None
|
||||
if t == "isbn":
|
||||
url = ISBN_URL + q
|
||||
elif t == 'subject':
|
||||
url = SUBJECT_URL + q
|
||||
else:
|
||||
url = SEARCH_URL + urlencode({
|
||||
'q': q,
|
||||
})
|
||||
if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
|
||||
url = url + "?apikey=" + self.DOUBAN_API_KEY
|
||||
return url
|
||||
# }}}
|
||||
|
||||
def download_cover(self, log, result_queue, abort, # {{{
|
||||
title=None, authors=None, identifiers={}, timeout=30):
|
||||
cached_url = self.get_cached_cover_url(identifiers)
|
||||
if cached_url is None:
|
||||
log.info('No cached cover found, running identify')
|
||||
rq = Queue()
|
||||
self.identify(log, rq, abort, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
if abort.is_set():
|
||||
return
|
||||
results = []
|
||||
while True:
|
||||
try:
|
||||
results.append(rq.get_nowait())
|
||||
except Empty:
|
||||
break
|
||||
results.sort(key=self.identify_results_keygen(
|
||||
title=title, authors=authors, identifiers=identifiers))
|
||||
for mi in results:
|
||||
cached_url = self.get_cached_cover_url(mi.identifiers)
|
||||
if cached_url is not None:
|
||||
break
|
||||
if cached_url is None:
|
||||
log.info('No cover found')
|
||||
return
|
||||
|
||||
if abort.is_set():
|
||||
return
|
||||
br = self.browser
|
||||
log('Downloading cover from:', cached_url)
|
||||
try:
|
||||
cdata = br.open_novisit(cached_url, timeout=timeout).read()
|
||||
if cdata:
|
||||
result_queue.put((self, cdata))
|
||||
except:
|
||||
log.exception('Failed to download cover from:', cached_url)
|
||||
|
||||
# }}}
|
||||
|
||||
def get_cached_cover_url(self, identifiers): # {{{
|
||||
url = None
|
||||
db = identifiers.get('douban', None)
|
||||
if db is None:
|
||||
isbn = identifiers.get('isbn', None)
|
||||
if isbn is not None:
|
||||
db = self.cached_isbn_to_identifier(isbn)
|
||||
if db is not None:
|
||||
url = self.cached_identifier_to_cover_url(db)
|
||||
|
||||
return url
|
||||
# }}}
|
||||
|
||||
def get_all_details(self, br, log, entries, abort, # {{{
|
||||
result_queue, timeout):
|
||||
for relevance, i in enumerate(entries):
|
||||
try:
|
||||
ans = to_metadata(br, log, i, timeout)
|
||||
if isinstance(ans, Metadata):
|
||||
ans.source_relevance = relevance
|
||||
db = ans.identifiers['douban']
|
||||
for isbn in getattr(ans, 'all_isbns', []):
|
||||
self.cache_isbn_to_identifier(isbn, db)
|
||||
if ans.has_douban_cover:
|
||||
self.cache_identifier_to_cover_url(db,
|
||||
ans.has_douban_cover)
|
||||
self.clean_downloaded_metadata(ans)
|
||||
result_queue.put(ans)
|
||||
except:
|
||||
log.exception(
|
||||
'Failed to get metadata for identify entry:',
|
||||
etree.tostring(i))
|
||||
if abort.is_set():
|
||||
break
|
||||
# }}}
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||
identifiers={}, timeout=30):
|
||||
query = self.create_query(log, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
if not query:
|
||||
log.error('Insufficient metadata to construct query')
|
||||
return
|
||||
br = self.browser
|
||||
try:
|
||||
raw = br.open_novisit(query, timeout=timeout).read()
|
||||
except Exception as e:
|
||||
log.exception('Failed to make identify query: %r'%query)
|
||||
return as_unicode(e)
|
||||
try:
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||
strip_encoding_pats=True)[0], parser=parser)
|
||||
entries = entry(feed)
|
||||
except Exception as e:
|
||||
log.exception('Failed to parse identify results')
|
||||
return as_unicode(e)
|
||||
if not entries and identifiers and title and authors and \
|
||||
not abort.is_set():
|
||||
return self.identify(log, result_queue, abort, title=title,
|
||||
authors=authors, timeout=timeout)
|
||||
|
||||
# There is no point running these queries in threads as douban
|
||||
# throttles requests returning 403 Forbidden errors
|
||||
self.get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||
|
||||
return None
|
||||
# }}}
|
||||
|
||||
if __name__ == '__main__': # tests {{{
|
||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
|
||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||
title_test, authors_test)
|
||||
test_identify_plugin(Douban.name,
|
||||
[
|
||||
|
||||
|
||||
(
|
||||
{'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
|
||||
'authors':['刘慈欣']},
|
||||
[title_test('三体', exact=True),
|
||||
authors_test(['刘慈欣'])]
|
||||
),
|
||||
|
||||
(
|
||||
{'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
|
||||
[title_test('Linux内核修炼之道', exact=False)]
|
||||
),
|
||||
])
|
||||
# }}}
|
||||
|
@ -157,7 +157,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
||||
class GoogleBooks(Source):
|
||||
|
||||
name = 'Google'
|
||||
description = _('Downloads metadata from Google Books')
|
||||
description = _('Downloads metadata and covers from Google Books')
|
||||
|
||||
capabilities = frozenset(['identify', 'cover'])
|
||||
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
||||
|
@ -382,7 +382,7 @@ def identify(log, abort, # {{{
|
||||
if key not in filter_results:
|
||||
filtered_results.append(r)
|
||||
filter_results.add(key)
|
||||
presults = filtered_results
|
||||
results[plugin] = presults = filtered_results
|
||||
|
||||
plog = logs[plugin].getvalue().strip()
|
||||
log('\n'+'*'*30, plugin.name, '*'*30)
|
||||
|
@ -30,7 +30,7 @@ base_url = 'http://search.overdrive.com/'
|
||||
class OverDrive(Source):
|
||||
|
||||
name = 'Overdrive'
|
||||
description = _('Downloads metadata from Overdrive\'s Content Reserve')
|
||||
description = _('Downloads metadata and covers from Overdrive\'s Content Reserve')
|
||||
|
||||
capabilities = frozenset(['identify', 'cover'])
|
||||
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
||||
|
@ -12,7 +12,7 @@ A Humane Web Text Generator
|
||||
#__date__ = '2009/12/04'
|
||||
|
||||
__copyright__ = """
|
||||
Copyright (c) 2011, Leigh Parry
|
||||
Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
|
||||
Copyright (c) 2011, John Schember <john@nachtimwald.com>
|
||||
Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
|
||||
Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
|
||||
@ -219,14 +219,13 @@ class Textile(object):
|
||||
]
|
||||
glyph_defaults = [
|
||||
(re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign
|
||||
(re.compile(r'(\d+)\'', re.I), r'\1′'), # prime
|
||||
(re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double
|
||||
(re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime
|
||||
(re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double
|
||||
(re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'<acronym title="\2">\1</acronym>'), # 3+ uppercase acronym
|
||||
(re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'<span class="caps">\1</span>'), # 3+ uppercase
|
||||
(re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis
|
||||
(re.compile(r'^[\*_-]{3,}$', re.M), r'<hr />'), # <hr> scene-break
|
||||
(re.compile(r'\b--\b'), r'—'), # em dash
|
||||
(re.compile(r'(\s)--(\s)'), r'\1—\2'), # em dash
|
||||
(re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash
|
||||
(re.compile(r'\s-(?:\s|$)'), r' – '), # en dash
|
||||
(re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark
|
||||
(re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered
|
||||
@ -706,6 +705,21 @@ class Textile(object):
|
||||
result.append(line)
|
||||
return ''.join(result)
|
||||
|
||||
def macros_only(self, text):
|
||||
# fix: hackish
|
||||
text = re.sub(r'"\Z', '\" ', text)
|
||||
|
||||
result = []
|
||||
for line in re.compile(r'(<.*?>)', re.U).split(text):
|
||||
if not re.search(r'<.*>', line):
|
||||
rules = []
|
||||
if re.search(r'{.+?}', line):
|
||||
rules = self.macro_defaults
|
||||
for s, r in rules:
|
||||
line = s.sub(r, line)
|
||||
result.append(line)
|
||||
return ''.join(result)
|
||||
|
||||
def vAlign(self, input):
|
||||
d = {'^':'top', '-':'middle', '~':'bottom'}
|
||||
return d.get(input, '')
|
||||
@ -814,6 +828,7 @@ class Textile(object):
|
||||
'fooobar ... and hello world ...'
|
||||
"""
|
||||
|
||||
text = self.macros_only(text)
|
||||
punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
|
||||
|
||||
pattern = r'''
|
||||
@ -1044,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
|
||||
return Textile(restricted=True, lite=lite,
|
||||
noimage=noimage).textile(text, rel='nofollow',
|
||||
html_type=html_type)
|
||||
|
||||
|
@ -66,19 +66,26 @@ class TXTOutput(OutputFormatPlugin):
|
||||
help=_('Do not remove image references within the document. This is only ' \
|
||||
'useful when paired with a txt-output-formatting option that '
|
||||
'is not none because links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_color',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove font color from output. This is only useful when ' \
|
||||
'txt-output-formatting is set to textile. Textile is the only ' \
|
||||
'formatting that supports setting font color. If this option is ' \
|
||||
'not specified font color will not be set and default to the ' \
|
||||
'color displayed by the reader (generally this is black).')),
|
||||
])
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
if opts.txt_output_formatting.lower() == 'markdown':
|
||||
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
||||
writer = MarkdownMLizer(log)
|
||||
self.writer = MarkdownMLizer(log)
|
||||
elif opts.txt_output_formatting.lower() == 'textile':
|
||||
from calibre.ebooks.txt.textileml import TextileMLizer
|
||||
writer = TextileMLizer(log)
|
||||
self.writer = TextileMLizer(log)
|
||||
else:
|
||||
writer = TXTMLizer(log)
|
||||
self.writer = TXTMLizer(log)
|
||||
|
||||
txt = writer.extract_content(oeb_book, opts)
|
||||
txt = self.writer.extract_content(oeb_book, opts)
|
||||
txt = clean_ascii_chars(txt)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
@ -111,17 +118,28 @@ class TXTZOutput(TXTOutput):
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
with TemporaryDirectory('_txtz_output') as tdir:
|
||||
# TXT
|
||||
with TemporaryFile('index.txt') as tf:
|
||||
txt_name = 'index.txt'
|
||||
if opts.txt_output_formatting.lower() == 'textile':
|
||||
txt_name = 'index.text'
|
||||
with TemporaryFile(txt_name) as tf:
|
||||
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
|
||||
shutil.copy(tf, os.path.join(tdir, 'index.txt'))
|
||||
shutil.copy(tf, os.path.join(tdir, txt_name))
|
||||
|
||||
# Images
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
if hasattr(self.writer, 'images'):
|
||||
path = os.path.join(tdir, 'images')
|
||||
if item.href in self.writer.images:
|
||||
href = self.writer.images[item.href]
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||
href = os.path.basename(item.href)
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
with open(os.path.join(tdir, item.href), 'wb') as imgf:
|
||||
with open(os.path.join(path, href), 'wb') as imgf:
|
||||
imgf.write(item.data)
|
||||
|
||||
# Metadata
|
||||
|
@ -242,6 +242,8 @@ def detect_formatting_type(txt):
|
||||
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
||||
# Links
|
||||
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
||||
# paragraph blocks
|
||||
textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
|
||||
|
||||
# Decide if either markdown or textile is used in the text
|
||||
# based on the number of unique formatting elements found.
|
||||
|
@ -1,62 +1,489 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into Textile formatted plain text
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from lxml import etree
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
from calibre.utils.html2textile import html2textile
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks import unit_convert
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||
|
||||
class TextileMLizer(object):
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
class TextileMLizer(OEB2HTML):
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
self.log.info('Converting XHTML to Textile formatted TXT...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
self.in_pre = False
|
||||
self.in_table = False
|
||||
self.links = {}
|
||||
self.list = []
|
||||
self.our_links = []
|
||||
self.in_a_link = False
|
||||
self.our_ids = []
|
||||
self.images = {}
|
||||
self.id_no_text = u''
|
||||
self.style_embed = []
|
||||
self.remove_space_after_newline = False
|
||||
self.base_hrefs = [item.href for item in oeb_book.spine]
|
||||
self.map_resources(oeb_book)
|
||||
|
||||
return self.mlize_spine()
|
||||
self.style_bold = False
|
||||
self.style_italic = False
|
||||
self.style_under = False
|
||||
self.style_strike = False
|
||||
self.style_smallcap = False
|
||||
|
||||
def mlize_spine(self):
|
||||
txt = self.mlize_spine(oeb_book)
|
||||
txt = unsmarten(txt)
|
||||
|
||||
# Do some tidying up
|
||||
txt = self.tidy_up(txt)
|
||||
|
||||
return txt
|
||||
|
||||
def mlize_spine(self, oeb_book):
|
||||
output = [u'']
|
||||
|
||||
for item in self.oeb_book.spine:
|
||||
for item in oeb_book.spine:
|
||||
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
|
||||
self.rewrite_ids(item.data, item)
|
||||
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
output.append('\n\n')
|
||||
return ''.join(output)
|
||||
|
||||
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||
def tidy_up(self, text):
|
||||
# May need tweaking and finetuning
|
||||
def check_escaping(text, tests):
|
||||
for t in tests:
|
||||
# I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
|
||||
txt = '%s' % t
|
||||
if txt != '%':
|
||||
text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text)
|
||||
text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
|
||||
text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
|
||||
return text
|
||||
|
||||
if not self.opts.keep_links:
|
||||
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
|
||||
if not self.opts.keep_image_references:
|
||||
html = re.sub(r'<\s*img[^>]*>', '', html)
|
||||
# Now tidyup links and ids - remove ones that don't have a correponding opposite
|
||||
if self.opts.keep_links:
|
||||
for i in self.our_links:
|
||||
if i[0] == '#':
|
||||
if i not in self.our_ids:
|
||||
text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
|
||||
for i in self.our_ids:
|
||||
if i not in self.our_links:
|
||||
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
|
||||
|
||||
text = html2textile(html)
|
||||
# Remove obvious non-needed escaping, add sub/sup-script ones
|
||||
text = check_escaping(text, ['\*', '_', '\*'])
|
||||
# escape the super/sub-scripts if needed
|
||||
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
|
||||
# escape the super/sub-scripts if needed
|
||||
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
|
||||
|
||||
# Ensure the section ends with at least two new line characters.
|
||||
# This is to prevent the last paragraph from a section being
|
||||
# combined into the fist paragraph of the next.
|
||||
end_chars = text[-4:]
|
||||
# Convert all newlines to \n
|
||||
end_chars = end_chars.replace('\r\n', '\n')
|
||||
end_chars = end_chars.replace('\r', '\n')
|
||||
end_chars = end_chars[-2:]
|
||||
if not end_chars[1] == '\n':
|
||||
text += '\n\n'
|
||||
if end_chars[1] == '\n' and not end_chars[0] == '\n':
|
||||
text += '\n'
|
||||
#remove empty spans
|
||||
text = re.sub(r'%\xa0+', r'%', text)
|
||||
#remove empty spans - MAY MERGE SOME ?
|
||||
text = re.sub(r'%%', r'', text)
|
||||
#remove spans from tagged output
|
||||
text = re.sub(r'%([_+*-]+)%', r'\1', text)
|
||||
#remove spaces before a newline
|
||||
text = re.sub(r' +\n', r'\n', text)
|
||||
#remove newlines at top of file
|
||||
text = re.sub(r'^\n+', r'', text)
|
||||
#correct blockcode paras
|
||||
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
||||
#correct blockquote paras
|
||||
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
|
||||
|
||||
output += text
|
||||
#reduce blank lines
|
||||
text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
|
||||
text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
|
||||
#Check span following blank para
|
||||
text = re.sub(r'\n+ +%', r' %', text)
|
||||
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
|
||||
# blank paragraph
|
||||
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
|
||||
# blank paragraph
|
||||
text = re.sub(u'\n\xa0', r'\np. ', text)
|
||||
# blank paragraph
|
||||
text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text)
|
||||
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
|
||||
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
|
||||
#sort out spaces in tables
|
||||
text = re.sub(r' {2,}\|', r' |', text)
|
||||
|
||||
output = u''.join(output)
|
||||
# Now put back spaces removed earlier as they're needed here
|
||||
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
||||
#reduce blank lines
|
||||
text = re.sub(r' \n\n\n', r' \n\n', text)
|
||||
|
||||
return output
|
||||
return text
|
||||
|
||||
def remove_newlines(self, text):
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
# Condense redundant spaces created by replacing newlines with spaces.
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
text = re.sub(r'\t+', '', text)
|
||||
if self.remove_space_after_newline == True:
|
||||
text = re.sub(r'^ +', '', text)
|
||||
self.remove_space_after_newline = False
|
||||
return text
|
||||
|
||||
def check_styles(self, style):
|
||||
txt = '{'
|
||||
if self.opts.keep_color:
|
||||
if 'color' in style.cssdict() and style['color'] != 'black':
|
||||
txt += 'color:'+style['color']+';'
|
||||
if 'background' in style.cssdict():
|
||||
txt += 'background:'+style['background']+';'
|
||||
txt += '}'
|
||||
if txt == '{}': txt = ''
|
||||
return txt
|
||||
|
||||
def check_halign(self, style):
|
||||
tests = {'left':'<','justify':'<>','center':'=','right':'>'}
|
||||
for i in tests:
|
||||
if style['text-align'] == i:
|
||||
return tests[i]
|
||||
return ''
|
||||
|
||||
def check_valign(self, style):
|
||||
tests = {'top':'^','bottom':'~'} #, 'middle':'-'}
|
||||
for i in tests:
|
||||
if style['vertical-align'] == i:
|
||||
return tests[i]
|
||||
return ''
|
||||
|
||||
def check_padding(self, style, stylizer):
|
||||
txt = ''
|
||||
left_padding_pts = 0
|
||||
left_margin_pts = 0
|
||||
if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
|
||||
left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
|
||||
left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
left = left_margin_pts + left_padding_pts
|
||||
emleft = int(round(left / stylizer.profile.fbase))
|
||||
if emleft >= 1:
|
||||
txt += '(' * emleft
|
||||
right_padding_pts = 0
|
||||
right_margin_pts = 0
|
||||
if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
|
||||
right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
|
||||
right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
right = right_margin_pts + right_padding_pts
|
||||
emright = int(round(right / stylizer.profile.fbase))
|
||||
if emright >= 1:
|
||||
txt += ')' * emright
|
||||
|
||||
return txt
|
||||
|
||||
def check_id_tag(self, attribs):
|
||||
txt = ''
|
||||
if attribs.has_key('id'):
|
||||
txt = '(#'+attribs['id']+ ')'
|
||||
self.our_ids.append('#'+attribs['id'])
|
||||
self.id_no_text = u'\xa0'
|
||||
return txt
|
||||
|
||||
def build_block(self, tag, style, attribs, stylizer):
|
||||
txt = '\n' + tag
|
||||
if self.opts.keep_links:
|
||||
txt += self.check_id_tag(attribs)
|
||||
txt += self.check_padding(style, stylizer)
|
||||
txt += self.check_halign(style)
|
||||
txt += self.check_styles(style)
|
||||
return txt
|
||||
|
||||
def prepare_string_for_textile(self, txt):
|
||||
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
|
||||
return ' ==%s== ' % txt
|
||||
return txt
|
||||
|
||||
def dump_text(self, elem, stylizer):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
'''
|
||||
|
||||
# We can only processes tags. If there isn't a tag return any text.
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
p = elem.getparent()
|
||||
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
|
||||
and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
# Setup our variables.
|
||||
text = ['']
|
||||
style = stylizer.style(elem)
|
||||
tags = []
|
||||
tag = barename(elem.tag)
|
||||
attribs = elem.attrib
|
||||
|
||||
# Ignore anything that is set to not be displayed.
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
return ['']
|
||||
|
||||
# Soft scene breaks.
|
||||
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
|
||||
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
|
||||
if ems >= 1:
|
||||
text.append(u'\n\n\xa0' * ems)
|
||||
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
||||
if tag == 'div':
|
||||
tag = 'p'
|
||||
text.append(self.build_block(tag, style, attribs, stylizer))
|
||||
text.append('. ')
|
||||
tags.append('\n')
|
||||
|
||||
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
||||
if self.style_italic == False:
|
||||
if self.in_a_link:
|
||||
text.append('_')
|
||||
tags.append('_')
|
||||
else:
|
||||
text.append('[_')
|
||||
tags.append('_]')
|
||||
self.style_embed.append('_')
|
||||
self.style_italic = True
|
||||
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
||||
if self.style_bold == False:
|
||||
if self.in_a_link:
|
||||
text.append('*')
|
||||
tags.append('*')
|
||||
else:
|
||||
text.append('[*')
|
||||
tags.append('*]')
|
||||
self.style_embed.append('*')
|
||||
self.style_bold = True
|
||||
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
|
||||
if tag != 'a':
|
||||
if self.style_under == False:
|
||||
text.append('[+')
|
||||
tags.append('+]')
|
||||
self.style_embed.append('+')
|
||||
self.style_under = True
|
||||
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
|
||||
if self.style_strike == False:
|
||||
text.append('[-')
|
||||
tags.append('-]')
|
||||
self.style_embed.append('-')
|
||||
self.style_strike = True
|
||||
if tag == 'br':
|
||||
for i in reversed(self.style_embed):
|
||||
text.append(i)
|
||||
text.append('\n')
|
||||
for i in self.style_embed:
|
||||
text.append(i)
|
||||
tags.append('')
|
||||
self.remove_space_after_newline = True
|
||||
if tag == 'blockquote':
|
||||
text.append('\nbq. ')
|
||||
tags.append('\n')
|
||||
elif tag in ('abbr', 'acronym'):
|
||||
text.append('')
|
||||
txt = attribs['title']
|
||||
tags.append('(' + txt + ')')
|
||||
elif tag == 'sup':
|
||||
text.append('^')
|
||||
tags.append('^')
|
||||
elif tag == 'sub':
|
||||
text.append('~')
|
||||
tags.append('~')
|
||||
elif tag == 'code':
|
||||
if self.in_pre:
|
||||
text.append('\nbc. ')
|
||||
tags.append('')
|
||||
else:
|
||||
text.append('@')
|
||||
tags.append('@')
|
||||
elif tag == 'cite':
|
||||
text.append('??')
|
||||
tags.append('??')
|
||||
elif tag == 'hr':
|
||||
text.append('\n***')
|
||||
tags.append('\n')
|
||||
elif tag == 'pre':
|
||||
self.in_pre = True
|
||||
text.append('\npre. ')
|
||||
tags.append('pre\n')
|
||||
elif tag == 'a':
|
||||
if self.opts.keep_links:
|
||||
if attribs.has_key('href'):
|
||||
text.append('"')
|
||||
tags.append('a')
|
||||
tags.append('":' + attribs['href'])
|
||||
self.our_links.append(attribs['href'])
|
||||
if attribs.has_key('title'):
|
||||
tags.append('(' + attribs['title'] + ')')
|
||||
self.in_a_link = True
|
||||
else:
|
||||
text.append('%')
|
||||
tags.append('%')
|
||||
elif tag == 'img':
|
||||
if self.opts.keep_image_references:
|
||||
txt = '!' + self.check_halign(style)
|
||||
txt += self.check_valign(style)
|
||||
txt += attribs['src']
|
||||
text.append(txt)
|
||||
if attribs.has_key('alt'):
|
||||
txt = attribs['alt']
|
||||
if txt != '':
|
||||
text.append('(' + txt + ')')
|
||||
tags.append('!')
|
||||
elif tag in ('ol', 'ul'):
|
||||
self.list.append({'name': tag, 'num': 0})
|
||||
text.append('')
|
||||
tags.append(tag)
|
||||
elif tag == 'li':
|
||||
if self.list: li = self.list[-1]
|
||||
else: li = {'name': 'ul', 'num': 0}
|
||||
text.append('\n')
|
||||
if li['name'] == 'ul':
|
||||
text.append('*' * len(self.list) + ' ')
|
||||
elif li['name'] == 'ol':
|
||||
text.append('#' * len(self.list) + ' ')
|
||||
tags.append('')
|
||||
elif tag == 'dl':
|
||||
text.append('\n')
|
||||
tags.append('')
|
||||
elif tag == 'dt':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'dd':
|
||||
text.append(' ')
|
||||
tags.append('')
|
||||
elif tag == 'dd':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'table':
|
||||
txt = self.build_block(tag, style, attribs, stylizer)
|
||||
txt += '. \n'
|
||||
if txt != '\ntable. \n':
|
||||
text.append(txt)
|
||||
else:
|
||||
text.append('\n')
|
||||
tags.append('')
|
||||
elif tag == 'tr':
|
||||
txt = self.build_block('', style, attribs, stylizer)
|
||||
txt += '. '
|
||||
if txt != '\n. ':
|
||||
txt = re.sub ('\n', '', txt)
|
||||
text.append(txt)
|
||||
tags.append('|\n')
|
||||
elif tag == 'td':
|
||||
text.append('|')
|
||||
txt = ''
|
||||
txt += self.check_halign(style)
|
||||
txt += self.check_valign(style)
|
||||
if attribs.has_key ('colspan'):
|
||||
txt += '\\' + attribs['colspan']
|
||||
if attribs.has_key ('rowspan'):
|
||||
txt += '/' + attribs['rowspan']
|
||||
txt += self.check_styles(style)
|
||||
if txt != '':
|
||||
text.append(txt + '. ')
|
||||
tags.append('')
|
||||
elif tag == 'th':
|
||||
text.append('|_. ')
|
||||
tags.append('')
|
||||
elif tag == 'span':
|
||||
if style['font-variant'] == 'small-caps':
|
||||
if self.style_smallcap == False:
|
||||
text.append('&')
|
||||
tags.append('&')
|
||||
self.style_smallcap = True
|
||||
else:
|
||||
if self.in_a_link == False:
|
||||
txt = '%'
|
||||
if self.opts.keep_links:
|
||||
txt += self.check_id_tag(attribs)
|
||||
txt += self.check_styles(style)
|
||||
if txt != '%':
|
||||
text.append(txt)
|
||||
tags.append('%')
|
||||
|
||||
if self.opts.keep_links and attribs.has_key('id'):
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
|
||||
text.append(self.check_id_tag(attribs))
|
||||
|
||||
# Process the styles for any that we want to keep
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
|
||||
'span', 'table', 'tr', 'td'):
|
||||
if not self.in_a_link:
|
||||
text.append(self.check_styles(style))
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
txt = elem.text
|
||||
if not self.in_pre:
|
||||
txt = self.prepare_string_for_textile(self.remove_newlines(txt))
|
||||
text.append(txt)
|
||||
self.id_no_text = u''
|
||||
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer)
|
||||
|
||||
# Close all open tags.
|
||||
tags.reverse()
|
||||
for t in tags:
|
||||
if tag in ('pre', 'ul', 'ol', 'li', 'table'):
|
||||
if tag == 'pre':
|
||||
self.in_pre = False
|
||||
elif tag in ('ul', 'ol'):
|
||||
if self.list: self.list.pop()
|
||||
if not self.list: text.append('\n')
|
||||
else:
|
||||
if t == 'a':
|
||||
self.in_a_link = False
|
||||
t = ''
|
||||
text.append(self.id_no_text)
|
||||
self.id_no_text = u''
|
||||
if t in ('*]', '*'):
|
||||
self.style_bold = False
|
||||
elif t in ('_]', '_'):
|
||||
self.style_italic = False
|
||||
elif t == '+]':
|
||||
self.style_under = False
|
||||
elif t == '-]':
|
||||
self.style_strike = False
|
||||
elif t == '&':
|
||||
self.style_smallcap = False
|
||||
if t in ('*]', '_]', '+]', '-]', '*', '_'):
|
||||
txt = self.style_embed.pop()
|
||||
text.append('%s' % t)
|
||||
|
||||
# Soft scene breaks.
|
||||
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
|
||||
ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
|
||||
if ems >= 1:
|
||||
text.append(u'\n\n\xa0' * ems)
|
||||
|
||||
# Add the text that is outside of the tag.
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
tail = elem.tail
|
||||
if not self.in_pre:
|
||||
tail = self.prepare_string_for_textile(self.remove_newlines(tail))
|
||||
text.append(tail)
|
||||
|
||||
return text
|
||||
|
108
src/calibre/ebooks/txt/unsmarten.py
Normal file
108
src/calibre/ebooks/txt/unsmarten.py
Normal file
@ -0,0 +1,108 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""unsmarten : html2textile helper function"""
|
||||
|
||||
__version__ = '0.1'
|
||||
__author__ = 'Leigh Parry'
|
||||
|
||||
import re
|
||||
|
||||
def unsmarten(txt):
|
||||
txt = re.sub(u'–|–|–', r'-', txt) # en-dash
|
||||
txt = re.sub(u'—|—|—', r'--', txt) # em-dash
|
||||
txt = re.sub(u'…|…|…', r'...', txt) # ellipsis
|
||||
|
||||
txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote
|
||||
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
||||
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
||||
|
||||
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||
txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright
|
||||
txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered
|
||||
txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter
|
||||
txt = re.sub(u'½|½|½', r'{1/2}', txt) # half
|
||||
txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||
txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave
|
||||
txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute
|
||||
txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||
txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||
txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||
txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||
txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||
txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||
txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave
|
||||
txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute
|
||||
txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||
txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||
txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||
txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute
|
||||
txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||
txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||
txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||
txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||
txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||
txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||
txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||
txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||
txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||
txt = re.sub(u'×|×|×', r'{x}', txt) # dimension
|
||||
txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||
txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave
|
||||
txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||
txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||
txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||
txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||
txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||
txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave
|
||||
txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute
|
||||
txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex
|
||||
txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||
txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||
txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring
|
||||
txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae
|
||||
txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||
txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave
|
||||
txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute
|
||||
txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||
txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||
txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave
|
||||
txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute
|
||||
txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex
|
||||
txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||
txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth
|
||||
txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||
txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave
|
||||
txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute
|
||||
txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||
txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||
txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||
txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||
txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave
|
||||
txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute
|
||||
txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex
|
||||
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
||||
txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron
|
||||
txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron
|
||||
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
||||
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
||||
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
||||
txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee
|
||||
txt = re.sub(u'€|€|€', r'{C=}', txt) # euro
|
||||
txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark
|
||||
txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade
|
||||
txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club
|
||||
txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart
|
||||
txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||
|
||||
# Move into main code?
|
||||
# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
|
||||
# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
||||
# txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
||||
|
||||
return txt
|
@ -620,7 +620,11 @@ class Application(QApplication):
|
||||
self.original_font = QFont(QApplication.font())
|
||||
fi = gprefs['font']
|
||||
if fi is not None:
|
||||
QApplication.setFont(QFont(*fi))
|
||||
font = QFont(*(fi[:4]))
|
||||
s = gprefs.get('font_stretch', None)
|
||||
if s is not None:
|
||||
font.setStretch(s)
|
||||
QApplication.setFont(font)
|
||||
|
||||
def _send_file_open_events(self):
|
||||
with self._file_open_lock:
|
||||
|
@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
|
||||
Widget.__init__(self, parent,
|
||||
['newline', 'max_line_length', 'force_max_line_length',
|
||||
'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references',
|
||||
'txt_output_encoding'])
|
||||
'keep_color', 'txt_output_encoding'])
|
||||
self.db, self.book_id = db, book_id
|
||||
for x in get_option('newline').option.choices:
|
||||
self.opt_newline.addItem(x)
|
||||
|
@ -122,6 +122,13 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="opt_keep_color">
|
||||
<property name="text">
|
||||
<string>Keep text color, when possible</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
|
@ -161,7 +161,11 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
|
||||
|
||||
def initialize(self):
|
||||
ConfigWidgetBase.initialize(self)
|
||||
self.current_font = self.initial_font = gprefs['font']
|
||||
font = gprefs['font']
|
||||
if font is not None:
|
||||
font = list(font)
|
||||
font.append(gprefs.get('font_stretch', QFont.Unstretched))
|
||||
self.current_font = self.initial_font = font
|
||||
self.update_font_display()
|
||||
self.display_model.initialize()
|
||||
|
||||
@ -178,7 +182,8 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
|
||||
def build_font_obj(self):
|
||||
font_info = self.current_font
|
||||
if font_info is not None:
|
||||
font = QFont(*font_info)
|
||||
font = QFont(*(font_info[:4]))
|
||||
font.setStretch(font_info[4])
|
||||
else:
|
||||
font = qt_app.original_font
|
||||
return font
|
||||
@ -215,15 +220,18 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
|
||||
if fd.exec_() == fd.Accepted:
|
||||
font = fd.selectedFont()
|
||||
fi = QFontInfo(font)
|
||||
self.current_font = (unicode(fi.family()), fi.pointSize(),
|
||||
fi.weight(), fi.italic())
|
||||
self.current_font = [unicode(fi.family()), fi.pointSize(),
|
||||
fi.weight(), fi.italic(), font.stretch()]
|
||||
self.update_font_display()
|
||||
self.changed_signal.emit()
|
||||
|
||||
def commit(self, *args):
|
||||
rr = ConfigWidgetBase.commit(self, *args)
|
||||
if self.current_font != self.initial_font:
|
||||
gprefs['font'] = self.current_font
|
||||
gprefs['font'] = (self.current_font[:4] if self.current_font else
|
||||
None)
|
||||
gprefs['font_stretch'] = (self.current_font[4] if self.current_font
|
||||
is not None else QFont.Unstretched)
|
||||
QApplication.setFont(self.font_display.font())
|
||||
rr = True
|
||||
self.display_model.commit()
|
||||
|
@ -71,9 +71,10 @@ class SourcesModel(QAbstractTableModel): # {{{
|
||||
plugin.is_configured()):
|
||||
return QIcon(I('list_remove.png'))
|
||||
elif role == Qt.ToolTipRole:
|
||||
base = plugin.description + '\n\n'
|
||||
if plugin.is_configured():
|
||||
return _('This source is configured and ready to go')
|
||||
return _('This source needs configuration')
|
||||
return base + _('This source is configured and ready to go')
|
||||
return base + _('This source needs configuration')
|
||||
return NONE
|
||||
|
||||
def setData(self, index, val, role):
|
||||
|
@ -29,7 +29,7 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
|
||||
detail_item = self.url + detail_item
|
||||
|
||||
if external or self.config.get('open_external', False):
|
||||
open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
|
||||
open_url(QUrl(url_slash_cleaner(detail_item)))
|
||||
else:
|
||||
d = WebStoreDialog(self.gui, self.url, parent, detail_item)
|
||||
d.setWindowTitle(self.name)
|
||||
|
@ -633,8 +633,8 @@ class LibraryPage(QWizardPage, LibraryUI):
|
||||
try:
|
||||
lang = prefs['language'].lower()[:2]
|
||||
metadata_plugins = {
|
||||
'zh' : ('Douban Books', 'Douban.com covers'),
|
||||
'fr' : ('Nicebooks', 'Nicebooks covers'),
|
||||
'zh' : ('Douban Books',),
|
||||
'fr' : ('Nicebooks',),
|
||||
}.get(lang, [])
|
||||
from calibre.customize.ui import enable_plugin
|
||||
for name in metadata_plugins:
|
||||
|
@ -869,6 +869,7 @@ class Engine(threading.Thread):
|
||||
if DEBUG:
|
||||
traceback.print_exc()
|
||||
except:
|
||||
if DEBUG:
|
||||
traceback.print_exc()
|
||||
except:
|
||||
pass
|
||||
|
@ -1,209 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the <organization> nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import barename
|
||||
|
||||
class EchoTarget:
|
||||
|
||||
def __init__(self):
|
||||
self.final_output = []
|
||||
self.block = False
|
||||
self.ol_ident = 0
|
||||
self.ul_ident = 0
|
||||
self.list_types = []
|
||||
self.haystack = []
|
||||
|
||||
def start(self, tag, attrib):
|
||||
tag = barename(tag)
|
||||
|
||||
newline = '\n'
|
||||
dot = ''
|
||||
new_tag = ''
|
||||
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
||||
new_tag = tag
|
||||
dot = '. '
|
||||
elif tag == 'p':
|
||||
new_tag = ''
|
||||
dot = ''
|
||||
elif tag == 'blockquote':
|
||||
new_tag = 'bq'
|
||||
dot = '. '
|
||||
elif tag in ('b', 'strong'):
|
||||
new_tag = '*'
|
||||
newline = ''
|
||||
elif tag in ('em', 'i'):
|
||||
new_tag = '_'
|
||||
newline = ''
|
||||
elif tag == 'cite':
|
||||
new_tag = '??'
|
||||
newline = ''
|
||||
elif tag == 'del':
|
||||
new_tag = '-'
|
||||
newline = ''
|
||||
elif tag == 'ins':
|
||||
new_tag = '+'
|
||||
newline = ''
|
||||
elif tag == 'sup':
|
||||
new_tag = '^'
|
||||
newline = ''
|
||||
elif tag == 'sub':
|
||||
new_tag = '~'
|
||||
newline = ''
|
||||
elif tag == 'span':
|
||||
new_tag = ''
|
||||
newline = ''
|
||||
elif tag == 'a':
|
||||
self.block = True
|
||||
if 'title' in attrib:
|
||||
self.a_part = {'title':attrib.get('title'),
|
||||
'href':attrib.get('href', '')}
|
||||
else:
|
||||
self.a_part = {'title':None, 'href':attrib.get('href', '')}
|
||||
new_tag = ''
|
||||
newline = ''
|
||||
|
||||
elif tag == 'img':
|
||||
if 'alt' in attrib:
|
||||
new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
|
||||
else:
|
||||
new_tag = ' !%s' % attrib.get('src')
|
||||
newline = ''
|
||||
|
||||
elif tag in ('ul', 'ol'):
|
||||
new_tag = ''
|
||||
newline = ''
|
||||
self.list_types.append(tag)
|
||||
if tag == 'ul':
|
||||
self.ul_ident += 1
|
||||
else:
|
||||
self.ol_ident += 1
|
||||
|
||||
elif tag == 'li':
|
||||
indent = self.ul_ident + self.ol_ident
|
||||
if self.list_types[-1] == 'ul':
|
||||
new_tag = '*' * indent + ' '
|
||||
newline = '\n'
|
||||
else:
|
||||
new_tag = '#' * indent + ' '
|
||||
newline = '\n'
|
||||
|
||||
|
||||
if tag not in ('ul', 'ol'):
|
||||
textile = '%(newline)s%(tag)s%(dot)s' % \
|
||||
{
|
||||
'newline':newline,
|
||||
'tag':new_tag,
|
||||
'dot':dot
|
||||
}
|
||||
if not self.block:
|
||||
self.final_output.append(textile)
|
||||
else:
|
||||
self.haystack.append(textile)
|
||||
|
||||
def end(self, tag):
|
||||
tag = barename(tag)
|
||||
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
|
||||
self.final_output.append('\n')
|
||||
elif tag in ('b', 'strong'):
|
||||
self.final_output.append('*')
|
||||
elif tag in ('em', 'i'):
|
||||
self.final_output.append('_')
|
||||
elif tag == 'cite':
|
||||
self.final_output.append('??')
|
||||
elif tag == 'del':
|
||||
self.final_output.append('-')
|
||||
elif tag == 'ins':
|
||||
self.final_output.append('+')
|
||||
elif tag == 'sup':
|
||||
self.final_output.append('^')
|
||||
elif tag == 'sub':
|
||||
self.final_output.append('~')
|
||||
elif tag == 'span':
|
||||
self.final_output.append('')
|
||||
elif tag == 'a':
|
||||
if self.a_part['title']:
|
||||
textilized = ' "%s (%s)":%s ' % (
|
||||
''.join(self.haystack),
|
||||
self.a_part.get('title'),
|
||||
self.a_part.get('href'),
|
||||
)
|
||||
self.haystack = []
|
||||
else:
|
||||
textilized = ' "%s":%s ' % (
|
||||
''.join(self.haystack),
|
||||
self.a_part.get('href'),
|
||||
)
|
||||
self.haystack = []
|
||||
self.final_output.append(textilized)
|
||||
self.block = False
|
||||
elif tag == 'img':
|
||||
self.final_output.append('!')
|
||||
elif tag == 'ul':
|
||||
self.ul_ident -= 1
|
||||
self.list_types.pop()
|
||||
if len(self.list_types) == 0:
|
||||
self.final_output.append('\n')
|
||||
elif tag == 'ol':
|
||||
self.ol_ident -= 1
|
||||
self.list_types.pop()
|
||||
if len(self.list_types) == 0:
|
||||
self.final_output.append('\n')
|
||||
|
||||
def data(self, data):
|
||||
#we dont want any linebreaks inside our tags
|
||||
node_data = data.replace('\n','')
|
||||
if not self.block:
|
||||
self.final_output.append(node_data)
|
||||
else:
|
||||
self.haystack.append(node_data)
|
||||
|
||||
def comment(self, text):
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
return "closed!"
|
||||
|
||||
|
||||
def html2textile(html):
|
||||
#1st pass
|
||||
#clean the whitespace and convert html to xhtml
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.fromstring(html, parser)
|
||||
xhtml = etree.tostring(tree, method="xml")
|
||||
parser = etree.XMLParser(remove_blank_text=True)
|
||||
root = etree.XML(xhtml, parser)
|
||||
cleaned_html = etree.tostring(root)
|
||||
#2nd pass build textile
|
||||
target = EchoTarget()
|
||||
parser = etree.XMLParser(target=target)
|
||||
root = etree.fromstring(cleaned_html, parser)
|
||||
textilized_text = ''.join(target.final_output).lstrip().rstrip()
|
||||
return textilized_text
|
Loading…
x
Reference in New Issue
Block a user