diff --git a/recipes/readers_digest.recipe b/recipes/readers_digest.recipe index 3689ca4c53..caf5cf081d 100644 --- a/recipes/readers_digest.recipe +++ b/recipes/readers_digest.recipe @@ -3,7 +3,6 @@ __license__ = 'GPL v3' ''' ''' from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.web.feeds import Feed class ReadersDigest(BasicNewsRecipe): @@ -38,151 +37,20 @@ class ReadersDigest(BasicNewsRecipe): ''' - remove_tags = [ - dict(name='h4', attrs={'class':'close'}), - dict(name='div', attrs={'class':'fromLine'}), - dict(name='img', attrs={'class':'colorTag'}), - dict(name='div', attrs={'id':'sponsorArticleHeader'}), - dict(name='div', attrs={'class':'horizontalAd'}), - dict(name='div', attrs={'id':'imageCounterLeft'}), - dict(name='div', attrs={'id':'commentsPrint'}) - ] - - feeds = [ - ('New in RD', 'http://feeds.rd.com/ReadersDigest'), - ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'), - ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'), - ('Blogs','http://feeds.rd.com/ReadersDigestBlogs') + ('Food', 'http://www.rd.com/food/feed'), + ('Health', 'http://www.rd.com/health/feed'), + ('Home', 'http://www.rd.com/home/feed'), + ('Family', 'http://www.rd.com/family/feed'), + ('Money', 'http://www.rd.com/money/feed'), + ('Travel', 'http://www.rd.com/travel/feed'), ] cover_url = 'http://www.rd.com/images/logo-main-rd.gif' - - -#------------------------------------------------------------------------------------------------- - - def print_version(self, url): - - # Get the identity number of the current article and append it to the root print URL - - if url.find('/article') > 0: - ident = url[url.find('/article')+8:url.find('.html?')-4] - url = 'http://www.rd.com/content/printContent.do?contentId=' + ident - - elif url.find('/post') > 0: - - # in this case, have to get the page itself to derive the Print page. - soup = self.index_to_soup(url) - newsoup = soup.find('ul',attrs={'class':'printBlock'}) - url = 'http://www.rd.com' + newsoup('a')[0]['href'] - url = url[0:url.find('&Keep')] - - return url - -#------------------------------------------------------------------------------------------------- - - def parse_index(self): - - pages = [ - ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}), - # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}), - ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'}) - + keep_only_tags = dict(id='main-content') + remove_tags = [ + {'class':['post-categories']}, ] - feeds = [] - - for page in pages: - section, url, divider, attrList = page - newArticles = self.page_parse(url, divider, attrList) - feeds.append((section,newArticles)) - - # after the pages of the site have been processed, parse several RSS feeds for additional sections - newfeeds = Feed() - newfeeds = self.parse_rss() - - - # The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable - # for this module (parse_index). - - for feed in newfeeds: - newArticles = [] - for article in feed.articles: - newArt = { - 'title' : article.title, - 'url' : article.url, - 'date' : article.date, - 'description' : article.text_summary - } - newArticles.append(newArt) - - - # New and Blogs should be the first two feeds. - if feed.title == 'New in RD': - feeds.insert(0,(feed.title,newArticles)) - elif feed.title == 'Blogs': - feeds.insert(1,(feed.title,newArticles)) - else: - feeds.append((feed.title,newArticles)) - - - return feeds - -#------------------------------------------------------------------------------------------------- - - def page_parse(self, mainurl, divider, attrList): - - articles = [] - mainsoup = self.index_to_soup(mainurl) - for item in mainsoup.findAll(attrs=attrList): - newArticle = { - 'title' : item('img')[0]['alt'], - 'url' : 'http://www.rd.com'+item('a')[0]['href'], - 'date' : '', - 'description' : '' - } - articles.append(newArticle) - - - - return articles - - - -#------------------------------------------------------------------------------------------------- - - def parse_rss (self): - - # Do the "official" parse_feeds first - feeds = BasicNewsRecipe.parse_feeds(self) - - - # Loop thru the articles in all feeds to find articles with "recipe" in it - recipeArticles = [] - for curfeed in feeds: - delList = [] - for a,curarticle in enumerate(curfeed.articles): - if curarticle.title.upper().find('RECIPE') >= 0: - recipeArticles.append(curarticle) - delList.append(curarticle) - if len(delList)>0: - for d in delList: - index = curfeed.articles.index(d) - curfeed.articles[index:index+1] = [] - - # If there are any recipes found, create a new Feed object and append. - if len(recipeArticles) > 0: - pfeed = Feed() - pfeed.title = 'Recipes' - pfeed.descrition = 'Recipe Feed (Virtual)' - pfeed.image_url = None - pfeed.oldest_article = 30 - pfeed.id_counter = len(recipeArticles) - # Create a new Feed, add the recipe articles, and then append - # to "official" list of feeds - pfeed.articles = recipeArticles[:] - feeds.append(pfeed) - - return feeds diff --git a/recipes/strategy-business.recipe b/recipes/strategy-business.recipe index ab58965e98..a4697ecfcd 100644 --- a/recipes/strategy-business.recipe +++ b/recipes/strategy-business.recipe @@ -33,7 +33,7 @@ class StrategyBusinessRecipe(BasicNewsRecipe): elif c.name.endswith('_password'): br[c.name] = self.password raw = br.submit().read() - if '>Logout' not in raw: + if 'You have been logged in' not in raw: raise ValueError('Failed to login, check your username and password') return br diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 1cfc74c4d3..ce8c7f71cb 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -628,8 +628,9 @@ from calibre.ebooks.metadata.sources.amazon import Amazon from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary from calibre.ebooks.metadata.sources.isbndb import ISBNDB from calibre.ebooks.metadata.sources.overdrive import OverDrive +from calibre.ebooks.metadata.sources.douban import Douban -plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive] +plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban] # }}} diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 5c29f1e79b..e04930dd0c 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -253,7 +253,7 @@ class OutputProfile(Plugin): periodical_date_in_title = True #: Characters used in jackets and catalogs - missing_char = u'x' + missing_char = u'x' ratings_char = u'*' empty_ratings_char = u' ' read_char = u'+' @@ -293,38 +293,38 @@ class iPadOutput(OutputProfile): } ] - missing_char = u'\u2715\u200a' # stylized 'x' plus hair space - ratings_char = u'\u2605' # filled star - empty_ratings_char = u'\u2606' # hollow star - read_char = u'\u2713' # check mark + missing_char = u'\u2715\u200a' # stylized 'x' plus hair space + ratings_char = u'\u2605' # filled star + empty_ratings_char = u'\u2606' # hollow star + read_char = u'\u2713' # check mark touchscreen = True # touchscreen_news_css {{{ touchscreen_news_css = u''' - /* hr used in articles */ - .article_articles_list { + /* hr used in articles */ + .article_articles_list { width:18%; - } + } .article_link { - color: #593f29; + color: #593f29; font-style: italic; } .article_next { - -webkit-border-top-right-radius:4px; - -webkit-border-bottom-right-radius:4px; + -webkit-border-top-right-radius:4px; + -webkit-border-bottom-right-radius:4px; font-style: italic; width:32%; } .article_prev { - -webkit-border-top-left-radius:4px; - -webkit-border-bottom-left-radius:4px; + -webkit-border-top-left-radius:4px; + -webkit-border-bottom-left-radius:4px; font-style: italic; width:32%; } - .article_sections_list { + .article_sections_list { width:18%; - } + } .articles_link { font-weight: bold; } @@ -334,8 +334,8 @@ class iPadOutput(OutputProfile): .caption_divider { - border:#ccc 1px solid; - } + border:#ccc 1px solid; + } .touchscreen_navbar { background:#c3bab2; @@ -357,50 +357,50 @@ class iPadOutput(OutputProfile): text-align:center; } - .touchscreen_navbar td a:link { - color: #593f29; - text-decoration: none; - } + .touchscreen_navbar td a:link { + color: #593f29; + text-decoration: none; + } - /* Index formatting */ - .publish_date { - text-align:center; - } - .divider { - border-bottom:1em solid white; - border-top:1px solid gray; - } + /* Index formatting */ + .publish_date { + text-align:center; + } + .divider { + border-bottom:1em solid white; + border-top:1px solid gray; + } - hr.caption_divider { - border-color:black; - border-style:solid; - border-width:1px; - } + hr.caption_divider { + border-color:black; + border-style:solid; + border-width:1px; + } /* Feed summary formatting */ .article_summary { - display:inline-block; - } + display:inline-block; + } .feed { font-family:sans-serif; font-weight:bold; font-size:larger; - } + } .feed_link { font-style: italic; } .feed_next { - -webkit-border-top-right-radius:4px; - -webkit-border-bottom-right-radius:4px; + -webkit-border-top-right-radius:4px; + -webkit-border-bottom-right-radius:4px; font-style: italic; width:40%; } .feed_prev { - -webkit-border-top-left-radius:4px; - -webkit-border-bottom-left-radius:4px; + -webkit-border-top-left-radius:4px; + -webkit-border-bottom-left-radius:4px; font-style: italic; width:40%; } @@ -410,24 +410,24 @@ class iPadOutput(OutputProfile): font-size: 160%; } - .feed_up { + .feed_up { font-weight: bold; width:20%; - } + } .summary_headline { font-weight:bold; text-align:left; - } + } .summary_byline { text-align:left; font-family:monospace; - } + } .summary_text { text-align:left; - } + } ''' # }}} @@ -617,8 +617,8 @@ class KindleOutput(OutputProfile): supports_mobi_indexing = True periodical_date_in_title = False - missing_char = u'x\u2009' - empty_ratings_char = u'\u2606' + missing_char = u'x\u2009' + empty_ratings_char = u'\u2606' ratings_char = u'\u2605' read_char = u'\u2713' @@ -642,8 +642,8 @@ class KindleDXOutput(OutputProfile): #comic_screen_size = (741, 1022) supports_mobi_indexing = True periodical_date_in_title = False - missing_char = u'x\u2009' - empty_ratings_char = u'\u2606' + missing_char = u'x\u2009' + empty_ratings_char = u'\u2606' ratings_char = u'\u2605' read_char = u'\u2713' mobi_ems_per_blockquote = 2.0 diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 70a6e104c3..e955336d3f 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -92,7 +92,7 @@ def restore_plugin_state_to_default(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Overdrive', + 'Overdrive', 'Douban Books', ]) def is_disabled(plugin): diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 917c5ad8ae..ac1d61ce59 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -103,10 +103,11 @@ class EPUBInput(InputFormatPlugin): t.set('href', guide_cover) t.set('title', 'Title Page') from calibre.ebooks import render_html_svg_workaround - renderer = render_html_svg_workaround(guide_cover, log) - if renderer is not None: - open('calibre_raster_cover.jpg', 'wb').write( - renderer) + if os.path.exists(guide_cover): + renderer = render_html_svg_workaround(guide_cover, log) + if renderer is not None: + open('calibre_raster_cover.jpg', 'wb').write( + renderer) def find_opf(self): def attr(n, attr): diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 31d815af63..40cd54cfbd 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -280,7 +280,7 @@ class Worker(Thread): # Get details {{{ class Amazon(Source): name = 'Amazon.com' - description = _('Downloads metadata from Amazon') + description = _('Downloads metadata and covers from Amazon') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'identifier:amazon', diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py new file mode 100644 index 0000000000..3c6bb7b6c7 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ; 2011, Li Fanxi ' +__docformat__ = 'restructuredtext en' + +import time +from urllib import urlencode +from functools import partial +from Queue import Queue, Empty + +from lxml import etree + +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode +from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_chars +from calibre import as_unicode + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'db': 'http://www.douban.com/xmlns/', + 'gd': 'http://schemas.google.com/g/2005' + } +XPath = partial(etree.XPath, namespaces=NAMESPACES) +total_results = XPath('//openSearch:totalResults') +start_index = XPath('//openSearch:startIndex') +items_per_page = XPath('//openSearch:itemsPerPage') +entry = XPath('//atom:entry') +entry_id = XPath('descendant::atom:id') +title = XPath('descendant::atom:title') +description = XPath('descendant::atom:summary') +publisher = XPath("descendant::db:attribute[@name='publisher']") +isbn = XPath("descendant::db:attribute[@name='isbn13']") +date = XPath("descendant::db:attribute[@name='pubdate']") +creator = XPath("descendant::db:attribute[@name='author']") +booktag = XPath("descendant::db:tag/attribute::name") +rating = XPath("descendant::gd:rating/attribute::average") +cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") + +def get_details(browser, url, timeout): # {{{ + try: + raw = browser.open_novisit(url, timeout=timeout).read() + except Exception as e: + gc = getattr(e, 'getcode', lambda : -1) + if gc() != 403: + raise + # Douban is throttling us, wait a little + time.sleep(2) + raw = browser.open_novisit(url, timeout=timeout).read() + + return raw +# }}} + +def to_metadata(browser, log, entry_, timeout): # {{{ + def get_text(extra, x): + try: + ans = x(extra) + if ans: + ans = ans[0].text + if ans and ans.strip(): + return ans.strip() + except: + log.exception('Programming error:') + return None + + id_url = entry_id(entry_)[0].text + douban_id = id_url.split('/')[-1] + title_ = ': '.join([x.text for x in title(entry_)]).strip() + authors = [x.text.strip() for x in creator(entry_) if x.text] + if not authors: + authors = [_('Unknown')] + if not id_url or not title: + # Silently discard this entry + return None + + mi = Metadata(title_, authors) + mi.identifiers = {'douban':douban_id} + try: + raw = get_details(browser, id_url, timeout) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0]) + extra = entry(feed)[0] + except: + log.exception('Failed to get additional details for', mi.title) + return mi + mi.comments = get_text(extra, description) + mi.publisher = get_text(extra, publisher) + + # ISBN + isbns = [] + for x in [t.text for t in isbn(extra)]: + if check_isbn(x): + isbns.append(x) + if isbns: + mi.isbn = sorted(isbns, key=len)[-1] + mi.all_isbns = isbns + + # Tags + try: + btags = [x for x in booktag(extra) if x] + tags = [] + for t in btags: + atags = [y.strip() for y in t.split('/')] + for tag in atags: + if tag not in tags: + tags.append(tag) + except: + log.exception('Failed to parse tags:') + tags = [] + if tags: + mi.tags = [x.replace(',', ';') for x in tags] + + # pubdate + pubdate = get_text(extra, date) + if pubdate: + try: + default = utcnow().replace(day=15) + mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) + except: + log.error('Failed to parse pubdate %r'%pubdate) + + # Ratings + if rating(extra): + try: + mi.rating = float(rating(extra)[0]) / 2.0 + except: + log.exception('Failed to parse rating') + mi.rating = 0 + + # Cover + mi.has_douban_cover = None + u = cover_url(extra) + if u: + u = u[0].replace('/spic/', '/lpic/'); + # If URL contains "book-default", the book doesn't have a cover + if u.find('book-default') == -1: + mi.has_douban_cover = u + return mi +# }}} + +class Douban(Source): + + name = 'Douban Books' + author = 'Li Fanxi' + version = (2, 0, 0) + + description = _('Downloads metadata and covers from Douban.com') + + capabilities = frozenset(['identify', 'cover']) + touched_fields = frozenset(['title', 'authors', 'tags', + 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', + 'identifier:douban']) # language currently disabled + supports_gzip_transfer_encoding = True + cached_cover_url_is_reliable = True + + DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' + DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/' + + def get_book_url(self, identifiers): # {{{ + db = identifiers.get('douban', None) + if db is not None: + return ('douban', db, self.DOUBAN_BOOK_URL%db) + # }}} + + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ + SEARCH_URL = 'http://api.douban.com/book/subjects?' + ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + SUBJECT_URL = 'http://api.douban.com/book/subject/' + + q = '' + t = None + isbn = check_isbn(identifiers.get('isbn', None)) + subject = identifiers.get('douban', None) + if isbn is not None: + q = isbn + t = 'isbn' + elif subject is not None: + q = subject + t = 'subject' + elif title or authors: + def build_term(prefix, parts): + return ' '.join(x for x in parts) + title_tokens = list(self.get_title_tokens(title)) + if title_tokens: + q += build_term('title', title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + q += ((' ' if q != '' else '') + + build_term('author', author_tokens)) + t = 'search' + q = q.strip() + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + url = None + if t == "isbn": + url = ISBN_URL + q + elif t == 'subject': + url = SUBJECT_URL + q + else: + url = SEARCH_URL + urlencode({ + 'q': q, + }) + if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': + url = url + "?apikey=" + self.DOUBAN_API_KEY + return url + # }}} + + def download_cover(self, log, result_queue, abort, # {{{ + title=None, authors=None, identifiers={}, timeout=30): + cached_url = self.get_cached_cover_url(identifiers) + if cached_url is None: + log.info('No cached cover found, running identify') + rq = Queue() + self.identify(log, rq, abort, title=title, authors=authors, + identifiers=identifiers) + if abort.is_set(): + return + results = [] + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + results.sort(key=self.identify_results_keygen( + title=title, authors=authors, identifiers=identifiers)) + for mi in results: + cached_url = self.get_cached_cover_url(mi.identifiers) + if cached_url is not None: + break + if cached_url is None: + log.info('No cover found') + return + + if abort.is_set(): + return + br = self.browser + log('Downloading cover from:', cached_url) + try: + cdata = br.open_novisit(cached_url, timeout=timeout).read() + if cdata: + result_queue.put((self, cdata)) + except: + log.exception('Failed to download cover from:', cached_url) + + # }}} + + def get_cached_cover_url(self, identifiers): # {{{ + url = None + db = identifiers.get('douban', None) + if db is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + db = self.cached_isbn_to_identifier(isbn) + if db is not None: + url = self.cached_identifier_to_cover_url(db) + + return url + # }}} + + def get_all_details(self, br, log, entries, abort, # {{{ + result_queue, timeout): + for relevance, i in enumerate(entries): + try: + ans = to_metadata(br, log, i, timeout) + if isinstance(ans, Metadata): + ans.source_relevance = relevance + db = ans.identifiers['douban'] + for isbn in getattr(ans, 'all_isbns', []): + self.cache_isbn_to_identifier(isbn, db) + if ans.has_douban_cover: + self.cache_identifier_to_cover_url(db, + ans.has_douban_cover) + self.clean_downloaded_metadata(ans) + result_queue.put(ans) + except: + log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) + if abort.is_set(): + break + # }}} + + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ + identifiers={}, timeout=30): + query = self.create_query(log, title=title, authors=authors, + identifiers=identifiers) + if not query: + log.error('Insufficient metadata to construct query') + return + br = self.browser + try: + raw = br.open_novisit(query, timeout=timeout).read() + except Exception as e: + log.exception('Failed to make identify query: %r'%query) + return as_unicode(e) + try: + parser = etree.XMLParser(recover=True, no_network=True) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0], parser=parser) + entries = entry(feed) + except Exception as e: + log.exception('Failed to parse identify results') + return as_unicode(e) + if not entries and identifiers and title and authors and \ + not abort.is_set(): + return self.identify(log, result_queue, abort, title=title, + authors=authors, timeout=timeout) + + # There is no point running these queries in threads as douban + # throttles requests returning 403 Forbidden errors + self.get_all_details(br, log, entries, abort, result_queue, timeout) + + return None + # }}} + +if __name__ == '__main__': # tests {{{ + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py + from calibre.ebooks.metadata.sources.test import (test_identify_plugin, + title_test, authors_test) + test_identify_plugin(Douban.name, + [ + + + ( + {'identifiers':{'isbn': '9787536692930'}, 'title':'三体', + 'authors':['刘慈欣']}, + [title_test('三体', exact=True), + authors_test(['刘慈欣'])] + ), + + ( + {'title': 'Linux内核修炼之道', 'authors':['任桥伟']}, + [title_test('Linux内核修炼之道', exact=False)] + ), + ]) +# }}} + diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index b479368bac..bd1043b774 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -157,7 +157,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ class GoogleBooks(Source): name = 'Google' - description = _('Downloads metadata from Google Books') + description = _('Downloads metadata and covers from Google Books') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index b084f86294..0cc070c3c6 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -382,7 +382,7 @@ def identify(log, abort, # {{{ if key not in filter_results: filtered_results.append(r) filter_results.add(key) - presults = filtered_results + results[plugin] = presults = filtered_results plog = logs[plugin].getvalue().strip() log('\n'+'*'*30, plugin.name, '*'*30) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 4ee248579e..f52b1f423b 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -30,7 +30,7 @@ base_url = 'http://search.overdrive.com/' class OverDrive(Source): name = 'Overdrive' - description = _('Downloads metadata from Overdrive\'s Content Reserve') + description = _('Downloads metadata and covers from Overdrive\'s Content Reserve') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 39f793face..e088d264fc 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -12,7 +12,7 @@ A Humane Web Text Generator #__date__ = '2009/12/04' __copyright__ = """ -Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, Leigh Parry Copyright (c) 2011, John Schember Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ @@ -219,14 +219,13 @@ class Textile(object): ] glyph_defaults = [ (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign - (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime - (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime + (re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break - (re.compile(r'\b--\b'), r'—'), # em dash - (re.compile(r'(\s)--(\s)'), r'\1—\2'), # em dash + (re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -706,6 +705,21 @@ class Textile(object): result.append(line) return ''.join(result) + def macros_only(self, text): + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + for s, r in rules: + line = s.sub(r, line) + result.append(line) + return ''.join(result) + def vAlign(self, input): d = {'^':'top', '-':'middle', '~':'bottom'} return d.get(input, '') @@ -814,6 +828,7 @@ class Textile(object): 'fooobar ... and hello world ...' """ + text = self.macros_only(text) punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' pattern = r''' @@ -1044,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): return Textile(restricted=True, lite=lite, noimage=noimage).textile(text, rel='nofollow', html_type=html_type) - diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index ac63690996..d9c42eb1dc 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -66,19 +66,26 @@ class TXTOutput(OutputFormatPlugin): help=_('Do not remove image references within the document. This is only ' \ 'useful when paired with a txt-output-formatting option that ' 'is not none because links are always removed with plain text output.')), + OptionRecommendation(name='keep_color', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not remove font color from output. This is only useful when ' \ + 'txt-output-formatting is set to textile. Textile is the only ' \ + 'formatting that supports setting font color. If this option is ' \ + 'not specified font color will not be set and default to the ' \ + 'color displayed by the reader (generally this is black).')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer - writer = MarkdownMLizer(log) + self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from calibre.ebooks.txt.textileml import TextileMLizer - writer = TextileMLizer(log) + self.writer = TextileMLizer(log) else: - writer = TXTMLizer(log) + self.writer = TXTMLizer(log) - txt = writer.extract_content(oeb_book, opts) + txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') @@ -111,17 +118,28 @@ class TXTZOutput(TXTOutput): from calibre.ebooks.oeb.base import OEB_IMAGES with TemporaryDirectory('_txtz_output') as tdir: # TXT - with TemporaryFile('index.txt') as tf: + txt_name = 'index.txt' + if opts.txt_output_formatting.lower() == 'textile': + txt_name = 'index.text' + with TemporaryFile(txt_name) as tf: TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log) - shutil.copy(tf, os.path.join(tdir, 'index.txt')) + shutil.copy(tf, os.path.join(tdir, txt_name)) # Images for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: - path = os.path.join(tdir, os.path.dirname(item.href)) + if hasattr(self.writer, 'images'): + path = os.path.join(tdir, 'images') + if item.href in self.writer.images: + href = self.writer.images[item.href] + else: + continue + else: + path = os.path.join(tdir, os.path.dirname(item.href)) + href = os.path.basename(item.href) if not os.path.exists(path): os.makedirs(path) - with open(os.path.join(tdir, item.href), 'wb') as imgf: + with open(os.path.join(path, href), 'wb') as imgf: imgf.write(item.data) # Metadata diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 7e161f63bd..54369190de 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -242,6 +242,8 @@ def detect_formatting_type(txt): textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt)) # Links textile_count += len(re.findall(r'"[^"]*":\S+', txt)) + # paragraph blocks + textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt)) # Decide if either markdown or textile is used in the text # based on the number of unique formatting elements found. diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py index d7e11695c5..36dc9952d2 100644 --- a/src/calibre/ebooks/txt/textileml.py +++ b/src/calibre/ebooks/txt/textileml.py @@ -1,62 +1,489 @@ # -*- coding: utf-8 -*- __license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' +__copyright__ = '2011, Leigh Parry ' __docformat__ = 'restructuredtext en' ''' Transform OEB content into Textile formatted plain text ''' - import re -from lxml import etree +from functools import partial -from calibre.ebooks.oeb.base import XHTML -from calibre.utils.html2textile import html2textile +from calibre.ebooks.htmlz.oeb2html import OEB2HTML +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks import unit_convert +from calibre.ebooks.txt.unsmarten import unsmarten -class TextileMLizer(object): - - def __init__(self, log): - self.log = log +class TextileMLizer(OEB2HTML): def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to Textile formatted TXT...') - self.oeb_book = oeb_book self.opts = opts + self.in_pre = False + self.in_table = False + self.links = {} + self.list = [] + self.our_links = [] + self.in_a_link = False + self.our_ids = [] + self.images = {} + self.id_no_text = u'' + self.style_embed = [] + self.remove_space_after_newline = False + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) - return self.mlize_spine() + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False - def mlize_spine(self): + txt = self.mlize_spine(oeb_book) + txt = unsmarten(txt) + + # Do some tidying up + txt = self.tidy_up(txt) + + return txt + + def mlize_spine(self, oeb_book): output = [u''] - - for item in self.oeb_book.spine: + for item in oeb_book.spine: self.log.debug('Converting %s to Textile formatted TXT...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output.append('\n\n') + return ''.join(output) - html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + def tidy_up(self, text): + # May need tweaking and finetuning + def check_escaping(text, tests): + for t in tests: + # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged + txt = '%s' % t + if txt != '%': + text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text) + return text - if not self.opts.keep_links: - html = re.sub(r'<\s*/*\s*a[^>]*>', '', html) - if not self.opts.keep_image_references: - html = re.sub(r'<\s*img[^>]*>', '', html) + # Now tidyup links and ids - remove ones that don't have a correponding opposite + if self.opts.keep_links: + for i in self.our_links: + if i[0] == '#': + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text) + for i in self.our_ids: + if i not in self.our_links: + text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text) + + # Remove obvious non-needed escaping, add sub/sup-script ones + text = check_escaping(text, ['\*', '_', '\*']) + # escape the super/sub-scripts if needed + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) + # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) - text = html2textile(html) + #remove empty spans + text = re.sub(r'%\xa0+', r'%', text) + #remove empty spans - MAY MERGE SOME ? + text = re.sub(r'%%', r'', text) + #remove spans from tagged output + text = re.sub(r'%([_+*-]+)%', r'\1', text) + #remove spaces before a newline + text = re.sub(r' +\n', r'\n', text) + #remove newlines at top of file + text = re.sub(r'^\n+', r'', text) + #correct blockcode paras + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + #correct blockquote paras + text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text) - # Ensure the section ends with at least two new line characters. - # This is to prevent the last paragraph from a section being - # combined into the fist paragraph of the next. - end_chars = text[-4:] - # Convert all newlines to \n - end_chars = end_chars.replace('\r\n', '\n') - end_chars = end_chars.replace('\r', '\n') - end_chars = end_chars[-2:] - if not end_chars[1] == '\n': - text += '\n\n' - if end_chars[1] == '\n' and not end_chars[0] == '\n': - text += '\n' + #reduce blank lines + text = re.sub(r'\n{3}', r'\n\np. \n\n', text) + text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text) + #Check span following blank para + text = re.sub(r'\n+ +%', r' %', text) + text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text) + # blank paragraph + text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text) + # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) + # blank paragraph + text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text) + text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) + text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) + #sort out spaces in tables + text = re.sub(r' {2,}\|', r' |', text) - output += text + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + #reduce blank lines + text = re.sub(r' \n\n\n', r' \n\n', text) - output = u''.join(output) + return text - return output + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + text = re.sub(r'\t+', '', text) + if self.remove_space_after_newline == True: + text = re.sub(r'^ +', '', text) + self.remove_space_after_newline = False + return text + + def check_styles(self, style): + txt = '{' + if self.opts.keep_color: + if 'color' in style.cssdict() and style['color'] != 'black': + txt += 'color:'+style['color']+';' + if 'background' in style.cssdict(): + txt += 'background:'+style['background']+';' + txt += '}' + if txt == '{}': txt = '' + return txt + + def check_halign(self, style): + tests = {'left':'<','justify':'<>','center':'=','right':'>'} + for i in tests: + if style['text-align'] == i: + return tests[i] + return '' + + def check_valign(self, style): + tests = {'top':'^','bottom':'~'} #, 'middle':'-'} + for i in tests: + if style['vertical-align'] == i: + return tests[i] + return '' + + def check_padding(self, style, stylizer): + txt = '' + left_padding_pts = 0 + left_margin_pts = 0 + if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto': + left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto': + left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi) + left = left_margin_pts + left_padding_pts + emleft = int(round(left / stylizer.profile.fbase)) + if emleft >= 1: + txt += '(' * emleft + right_padding_pts = 0 + right_margin_pts = 0 + if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto': + right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto': + right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi) + right = right_margin_pts + right_padding_pts + emright = int(round(right / stylizer.profile.fbase)) + if emright >= 1: + txt += ')' * emright + + return txt + + def check_id_tag(self, attribs): + txt = '' + if attribs.has_key('id'): + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) + self.id_no_text = u'\xa0' + return txt + + def build_block(self, tag, style, attribs, stylizer): + txt = '\n' + tag + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_padding(style, stylizer) + txt += self.check_halign(style) + txt += self.check_styles(style) + return txt + + def prepare_string_for_textile(self, txt): + if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): + return ' ==%s== ' % txt + return txt + + def dump_text(self, elem, stylizer): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + # Soft scene breaks. + if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': + ems = int(round(float(style.marginTop) / style.fontSize) - 1) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): + if tag == 'div': + tag = 'p' + text.append(self.build_block(tag, style, attribs, stylizer)) + text.append('. ') + tags.append('\n') + + if style['font-style'] == 'italic' or tag in ('i', 'em'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): + if self.style_italic == False: + if self.in_a_link: + text.append('_') + tags.append('_') + else: + text.append('[_') + tags.append('_]') + self.style_embed.append('_') + self.style_italic = True + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + if self.style_bold == False: + if self.in_a_link: + text.append('*') + tags.append('*') + else: + text.append('[*') + tags.append('*]') + self.style_embed.append('*') + self.style_bold = True + if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): + if tag != 'a': + if self.style_under == False: + text.append('[+') + tags.append('+]') + self.style_embed.append('+') + self.style_under = True + if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): + if self.style_strike == False: + text.append('[-') + tags.append('-]') + self.style_embed.append('-') + self.style_strike = True + if tag == 'br': + for i in reversed(self.style_embed): + text.append(i) + text.append('\n') + for i in self.style_embed: + text.append(i) + tags.append('') + self.remove_space_after_newline = True + if tag == 'blockquote': + text.append('\nbq. ') + tags.append('\n') + elif tag in ('abbr', 'acronym'): + text.append('') + txt = attribs['title'] + tags.append('(' + txt + ')') + elif tag == 'sup': + text.append('^') + tags.append('^') + elif tag == 'sub': + text.append('~') + tags.append('~') + elif tag == 'code': + if self.in_pre: + text.append('\nbc. ') + tags.append('') + else: + text.append('@') + tags.append('@') + elif tag == 'cite': + text.append('??') + tags.append('??') + elif tag == 'hr': + text.append('\n***') + tags.append('\n') + elif tag == 'pre': + self.in_pre = True + text.append('\npre. ') + tags.append('pre\n') + elif tag == 'a': + if self.opts.keep_links: + if attribs.has_key('href'): + text.append('"') + tags.append('a') + tags.append('":' + attribs['href']) + self.our_links.append(attribs['href']) + if attribs.has_key('title'): + tags.append('(' + attribs['title'] + ')') + self.in_a_link = True + else: + text.append('%') + tags.append('%') + elif tag == 'img': + if self.opts.keep_image_references: + txt = '!' + self.check_halign(style) + txt += self.check_valign(style) + txt += attribs['src'] + text.append(txt) + if attribs.has_key('alt'): + txt = attribs['alt'] + if txt != '': + text.append('(' + txt + ')') + tags.append('!') + elif tag in ('ol', 'ul'): + self.list.append({'name': tag, 'num': 0}) + text.append('') + tags.append(tag) + elif tag == 'li': + if self.list: li = self.list[-1] + else: li = {'name': 'ul', 'num': 0} + text.append('\n') + if li['name'] == 'ul': + text.append('*' * len(self.list) + ' ') + elif li['name'] == 'ol': + text.append('#' * len(self.list) + ' ') + tags.append('') + elif tag == 'dl': + text.append('\n') + tags.append('') + elif tag == 'dt': + text.append('') + tags.append('\n') + elif tag == 'dd': + text.append(' ') + tags.append('') + elif tag == 'dd': + text.append('') + tags.append('\n') + elif tag == 'table': + txt = self.build_block(tag, style, attribs, stylizer) + txt += '. \n' + if txt != '\ntable. \n': + text.append(txt) + else: + text.append('\n') + tags.append('') + elif tag == 'tr': + txt = self.build_block('', style, attribs, stylizer) + txt += '. ' + if txt != '\n. ': + txt = re.sub ('\n', '', txt) + text.append(txt) + tags.append('|\n') + elif tag == 'td': + text.append('|') + txt = '' + txt += self.check_halign(style) + txt += self.check_valign(style) + if attribs.has_key ('colspan'): + txt += '\\' + attribs['colspan'] + if attribs.has_key ('rowspan'): + txt += '/' + attribs['rowspan'] + txt += self.check_styles(style) + if txt != '': + text.append(txt + '. ') + tags.append('') + elif tag == 'th': + text.append('|_. ') + tags.append('') + elif tag == 'span': + if style['font-variant'] == 'small-caps': + if self.style_smallcap == False: + text.append('&') + tags.append('&') + self.style_smallcap = True + else: + if self.in_a_link == False: + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') + + if self.opts.keep_links and attribs.has_key('id'): + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): + text.append(self.check_id_tag(attribs)) + + # Process the styles for any that we want to keep + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \ + 'span', 'table', 'tr', 'td'): + if not self.in_a_link: + text.append(self.check_styles(style)) + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + txt = elem.text + if not self.in_pre: + txt = self.prepare_string_for_textile(self.remove_newlines(txt)) + text.append(txt) + self.id_no_text = u'' + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer) + + # Close all open tags. + tags.reverse() + for t in tags: + if tag in ('pre', 'ul', 'ol', 'li', 'table'): + if tag == 'pre': + self.in_pre = False + elif tag in ('ul', 'ol'): + if self.list: self.list.pop() + if not self.list: text.append('\n') + else: + if t == 'a': + self.in_a_link = False + t = '' + text.append(self.id_no_text) + self.id_no_text = u'' + if t in ('*]', '*'): + self.style_bold = False + elif t in ('_]', '_'): + self.style_italic = False + elif t == '+]': + self.style_under = False + elif t == '-]': + self.style_strike = False + elif t == '&': + self.style_smallcap = False + if t in ('*]', '_]', '+]', '-]', '*', '_'): + txt = self.style_embed.pop() + text.append('%s' % t) + + # Soft scene breaks. + if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': + ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + tail = elem.tail + if not self.in_pre: + tail = self.prepare_string_for_textile(self.remove_newlines(tail)) + text.append(tail) + + return text diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py new file mode 100644 index 0000000000..40444ba601 --- /dev/null +++ b/src/calibre/ebooks/txt/unsmarten.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +"""unsmarten : html2textile helper function""" + +__version__ = '0.1' +__author__ = 'Leigh Parry' + +import re + +def unsmarten(txt): + txt = re.sub(u'–|–|–', r'-', txt) # en-dash + txt = re.sub(u'—|—|—', r'--', txt) # em-dash + txt = re.sub(u'…|…|…', r'...', txt) # ellipsis + + txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote + txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe + txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote + + txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent + txt = re.sub(u'£|£|£', r'{L-}', txt) # pound + txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen + txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright + txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered + txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter + txt = re.sub(u'½|½|½', r'{1/2}', txt) # half + txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter + txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave + txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute + txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex + txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde + txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut + txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring + txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE + txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla + txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave + txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute + txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex + txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut + txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave + txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute + txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex + txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut + txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH + txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde + txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave + txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute + txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex + txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde + txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut + txt = re.sub(u'×|×|×', r'{x}', txt) # dimension + txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash + txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave + txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute + txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex + txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut + txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave + txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s + txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave + txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute + txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex + txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde + txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut + txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring + txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae + txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla + txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave + txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute + txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex + txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut + txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave + txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute + txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex + txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut + txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth + txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde + txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave + txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute + txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex + txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde + txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut + txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke + txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave + txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute + txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex + txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut + txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute + txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut + txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE + txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe + txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron + txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron + txt = re.sub(u'•|•|•', r'{*}', txt) # bullet + txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc + txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira + txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee + txt = re.sub(u'€|€|€', r'{C=}', txt) # euro + txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark + txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade + txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club + txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart + txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond + + # Move into main code? +# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph +# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph +# txt = re.sub(u'\n \n', r'\n
\n', txt) # blank paragraph - br tag + + return txt diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 1dfe1d8d14..28504f2a31 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -620,7 +620,11 @@ class Application(QApplication): self.original_font = QFont(QApplication.font()) fi = gprefs['font'] if fi is not None: - QApplication.setFont(QFont(*fi)) + font = QFont(*(fi[:4])) + s = gprefs.get('font_stretch', None) + if s is not None: + font.setStretch(s) + QApplication.setFont(font) def _send_file_open_events(self): with self._file_open_lock: diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 8427f83824..816e8d7785 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', 'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references', - 'txt_output_encoding']) + 'keep_color', 'txt_output_encoding']) self.db, self.book_id = db, book_id for x in get_option('newline').option.choices: self.opt_newline.addItem(x) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 1ef9e6e6b9..3a62643551 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -122,6 +122,13 @@ + + + + Keep text color, when possible + + + diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py index 620113cc3f..ee2d7a5428 100644 --- a/src/calibre/gui2/preferences/look_feel.py +++ b/src/calibre/gui2/preferences/look_feel.py @@ -161,7 +161,11 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): def initialize(self): ConfigWidgetBase.initialize(self) - self.current_font = self.initial_font = gprefs['font'] + font = gprefs['font'] + if font is not None: + font = list(font) + font.append(gprefs.get('font_stretch', QFont.Unstretched)) + self.current_font = self.initial_font = font self.update_font_display() self.display_model.initialize() @@ -178,7 +182,8 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): def build_font_obj(self): font_info = self.current_font if font_info is not None: - font = QFont(*font_info) + font = QFont(*(font_info[:4])) + font.setStretch(font_info[4]) else: font = qt_app.original_font return font @@ -215,15 +220,18 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): if fd.exec_() == fd.Accepted: font = fd.selectedFont() fi = QFontInfo(font) - self.current_font = (unicode(fi.family()), fi.pointSize(), - fi.weight(), fi.italic()) + self.current_font = [unicode(fi.family()), fi.pointSize(), + fi.weight(), fi.italic(), font.stretch()] self.update_font_display() self.changed_signal.emit() def commit(self, *args): rr = ConfigWidgetBase.commit(self, *args) if self.current_font != self.initial_font: - gprefs['font'] = self.current_font + gprefs['font'] = (self.current_font[:4] if self.current_font else + None) + gprefs['font_stretch'] = (self.current_font[4] if self.current_font + is not None else QFont.Unstretched) QApplication.setFont(self.font_display.font()) rr = True self.display_model.commit() diff --git a/src/calibre/gui2/preferences/metadata_sources.py b/src/calibre/gui2/preferences/metadata_sources.py index 05ff23987d..f7465fb0ee 100644 --- a/src/calibre/gui2/preferences/metadata_sources.py +++ b/src/calibre/gui2/preferences/metadata_sources.py @@ -71,9 +71,10 @@ class SourcesModel(QAbstractTableModel): # {{{ plugin.is_configured()): return QIcon(I('list_remove.png')) elif role == Qt.ToolTipRole: + base = plugin.description + '\n\n' if plugin.is_configured(): - return _('This source is configured and ready to go') - return _('This source needs configuration') + return base + _('This source is configured and ready to go') + return base + _('This source needs configuration') return NONE def setData(self, index, val, role): diff --git a/src/calibre/gui2/store/wizards_tower_books_plugin.py b/src/calibre/gui2/store/wizards_tower_books_plugin.py index 56bb00ff7e..c17ea2ca64 100644 --- a/src/calibre/gui2/store/wizards_tower_books_plugin.py +++ b/src/calibre/gui2/store/wizards_tower_books_plugin.py @@ -29,7 +29,7 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): detail_item = self.url + detail_item if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(QUrl(url_slash_cleaner(detail_item))) else: d = WebStoreDialog(self.gui, self.url, parent, detail_item) d.setWindowTitle(self.name) @@ -38,9 +38,9 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): def search(self, query, max_results=10, timeout=60): url = 'http://www.wizardstowerbooks.com/search.html?for=' + urllib.quote(query) - + br = browser() - + counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) @@ -60,13 +60,13 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): price = price.strip() if not price: continue - + title = ''.join(data.xpath('.//span[@class="prti"]/a/b/text()')) author = ''.join(data.xpath('.//p[@class="last"]/text()')) a, b, author = author.partition(' by ') - + counter -= 1 - + s = SearchResult() s.cover_url = cover_url s.title = title.strip() @@ -74,15 +74,15 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin): s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED - + yield s def get_details(self, search_result, timeout): br = browser() with closing(br.open(url_slash_cleaner(self.url + search_result.detail_item), timeout=timeout)) as nf: idata = html.fromstring(nf.read()) - + formats = ', '.join(idata.xpath('//select[@id="N1_"]//option//text()')) search_result.formats = formats.upper() - + return True diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index 6b1a793fc8..e139cee191 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -633,8 +633,8 @@ class LibraryPage(QWizardPage, LibraryUI): try: lang = prefs['language'].lower()[:2] metadata_plugins = { - 'zh' : ('Douban Books', 'Douban.com covers'), - 'fr' : ('Nicebooks', 'Nicebooks covers'), + 'zh' : ('Douban Books',), + 'fr' : ('Nicebooks',), }.get(lang, []) from calibre.customize.ui import enable_plugin for name in metadata_plugins: diff --git a/src/calibre/utils/Zeroconf.py b/src/calibre/utils/Zeroconf.py index fbb9b4e71f..2b3661162f 100755 --- a/src/calibre/utils/Zeroconf.py +++ b/src/calibre/utils/Zeroconf.py @@ -869,7 +869,8 @@ class Engine(threading.Thread): if DEBUG: traceback.print_exc() except: - traceback.print_exc() + if DEBUG: + traceback.print_exc() except: pass diff --git a/src/calibre/utils/html2textile.py b/src/calibre/utils/html2textile.py deleted file mode 100644 index 786e912e36..0000000000 --- a/src/calibre/utils/html2textile.py +++ /dev/null @@ -1,209 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2010, Webreactor - Marcin Lulek -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from lxml import etree -from calibre.ebooks.oeb.base import barename - -class EchoTarget: - - def __init__(self): - self.final_output = [] - self.block = False - self.ol_ident = 0 - self.ul_ident = 0 - self.list_types = [] - self.haystack = [] - - def start(self, tag, attrib): - tag = barename(tag) - - newline = '\n' - dot = '' - new_tag = '' - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): - new_tag = tag - dot = '. ' - elif tag == 'p': - new_tag = '' - dot = '' - elif tag == 'blockquote': - new_tag = 'bq' - dot = '. ' - elif tag in ('b', 'strong'): - new_tag = '*' - newline = '' - elif tag in ('em', 'i'): - new_tag = '_' - newline = '' - elif tag == 'cite': - new_tag = '??' - newline = '' - elif tag == 'del': - new_tag = '-' - newline = '' - elif tag == 'ins': - new_tag = '+' - newline = '' - elif tag == 'sup': - new_tag = '^' - newline = '' - elif tag == 'sub': - new_tag = '~' - newline = '' - elif tag == 'span': - new_tag = '' - newline = '' - elif tag == 'a': - self.block = True - if 'title' in attrib: - self.a_part = {'title':attrib.get('title'), - 'href':attrib.get('href', '')} - else: - self.a_part = {'title':None, 'href':attrib.get('href', '')} - new_tag = '' - newline = '' - - elif tag == 'img': - if 'alt' in attrib: - new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),) - else: - new_tag = ' !%s' % attrib.get('src') - newline = '' - - elif tag in ('ul', 'ol'): - new_tag = '' - newline = '' - self.list_types.append(tag) - if tag == 'ul': - self.ul_ident += 1 - else: - self.ol_ident += 1 - - elif tag == 'li': - indent = self.ul_ident + self.ol_ident - if self.list_types[-1] == 'ul': - new_tag = '*' * indent + ' ' - newline = '\n' - else: - new_tag = '#' * indent + ' ' - newline = '\n' - - - if tag not in ('ul', 'ol'): - textile = '%(newline)s%(tag)s%(dot)s' % \ - { - 'newline':newline, - 'tag':new_tag, - 'dot':dot - } - if not self.block: - self.final_output.append(textile) - else: - self.haystack.append(textile) - - def end(self, tag): - tag = barename(tag) - - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): - self.final_output.append('\n') - elif tag in ('b', 'strong'): - self.final_output.append('*') - elif tag in ('em', 'i'): - self.final_output.append('_') - elif tag == 'cite': - self.final_output.append('??') - elif tag == 'del': - self.final_output.append('-') - elif tag == 'ins': - self.final_output.append('+') - elif tag == 'sup': - self.final_output.append('^') - elif tag == 'sub': - self.final_output.append('~') - elif tag == 'span': - self.final_output.append('') - elif tag == 'a': - if self.a_part['title']: - textilized = ' "%s (%s)":%s ' % ( - ''.join(self.haystack), - self.a_part.get('title'), - self.a_part.get('href'), - ) - self.haystack = [] - else: - textilized = ' "%s":%s ' % ( - ''.join(self.haystack), - self.a_part.get('href'), - ) - self.haystack = [] - self.final_output.append(textilized) - self.block = False - elif tag == 'img': - self.final_output.append('!') - elif tag == 'ul': - self.ul_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - elif tag == 'ol': - self.ol_ident -= 1 - self.list_types.pop() - if len(self.list_types) == 0: - self.final_output.append('\n') - - def data(self, data): - #we dont want any linebreaks inside our tags - node_data = data.replace('\n','') - if not self.block: - self.final_output.append(node_data) - else: - self.haystack.append(node_data) - - def comment(self, text): - pass - - def close(self): - return "closed!" - - -def html2textile(html): - #1st pass - #clean the whitespace and convert html to xhtml - parser = etree.HTMLParser() - tree = etree.fromstring(html, parser) - xhtml = etree.tostring(tree, method="xml") - parser = etree.XMLParser(remove_blank_text=True) - root = etree.XML(xhtml, parser) - cleaned_html = etree.tostring(root) - #2nd pass build textile - target = EchoTarget() - parser = etree.XMLParser(target=target) - root = etree.fromstring(cleaned_html, parser) - textilized_text = ''.join(target.final_output).lstrip().rstrip() - return textilized_text