From 626f1b25584705f2f0409ab63f52cf04e3324ad6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 11 Jan 2011 07:49:08 -0500 Subject: [PATCH 1/3] TXT Input: Textile support. --- src/calibre/ebooks/textile/__init__.py | 3 + src/calibre/ebooks/textile/functions.py | 981 ++++++++++++++++++++++++ src/calibre/ebooks/txt/input.py | 10 +- src/calibre/ebooks/txt/processor.py | 19 + 4 files changed, 1012 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/textile/__init__.py create mode 100644 src/calibre/ebooks/textile/functions.py diff --git a/src/calibre/ebooks/textile/__init__.py b/src/calibre/ebooks/textile/__init__.py new file mode 100644 index 0000000000..eeaeb33940 --- /dev/null +++ b/src/calibre/ebooks/textile/__init__.py @@ -0,0 +1,3 @@ +from functions import textile, textile_restricted, Textile + +__all__ = ['textile', 'textile_restricted'] diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py new file mode 100644 index 0000000000..ec70f591eb --- /dev/null +++ b/src/calibre/ebooks/textile/functions.py @@ -0,0 +1,981 @@ +#!/usr/bin/env python +""" +PyTextile + +A Humane Web Text Generator +""" + +__version__ = '2.1.4' + +__date__ = '2009/12/04' + +__copyright__ = """ +Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ +Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ +Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/ + +Original PHP Version: +Copyright (c) 2003-2004, Dean Allen +All rights reserved. + +Thanks to Carlo Zottmann for refactoring +Textile's procedural code into a class framework + +Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/ + +""" + +__license__ = """ +L I C E N S E +============= +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name Textile nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +""" + +import re +import uuid +from urlparse import urlparse + +def _normalize_newlines(string): + out = re.sub(r'\r\n', '\n', string) + out = re.sub(r'\n{3,}', '\n\n', out) + out = re.sub(r'\n\s*\n', '\n\n', out) + out = re.sub(r'"$', '" ', out) + return out + +def getimagesize(url): + """ + Attempts to determine an image's width and height, and returns a string + suitable for use in an tag, or None in case of failure. + Requires that PIL is installed. + + >>> getimagesize("http://www.google.com/intl/en_ALL/images/logo.gif") + ... #doctest: +ELLIPSIS, +SKIP + 'width="..." height="..."' + + """ + + try: + import ImageFile + import urllib2 + except ImportError: + return None + + try: + p = ImageFile.Parser() + f = urllib2.urlopen(url) + while True: + s = f.read(1024) + if not s: + break + p.feed(s) + if p.image: + return 'width="%i" height="%i"' % p.image.size + except (IOError, ValueError): + return None + +class Textile(object): + hlgn = r'(?:\<(?!>)|(?|\<\>|\=|[()]+(?! ))' + vlgn = r'[\-^~]' + clas = r'(?:\([^)]+\))' + lnge = r'(?:\[[^\]]+\])' + styl = r'(?:\{[^}]+\})' + cspn = r'(?:\\\d+)' + rspn = r'(?:\/\d+)' + a = r'(?:%s|%s)*' % (hlgn, vlgn) + s = r'(?:%s|%s)*' % (cspn, rspn) + c = r'(?:%s)*' % '|'.join([clas, styl, lnge, hlgn]) + + pnct = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]' + # urlch = r'[\w"$\-_.+!*\'(),";/?:@=&%#{}|\\^~\[\]`]' + urlch = '[\w"$\-_.+*\'(),";\/?:@=&%#{}|\\^~\[\]`]' + + url_schemes = ('http', 'https', 'ftp', 'mailto') + + btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p') + btag_lite = ('bq', 'bc', 'p') + + glyph_defaults = ( + ('txt_quote_single_open', '‘'), + ('txt_quote_single_close', '’'), + ('txt_quote_double_open', '“'), + ('txt_quote_double_close', '”'), + ('txt_apostrophe', '’'), + ('txt_prime', '′'), + ('txt_prime_double', '″'), + ('txt_ellipsis', '…'), + ('txt_emdash', '—'), + ('txt_endash', '–'), + ('txt_dimension', '×'), + ('txt_trademark', '™'), + ('txt_registered', '®'), + ('txt_copyright', '©'), + ) + + def __init__(self, restricted=False, lite=False, noimage=False): + """docstring for __init__""" + self.restricted = restricted + self.lite = lite + self.noimage = noimage + self.get_sizes = False + self.fn = {} + self.urlrefs = {} + self.shelf = {} + self.rel = '' + self.html_type = 'xhtml' + + def textile(self, text, rel=None, head_offset=0, html_type='xhtml'): + """ + >>> import textile + >>> textile.textile('some textile') + u'\\t

some textile

' + """ + self.html_type = html_type + + # text = unicode(text) + text = _normalize_newlines(text) + + if self.restricted: + text = self.encode_html(text, quotes=False) + + if rel: + self.rel = ' rel="%s"' % rel + + text = self.getRefs(text) + + text = self.block(text, int(head_offset)) + + text = self.retrieve(text) + + return text + + def pba(self, input, element=None): + """ + Parse block attributes. + + >>> t = Textile() + >>> t.pba(r'\3') + '' + >>> t.pba(r'\\3', element='td') + ' colspan="3"' + >>> t.pba(r'/4', element='td') + ' rowspan="4"' + >>> t.pba(r'\\3/4', element='td') + ' colspan="3" rowspan="4"' + + >>> t.vAlign('^') + 'top' + + >>> t.pba('^', element='td') + ' style="vertical-align:top;"' + + >>> t.pba('{line-height:18px}') + ' style="line-height:18px;"' + + >>> t.pba('(foo-bar)') + ' class="foo-bar"' + + >>> t.pba('(#myid)') + ' id="myid"' + + >>> t.pba('(foo-bar#myid)') + ' class="foo-bar" id="myid"' + + >>> t.pba('((((') + ' style="padding-left:4em;"' + + >>> t.pba(')))') + ' style="padding-right:3em;"' + + >>> t.pba('[fr]') + ' lang="fr"' + + """ + style = [] + aclass = '' + lang = '' + colspan = '' + rowspan = '' + id = '' + + if not input: + return '' + + matched = input + if element == 'td': + m = re.search(r'\\(\d+)', matched) + if m: + colspan = m.group(1) + + m = re.search(r'/(\d+)', matched) + if m: + rowspan = m.group(1) + + if element == 'td' or element == 'tr': + m = re.search(r'(%s)' % self.vlgn, matched) + if m: + style.append("vertical-align:%s;" % self.vAlign(m.group(1))) + + m = re.search(r'\{([^}]*)\}', matched) + if m: + style.append(m.group(1).rstrip(';') + ';') + matched = matched.replace(m.group(0), '') + + m = re.search(r'\[([^\]]+)\]', matched, re.U) + if m: + lang = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'\(([^()]+)\)', matched, re.U) + if m: + aclass = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([(]+)', matched) + if m: + style.append("padding-left:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([)]+)', matched) + if m: + style.append("padding-right:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'(%s)' % self.hlgn, matched) + if m: + style.append("text-align:%s;" % self.hAlign(m.group(1))) + + m = re.search(r'^(.*)#(.*)$', aclass) + if m: + id = m.group(2) + aclass = m.group(1) + + if self.restricted: + if lang: + return ' lang="%s"' + else: + return '' + + result = [] + if style: + result.append(' style="%s"' % "".join(style)) + if aclass: + result.append(' class="%s"' % aclass) + if lang: + result.append(' lang="%s"' % lang) + if id: + result.append(' id="%s"' % id) + if colspan: + result.append(' colspan="%s"' % colspan) + if rowspan: + result.append(' rowspan="%s"' % rowspan) + return ''.join(result) + + def hasRawText(self, text): + """ + checks whether the text has text not already enclosed by a block tag + + >>> t = Textile() + >>> t.hasRawText('

foo bar biz baz

') + False + + >>> t.hasRawText(' why yes, yes it does') + True + + """ + r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|pre|h\d)[^>]*?>.*', re.S).sub('', text.strip()).strip() + r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r) + return '' != r + + def table(self, text): + r""" + >>> t = Textile() + >>> t.table('|one|two|three|\n|a|b|c|') + '\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
onetwothree
abc
\n\n' + """ + text = text + "\n\n" + pattern = re.compile(r'^(?:table(_?%(s)s%(a)s%(c)s)\. ?\n)?^(%(a)s%(c)s\.? ?\|.*\|)\n\n' % {'s':self.s, 'a':self.a, 'c':self.c}, re.S|re.M|re.U) + return pattern.sub(self.fTable, text) + + def fTable(self, match): + tatts = self.pba(match.group(1), 'table') + rows = [] + for row in [ x for x in match.group(2).split('\n') if x]: + rmtch = re.search(r'^(%s%s\. )(.*)' % (self.a, self.c), row.lstrip()) + if rmtch: + ratts = self.pba(rmtch.group(1), 'tr') + row = rmtch.group(2) + else: + ratts = '' + + cells = [] + for cell in row.split('|')[1:-1]: + ctyp = 'd' + if re.search(r'^_', cell): + ctyp = "h" + cmtch = re.search(r'^(_?%s%s%s\. )(.*)' % (self.s, self.a, self.c), cell) + if cmtch: + catts = self.pba(cmtch.group(1), 'td') + cell = cmtch.group(2) + else: + catts = '' + + cell = self.graf(self.span(cell)) + cells.append('\t\t\t%s' % (ctyp, catts, cell, ctyp)) + rows.append("\t\t\n%s\n\t\t" % (ratts, '\n'.join(cells))) + cells = [] + catts = None + return "\t\n%s\n\t\n\n" % (tatts, '\n'.join(rows)) + + def lists(self, text): + """ + >>> t = Textile() + >>> t.lists("* one\\n* two\\n* three") + '\\t
    \\n\\t\\t
  • one
  • \\n\\t\\t
  • two
  • \\n\\t\\t
  • three
  • \\n\\t
' + """ + pattern = re.compile(r'^([#*]+%s .*)$(?![^#*])' % self.c, re.U|re.M|re.S) + return pattern.sub(self.fList, text) + + def fList(self, match): + text = match.group(0).split("\n") + result = [] + lists = [] + for i, line in enumerate(text): + try: + nextline = text[i+1] + except IndexError: + nextline = '' + + m = re.search(r"^([#*]+)(%s%s) (.*)$" % (self.a, self.c), line, re.S) + if m: + tl, atts, content = m.groups() + nl = '' + nm = re.search(r'^([#*]+)\s.*', nextline) + if nm: + nl = nm.group(1) + if tl not in lists: + lists.append(tl) + atts = self.pba(atts) + line = "\t<%sl%s>\n\t\t
  • %s" % (self.lT(tl), atts, self.graf(content)) + else: + line = "\t\t
  • " + self.graf(content) + + if len(nl) <= len(tl): + line = line + "
  • " + for k in reversed(lists): + if len(k) > len(nl): + line = line + "\n\t" % self.lT(k) + if len(k) > 1: + line = line + "" + lists.remove(k) + + result.append(line) + return "\n".join(result) + + def lT(self, input): + if re.search(r'^#+', input): + return 'o' + else: + return 'u' + + def doPBr(self, in_): + return re.compile(r'<(p)([^>]*?)>(.*)()', re.S).sub(self.doBr, in_) + + def doBr(self, match): + if self.html_type == 'html': + content = re.sub(r'(.+)(?:(?)|(?))\n(?![#*\s|])', '\\1
    ', match.group(3)) + else: + content = re.sub(r'(.+)(?:(?)|(?))\n(?![#*\s|])', '\\1
    ', match.group(3)) + return '<%s%s>%s%s' % (match.group(1), match.group(2), content, match.group(4)) + + def block(self, text, head_offset = 0): + """ + >>> t = Textile() + >>> t.block('h1. foobar baby') + '\\t

    foobar baby

    ' + """ + if not self.lite: + tre = '|'.join(self.btag) + else: + tre = '|'.join(self.btag_lite) + text = text.split('\n\n') + + tag = 'p' + atts = cite = graf = ext = '' + + out = [] + + anon = False + for line in text: + pattern = r'^(%s)(%s%s)\.(\.?)(?::(\S+))? (.*)$' % (tre, self.a, self.c) + match = re.search(pattern, line, re.S) + if match: + if ext: + out.append(out.pop() + c1) + + tag, atts, ext, cite, graf = match.groups() + h_match = re.search(r'h([1-6])', tag) + if h_match: + head_level, = h_match.groups() + tag = 'h%i' % max(1, + min(int(head_level) + head_offset, + 6)) + o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, + cite, graf) + # leave off c1 if this block is extended, + # we'll close it at the start of the next block + + if ext: + line = "%s%s%s%s" % (o1, o2, content, c2) + else: + line = "%s%s%s%s%s" % (o1, o2, content, c2, c1) + + else: + anon = True + if ext or not re.search(r'^\s', line): + o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, + cite, line) + # skip $o1/$c1 because this is part of a continuing + # extended block + if tag == 'p' and not self.hasRawText(content): + line = content + else: + line = "%s%s%s" % (o2, content, c2) + else: + line = self.graf(line) + + line = self.doPBr(line) + if self.html_type == 'xhtml': + line = re.sub(r'
    ', '
    ', line) + + if ext and anon: + out.append(out.pop() + "\n" + line) + else: + out.append(line) + + if not ext: + tag = 'p' + atts = '' + cite = '' + graf = '' + + if ext: + out.append(out.pop() + c1) + return '\n\n'.join(out) + + def fBlock(self, tag, atts, ext, cite, content): + """ + >>> t = Textile() + >>> t.fBlock("bq", "", None, "", "Hello BlockQuote") + ('\\t
    \\n', '\\t\\t

    ', 'Hello BlockQuote', '

    ', '\\n\\t
    ') + + >>> t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote") + ('\\t
    \\n', '\\t\\t

    ', 'Hello BlockQuote', '

    ', '\\n\\t
    ') + + >>> t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS + ('
    ', '', ..., '', '
    ') + + >>> t.fBlock("h1", "", None, "", "foobar") + ('', '\\t

    ', 'foobar', '

    ', '') + """ + atts = self.pba(atts) + o1 = o2 = c2 = c1 = '' + + m = re.search(r'fn(\d+)', tag) + if m: + tag = 'p' + if m.group(1) in self.fn: + fnid = self.fn[m.group(1)] + else: + fnid = m.group(1) + atts = atts + ' id="fn%s"' % fnid + if atts.find('class=') < 0: + atts = atts + ' class="footnote"' + content = ('%s' % m.group(1)) + content + + if tag == 'bq': + cite = self.checkRefs(cite) + if cite: + cite = ' cite="%s"' % cite + else: + cite = '' + o1 = "\t\n" % (cite, atts) + o2 = "\t\t" % atts + c2 = "

    " + c1 = "\n\t" + + elif tag == 'bc': + o1 = "" % atts + o2 = "" % atts + c2 = "" + c1 = "" + content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) + + elif tag == 'notextile': + content = self.shelve(content) + o1 = o2 = '' + c1 = c2 = '' + + elif tag == 'pre': + content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) + o1 = "" % atts + o2 = c2 = '' + c1 = '' + + else: + o2 = "\t<%s%s>" % (tag, atts) + c2 = "" % tag + + content = self.graf(content) + return o1, o2, content, c2, c1 + + def footnoteRef(self, text): + """ + >>> t = Textile() + >>> t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS + 'foo1 ' + """ + return re.sub(r'\b\[([0-9]+)\](\s)?', self.footnoteID, text) + + def footnoteID(self, match): + id, t = match.groups() + if id not in self.fn: + self.fn[id] = str(uuid.uuid4()) + fnid = self.fn[id] + if not t: + t = '' + return '%s%s' % (fnid, id, t) + + def glyphs(self, text): + """ + >>> t = Textile() + + >>> t.glyphs("apostrophe's") + 'apostrophe’s' + + >>> t.glyphs("back in '88") + 'back in ’88' + + >>> t.glyphs('foo ...') + 'foo …' + + >>> t.glyphs('--') + '—' + + >>> t.glyphs('FooBar[tm]') + 'FooBar™' + + >>> t.glyphs("

    Cat's Cradle by Vonnegut

    ") + '

    Cat’s Cradle by Vonnegut

    ' + + """ + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + glyph_search = ( + re.compile(r"(\w)\'(\w)"), # apostrophe's + re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88 + re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing + re.compile(r'\'/'), # single opening + re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing + re.compile(r'"'), # double opening + re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym + re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase + re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis + re.compile(r'(\s?)--(\s?)'), # em dash + re.compile(r'\s-(?:\s|$)'), # en dash + re.compile(r'(\d+)( ?)x( ?)(?=\d+)'), # dimension sign + re.compile(r'\b ?[([]TM[])]', re.I), # trademark + re.compile(r'\b ?[([]R[])]', re.I), # registered + re.compile(r'\b ?[([]C[])]', re.I), # copyright + ) + + glyph_replace = [x % dict(self.glyph_defaults) for x in ( + r'\1%(txt_apostrophe)s\2', # apostrophe's + r'\1%(txt_apostrophe)s\2', # back in '88 + r'\1%(txt_quote_single_close)s', # single closing + r'%(txt_quote_single_open)s', # single opening + r'\1%(txt_quote_double_close)s', # double closing + r'%(txt_quote_double_open)s', # double opening + r'\1', # 3+ uppercase acronym + r'\1', # 3+ uppercase + r'\1%(txt_ellipsis)s', # ellipsis + r'\1%(txt_emdash)s\2', # em dash + r' %(txt_endash)s ', # en dash + r'\1\2%(txt_dimension)s\3', # dimension sign + r'%(txt_trademark)s', # trademark + r'%(txt_registered)s', # registered + r'%(txt_copyright)s', # copyright + )] + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + for s, r in zip(glyph_search, glyph_replace): + line = s.sub(r, line) + result.append(line) + return ''.join(result) + + def vAlign(self, input): + d = {'^':'top', '-':'middle', '~':'bottom'} + return d.get(input, '') + + def hAlign(self, input): + d = {'<':'left', '=':'center', '>':'right', '<>': 'justify'} + return d.get(input, '') + + def getRefs(self, text): + """ + what is this for? + """ + pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U) + text = pattern.sub(self.refs, text) + return text + + def refs(self, match): + flag, url = match.groups() + self.urlrefs[flag] = url + return '' + + def checkRefs(self, url): + return self.urlrefs.get(url, url) + + def isRelURL(self, url): + """ + Identify relative urls. + + >>> t = Textile() + >>> t.isRelURL("http://www.google.com/") + False + >>> t.isRelURL("/foo") + True + + """ + (scheme, netloc) = urlparse(url)[0:2] + return not scheme and not netloc + + def relURL(self, url): + scheme = urlparse(url)[0] + if self.restricted and scheme and scheme not in self.url_schemes: + return '#' + return url + + def shelve(self, text): + id = str(uuid.uuid4()) + self.shelf[id] = text + return id + + def retrieve(self, text): + """ + >>> t = Textile() + >>> id = t.shelve("foobar") + >>> t.retrieve(id) + 'foobar' + """ + while True: + old = text + for k, v in self.shelf.items(): + text = text.replace(k, v) + if text == old: + break + return text + + def encode_html(self, text, quotes=True): + a = ( + ('&', '&'), + ('<', '<'), + ('>', '>') + ) + + if quotes: + a = a + ( + ("'", '''), + ('"', '"') + ) + + for k, v in a: + text = text.replace(k, v) + return text + + def graf(self, text): + if not self.lite: + text = self.noTextile(text) + text = self.code(text) + + text = self.links(text) + + if not self.noimage: + text = self.image(text) + + if not self.lite: + text = self.lists(text) + text = self.table(text) + + text = self.span(text) + text = self.footnoteRef(text) + text = self.glyphs(text) + + return text.rstrip('\n') + + def links(self, text): + """ + >>> t = Textile() + >>> t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS + 'fooobar ... and hello world ...' + """ + + punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' + + pattern = r''' + (?P
        [\s\[{(]|[%s]   )?
    +            "                          # start
    +            (?P   %s       )
    +            (?P   [^"]+?   )
    +            \s?
    +            (?:   \(([^)]+?)\)(?=")   )?     # $title
    +            ":
    +            (?P    (?:ftp|https?)? (?: :// )? [-A-Za-z0-9+&@#/?=~_()|!:,.;]*[-A-Za-z0-9+&@#/=~_()|]   )
    +            (?P   [^\w\/;]*?   )
    +            (?=<|\s|$)
    +        ''' % (re.escape(punct), self.c)
    +
    +        text = re.compile(pattern, re.X).sub(self.fLink, text)
    +
    +        return text
    +
    +    def fLink(self, match):
    +        pre, atts, text, title, url, post = match.groups()
    +
    +        if pre == None:
    +            pre = ''
    +            
    +        # assume ) at the end of the url is not actually part of the url
    +        # unless the url also contains a (
    +        if url.endswith(')') and not url.find('(') > -1:
    +            post = url[-1] + post
    +            url = url[:-1]
    +
    +        url = self.checkRefs(url)
    +
    +        atts = self.pba(atts)
    +        if title:
    +            atts = atts +  ' title="%s"' % self.encode_html(title)
    +
    +        if not self.noimage:
    +            text = self.image(text)
    +
    +        text = self.span(text)
    +        text = self.glyphs(text)
    +
    +        url = self.relURL(url)
    +        out = '%s' % (self.encode_html(url), atts, self.rel, text)
    +        out = self.shelve(out)
    +        return ''.join([pre, out, post])
    +
    +    def span(self, text):
    +        """
    +        >>> t = Textile()
    +        >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
    +        'hello span strong and bold goodbye'
    +        """
    +        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
    +        pnct = ".,\"'?!;:"
    +
    +        for qtag in qtags:
    +            pattern = re.compile(r"""
    +                (?:^|(?<=[\s>%(pnct)s])|([\]}]))
    +                (%(qtag)s)(?!%(qtag)s)
    +                (%(c)s)
    +                (?::(\S+))?
    +                ([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n])
    +                ([%(pnct)s]*)
    +                %(qtag)s
    +                (?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s))
    +            """ % {'qtag':qtag, 'c':self.c, 'pnct':pnct,
    +                   'selfpnct':self.pnct}, re.X)
    +            text = pattern.sub(self.fSpan, text)
    +        return text
    +
    +
    +    def fSpan(self, match):
    +        _, tag, atts, cite, content, end, _ = match.groups()
    +
    +        qtags = {
    +            '*': 'strong',
    +            '**': 'b',
    +            '??': 'cite',
    +            '_' : 'em',
    +            '__': 'i',
    +            '-' : 'del',
    +            '%' : 'span',
    +            '+' : 'ins',
    +            '~' : 'sub',
    +            '^' : 'sup'
    +        }
    +        tag = qtags[tag]
    +        atts = self.pba(atts)
    +        if cite:
    +            atts = atts + 'cite="%s"' % cite
    +
    +        content = self.span(content)
    +
    +        out = "<%s%s>%s%s" % (tag, atts, content, end, tag)
    +        return out
    +
    +    def image(self, text):
    +        """
    +        >>> t = Textile()
    +        >>> t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
    +        ''
    +        """
    +        pattern = re.compile(r"""
    +            (?:[\[{])?          # pre
    +            \!                 # opening !
    +            (%s)               # optional style,class atts
    +            (?:\. )?           # optional dot-space
    +            ([^\s(!]+)         # presume this is the src
    +            \s?                # optional space
    +            (?:\(([^\)]+)\))?  # optional title
    +            \!                 # closing
    +            (?::(\S+))?        # optional href
    +            (?:[\]}]|(?=\s|$)) # lookahead: space or end of string
    +        """ % self.c, re.U|re.X)
    +        return pattern.sub(self.fImage, text)
    +
    +    def fImage(self, match):
    +        # (None, '', '/imgs/myphoto.jpg', None, None)
    +        atts, url, title, href = match.groups()
    +        atts  = self.pba(atts)
    +
    +        if title:
    +            atts = atts + ' title="%s" alt="%s"' % (title, title)
    +        else:
    +            atts = atts + ' alt=""'
    +            
    +        if not self.isRelURL(url) and self.get_sizes:
    +            size = getimagesize(url)
    +            if (size):
    +                atts += " %s" % size
    +
    +        if href:
    +            href = self.checkRefs(href)
    +
    +        url = self.checkRefs(url)
    +        url = self.relURL(url)
    +
    +        out = []
    +        if href:
    +            out.append('' % href)
    +        if self.html_type == 'html':
    +            out.append('' % (url, atts))
    +        else:
    +            out.append('' % (url, atts))
    +        if href: 
    +            out.append('')
    +
    +        return ''.join(out)
    +
    +    def code(self, text):
    +        text = self.doSpecial(text, '', '', self.fCode)
    +        text = self.doSpecial(text, '@', '@', self.fCode)
    +        text = self.doSpecial(text, '
    ', '
    ', self.fPre) + return text + + def fCode(self, match): + before, text, after = match.groups() + if after == None: + after = '' + # text needs to be escaped + if not self.restricted: + text = self.encode_html(text) + return ''.join([before, self.shelve('%s' % text), after]) + + def fPre(self, match): + before, text, after = match.groups() + if after == None: + after = '' + # text needs to be escapedd + if not self.restricted: + text = self.encode_html(text) + return ''.join([before, '
    ', self.shelve(text), '
    ', after]) + + def doSpecial(self, text, start, end, method=None): + if method == None: + method = self.fSpecial + pattern = re.compile(r'(^|\s|[\[({>])%s(.*?)%s(\s|$|[\])}])?' % (re.escape(start), re.escape(end)), re.M|re.S) + return pattern.sub(method, text) + + def fSpecial(self, match): + """ + special blocks like notextile or code + """ + before, text, after = match.groups() + if after == None: + after = '' + return ''.join([before, self.shelve(self.encode_html(text)), after]) + + def noTextile(self, text): + text = self.doSpecial(text, '', '', self.fTextile) + return self.doSpecial(text, '==', '==', self.fTextile) + + def fTextile(self, match): + before, notextile, after = match.groups() + if after == None: + after = '' + return ''.join([before, self.shelve(notextile), after]) + + +def textile(text, head_offset=0, html_type='xhtml', encoding=None, output=None): + """ + this function takes additional parameters: + head_offset - offset to apply to heading levels (default: 0) + html_type - 'xhtml' or 'html' style tags (default: 'xhtml') + """ + return Textile().textile(text, head_offset=head_offset, + html_type=html_type) + +def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): + """ + Restricted version of Textile designed for weblog comments and other + untrusted input. + + Raw HTML is escaped. + Style attributes are disabled. + rel='nofollow' is added to external links. + + When lite=True is set (the default): + Block tags are restricted to p, bq, and bc. + Lists and tables are disabled. + + When noimage=True is set (the default): + Image tags are disabled. + + """ + return Textile(restricted=True, lite=lite, + noimage=noimage).textile(text, rel='nofollow', + html_type=html_type) + diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index aaff8b55c0..73af3acde4 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - convert_heuristic, normalize_line_endings + convert_heuristic, normalize_line_endings, convert_textile from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -41,6 +41,7 @@ class TXTInput(InputFormatPlugin): 'paragraph and no styling is applied.\n' '* heuristic: Process using heuristics to determine formatting such ' 'as chapter headings and italic text.\n' + '* textile: Processing using textile formatting.\n' '* markdown: Processing using markdown formatting. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, @@ -91,6 +92,13 @@ class TXTInput(InputFormatPlugin): except RuntimeError: raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') + elif options.formatting_type == 'textile': + log.debug('Running text though textile conversion...') + try: + html = convert_textile(txt) + except RuntimeError: + raise ValueError('This txt file has malformed markup, it cannot be' + ' converted by calibre.') else: # Determine the paragraph type of the document. if options.paragraph_type == 'auto': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 6a1a106681..d0526bd9fc 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -8,6 +8,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown +from calibre.ebooks.textile import textile from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor from calibre.ebooks.conversion.preprocess import DocAnalysis @@ -80,6 +81,10 @@ def convert_markdown(txt, title='', disable_toc=False): safe_mode=False) return HTML_TEMPLATE % (title, md.convert(txt)) +def convert_textile(txt, title=''): + html = textile(txt, encoding='utf-8') + return HTML_TEMPLATE % (title, html) + def normalize_line_endings(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') @@ -176,5 +181,19 @@ def detect_formatting_type(txt): for c in md_escapted_characters: if txt.count('\\'+c) > 10: return 'markdown' + + # Check for textile + # Headings + if len(re.findall(r'h[1-6]\.', txt)) >= 5: + return 'textile' + # Block quote. + if len(re.findall(r'bq\.', txt)) >= 5: + return 'textile' + # Images + if len(re.findall(r'\![^\s]+(:[^\s]+)*', txt)) >= 5: + return 'textile' + # Links + if len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) >= 5: + return 'textile' return 'heuristic' From 9585ba655c810bb9132f3d6d7299455d23d47493 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 11 Jan 2011 18:08:55 -0500 Subject: [PATCH 2/3] TXT Input: remove unnecessary try block. Rework markdown and textile detection. --- src/calibre/ebooks/txt/input.py | 6 +---- src/calibre/ebooks/txt/processor.py | 41 +++++++++++++---------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 73af3acde4..0b0bd6d570 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -94,11 +94,7 @@ class TXTInput(InputFormatPlugin): ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') elif options.formatting_type == 'textile': log.debug('Running text though textile conversion...') - try: - html = convert_textile(txt) - except RuntimeError: - raise ValueError('This txt file has malformed markup, it cannot be' - ' converted by calibre.') + html = convert_textile(txt) else: # Determine the paragraph type of the document. if options.paragraph_type == 'auto': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index d0526bd9fc..d59fd4121a 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -162,38 +162,33 @@ def detect_paragraph_type(txt): def detect_formatting_type(txt): + markdown_count = 0 + textile_count = 0 + # Check for markdown # Headings - if len(re.findall('(?mu)^#+', txt)) >= 5: - return 'markdown' - if len(re.findall('(?mu)^=+$', txt)) >= 5: - return 'markdown' - if len(re.findall('(?mu)^-+$', txt)) >= 5: - return 'markdown' + markdown_count += len(re.findall('(?mu)^#+', txt)) + markdown_count += len(re.findall('(?mu)^=+$', txt)) + markdown_count += len(re.findall('(?mu)^-+$', txt)) # Images - if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: - return 'markdown' + markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) # Links - if len(re.findall('(?u)(^|(?P
    [^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
    -        return 'markdown'
    -    # Escaped characters
    -    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
    -    for c in md_escapted_characters:
    -        if txt.count('\\'+c) > 10:
    -            return 'markdown'
    +    markdown_count += len(re.findall('(?u)(^|(?P
    [^!]))\[.*?\]\([^)]+\)', txt))
             
         # Check for textile
         # Headings
    -    if len(re.findall(r'h[1-6]\.', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
         # Block quote.
    -    if len(re.findall(r'bq\.', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'(?mu)^bq\.', txt))
         # Images
    -    if len(re.findall(r'\![^\s]+(:[^\s]+)*', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'\![^\s]+(:[^\s]+)*', txt))
         # Links
    -    if len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
    +    
    +    if markdown_count > 5 or textile_count > 5:
    +        if markdown_count > textile_count:
    +            return 'markdown'
    +        else:
    +            return 'textile'
         
         return 'heuristic'
    
    From f058f9adab9b1ae6bbc0673b1d8f5dce4550072f Mon Sep 17 00:00:00 2001
    From: John Schember 
    Date: Tue, 11 Jan 2011 18:56:23 -0500
    Subject: [PATCH 3/3] FB2 Output: Implement #8277, Write cover to coverpage tag
     within metadata.
    
    ---
     src/calibre/ebooks/fb2/fb2ml.py | 68 +++++++++++++++++++--------------
     1 file changed, 39 insertions(+), 29 deletions(-)
    
    diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
    index 4dd6e7c7ae..037a805e74 100644
    --- a/src/calibre/ebooks/fb2/fb2ml.py
    +++ b/src/calibre/ebooks/fb2/fb2ml.py
    @@ -102,6 +102,7 @@ class FB2MLizer(object):
             metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
             metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
             metadata['id'] = None
    +        metadata['cover'] = self.get_cover()
     
             author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
             if len(author_parts) == 1:
    @@ -124,7 +125,8 @@ class FB2MLizer(object):
                 metadata['id'] = str(uuid.uuid4()) 
     
             for key, value in metadata.items():
    -            metadata[key] = prepare_string_for_xml(value)
    +            if not key == 'cover':
    +                metadata[key] = prepare_string_for_xml(value)
     
             return u'' \
                     '' \
    @@ -136,6 +138,7 @@ class FB2MLizer(object):
                                 '%(author_last)s' \
                             '' \
                             '%(title)s' \
    +                        '%(cover)s' \
                             '%(lang)s' \
                         '' \
                         '' \
    @@ -154,6 +157,41 @@ class FB2MLizer(object):
         def fb2_footer(self):
             return u''
     
    +    def get_cover(self):
    +        cover_href = None
    +        
    +        # Get the raster cover if it's available.
    +        if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
    +            id = unicode(self.oeb_book.metadata.cover[0])
    +            cover_item = self.oeb_book.manifest.ids[id]
    +            if cover_item.media_type in OEB_RASTER_IMAGES:
    +                cover_href = cover_item.href
    +            print 1
    +        else:
    +            # Figure out if we have a title page or a cover page
    +            page_name = ''
    +            if 'titlepage' in self.oeb_book.guide:
    +                page_name = 'titlepage'
    +            elif 'cover' in self.oeb_book.guide:
    +                page_name = 'cover'
    +
    +            if page_name:
    +                cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
    +                # Get the first image in the page
    +                for img in cover_item.xpath('//img'):
    +                    cover_href = cover_item.abshref(img.get('src'))
    +                    print cover_href
    +                    break
    +                
    +        if cover_href:
    +            # Only write the image tag if it is in the manifest.
    +            if cover_href in self.oeb_book.manifest.hrefs.keys():
    +                if cover_href not in self.image_hrefs.keys():
    +                    self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
    +            return u'' % self.image_hrefs[cover_href]
    +        
    +        return u'' 
    +
         def get_text(self):
             text = ['']
             
    @@ -162,23 +200,6 @@ class FB2MLizer(object):
                 text.append('
    ') self.section_level += 1 - # Insert the title page / cover into the spine if it is not already referenced. - title_name = u'' - if 'titlepage' in self.oeb_book.guide: - title_name = 'titlepage' - elif 'cover' in self.oeb_book.guide: - title_name = 'cover' - if title_name: - title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] - if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': - self.oeb_book.spine.insert(0, title_item, True) - # Create xhtml page to reference cover image so it can be used. - if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: - id = unicode(self.oeb_book.metadata.cover[0]) - cover_item = self.oeb_book.manifest.ids[id] - if cover_item.media_type in OEB_RASTER_IMAGES: - self.insert_image_cover(cover_item.href) - for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) @@ -203,17 +224,6 @@ class FB2MLizer(object): return ''.join(text) + '' - def insert_image_cover(self, image_href): - from calibre.ebooks.oeb.base import RECOVER_PARSER - try: - root = etree.fromstring(u'' % (XHTML_NS, image_href), parser=RECOVER_PARSER) - except: - root = etree.fromstring(u'', parser=RECOVER_PARSER) - - id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml') - item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root) - self.oeb_book.spine.insert(0, item, True) - def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.