From 626f1b25584705f2f0409ab63f52cf04e3324ad6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 11 Jan 2011 07:49:08 -0500 Subject: [PATCH 01/11] TXT Input: Textile support. --- src/calibre/ebooks/textile/__init__.py | 3 + src/calibre/ebooks/textile/functions.py | 981 ++++++++++++++++++++++++ src/calibre/ebooks/txt/input.py | 10 +- src/calibre/ebooks/txt/processor.py | 19 + 4 files changed, 1012 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/textile/__init__.py create mode 100644 src/calibre/ebooks/textile/functions.py diff --git a/src/calibre/ebooks/textile/__init__.py b/src/calibre/ebooks/textile/__init__.py new file mode 100644 index 0000000000..eeaeb33940 --- /dev/null +++ b/src/calibre/ebooks/textile/__init__.py @@ -0,0 +1,3 @@ +from functions import textile, textile_restricted, Textile + +__all__ = ['textile', 'textile_restricted'] diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py new file mode 100644 index 0000000000..ec70f591eb --- /dev/null +++ b/src/calibre/ebooks/textile/functions.py @@ -0,0 +1,981 @@ +#!/usr/bin/env python +""" +PyTextile + +A Humane Web Text Generator +""" + +__version__ = '2.1.4' + +__date__ = '2009/12/04' + +__copyright__ = """ +Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ +Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ +Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/ + +Original PHP Version: +Copyright (c) 2003-2004, Dean Allen +All rights reserved. + +Thanks to Carlo Zottmann for refactoring +Textile's procedural code into a class framework + +Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/ + +""" + +__license__ = """ +L I C E N S E +============= +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name Textile nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +""" + +import re +import uuid +from urlparse import urlparse + +def _normalize_newlines(string): + out = re.sub(r'\r\n', '\n', string) + out = re.sub(r'\n{3,}', '\n\n', out) + out = re.sub(r'\n\s*\n', '\n\n', out) + out = re.sub(r'"$', '" ', out) + return out + +def getimagesize(url): + """ + Attempts to determine an image's width and height, and returns a string + suitable for use in an tag, or None in case of failure. + Requires that PIL is installed. + + >>> getimagesize("http://www.google.com/intl/en_ALL/images/logo.gif") + ... #doctest: +ELLIPSIS, +SKIP + 'width="..." height="..."' + + """ + + try: + import ImageFile + import urllib2 + except ImportError: + return None + + try: + p = ImageFile.Parser() + f = urllib2.urlopen(url) + while True: + s = f.read(1024) + if not s: + break + p.feed(s) + if p.image: + return 'width="%i" height="%i"' % p.image.size + except (IOError, ValueError): + return None + +class Textile(object): + hlgn = r'(?:\<(?!>)|(?|\<\>|\=|[()]+(?! ))' + vlgn = r'[\-^~]' + clas = r'(?:\([^)]+\))' + lnge = r'(?:\[[^\]]+\])' + styl = r'(?:\{[^}]+\})' + cspn = r'(?:\\\d+)' + rspn = r'(?:\/\d+)' + a = r'(?:%s|%s)*' % (hlgn, vlgn) + s = r'(?:%s|%s)*' % (cspn, rspn) + c = r'(?:%s)*' % '|'.join([clas, styl, lnge, hlgn]) + + pnct = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]' + # urlch = r'[\w"$\-_.+!*\'(),";/?:@=&%#{}|\\^~\[\]`]' + urlch = '[\w"$\-_.+*\'(),";\/?:@=&%#{}|\\^~\[\]`]' + + url_schemes = ('http', 'https', 'ftp', 'mailto') + + btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p') + btag_lite = ('bq', 'bc', 'p') + + glyph_defaults = ( + ('txt_quote_single_open', '‘'), + ('txt_quote_single_close', '’'), + ('txt_quote_double_open', '“'), + ('txt_quote_double_close', '”'), + ('txt_apostrophe', '’'), + ('txt_prime', '′'), + ('txt_prime_double', '″'), + ('txt_ellipsis', '…'), + ('txt_emdash', '—'), + ('txt_endash', '–'), + ('txt_dimension', '×'), + ('txt_trademark', '™'), + ('txt_registered', '®'), + ('txt_copyright', '©'), + ) + + def __init__(self, restricted=False, lite=False, noimage=False): + """docstring for __init__""" + self.restricted = restricted + self.lite = lite + self.noimage = noimage + self.get_sizes = False + self.fn = {} + self.urlrefs = {} + self.shelf = {} + self.rel = '' + self.html_type = 'xhtml' + + def textile(self, text, rel=None, head_offset=0, html_type='xhtml'): + """ + >>> import textile + >>> textile.textile('some textile') + u'\\t

some textile

' + """ + self.html_type = html_type + + # text = unicode(text) + text = _normalize_newlines(text) + + if self.restricted: + text = self.encode_html(text, quotes=False) + + if rel: + self.rel = ' rel="%s"' % rel + + text = self.getRefs(text) + + text = self.block(text, int(head_offset)) + + text = self.retrieve(text) + + return text + + def pba(self, input, element=None): + """ + Parse block attributes. + + >>> t = Textile() + >>> t.pba(r'\3') + '' + >>> t.pba(r'\\3', element='td') + ' colspan="3"' + >>> t.pba(r'/4', element='td') + ' rowspan="4"' + >>> t.pba(r'\\3/4', element='td') + ' colspan="3" rowspan="4"' + + >>> t.vAlign('^') + 'top' + + >>> t.pba('^', element='td') + ' style="vertical-align:top;"' + + >>> t.pba('{line-height:18px}') + ' style="line-height:18px;"' + + >>> t.pba('(foo-bar)') + ' class="foo-bar"' + + >>> t.pba('(#myid)') + ' id="myid"' + + >>> t.pba('(foo-bar#myid)') + ' class="foo-bar" id="myid"' + + >>> t.pba('((((') + ' style="padding-left:4em;"' + + >>> t.pba(')))') + ' style="padding-right:3em;"' + + >>> t.pba('[fr]') + ' lang="fr"' + + """ + style = [] + aclass = '' + lang = '' + colspan = '' + rowspan = '' + id = '' + + if not input: + return '' + + matched = input + if element == 'td': + m = re.search(r'\\(\d+)', matched) + if m: + colspan = m.group(1) + + m = re.search(r'/(\d+)', matched) + if m: + rowspan = m.group(1) + + if element == 'td' or element == 'tr': + m = re.search(r'(%s)' % self.vlgn, matched) + if m: + style.append("vertical-align:%s;" % self.vAlign(m.group(1))) + + m = re.search(r'\{([^}]*)\}', matched) + if m: + style.append(m.group(1).rstrip(';') + ';') + matched = matched.replace(m.group(0), '') + + m = re.search(r'\[([^\]]+)\]', matched, re.U) + if m: + lang = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'\(([^()]+)\)', matched, re.U) + if m: + aclass = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([(]+)', matched) + if m: + style.append("padding-left:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([)]+)', matched) + if m: + style.append("padding-right:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'(%s)' % self.hlgn, matched) + if m: + style.append("text-align:%s;" % self.hAlign(m.group(1))) + + m = re.search(r'^(.*)#(.*)$', aclass) + if m: + id = m.group(2) + aclass = m.group(1) + + if self.restricted: + if lang: + return ' lang="%s"' + else: + return '' + + result = [] + if style: + result.append(' style="%s"' % "".join(style)) + if aclass: + result.append(' class="%s"' % aclass) + if lang: + result.append(' lang="%s"' % lang) + if id: + result.append(' id="%s"' % id) + if colspan: + result.append(' colspan="%s"' % colspan) + if rowspan: + result.append(' rowspan="%s"' % rowspan) + return ''.join(result) + + def hasRawText(self, text): + """ + checks whether the text has text not already enclosed by a block tag + + >>> t = Textile() + >>> t.hasRawText('

foo bar biz baz

') + False + + >>> t.hasRawText(' why yes, yes it does') + True + + """ + r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|pre|h\d)[^>]*?>.*', re.S).sub('', text.strip()).strip() + r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r) + return '' != r + + def table(self, text): + r""" + >>> t = Textile() + >>> t.table('|one|two|three|\n|a|b|c|') + '\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
onetwothree
abc
\n\n' + """ + text = text + "\n\n" + pattern = re.compile(r'^(?:table(_?%(s)s%(a)s%(c)s)\. ?\n)?^(%(a)s%(c)s\.? ?\|.*\|)\n\n' % {'s':self.s, 'a':self.a, 'c':self.c}, re.S|re.M|re.U) + return pattern.sub(self.fTable, text) + + def fTable(self, match): + tatts = self.pba(match.group(1), 'table') + rows = [] + for row in [ x for x in match.group(2).split('\n') if x]: + rmtch = re.search(r'^(%s%s\. )(.*)' % (self.a, self.c), row.lstrip()) + if rmtch: + ratts = self.pba(rmtch.group(1), 'tr') + row = rmtch.group(2) + else: + ratts = '' + + cells = [] + for cell in row.split('|')[1:-1]: + ctyp = 'd' + if re.search(r'^_', cell): + ctyp = "h" + cmtch = re.search(r'^(_?%s%s%s\. )(.*)' % (self.s, self.a, self.c), cell) + if cmtch: + catts = self.pba(cmtch.group(1), 'td') + cell = cmtch.group(2) + else: + catts = '' + + cell = self.graf(self.span(cell)) + cells.append('\t\t\t%s' % (ctyp, catts, cell, ctyp)) + rows.append("\t\t\n%s\n\t\t" % (ratts, '\n'.join(cells))) + cells = [] + catts = None + return "\t\n%s\n\t\n\n" % (tatts, '\n'.join(rows)) + + def lists(self, text): + """ + >>> t = Textile() + >>> t.lists("* one\\n* two\\n* three") + '\\t
    \\n\\t\\t
  • one
  • \\n\\t\\t
  • two
  • \\n\\t\\t
  • three
  • \\n\\t
' + """ + pattern = re.compile(r'^([#*]+%s .*)$(?![^#*])' % self.c, re.U|re.M|re.S) + return pattern.sub(self.fList, text) + + def fList(self, match): + text = match.group(0).split("\n") + result = [] + lists = [] + for i, line in enumerate(text): + try: + nextline = text[i+1] + except IndexError: + nextline = '' + + m = re.search(r"^([#*]+)(%s%s) (.*)$" % (self.a, self.c), line, re.S) + if m: + tl, atts, content = m.groups() + nl = '' + nm = re.search(r'^([#*]+)\s.*', nextline) + if nm: + nl = nm.group(1) + if tl not in lists: + lists.append(tl) + atts = self.pba(atts) + line = "\t<%sl%s>\n\t\t
  • %s" % (self.lT(tl), atts, self.graf(content)) + else: + line = "\t\t
  • " + self.graf(content) + + if len(nl) <= len(tl): + line = line + "
  • " + for k in reversed(lists): + if len(k) > len(nl): + line = line + "\n\t" % self.lT(k) + if len(k) > 1: + line = line + "" + lists.remove(k) + + result.append(line) + return "\n".join(result) + + def lT(self, input): + if re.search(r'^#+', input): + return 'o' + else: + return 'u' + + def doPBr(self, in_): + return re.compile(r'<(p)([^>]*?)>(.*)()', re.S).sub(self.doBr, in_) + + def doBr(self, match): + if self.html_type == 'html': + content = re.sub(r'(.+)(?:(?)|(?))\n(?![#*\s|])', '\\1
    ', match.group(3)) + else: + content = re.sub(r'(.+)(?:(?)|(?))\n(?![#*\s|])', '\\1
    ', match.group(3)) + return '<%s%s>%s%s' % (match.group(1), match.group(2), content, match.group(4)) + + def block(self, text, head_offset = 0): + """ + >>> t = Textile() + >>> t.block('h1. foobar baby') + '\\t

    foobar baby

    ' + """ + if not self.lite: + tre = '|'.join(self.btag) + else: + tre = '|'.join(self.btag_lite) + text = text.split('\n\n') + + tag = 'p' + atts = cite = graf = ext = '' + + out = [] + + anon = False + for line in text: + pattern = r'^(%s)(%s%s)\.(\.?)(?::(\S+))? (.*)$' % (tre, self.a, self.c) + match = re.search(pattern, line, re.S) + if match: + if ext: + out.append(out.pop() + c1) + + tag, atts, ext, cite, graf = match.groups() + h_match = re.search(r'h([1-6])', tag) + if h_match: + head_level, = h_match.groups() + tag = 'h%i' % max(1, + min(int(head_level) + head_offset, + 6)) + o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, + cite, graf) + # leave off c1 if this block is extended, + # we'll close it at the start of the next block + + if ext: + line = "%s%s%s%s" % (o1, o2, content, c2) + else: + line = "%s%s%s%s%s" % (o1, o2, content, c2, c1) + + else: + anon = True + if ext or not re.search(r'^\s', line): + o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, + cite, line) + # skip $o1/$c1 because this is part of a continuing + # extended block + if tag == 'p' and not self.hasRawText(content): + line = content + else: + line = "%s%s%s" % (o2, content, c2) + else: + line = self.graf(line) + + line = self.doPBr(line) + if self.html_type == 'xhtml': + line = re.sub(r'
    ', '
    ', line) + + if ext and anon: + out.append(out.pop() + "\n" + line) + else: + out.append(line) + + if not ext: + tag = 'p' + atts = '' + cite = '' + graf = '' + + if ext: + out.append(out.pop() + c1) + return '\n\n'.join(out) + + def fBlock(self, tag, atts, ext, cite, content): + """ + >>> t = Textile() + >>> t.fBlock("bq", "", None, "", "Hello BlockQuote") + ('\\t
    \\n', '\\t\\t

    ', 'Hello BlockQuote', '

    ', '\\n\\t
    ') + + >>> t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote") + ('\\t
    \\n', '\\t\\t

    ', 'Hello BlockQuote', '

    ', '\\n\\t
    ') + + >>> t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS + ('
    ', '', ..., '', '
    ') + + >>> t.fBlock("h1", "", None, "", "foobar") + ('', '\\t

    ', 'foobar', '

    ', '') + """ + atts = self.pba(atts) + o1 = o2 = c2 = c1 = '' + + m = re.search(r'fn(\d+)', tag) + if m: + tag = 'p' + if m.group(1) in self.fn: + fnid = self.fn[m.group(1)] + else: + fnid = m.group(1) + atts = atts + ' id="fn%s"' % fnid + if atts.find('class=') < 0: + atts = atts + ' class="footnote"' + content = ('%s' % m.group(1)) + content + + if tag == 'bq': + cite = self.checkRefs(cite) + if cite: + cite = ' cite="%s"' % cite + else: + cite = '' + o1 = "\t\n" % (cite, atts) + o2 = "\t\t" % atts + c2 = "

    " + c1 = "\n\t" + + elif tag == 'bc': + o1 = "" % atts + o2 = "" % atts + c2 = "" + c1 = "" + content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) + + elif tag == 'notextile': + content = self.shelve(content) + o1 = o2 = '' + c1 = c2 = '' + + elif tag == 'pre': + content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) + o1 = "" % atts + o2 = c2 = '' + c1 = '' + + else: + o2 = "\t<%s%s>" % (tag, atts) + c2 = "" % tag + + content = self.graf(content) + return o1, o2, content, c2, c1 + + def footnoteRef(self, text): + """ + >>> t = Textile() + >>> t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS + 'foo1 ' + """ + return re.sub(r'\b\[([0-9]+)\](\s)?', self.footnoteID, text) + + def footnoteID(self, match): + id, t = match.groups() + if id not in self.fn: + self.fn[id] = str(uuid.uuid4()) + fnid = self.fn[id] + if not t: + t = '' + return '%s%s' % (fnid, id, t) + + def glyphs(self, text): + """ + >>> t = Textile() + + >>> t.glyphs("apostrophe's") + 'apostrophe’s' + + >>> t.glyphs("back in '88") + 'back in ’88' + + >>> t.glyphs('foo ...') + 'foo …' + + >>> t.glyphs('--') + '—' + + >>> t.glyphs('FooBar[tm]') + 'FooBar™' + + >>> t.glyphs("

    Cat's Cradle by Vonnegut

    ") + '

    Cat’s Cradle by Vonnegut

    ' + + """ + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + glyph_search = ( + re.compile(r"(\w)\'(\w)"), # apostrophe's + re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88 + re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing + re.compile(r'\'/'), # single opening + re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing + re.compile(r'"'), # double opening + re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym + re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase + re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis + re.compile(r'(\s?)--(\s?)'), # em dash + re.compile(r'\s-(?:\s|$)'), # en dash + re.compile(r'(\d+)( ?)x( ?)(?=\d+)'), # dimension sign + re.compile(r'\b ?[([]TM[])]', re.I), # trademark + re.compile(r'\b ?[([]R[])]', re.I), # registered + re.compile(r'\b ?[([]C[])]', re.I), # copyright + ) + + glyph_replace = [x % dict(self.glyph_defaults) for x in ( + r'\1%(txt_apostrophe)s\2', # apostrophe's + r'\1%(txt_apostrophe)s\2', # back in '88 + r'\1%(txt_quote_single_close)s', # single closing + r'%(txt_quote_single_open)s', # single opening + r'\1%(txt_quote_double_close)s', # double closing + r'%(txt_quote_double_open)s', # double opening + r'\1', # 3+ uppercase acronym + r'\1', # 3+ uppercase + r'\1%(txt_ellipsis)s', # ellipsis + r'\1%(txt_emdash)s\2', # em dash + r' %(txt_endash)s ', # en dash + r'\1\2%(txt_dimension)s\3', # dimension sign + r'%(txt_trademark)s', # trademark + r'%(txt_registered)s', # registered + r'%(txt_copyright)s', # copyright + )] + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + for s, r in zip(glyph_search, glyph_replace): + line = s.sub(r, line) + result.append(line) + return ''.join(result) + + def vAlign(self, input): + d = {'^':'top', '-':'middle', '~':'bottom'} + return d.get(input, '') + + def hAlign(self, input): + d = {'<':'left', '=':'center', '>':'right', '<>': 'justify'} + return d.get(input, '') + + def getRefs(self, text): + """ + what is this for? + """ + pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U) + text = pattern.sub(self.refs, text) + return text + + def refs(self, match): + flag, url = match.groups() + self.urlrefs[flag] = url + return '' + + def checkRefs(self, url): + return self.urlrefs.get(url, url) + + def isRelURL(self, url): + """ + Identify relative urls. + + >>> t = Textile() + >>> t.isRelURL("http://www.google.com/") + False + >>> t.isRelURL("/foo") + True + + """ + (scheme, netloc) = urlparse(url)[0:2] + return not scheme and not netloc + + def relURL(self, url): + scheme = urlparse(url)[0] + if self.restricted and scheme and scheme not in self.url_schemes: + return '#' + return url + + def shelve(self, text): + id = str(uuid.uuid4()) + self.shelf[id] = text + return id + + def retrieve(self, text): + """ + >>> t = Textile() + >>> id = t.shelve("foobar") + >>> t.retrieve(id) + 'foobar' + """ + while True: + old = text + for k, v in self.shelf.items(): + text = text.replace(k, v) + if text == old: + break + return text + + def encode_html(self, text, quotes=True): + a = ( + ('&', '&'), + ('<', '<'), + ('>', '>') + ) + + if quotes: + a = a + ( + ("'", '''), + ('"', '"') + ) + + for k, v in a: + text = text.replace(k, v) + return text + + def graf(self, text): + if not self.lite: + text = self.noTextile(text) + text = self.code(text) + + text = self.links(text) + + if not self.noimage: + text = self.image(text) + + if not self.lite: + text = self.lists(text) + text = self.table(text) + + text = self.span(text) + text = self.footnoteRef(text) + text = self.glyphs(text) + + return text.rstrip('\n') + + def links(self, text): + """ + >>> t = Textile() + >>> t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS + 'fooobar ... and hello world ...' + """ + + punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' + + pattern = r''' + (?P
        [\s\[{(]|[%s]   )?
    +            "                          # start
    +            (?P   %s       )
    +            (?P   [^"]+?   )
    +            \s?
    +            (?:   \(([^)]+?)\)(?=")   )?     # $title
    +            ":
    +            (?P    (?:ftp|https?)? (?: :// )? [-A-Za-z0-9+&@#/?=~_()|!:,.;]*[-A-Za-z0-9+&@#/=~_()|]   )
    +            (?P   [^\w\/;]*?   )
    +            (?=<|\s|$)
    +        ''' % (re.escape(punct), self.c)
    +
    +        text = re.compile(pattern, re.X).sub(self.fLink, text)
    +
    +        return text
    +
    +    def fLink(self, match):
    +        pre, atts, text, title, url, post = match.groups()
    +
    +        if pre == None:
    +            pre = ''
    +            
    +        # assume ) at the end of the url is not actually part of the url
    +        # unless the url also contains a (
    +        if url.endswith(')') and not url.find('(') > -1:
    +            post = url[-1] + post
    +            url = url[:-1]
    +
    +        url = self.checkRefs(url)
    +
    +        atts = self.pba(atts)
    +        if title:
    +            atts = atts +  ' title="%s"' % self.encode_html(title)
    +
    +        if not self.noimage:
    +            text = self.image(text)
    +
    +        text = self.span(text)
    +        text = self.glyphs(text)
    +
    +        url = self.relURL(url)
    +        out = '%s' % (self.encode_html(url), atts, self.rel, text)
    +        out = self.shelve(out)
    +        return ''.join([pre, out, post])
    +
    +    def span(self, text):
    +        """
    +        >>> t = Textile()
    +        >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
    +        'hello span strong and bold goodbye'
    +        """
    +        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
    +        pnct = ".,\"'?!;:"
    +
    +        for qtag in qtags:
    +            pattern = re.compile(r"""
    +                (?:^|(?<=[\s>%(pnct)s])|([\]}]))
    +                (%(qtag)s)(?!%(qtag)s)
    +                (%(c)s)
    +                (?::(\S+))?
    +                ([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n])
    +                ([%(pnct)s]*)
    +                %(qtag)s
    +                (?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s))
    +            """ % {'qtag':qtag, 'c':self.c, 'pnct':pnct,
    +                   'selfpnct':self.pnct}, re.X)
    +            text = pattern.sub(self.fSpan, text)
    +        return text
    +
    +
    +    def fSpan(self, match):
    +        _, tag, atts, cite, content, end, _ = match.groups()
    +
    +        qtags = {
    +            '*': 'strong',
    +            '**': 'b',
    +            '??': 'cite',
    +            '_' : 'em',
    +            '__': 'i',
    +            '-' : 'del',
    +            '%' : 'span',
    +            '+' : 'ins',
    +            '~' : 'sub',
    +            '^' : 'sup'
    +        }
    +        tag = qtags[tag]
    +        atts = self.pba(atts)
    +        if cite:
    +            atts = atts + 'cite="%s"' % cite
    +
    +        content = self.span(content)
    +
    +        out = "<%s%s>%s%s" % (tag, atts, content, end, tag)
    +        return out
    +
    +    def image(self, text):
    +        """
    +        >>> t = Textile()
    +        >>> t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
    +        ''
    +        """
    +        pattern = re.compile(r"""
    +            (?:[\[{])?          # pre
    +            \!                 # opening !
    +            (%s)               # optional style,class atts
    +            (?:\. )?           # optional dot-space
    +            ([^\s(!]+)         # presume this is the src
    +            \s?                # optional space
    +            (?:\(([^\)]+)\))?  # optional title
    +            \!                 # closing
    +            (?::(\S+))?        # optional href
    +            (?:[\]}]|(?=\s|$)) # lookahead: space or end of string
    +        """ % self.c, re.U|re.X)
    +        return pattern.sub(self.fImage, text)
    +
    +    def fImage(self, match):
    +        # (None, '', '/imgs/myphoto.jpg', None, None)
    +        atts, url, title, href = match.groups()
    +        atts  = self.pba(atts)
    +
    +        if title:
    +            atts = atts + ' title="%s" alt="%s"' % (title, title)
    +        else:
    +            atts = atts + ' alt=""'
    +            
    +        if not self.isRelURL(url) and self.get_sizes:
    +            size = getimagesize(url)
    +            if (size):
    +                atts += " %s" % size
    +
    +        if href:
    +            href = self.checkRefs(href)
    +
    +        url = self.checkRefs(url)
    +        url = self.relURL(url)
    +
    +        out = []
    +        if href:
    +            out.append('' % href)
    +        if self.html_type == 'html':
    +            out.append('' % (url, atts))
    +        else:
    +            out.append('' % (url, atts))
    +        if href: 
    +            out.append('')
    +
    +        return ''.join(out)
    +
    +    def code(self, text):
    +        text = self.doSpecial(text, '', '', self.fCode)
    +        text = self.doSpecial(text, '@', '@', self.fCode)
    +        text = self.doSpecial(text, '
    ', '
    ', self.fPre) + return text + + def fCode(self, match): + before, text, after = match.groups() + if after == None: + after = '' + # text needs to be escaped + if not self.restricted: + text = self.encode_html(text) + return ''.join([before, self.shelve('%s' % text), after]) + + def fPre(self, match): + before, text, after = match.groups() + if after == None: + after = '' + # text needs to be escapedd + if not self.restricted: + text = self.encode_html(text) + return ''.join([before, '
    ', self.shelve(text), '
    ', after]) + + def doSpecial(self, text, start, end, method=None): + if method == None: + method = self.fSpecial + pattern = re.compile(r'(^|\s|[\[({>])%s(.*?)%s(\s|$|[\])}])?' % (re.escape(start), re.escape(end)), re.M|re.S) + return pattern.sub(method, text) + + def fSpecial(self, match): + """ + special blocks like notextile or code + """ + before, text, after = match.groups() + if after == None: + after = '' + return ''.join([before, self.shelve(self.encode_html(text)), after]) + + def noTextile(self, text): + text = self.doSpecial(text, '', '', self.fTextile) + return self.doSpecial(text, '==', '==', self.fTextile) + + def fTextile(self, match): + before, notextile, after = match.groups() + if after == None: + after = '' + return ''.join([before, self.shelve(notextile), after]) + + +def textile(text, head_offset=0, html_type='xhtml', encoding=None, output=None): + """ + this function takes additional parameters: + head_offset - offset to apply to heading levels (default: 0) + html_type - 'xhtml' or 'html' style tags (default: 'xhtml') + """ + return Textile().textile(text, head_offset=head_offset, + html_type=html_type) + +def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): + """ + Restricted version of Textile designed for weblog comments and other + untrusted input. + + Raw HTML is escaped. + Style attributes are disabled. + rel='nofollow' is added to external links. + + When lite=True is set (the default): + Block tags are restricted to p, bq, and bc. + Lists and tables are disabled. + + When noimage=True is set (the default): + Image tags are disabled. + + """ + return Textile(restricted=True, lite=lite, + noimage=noimage).textile(text, rel='nofollow', + html_type=html_type) + diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index aaff8b55c0..73af3acde4 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - convert_heuristic, normalize_line_endings + convert_heuristic, normalize_line_endings, convert_textile from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -41,6 +41,7 @@ class TXTInput(InputFormatPlugin): 'paragraph and no styling is applied.\n' '* heuristic: Process using heuristics to determine formatting such ' 'as chapter headings and italic text.\n' + '* textile: Processing using textile formatting.\n' '* markdown: Processing using markdown formatting. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, @@ -91,6 +92,13 @@ class TXTInput(InputFormatPlugin): except RuntimeError: raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') + elif options.formatting_type == 'textile': + log.debug('Running text though textile conversion...') + try: + html = convert_textile(txt) + except RuntimeError: + raise ValueError('This txt file has malformed markup, it cannot be' + ' converted by calibre.') else: # Determine the paragraph type of the document. if options.paragraph_type == 'auto': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 6a1a106681..d0526bd9fc 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -8,6 +8,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown +from calibre.ebooks.textile import textile from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor from calibre.ebooks.conversion.preprocess import DocAnalysis @@ -80,6 +81,10 @@ def convert_markdown(txt, title='', disable_toc=False): safe_mode=False) return HTML_TEMPLATE % (title, md.convert(txt)) +def convert_textile(txt, title=''): + html = textile(txt, encoding='utf-8') + return HTML_TEMPLATE % (title, html) + def normalize_line_endings(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') @@ -176,5 +181,19 @@ def detect_formatting_type(txt): for c in md_escapted_characters: if txt.count('\\'+c) > 10: return 'markdown' + + # Check for textile + # Headings + if len(re.findall(r'h[1-6]\.', txt)) >= 5: + return 'textile' + # Block quote. + if len(re.findall(r'bq\.', txt)) >= 5: + return 'textile' + # Images + if len(re.findall(r'\![^\s]+(:[^\s]+)*', txt)) >= 5: + return 'textile' + # Links + if len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) >= 5: + return 'textile' return 'heuristic' From 9eb8b031d97e6432683952c3e7bdcdd2a8117b97 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jan 2011 11:14:19 -0700 Subject: [PATCH 02/11] Code to put downloaded news into the magazines category on the nook color. Commented out, pending testing --- src/calibre/devices/nook/driver.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index 987b90c748..ca05885645 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -91,3 +91,19 @@ class NOOK_COLOR(NOOK): EBOOK_DIR_MAIN = 'My Files/Books' + ''' + def create_upload_path(self, path, mdata, fname, create_dirs=True): + filepath = NOOK.create_upload_path(self, path, mdata, fname, + create_dirs=create_dirs) + edm = self.EBOOK_DIR_MAIN.replace('/', os.sep) + npath = os.path.join(edm, _('News')) + os.sep + if npath in filepath: + filepath = filepath.replace(npath, os.sep.join('My Files', + 'Magazines')+os.sep) + filedir = os.path.dirname(filepath) + if create_dirs and not os.path.exists(filedir): + os.makedirs(filedir) + + return filepath + ''' + From b61fdf0eac17b578c73ce68c344ec8efe952239a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jan 2011 13:15:29 -0700 Subject: [PATCH 03/11] Updated NYTimes --- resources/recipes/nytimes.recipe | 25 +++++++++++++++++++++++++ resources/recipes/nytimes_sub.recipe | 23 +++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index eaa428e731..6f80f4f85f 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -685,3 +685,28 @@ class NYTimes(BasicNewsRecipe): divTag.replaceWith(tag) return soup + + def populate_article_metadata(self, article, soup, first): + shortparagraph = "" + try: + if len(article.text_summary.strip()) == 0: + articlebodies = soup.findAll('div',attrs={'class':'articleBody'}) + if articlebodies: + for articlebody in articlebodies: + if articlebody: + paras = articlebody.findAll('p') + for p in paras: + refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() + #account for blank paragraphs and short paragraphs by appending them to longer ones + if len(refparagraph) > 0: + if len(refparagraph) > 70: #approximately one line of text + article.summary = article.text_summary = shortparagraph + refparagraph + return + else: + shortparagraph = refparagraph + " " + if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): + shortparagraph = shortparagraph + "- " + except: + self.log("Error creating article descriptions") + return + diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index e56fd9cdec..8ac7c735f7 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -685,4 +685,27 @@ class NYTimes(BasicNewsRecipe): divTag.replaceWith(tag) return soup + def populate_article_metadata(self, article, soup, first): + shortparagraph = "" + try: + if len(article.text_summary.strip()) == 0: + articlebodies = soup.findAll('div',attrs={'class':'articleBody'}) + if articlebodies: + for articlebody in articlebodies: + if articlebody: + paras = articlebody.findAll('p') + for p in paras: + refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() + #account for blank paragraphs and short paragraphs by appending them to longer ones + if len(refparagraph) > 0: + if len(refparagraph) > 70: #approximately one line of text + article.summary = article.text_summary = shortparagraph + refparagraph + return + else: + shortparagraph = refparagraph + " " + if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): + shortparagraph = shortparagraph + "- " + except: + self.log("Error creating article descriptions") + return From bfa79729a27f6f2bb504d2137f0bc89d192e5f68 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jan 2011 13:34:18 -0700 Subject: [PATCH 04/11] ... --- src/calibre/gui2/preferences/plugins.ui | 4 ++++ src/calibre/gui2/shortcuts.py | 7 ++++++- src/calibre/gui2/viewer/documentview.py | 7 +++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/preferences/plugins.ui b/src/calibre/gui2/preferences/plugins.ui index 18f0786a66..83a904eb08 100644 --- a/src/calibre/gui2/preferences/plugins.ui +++ b/src/calibre/gui2/preferences/plugins.ui @@ -76,6 +76,10 @@ &Add a new plugin + + + :/images/plugins.png:/images/plugins.png + diff --git a/src/calibre/gui2/shortcuts.py b/src/calibre/gui2/shortcuts.py index bdd699a69d..5e56435e10 100644 --- a/src/calibre/gui2/shortcuts.py +++ b/src/calibre/gui2/shortcuts.py @@ -150,7 +150,7 @@ class Delegate(QStyledItemDelegate): custom = [] if editor.custom.isChecked(): for x in ('1', '2'): - sc = getattr(editor, 'shortcut'+x) + sc = getattr(editor, 'shortcut'+x, None) if sc is not None: custom.append(sc) @@ -266,6 +266,11 @@ class ShortcutConfig(QWidget): self.view.scrollTo(index) + @property + def is_editing(self): + return self.view.state() == self.view.EditingState + + if __name__ == '__main__': from calibre.gui2 import is_ok_to_use_qt from calibre.gui2.viewer.keys import SHORTCUTS diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index 55abae0392..4485e63373 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -120,6 +120,13 @@ class ConfigDialog(QDialog, Ui_Dialog): def accept(self, *args): + if self.shortcut_config.is_editing: + from calibre.gui2 import info_dialog + info_dialog(self, _('Still editing'), + _('You are in the middle of editing a keyboard shortcut' + ' first complete that, by clicking outside the ' + ' shortcut editing box.'), show=True) + return c = config() c.set('serif_family', unicode(self.serif_family.currentFont().family())) c.set('sans_family', unicode(self.sans_family.currentFont().family())) From 66b870e6d89bc68c02b6c321ed9f6f963e4303ed Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jan 2011 13:42:04 -0700 Subject: [PATCH 05/11] ... --- src/calibre/ebooks/conversion/plumber.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index b1d760ea2d..9b22fb46ec 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -88,6 +88,7 @@ class Plumber(object): self.ui_reporter = report_progress self.abort_after_input_dump = abort_after_input_dump + # Pipeline options {{{ # Initialize the conversion options that are independent of input and # output formats. The input and output plugins can still disable these # options via recommendations. @@ -527,6 +528,7 @@ OptionRecommendation(name='timestamp', help=_('Set the book timestamp (used by the date column in calibre).')), ] + # }}} input_fmt = os.path.splitext(self.input)[1] if not input_fmt: From 3cd9ffcec6c14cada79b989fbfa92df49db5100c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jan 2011 15:08:15 -0700 Subject: [PATCH 06/11] Fix #8281 (Error when customizing builtin recipes with same name (e.g. The Nation)) --- src/calibre/gui2/dialogs/user_profiles.py | 69 ++++++++++++++++----- src/calibre/web/feeds/recipes/collection.py | 19 +++++- 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/src/calibre/gui2/dialogs/user_profiles.py b/src/calibre/gui2/dialogs/user_profiles.py index 71c9ebcd04..04c41f0c5e 100644 --- a/src/calibre/gui2/dialogs/user_profiles.py +++ b/src/calibre/gui2/dialogs/user_profiles.py @@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal ' import time, os from PyQt4.Qt import SIGNAL, QUrl, QAbstractListModel, Qt, \ - QVariant, QInputDialog + QVariant from calibre.web.feeds.recipes import compile_recipe from calibre.web.feeds.news import AutomaticNewsRecipe @@ -256,24 +256,61 @@ class %(classname)s(%(base_class)s): def add_builtin_recipe(self): from calibre.web.feeds.recipes.collection import \ - get_builtin_recipe_by_title, get_builtin_recipe_titles - items = sorted(get_builtin_recipe_titles(), key=sort_key) + get_builtin_recipe_collection, get_builtin_recipe_by_id + from PyQt4.Qt import QDialog, QVBoxLayout, QListWidgetItem, \ + QListWidget, QDialogButtonBox, QSize + d = QDialog(self) + d.l = QVBoxLayout() + d.setLayout(d.l) + d.list = QListWidget(d) + d.list.doubleClicked.connect(lambda x: d.accept()) + d.l.addWidget(d.list) + d.bb = QDialogButtonBox(QDialogButtonBox.Ok|QDialogButtonBox.Cancel, + Qt.Horizontal, d) + d.bb.accepted.connect(d.accept) + d.bb.rejected.connect(d.reject) + d.l.addWidget(d.bb) + d.setWindowTitle(_('Choose builtin recipe')) + items = [] + for r in get_builtin_recipe_collection(): + id_ = r.get('id', '') + title = r.get('title', '') + lang = r.get('language', '') + if id_ and title: + items.append((title + ' [%s]'%lang, id_)) - title, ok = QInputDialog.getItem(self, _('Pick recipe'), _('Pick the recipe to customize'), - items, 0, False) - if ok: - title = unicode(title) - profile = get_builtin_recipe_by_title(title) - if self._model.has_title(title): - if question_dialog(self, _('Replace recipe?'), - _('A custom recipe named %s already exists. Do you want to ' - 'replace it?')%title): - self._model.replace_by_title(title, profile) - else: - return + items.sort(key=lambda x:sort_key(x[0])) + for title, id_ in items: + item = QListWidgetItem(title) + item.setData(Qt.UserRole, id_) + d.list.addItem(item) + + d.resize(QSize(450, 400)) + ret = d.exec_() + d.list.doubleClicked.disconnect() + if ret != d.Accepted: + return + + items = list(d.list.selectedItems()) + if not items: + return + item = items[-1] + id_ = unicode(item.data(Qt.UserRole).toString()) + title = unicode(item.data(Qt.DisplayRole).toString()).rpartition(' [')[0] + profile = get_builtin_recipe_by_id(id_) + if profile is None: + raise Exception('Something weird happened') + + if self._model.has_title(title): + if question_dialog(self, _('Replace recipe?'), + _('A custom recipe named %s already exists. Do you want to ' + 'replace it?')%title): + self._model.replace_by_title(title, profile) else: - self.model.add(title, profile) + return + else: + self.model.add(title, profile) self.clear() diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py index a513cf3880..5dd360213b 100644 --- a/src/calibre/web/feeds/recipes/collection.py +++ b/src/calibre/web/feeds/recipes/collection.py @@ -108,7 +108,6 @@ def download_builtin_recipe(urn): br = browser() return br.open_novisit('http://status.calibre-ebook.com/recipe/'+urn).read() - def get_builtin_recipe_by_title(title, log=None, download_recipe=False): for x in get_builtin_recipe_collection(): if x.get('title') == title: @@ -127,6 +126,24 @@ def get_builtin_recipe_by_title(title, log=None, download_recipe=False): 'Failed to download recipe, using builtin version') return P('recipes/%s.recipe'%urn, data=True) +def get_builtin_recipe_by_id(id_, log=None, download_recipe=False): + for x in get_builtin_recipe_collection(): + if x.get('id') == id_: + urn = x.get('id')[8:] + if download_recipe: + try: + if log is not None: + log('Trying to get latest version of recipe:', urn) + return download_builtin_recipe(urn) + except: + if log is None: + import traceback + traceback.print_exc() + else: + log.exception( + 'Failed to download recipe, using builtin version') + return P('recipes/%s.recipe'%urn, data=True) + class SchedulerConfig(object): def __init__(self): From 9585ba655c810bb9132f3d6d7299455d23d47493 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 11 Jan 2011 18:08:55 -0500 Subject: [PATCH 07/11] TXT Input: remove unnecessary try block. Rework markdown and textile detection. --- src/calibre/ebooks/txt/input.py | 6 +---- src/calibre/ebooks/txt/processor.py | 41 +++++++++++++---------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 73af3acde4..0b0bd6d570 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -94,11 +94,7 @@ class TXTInput(InputFormatPlugin): ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') elif options.formatting_type == 'textile': log.debug('Running text though textile conversion...') - try: - html = convert_textile(txt) - except RuntimeError: - raise ValueError('This txt file has malformed markup, it cannot be' - ' converted by calibre.') + html = convert_textile(txt) else: # Determine the paragraph type of the document. if options.paragraph_type == 'auto': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index d0526bd9fc..d59fd4121a 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -162,38 +162,33 @@ def detect_paragraph_type(txt): def detect_formatting_type(txt): + markdown_count = 0 + textile_count = 0 + # Check for markdown # Headings - if len(re.findall('(?mu)^#+', txt)) >= 5: - return 'markdown' - if len(re.findall('(?mu)^=+$', txt)) >= 5: - return 'markdown' - if len(re.findall('(?mu)^-+$', txt)) >= 5: - return 'markdown' + markdown_count += len(re.findall('(?mu)^#+', txt)) + markdown_count += len(re.findall('(?mu)^=+$', txt)) + markdown_count += len(re.findall('(?mu)^-+$', txt)) # Images - if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: - return 'markdown' + markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) # Links - if len(re.findall('(?u)(^|(?P
    [^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
    -        return 'markdown'
    -    # Escaped characters
    -    md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
    -    for c in md_escapted_characters:
    -        if txt.count('\\'+c) > 10:
    -            return 'markdown'
    +    markdown_count += len(re.findall('(?u)(^|(?P
    [^!]))\[.*?\]\([^)]+\)', txt))
             
         # Check for textile
         # Headings
    -    if len(re.findall(r'h[1-6]\.', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
         # Block quote.
    -    if len(re.findall(r'bq\.', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'(?mu)^bq\.', txt))
         # Images
    -    if len(re.findall(r'\![^\s]+(:[^\s]+)*', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'\![^\s]+(:[^\s]+)*', txt))
         # Links
    -    if len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) >= 5:
    -        return 'textile'
    +    textile_count += len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
    +    
    +    if markdown_count > 5 or textile_count > 5:
    +        if markdown_count > textile_count:
    +            return 'markdown'
    +        else:
    +            return 'textile'
         
         return 'heuristic'
    
    From f058f9adab9b1ae6bbc0673b1d8f5dce4550072f Mon Sep 17 00:00:00 2001
    From: John Schember 
    Date: Tue, 11 Jan 2011 18:56:23 -0500
    Subject: [PATCH 08/11] FB2 Output: Implement #8277, Write cover to coverpage
     tag within metadata.
    
    ---
     src/calibre/ebooks/fb2/fb2ml.py | 68 +++++++++++++++++++--------------
     1 file changed, 39 insertions(+), 29 deletions(-)
    
    diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
    index 4dd6e7c7ae..037a805e74 100644
    --- a/src/calibre/ebooks/fb2/fb2ml.py
    +++ b/src/calibre/ebooks/fb2/fb2ml.py
    @@ -102,6 +102,7 @@ class FB2MLizer(object):
             metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
             metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
             metadata['id'] = None
    +        metadata['cover'] = self.get_cover()
     
             author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
             if len(author_parts) == 1:
    @@ -124,7 +125,8 @@ class FB2MLizer(object):
                 metadata['id'] = str(uuid.uuid4()) 
     
             for key, value in metadata.items():
    -            metadata[key] = prepare_string_for_xml(value)
    +            if not key == 'cover':
    +                metadata[key] = prepare_string_for_xml(value)
     
             return u'' \
                     '' \
    @@ -136,6 +138,7 @@ class FB2MLizer(object):
                                 '%(author_last)s' \
                             '' \
                             '%(title)s' \
    +                        '%(cover)s' \
                             '%(lang)s' \
                         '' \
                         '' \
    @@ -154,6 +157,41 @@ class FB2MLizer(object):
         def fb2_footer(self):
             return u''
     
    +    def get_cover(self):
    +        cover_href = None
    +        
    +        # Get the raster cover if it's available.
    +        if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
    +            id = unicode(self.oeb_book.metadata.cover[0])
    +            cover_item = self.oeb_book.manifest.ids[id]
    +            if cover_item.media_type in OEB_RASTER_IMAGES:
    +                cover_href = cover_item.href
    +            print 1
    +        else:
    +            # Figure out if we have a title page or a cover page
    +            page_name = ''
    +            if 'titlepage' in self.oeb_book.guide:
    +                page_name = 'titlepage'
    +            elif 'cover' in self.oeb_book.guide:
    +                page_name = 'cover'
    +
    +            if page_name:
    +                cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
    +                # Get the first image in the page
    +                for img in cover_item.xpath('//img'):
    +                    cover_href = cover_item.abshref(img.get('src'))
    +                    print cover_href
    +                    break
    +                
    +        if cover_href:
    +            # Only write the image tag if it is in the manifest.
    +            if cover_href in self.oeb_book.manifest.hrefs.keys():
    +                if cover_href not in self.image_hrefs.keys():
    +                    self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
    +            return u'' % self.image_hrefs[cover_href]
    +        
    +        return u'' 
    +
         def get_text(self):
             text = ['']
             
    @@ -162,23 +200,6 @@ class FB2MLizer(object):
                 text.append('
    ') self.section_level += 1 - # Insert the title page / cover into the spine if it is not already referenced. - title_name = u'' - if 'titlepage' in self.oeb_book.guide: - title_name = 'titlepage' - elif 'cover' in self.oeb_book.guide: - title_name = 'cover' - if title_name: - title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] - if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': - self.oeb_book.spine.insert(0, title_item, True) - # Create xhtml page to reference cover image so it can be used. - if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: - id = unicode(self.oeb_book.metadata.cover[0]) - cover_item = self.oeb_book.manifest.ids[id] - if cover_item.media_type in OEB_RASTER_IMAGES: - self.insert_image_cover(cover_item.href) - for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) @@ -203,17 +224,6 @@ class FB2MLizer(object): return ''.join(text) + '' - def insert_image_cover(self, image_href): - from calibre.ebooks.oeb.base import RECOVER_PARSER - try: - root = etree.fromstring(u'' % (XHTML_NS, image_href), parser=RECOVER_PARSER) - except: - root = etree.fromstring(u'', parser=RECOVER_PARSER) - - id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml') - item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root) - self.oeb_book.spine.insert(0, item, True) - def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. From 0bab82e9b1e0dec31ca0e924fe7e1e72c9de83f6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jan 2011 17:25:35 -0700 Subject: [PATCH 09/11] RTF Input: Substitute a dummy image for WMF images in the RTF document --- src/calibre/ebooks/rtf/input.py | 28 +++++++++++++++--- src/calibre/utils/wmf/__init__.py | 47 +++++++++++++++++++++++++++++ src/calibre/utils/wmf/wmf.c | 49 ++++++++++++++++++++++++++----- 3 files changed, 112 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 5154373eda..714a5b656f 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -159,11 +159,31 @@ class RTFInput(InputFormatPlugin): return imap def convert_image(self, name): - from calibre.utils.magick import Image - img = Image() - img.open(name) + try: + return self.rasterize_wmf(name) + except: + self.log.exception('Failed to convert WMF image %r'%name) + return self.replace_wmf(name) + + def replace_wmf(self, name): + from calibre.ebooks import calibre_cover + data = calibre_cover('Conversion of WMF images is not supported', + 'Use Microsoft Word or OpenOffice to save this RTF file' + ' as HTML and convert that in calibre.', title_size=36, + author_size=20) name = name.replace('.wmf', '.jpg') - img.save(name) + with open(name, 'wb') as f: + f.write(data) + return name + + def rasterize_wmf(self, name): + from calibre.utils.wmf import extract_raster_image + with open(name, 'rb') as f: + data = f.read() + data = extract_raster_image(data) + name = name.replace('.wmf', '.jpg') + with open(name, 'wb') as f: + f.write(data) return name diff --git a/src/calibre/utils/wmf/__init__.py b/src/calibre/utils/wmf/__init__.py index 68dfb8d2b5..cb7736e06a 100644 --- a/src/calibre/utils/wmf/__init__.py +++ b/src/calibre/utils/wmf/__init__.py @@ -5,5 +5,52 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import glob + +from calibre.constants import plugins, iswindows, filesystem_encoding +from calibre.ptempfile import TemporaryDirectory +from calibre import CurrentDir +from calibre.utils.magick import Image, PixelWand + +class Unavailable(Exception): + pass + +class NoRaster(Exception): + pass + +def extract_raster_image(wmf_data): + try: + wmf, wmf_err = plugins['wmf'] + except KeyError: + raise Unavailable('libwmf not available on this platform') + if wmf_err: + raise Unavailable(wmf_err) + + if iswindows: + import sys, os + appdir = sys.app_dir + if isinstance(appdir, unicode): + appdir = appdir.encode(filesystem_encoding) + fdir = os.path.join(appdir, 'wmffonts') + wmf.set_font_dir(fdir) + + data = '' + + with TemporaryDirectory('wmf2png') as tdir: + with CurrentDir(tdir): + wmf.render(wmf_data) + + images = list(sorted(glob.glob('*.png'))) + if not images: + raise NoRaster('No raster images in WMF') + data = open(images[0], 'rb').read() + + im = Image() + im.load(data) + pw = PixelWand() + pw.color = '#ffffff' + im.rotate(pw, 180) + + return im.export('png') diff --git a/src/calibre/utils/wmf/wmf.c b/src/calibre/utils/wmf/wmf.c index 1f8e8a27f3..74d3ca813f 100644 --- a/src/calibre/utils/wmf/wmf.c +++ b/src/calibre/utils/wmf/wmf.c @@ -4,6 +4,7 @@ #include #include +//#include typedef struct { char *data; @@ -13,7 +14,7 @@ typedef struct { //This code is taken mostly from the Abiword wmf plugin - +// Buffer read {{{ // returns unsigned char cast to int, or EOF static int wmf_WMF_read(void * context) { char c; @@ -22,11 +23,11 @@ static int wmf_WMF_read(void * context) { if (info->pos == info->len) return EOF; - c = info->data[pos]; + c = info->data[info->pos]; info->pos++; - return (int)c; + return (int)((unsigned char)c); } // returns (-1) on error, else 0 @@ -44,8 +45,17 @@ static long wmf_WMF_tell(void * context) { return (long) info->pos; } +// }}} +char _png_name_buf[100]; +char *wmf_png_name(void *ctxt) { + int *num = (int*)ctxt; + *num = *num + 1; + snprintf(_png_name_buf, 90, "%04d.png", *num); + return _png_name_buf; +} + #define CLEANUP if(API) { if (stream) wmf_free(API, stream); wmf_api_destroy(API); }; static PyObject * @@ -66,9 +76,9 @@ wmf_render(PyObject *self, PyObject *args) { unsigned int max_width = 1600; unsigned int max_height = 1200; - unsigned long max_flags = 0; static const char* Default_Description = "wmf2svg"; + int fname_counter = 0; wmf_error_t err; @@ -125,6 +135,8 @@ wmf_render(PyObject *self, PyObject *args) { ddata->Description = (char *)Default_Description; ddata->bbox = bbox; + ddata->image.context = (void *)&fname_counter; + ddata->image.name = wmf_png_name; wmf_display_size(API, &disp_width, &disp_height, 96, 96); @@ -156,9 +168,9 @@ wmf_render(PyObject *self, PyObject *args) { ddata->height = (unsigned int) ceil ((double) wmf_height); } - ddata->flags |= WMF_SVG_INLINE_IMAGES; - - ddata->flags |= WMF_GD_OUTPUT_MEMORY | WMF_GD_OWN_BUFFER; + // Needs GD + //ddata->flags |= WMF_SVG_INLINE_IMAGES; + //ddata->flags |= WMF_GD_OUTPUT_MEMORY | WMF_GD_OWN_BUFFER; err = wmf_play(API, 0, &(bbox)); @@ -178,11 +190,32 @@ wmf_render(PyObject *self, PyObject *args) { return ans; } +#ifdef _WIN32 +void set_libwmf_fontdir(const char *); + +static PyObject * +wmf_setfontdir(PyObject *self, PyObject *args) { + char *path; + if (!PyArg_ParseTuple(args, "s", &path)) + return NULL; + set_libwmf_fontdir(path); + + Py_RETURN_NONE; +} +#endif + + + static PyMethodDef wmf_methods[] = { {"render", wmf_render, METH_VARARGS, - "render(path) -> Render wmf as svg." + "render(data) -> Render wmf as svg." }, +#ifdef _WIN32 + {"set_font_dir", wmf_setfontdir, METH_VARARGS, + "set_font_dir(path) -> Set the path to the fonts dir on windows, must be called at least once before using render()" + }, +#endif {NULL} /* Sentinel */ }; From 06723a07483cfd59d63ffefc58ce1c53efaa5f92 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 11 Jan 2011 19:57:57 -0500 Subject: [PATCH 10/11] ... --- src/calibre/ebooks/fb2/fb2ml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 037a805e74..7a618ab54a 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -166,7 +166,6 @@ class FB2MLizer(object): cover_item = self.oeb_book.manifest.ids[id] if cover_item.media_type in OEB_RASTER_IMAGES: cover_href = cover_item.href - print 1 else: # Figure out if we have a title page or a cover page page_name = '' @@ -180,7 +179,6 @@ class FB2MLizer(object): # Get the first image in the page for img in cover_item.xpath('//img'): cover_href = cover_item.abshref(img.get('src')) - print cover_href break if cover_href: From d2d65d805663ebfbfd11e723dc94c241bff30ce6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Jan 2011 18:12:29 -0700 Subject: [PATCH 11/11] ... --- resources/recipes/tyzden.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/recipes/tyzden.recipe b/resources/recipes/tyzden.recipe index c206244ff6..b8d7389fbe 100644 --- a/resources/recipes/tyzden.recipe +++ b/resources/recipes/tyzden.recipe @@ -28,7 +28,7 @@ class TyzdenRecipe(BasicNewsRecipe): if (weeknum > 1): weeknum -= 1 - title = u'.tyzden ' + str(weeknum) + '/' + str(year) + title = u'tyzden' base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum) base_url = base_url_path + '.html'