diff --git a/src/libprs500/ebooks/lrf/txt/convert_from.py b/src/libprs500/ebooks/lrf/txt/convert_from.py index 00a65f127e..93e4cf3e1f 100644 --- a/src/libprs500/ebooks/lrf/txt/convert_from.py +++ b/src/libprs500/ebooks/lrf/txt/convert_from.py @@ -31,10 +31,12 @@ def option_parser(): 'the text in mybook.txt. Default is to try to autodetect.' parser.add_option('-e', '--encoding', action='store', type='string', \ dest='encoding', help=enchelp, default=None) + parser.add_option('--debug-html-generation', action='store_true', default=False, + dest='debug_html_generation', help='Print generated HTML to stdout and quit.') return parser -def generate_html(txtfile, encoding): +def generate_html(txtfile, encoding, logger): ''' Convert txtfile to html and return a PersistentTemporaryFile object pointing to the file with the HTML. @@ -54,12 +56,14 @@ def generate_html(txtfile, encoding): raise ConversionError, 'Could not detect encoding of %s'%(txtfile,) else: txt = codecs.open(txtfile, 'rb', enc).read() + + logger.info('Converting text to HTML...') md = markdown.Markdown(txt, - extensions=['footnotes', 'tables', 'toc'], - encoding=enc, - safe_mode=False, - ) + extensions=['footnotes', 'tables', 'toc'], + safe_mode=False, + ) html = md.toString() + p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile)) p.close() codecs.open(p.name, 'wb', enc).write(html) @@ -73,14 +77,19 @@ def process_file(path, options, logger=None): txt = os.path.abspath(os.path.expanduser(path)) if not hasattr(options, 'encoding'): options.encoding = None - htmlfile = generate_html(txt, options.encoding) - options.force_page_break = 'h2' - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) + if not hasattr(options, 'debug_html_generation'): + options.debug_html_generation = False + htmlfile = generate_html(txt, options.encoding, logger) + if not options.debug_html_generation: + options.force_page_break = 'h2' + if not options.output: + ext = '.lrs' if options.lrs else '.lrf' + options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) + options.output = os.path.abspath(os.path.expanduser(options.output)) - html_process_file(htmlfile.name, options, logger) + html_process_file(htmlfile.name, options, logger) + else: + print open(htmlfile.name, 'rb').read() def main(args=sys.argv, logger=None): parser = option_parser() diff --git a/src/libprs500/ebooks/markdown/markdown.py b/src/libprs500/ebooks/markdown/markdown.py index 8dfab2d505..1b7f614d1e 100644 --- a/src/libprs500/ebooks/markdown/markdown.py +++ b/src/libprs500/ebooks/markdown/markdown.py @@ -1,43 +1,37 @@ #!/usr/bin/env python -# The following constant specifies the name used in the usage -# statement displayed for python versions lower than 2.3. (With -# python2.3 and higher the usage statement is generated by optparse -# and uses the actual name of the executable called.) - -EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" - -SPEED_TEST = 0 +version = "1.6b" +version_info = (1,6,2,"rc-2") +__revision__ = "$Rev$" """ -==================================================================== -IF YOA ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION -==================================================================== - Python-Markdown =============== Converts Markdown to HTML. Basic usage as a module: import markdown - html = markdown.markdown(your_text_string) + md = Markdown() + html = markdown.convert(your_text_string) + +See http://www.freewisdom.org/projects/python-markdown/ for more +information and instructions on how to extend the functionality of the +script. (You might want to read that before you try modifying this +file.) Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and maintained by [Yuri Takhteyev](http://www.freewisdom.org). -Project website: http://www.freewisdom.org/projects/python-markdown Contact: yuri [at] freewisdom.org License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD -Version: 1.5a (July 9, 2006) - -For changelog, see end of file """ -import re, sys, os, random, codecs -# set debug level: 3 none, 2 critical, 1 informative, 0 all +import re, sys, codecs + +# Set debug level: 3 none, 2 critical, 1 informative, 0 all (VERBOSE, INFO, CRITICAL, NONE) = range(4) MESSAGE_THRESHOLD = CRITICAL @@ -49,10 +43,45 @@ def message(level, text) : # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY ----------------- -# all tabs will be expanded to up to this many spaces -TAB_LENGTH = 4 -ENABLE_ATTRIBUTES = 1 -SMART_EMPHASIS = 1 +TAB_LENGTH = 4 # expand tabs to this many spaces +ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> +SMART_EMPHASIS = 1 # this_or_that does not become thisorthat +HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode + +RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), + # from Hebrew to Nko (includes Arabic, Syriac and Thaana) + (u'\u2D30', u'\u2D7F'), + # Tifinagh + ) + +# Unicode Reference Table: +# 0590-05FF - Hebrew +# 0600-06FF - Arabic +# 0700-074F - Syriac +# 0750-077F - Arabic Supplement +# 0780-07BF - Thaana +# 07C0-07FF - Nko + +BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ), + 'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"), + unicode(codecs.BOM_UTF16_BE, "utf-16")), + #'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"), + # unicode(codecs.BOM_UTF32_BE, "utf-32")), + } + +def removeBOM(text, encoding): + for bom in BOMS[encoding]: + if text.startswith(bom): + return text.lstrip(bom) + return text + +# The following constant specifies the name used in the usage +# statement displayed for python versions lower than 2.3. (With +# python2.3 and higher the usage statement is generated by optparse +# and uses the actual name of the executable called.) + +EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" + # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ---------- @@ -82,14 +111,50 @@ Importantly, NanoDom does not do normalization, which is what we want. It also adds extra white space when converting DOM to string """ +ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&"), + (re.compile("<"), "<"), + (re.compile(">"), ">"), + (re.compile("\""), """)] + +ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&"), + (re.compile("<"), "<"), + (re.compile(">"), ">"), + (re.compile("\""), """)] + + +def getBidiType(text) : + + if not text : return None + + ch = text[0] + + if not isinstance(ch, unicode) or not ch.isalpha(): + return None + + else : + + for min, max in RTL_BIDI_RANGES : + if ( ch >= min and ch <= max ) : + return "rtl" + else : + return "ltr" + class Document : + def __init__ (self) : + self.bidi = "ltr" + def appendChild(self, child) : self.documentElement = child + child.isDocumentElement = True child.parent = self self.entities = {} + def setBidi(self, bidi) : + if bidi : + self.bidi = bidi + def createElement(self, tag, textNode=None) : el = Element(tag) el.doc = self @@ -107,19 +172,23 @@ class Document : self.entities[entity] = EntityReference(entity) return self.entities[entity] + def createCDATA(self, text) : + node = CDATA(text) + node.doc = self + return node + def toxml (self) : return self.documentElement.toxml() - def normalizeEntities(self, text) : + def normalizeEntities(self, text, avoidDoubleNormalizing=False) : - pairs = [ ("&", "&"), - ("<", "<"), - (">", ">"), - ("\"", """)] + if avoidDoubleNormalizing : + regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT + else : + regexps = ENTITY_NORMALIZATION_EXPRESSIONS - - for old, new in pairs : - text = text.replace(old, new) + for regexp, substitution in regexps : + text = regexp.sub(substitution, text) return text def find(self, test) : @@ -130,6 +199,19 @@ class Document : self.documentElement = None +class CDATA : + + type = "cdata" + + def __init__ (self, text) : + self.text = text + + def handleAttributes(self) : + pass + + def toxml (self) : + return "" + class Element : type = "element" @@ -140,6 +222,19 @@ class Element : self.attributes = [] self.attribute_values = {} self.childNodes = [] + self.bidi = None + self.isDocumentElement = False + + def setBidi(self, bidi) : + + if bidi : + + + if not self.bidi or self.isDocumentElement: + # Once the bidi is set don't change it (except for doc element) + self.bidi = bidi + self.parent.setBidi(bidi) + def unlink(self) : for child in self.childNodes : @@ -186,27 +281,56 @@ class Element : if ENABLE_ATTRIBUTES : for child in self.childNodes: child.handleAttributes() + buffer = "" if self.nodeName in ['h1', 'h2', 'h3', 'h4'] : buffer += "\n" elif self.nodeName in ['li'] : buffer += "\n " + + # Process children FIRST, then do the attributes + + childBuffer = "" + + if self.childNodes or self.nodeName in ['blockquote']: + childBuffer += ">" + for child in self.childNodes : + childBuffer += child.toxml() + if self.nodeName == 'p' : + childBuffer += "\n" + elif self.nodeName == 'li' : + childBuffer += "\n " + childBuffer += "%s>" % self.nodeName + else : + childBuffer += "/>" + + + buffer += "<" + self.nodeName + + if self.nodeName in ['p', 'li', 'ul', 'ol', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] : + + if not self.attribute_values.has_key("dir"): + if self.bidi : + bidi = self.bidi + else : + bidi = self.doc.bidi + + if bidi=="rtl" : + self.setAttribute("dir", "rtl") + for attr in self.attributes : value = self.attribute_values[attr] - value = self.doc.normalizeEntities(value) + value = self.doc.normalizeEntities(value, + avoidDoubleNormalizing=True) buffer += ' %s="%s"' % (attr, value) - if self.childNodes or self.nodeName in ['blockquote']: - buffer += ">" - for child in self.childNodes : - buffer += child.toxml() - if self.nodeName == 'p' : - buffer += "\n" - elif self.nodeName == 'li' : - buffer += "\n " - buffer += "%s>" % self.nodeName - else : - buffer += "/>" + + + # Now let's actually append the children + + buffer += childBuffer + if self.nodeName in ['p', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4'] : buffer += "\n" @@ -223,13 +347,18 @@ class TextNode : self.value = text def attributeCallback(self, match) : + self.parent.setAttribute(match.group(1), match.group(2)) def handleAttributes(self) : self.value = self.attrRegExp.sub(self.attributeCallback, self.value) def toxml(self) : + text = self.value + + self.parent.setBidi(getBidiType(text)) + if not text.startswith(HTML_PLACEHOLDER_PREFIX): if self.parent.nodeName == "p" : text = text.replace("\n", "\n ") @@ -262,10 +391,10 @@ class EntityReference: Preprocessors munge source text before we start doing anything too complicated. -Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document, -modifies it as necessary and returns either the same pointer or a -pointer to a new list. Preprocessors must extend -markdown.Preprocessor. +Each preprocessor implements a "run" method that takes a pointer to a +list of lines of the document, modifies it as necessary and returns +either the same pointer or a pointer to a new list. Preprocessors +must extend markdown.Preprocessor. """ @@ -305,10 +434,6 @@ class HeaderPreprocessor (Preprocessor): lines[i] = "## " + lines[i].strip() lines[i+1] = "" - #for l in lines : - # print l.encode('utf8') - #sys.exit(0) - return lines HEADER_PREPROCESSOR = HeaderPreprocessor() @@ -362,10 +487,13 @@ class HtmlBlockPreprocessor (Preprocessor): return block.rstrip()[-len(left_tag)-2:-1].lower() def _equal_tags(self, left_tag, right_tag): + if left_tag in ['?', '?php', 'div'] : # handle PHP, etc. return True if ("/" + left_tag) == right_tag: return True + if (right_tag == "--" and left_tag == "--") : + return True elif left_tag == right_tag[1:] \ and right_tag[0] != "<": return True @@ -376,9 +504,10 @@ class HtmlBlockPreprocessor (Preprocessor): return (tag in ['hr', 'hr/']) - def run (self, lines) : + def run (self, text) : + new_blocks = [] - text = "\n".join(lines) + #text = "\n".join(lines) text = text.split("\n\n") items = [] @@ -417,26 +546,31 @@ class HtmlBlockPreprocessor (Preprocessor): new_blocks.append( self.stash.store(block.strip())) continue - elif not block[1] == "!": + else: #if not block[1] == "!": # if is block level tag and is not complete items.append(block.strip()) in_tag = True continue - + new_blocks.append(block) else: items.append(block.strip()) right_tag = self._get_right_tag(left_tag, block) + if self._equal_tags(left_tag, right_tag): # if find closing tag in_tag = False new_blocks.append( self.stash.store('\n\n'.join(items))) items = [] - - return "\n\n".join(new_blocks).split("\n") + + if items : + new_blocks.append(self.stash.store('\n\n'.join(items))) + new_blocks.append('\n') + + return "\n\n".join(new_blocks) #.split("\n") HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor() @@ -609,15 +743,15 @@ class LinkPattern (Pattern): def handleMatch(self, m, doc) : el = doc.createElement('a') el.appendChild(doc.createTextNode(m.group(2))) - parts = m.group(9).split() + parts = m.group(9).split('"') # We should now have [], [href], or [href, title] if parts : - el.setAttribute('href', parts[0]) + el.setAttribute('href', parts[0].strip()) else : el.setAttribute('href', "") if len(parts) > 1 : # we also got a title - title = " ".join(parts[1:]).strip() + title = '"' + '"'.join(parts[1:]).strip() title = dequote(title) #.replace('"', """) el.setAttribute('title', title) return el @@ -645,12 +779,14 @@ class ImagePattern (Pattern): class ReferencePattern (Pattern): def handleMatch(self, m, doc): + if m.group(9) : id = m.group(9).lower() else : # if we got something like "[Google][]" # we'll use "google" as the id id = m.group(2).lower() + if not self.references.has_key(id) : # ignore undefined refs return None href, title = self.references[id] @@ -789,8 +925,7 @@ class BlockGuru : remainder of the original list""" items = [] - item = -1 - + i = 0 # to keep track of where we are for line in lines : @@ -908,11 +1043,11 @@ class Markdown: Markdown text """ - def __init__(self, source=None, - extensions=None, + def __init__(self, source=None, # deprecated + extensions=[], extension_configs=None, - encoding=None, - safe_mode = True): + encoding="utf-8", + safe_mode = False): """Creates a new Markdown instance. @param source: The text in Markdown format. @@ -924,10 +1059,14 @@ class Markdown: self.blockGuru = BlockGuru() self.registeredExtensions = [] self.stripTopLevelTags = 1 + self.docType = "" - self.preprocessors = [ HEADER_PREPROCESSOR, + + self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR] + + self.preprocessors = [ + HEADER_PREPROCESSOR, LINE_PREPROCESSOR, - HTML_BLOCK_PREPROCESSOR, LINE_BREAKS_PREPROCESSOR, # A footnote preprocessor will # get inserted here @@ -979,6 +1118,7 @@ class Markdown: for ext in extensions : extension_module_name = "libprs500.ebooks.markdown.mdx_" + ext + try : module = sys.modules[extension_module_name] except : @@ -991,6 +1131,7 @@ class Markdown: configs_for_ext = configs[ext] else : configs_for_ext = [] + extension = module.makeExtension(configs_for_ext) extension.extendMarkdown(self, globals()) @@ -1032,7 +1173,7 @@ class Markdown: self.doc.appendChild(self.top_element) # Fixup the source text - text = self.source.strip() + text = self.source #.strip() text = text.replace("\r\n", "\n").replace("\r", "\n") text += "\n\n" text = text.expandtabs(TAB_LENGTH) @@ -1085,100 +1226,86 @@ class Markdown: @param inList: a level @returns: None""" - if not lines : - return - - # Check if this section starts with a list, a blockquote or - # a code block - - processFn = { 'ul' : self._processUList, - 'ol' : self._processOList, - 'quoted' : self._processQuote, - 'tabbed' : self._processCodeBlock } - - for regexp in ['ul', 'ol', 'quoted', 'tabbed'] : - m = RE.regExp[regexp].match(lines[0]) - if m : - try: - processFn[regexp](parent_elem, lines, inList) - except RuntimeError: - print 'WARNING: Max recursion depth excedeeded, skipping section' - #print '\n'.join(lines) - #sys.exit() - return - - # We are NOT looking at one of the high-level structures like - # lists or blockquotes. So, it's just a regular paragraph - # (though perhaps nested inside a list or something else). If - # we are NOT inside a list, we just need to look for a blank - # line to find the end of the block. If we ARE inside a - # list, however, we need to consider that a sublist does not - # need to be separated by a blank line. Rather, the following - # markup is legal: - # - # * The top level list item - # - # Another paragraph of the list. This is where we are now. - # * Underneath we might have a sublist. - # - - if inList : - - start, theRest = self._linesUntil(lines, (lambda line: - RE.regExp['ul'].match(line) - or RE.regExp['ol'].match(line) - or not line.strip())) - - self._processSection(parent_elem, start, - inList - 1, looseList = looseList) - self._processSection(parent_elem, theRest, - inList - 1, looseList = looseList) - - - else : # Ok, so it's just a simple block - - paragraph, theRest = self._linesUntil(lines, lambda line: - not line.strip()) - - if len(paragraph) and paragraph[0].startswith('#') : - m = RE.regExp['header'].match(paragraph[0]) + while lines: + # Check if this section starts with a list, a blockquote or + # a code block + + processFn = { 'ul' : self._processUList, + 'ol' : self._processOList, + 'quoted' : self._processQuote, + 'tabbed' : self._processCodeBlock } + + for regexp in ['ul', 'ol', 'quoted', 'tabbed'] : + m = RE.regExp[regexp].match(lines[0]) if m : - level = len(m.group(1)) - h = self.doc.createElement("h%d" % level) - parent_elem.appendChild(h) - for item in self._handleInlineWrapper2(m.group(2).strip()) : - h.appendChild(item) - else : - message(CRITICAL, "We've got a problem header!") - - elif paragraph : - - list = self._handleInlineWrapper2("\n".join(paragraph)) - - if ( parent_elem.nodeName == 'li' - and not (looseList or parent_elem.childNodes)): - - #and not parent_elem.childNodes) : - # If this is the first paragraph inside "li", don't - # put
around it - append the paragraph bits directly - # onto parent_elem - el = parent_elem - else : - # Otherwise make a "p" element - el = self.doc.createElement("p") - parent_elem.appendChild(el) - - for item in list : - el.appendChild(item) - - if theRest : - theRest = theRest[1:] # skip the first (blank) line - - try: - self._processSection(parent_elem, theRest, inList) - except RuntimeError: #Added by Kovid - pass - + processFn[regexp](parent_elem, lines, inList) + return + + # We are NOT looking at one of the high-level structures like + # lists or blockquotes. So, it's just a regular paragraph + # (though perhaps nested inside a list or something else). If + # we are NOT inside a list, we just need to look for a blank + # line to find the end of the block. If we ARE inside a + # list, however, we need to consider that a sublist does not + # need to be separated by a blank line. Rather, the following + # markup is legal: + # + # * The top level list item + # + # Another paragraph of the list. This is where we are now. + # * Underneath we might have a sublist. + # + + if inList : + + start, lines = self._linesUntil(lines, (lambda line: + RE.regExp['ul'].match(line) + or RE.regExp['ol'].match(line) + or not line.strip())) + + self._processSection(parent_elem, start, + inList - 1, looseList = looseList) + self._processSection(parent_elem, lines, + inList - 1, looseList = looseList) + + + else : # Ok, so it's just a simple block + + paragraph, lines = self._linesUntil(lines, lambda line: + not line.strip()) + if len(paragraph) and paragraph[0].startswith('#') : + m = RE.regExp['header'].match(paragraph[0]) + if m : + level = len(m.group(1)) + h = self.doc.createElement("h%d" % level) + parent_elem.appendChild(h) + for item in self._handleInlineWrapper(m.group(2).strip()) : + h.appendChild(item) + else : + message(CRITICAL, "We've got a problem header!") + + elif paragraph : + list = self._handleInlineWrapper("\n".join(paragraph)) + + if ( parent_elem.nodeName == 'li' + and not (looseList or parent_elem.childNodes)): + + #and not parent_elem.childNodes) : + # If this is the first paragraph inside "li", don't + # put
around it - append the paragraph bits directly + # onto parent_elem + el = parent_elem + else : + # Otherwise make a "p" element + el = self.doc.createElement("p") + parent_elem.appendChild(el) + + for item in list : + el.appendChild(item) + + if lines: + lines = lines[1:] # skip the first (blank) line + def _processUList(self, parent_elem, lines, inList) : @@ -1247,9 +1374,11 @@ class Markdown: m = RE.regExp[expr].match(line) if m : if expr in ['ul', 'ol'] : # We are looking at a new item - if m.group(1) : - items.append([m.group(1)]) - item += 1 + #if m.group(1) : + # Removed the check to allow for a blank line + # at the beginning of the list item + items.append([m.group(1)]) + item += 1 elif expr == 'tabbed' : # This line needs to be detabbed items[item].append(m.group(4)) #after the 'tab' @@ -1333,46 +1462,37 @@ class Markdown: detabbed, theRest = self.blockGuru.detectTabbed(lines) pre = self.doc.createElement('pre') - #code = self.doc.createElement('code') + code = self.doc.createElement('code') parent_elem.appendChild(pre) - #pre.appendChild(code) + pre.appendChild(code) text = "\n".join(detabbed).rstrip()+"\n" #text = text.replace("&", "&") - pre.appendChild(self.doc.createTextNode(text)) + code.appendChild(self.doc.createTextNode(text)) self._processSection(parent_elem, theRest, inList) - def _handleInlineWrapper2 (self, line) : + def _handleInlineWrapper (self, line) : parts = [line] - #if not(line): - # return [self.doc.createTextNode(' ')] - for pattern in self.inlinePatterns : - #print - #print self.inlinePatterns.index(pattern) - i = 0 - #print parts while i < len(parts) : x = parts[i] - #print i + if isinstance(x, (str, unicode)) : result = self._applyPattern(x, pattern) - #print result - #print result - #print parts, i + if result : i -= 1 parts.remove(x) for y in result : parts.insert(i+1,y) - + i += 1 for i in range(len(parts)) : @@ -1383,27 +1503,6 @@ class Markdown: return parts - - def _handleInlineWrapper (self, line) : - - # A wrapper around _handleInline to avoid recursion - - parts = [line] - - i = 0 - - while i < len(parts) : - x = parts[i] - if isinstance(x, (str, unicode)) : - parts.remove(x) - result = self._handleInline(x) - for y in result : - parts.insert(i,y) - else : - i += 1 - - return parts - def _handleInline(self, line): """Transform a Markdown line with inline elements to an XHTML fragment. @@ -1424,6 +1523,7 @@ class Markdown: return [self.doc.createTextNode(line)] def _applyPattern(self, line, pattern) : + """ Given a pattern name, this function checks if the line fits the pattern, creates the necessary elements, and returns back a list consisting of NanoDom elements and/or strings. @@ -1438,6 +1538,8 @@ class Markdown: # match the line to pattern's pre-compiled reg exp. # if no match, move on. + + m = pattern.getCompiledRegExp().match(line) if not m : return None @@ -1446,6 +1548,40 @@ class Markdown: # if it doesn't, move on node = pattern.handleMatch(m, self.doc) + # check if any of this nodes have children that need processing + + if isinstance(node, Element): + + if not node.nodeName in ["code", "pre"] : + for child in node.childNodes : + if isinstance(child, TextNode): + + result = self._handleInlineWrapper(child.value) + + if result: + + if result == [child] : + continue + + result.reverse() + #to make insertion easier + + position = node.childNodes.index(child) + + node.removeChild(child) + + for item in result: + + if isinstance(item, (str, unicode)): + if len(item) > 0: + node.insertChild(position, + self.doc.createTextNode(item)) + else: + node.insertChild(position, item) + + + + if node : # Those are in the reverse order! return ( m.groups()[-1], # the string to the left @@ -1455,7 +1591,7 @@ class Markdown: else : return None - def __str__(self, source = None): + def convert (self, source = None): """Return the document in XHTML format. @returns: A serialized XHTML body.""" @@ -1463,6 +1599,14 @@ class Markdown: if source : self.source = source + + if not self.source : + return "" + + self.source = removeBOM(self.source, self.encoding) + + for pp in self.textPreprocessors: + self.source = pp.run(self.source) doc = self._transform() xml = doc.toxml() @@ -1474,8 +1618,8 @@ class Markdown: for i in range(self.htmlStash.html_counter) : html = self.htmlStash.rawHtmlBlocks[i] - if self.safeMode : - html = "[HTML_REMOVED]" + if self.safeMode and html != "
%s\n
" % (HTML_PLACEHOLDER % i), html + "\n") @@ -1490,10 +1634,13 @@ class Markdown: for pp in self.textPostprocessors : xml = pp.run(xml) - return xml + return (self.docType + xml).strip() - toString = __str__ + __str__ = convert # deprecated - will be changed in 1.7 to report + # information about the MD instance + + toString = __str__ # toString() method is deprecated def __unicode__(self): @@ -1502,7 +1649,7 @@ class Markdown: return str(self)#.decode(self.encoding) - toUnicode = __unicode__ + toUnicode = __unicode__ # deprecated - will be removed in 1.7 @@ -1525,7 +1672,7 @@ def markdownFromFile(input = None, if not encoding : encoding = "utf-8" - input_file = codecs.open(input, mode="r", encoding="utf-8") + input_file = codecs.open(input, mode="r", encoding=encoding) text = input_file.read() input_file.close() @@ -1559,23 +1706,21 @@ def markdown(text, pairs = [x.split("=") for x in ext[pos+1:-1].split(",")] configs = [(x.strip(), y.strip()) for (x, y) in pairs] extension_configs[name] = configs - #print configs - md = Markdown(text, extensions=extension_names, + md = Markdown(extensions=extension_names, extension_configs=extension_configs, safe_mode = safe_mode) - return md.toString() + return md.convert(text) class Extension : - def __init__(self, configs={}) : + def __init__(self, configs = {}) : self.config = configs def getConfig(self, key) : if self.config.has_key(key) : - #print self.config[key][0] return self.config[key][0] else : return "" @@ -1653,10 +1798,8 @@ def parse_options() : 'extensions' : options.extensions, 'encoding' : options.encoding } -def main(): +if __name__ == '__main__': """ Run Markdown from the command line. """ - for a in ['-x', 'toc', '-x', 'tables', '-x', 'footnotes']: - sys.argv.append(a) options = parse_options() @@ -1667,8 +1810,7 @@ def main(): markdownFromFile(**options) -if __name__ == '__main__': - main() +