Fix txt2lrf processing of long unstructured txt files

2025-07-09 03:04:10 -04:00 · 2007-08-25 02:51:27 +00:00 · 2007-08-25 02:51:27 +00:00 · 58c15ac8b7
commit 58c15ac8b7
parent ae0b5d3168
2 changed files with 382 additions and 231 deletions
--- a/src/libprs500/ebooks/lrf/txt/convert_from.py
+++ b/src/libprs500/ebooks/lrf/txt/convert_from.py
@ -31,10 +31,12 @@ def option_parser():
              'the text in mybook.txt. Default is to try to autodetect.'
    parser.add_option('-e', '--encoding', action='store', type='string', \
                      dest='encoding', help=enchelp, default=None)
+    parser.add_option('--debug-html-generation', action='store_true', default=False,
+                      dest='debug_html_generation', help='Print generated HTML to stdout and quit.')
    return parser
    

-def generate_html(txtfile, encoding):
+def generate_html(txtfile, encoding, logger):
    '''
    Convert txtfile to html and return a PersistentTemporaryFile object pointing
    to the file with the HTML.
@ -54,12 +56,14 @@ def generate_html(txtfile, encoding):
            raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
    else:
        txt = codecs.open(txtfile, 'rb', enc).read()
+    
+    logger.info('Converting text to HTML...')
    md = markdown.Markdown(txt,
                       extensions=['footnotes', 'tables', 'toc'],
-                           encoding=enc,
                       safe_mode=False,
                       )
    html = md.toString()
+    
    p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
    p.close()
    codecs.open(p.name, 'wb', enc).write(html)
@ -73,7 +77,10 @@ def process_file(path, options, logger=None):
    txt = os.path.abspath(os.path.expanduser(path))
    if not hasattr(options, 'encoding'):
        options.encoding = None 
-    htmlfile = generate_html(txt, options.encoding)
+    if not hasattr(options, 'debug_html_generation'):
+        options.debug_html_generation = False
+    htmlfile = generate_html(txt, options.encoding, logger)
+    if not options.debug_html_generation:
        options.force_page_break = 'h2'
        if not options.output:
            ext = '.lrs' if options.lrs else '.lrf'
@ -81,6 +88,8 @@ def process_file(path, options, logger=None):
        options.output = os.path.abspath(os.path.expanduser(options.output))                
    
        html_process_file(htmlfile.name, options, logger)
+    else:
+        print open(htmlfile.name, 'rb').read()        

 def main(args=sys.argv, logger=None):
    parser = option_parser()    
--- a/src/libprs500/ebooks/markdown/markdown.py
+++ b/src/libprs500/ebooks/markdown/markdown.py
@ -1,43 +1,37 @@
 #!/usr/bin/env python

-# The following constant specifies the name used in the usage
-# statement displayed for python versions lower than 2.3.  (With
-# python2.3 and higher the usage statement is generated by optparse
-# and uses the actual name of the executable called.)
-
-EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
-
-SPEED_TEST = 0
+version = "1.6b"
+version_info = (1,6,2,"rc-2")
+__revision__ = "$Rev$"

 """
-====================================================================
-IF YOA ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION
-====================================================================
-
 Python-Markdown
 ===============

 Converts Markdown to HTML.  Basic usage as a module:

    import markdown
-    html = markdown.markdown(your_text_string)
+    md = Markdown()
+    html = markdown.convert(your_text_string)
+
+See http://www.freewisdom.org/projects/python-markdown/ for more
+information and instructions on how to extend the functionality of the
+script.  (You might want to read that before you try modifying this
+file.)

 Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
 maintained  by [Yuri Takhteyev](http://www.freewisdom.org).

-Project website: http://www.freewisdom.org/projects/python-markdown
 Contact: yuri [at] freewisdom.org

 License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD

-Version: 1.5a (July 9, 2006)
-
-For changelog, see end of file
 """

-import re, sys, os, random, codecs

-# set debug level: 3 none, 2 critical, 1 informative, 0 all
+import re, sys, codecs
+
+# Set debug level: 3 none, 2 critical, 1 informative, 0 all
 (VERBOSE, INFO, CRITICAL, NONE) = range(4)

 MESSAGE_THRESHOLD = CRITICAL
@ -49,10 +43,45 @@ def message(level, text) :

 # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------

-# all tabs will be expanded to up to this many spaces
-TAB_LENGTH = 4
-ENABLE_ATTRIBUTES = 1
-SMART_EMPHASIS = 1
+TAB_LENGTH = 4            # expand tabs to this many spaces
+ENABLE_ATTRIBUTES = True  # @id = xyz -> <... id="xyz">
+SMART_EMPHASIS = 1        # this_or_that does not become this<i>or</i>that
+HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
+
+RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
+                    # from Hebrew to Nko (includes Arabic, Syriac and Thaana)
+                    (u'\u2D30', u'\u2D7F'),
+                    # Tifinagh
+                    )
+
+# Unicode Reference Table:
+# 0590-05FF - Hebrew
+# 0600-06FF - Arabic
+# 0700-074F - Syriac
+# 0750-077F - Arabic Supplement
+# 0780-07BF - Thaana
+# 07C0-07FF - Nko
+
+BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ),
+         'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"),
+                     unicode(codecs.BOM_UTF16_BE, "utf-16")),
+         #'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"),
+         #            unicode(codecs.BOM_UTF32_BE, "utf-32")),
+         }
+
+def removeBOM(text, encoding):
+    for bom in BOMS[encoding]:
+        if text.startswith(bom):
+            return text.lstrip(bom)
+    return text
+
+# The following constant specifies the name used in the usage
+# statement displayed for python versions lower than 2.3.  (With
+# python2.3 and higher the usage statement is generated by optparse
+# and uses the actual name of the executable called.)
+
+EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
+                    

 # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------

@ -82,14 +111,50 @@ Importantly, NanoDom does not do normalization, which is what we
 want. It also adds extra white space when converting DOM to string
 """

+ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&amp;"),
+                                     (re.compile("<"), "&lt;"),
+                                     (re.compile(">"), "&gt;"),
+                                     (re.compile("\""), "&quot;")]
+
+ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&amp;"),
+                                     (re.compile("<"), "&lt;"),
+                                     (re.compile(">"), "&gt;"),
+                                     (re.compile("\""), "&quot;")]
+
+
+def getBidiType(text) :
+
+    if not text : return None
+
+    ch = text[0]
+
+    if not isinstance(ch, unicode) or not ch.isalpha():
+        return None
+
+    else :
+
+        for min, max in RTL_BIDI_RANGES :
+            if ( ch >= min and ch <= max ) :
+                return "rtl"
+        else :
+            return "ltr"
+

 class Document :

+    def __init__ (self) :
+        self.bidi = "ltr"
+
    def appendChild(self, child) :
        self.documentElement = child
+        child.isDocumentElement = True
        child.parent = self
        self.entities = {}

+    def setBidi(self, bidi) :
+        if bidi :
+            self.bidi = bidi
+
    def createElement(self, tag, textNode=None) :
        el = Element(tag)
        el.doc = self
@ -107,19 +172,23 @@ class Document :
            self.entities[entity] = EntityReference(entity)
        return self.entities[entity]

+    def createCDATA(self, text) :
+        node = CDATA(text)
+        node.doc = self
+        return node
+
    def toxml (self) :
        return self.documentElement.toxml()

-    def normalizeEntities(self, text) :
+    def normalizeEntities(self, text, avoidDoubleNormalizing=False) :

-        pairs = [ ("&", "&amp;"),
-                  ("<", "&lt;"),
-                  (">", "&gt;"),
-                  ("\"", "&quot;")]
+        if avoidDoubleNormalizing :
+            regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT
+        else :
+            regexps = ENTITY_NORMALIZATION_EXPRESSIONS

-
-        for old, new in pairs :
-            text = text.replace(old, new)
+        for regexp, substitution in regexps :
+            text = regexp.sub(substitution, text)
        return text

    def find(self, test) :
@ -130,6 +199,19 @@ class Document :
        self.documentElement = None


+class CDATA :
+
+    type = "cdata"
+
+    def __init__ (self, text) :
+        self.text = text
+
+    def handleAttributes(self) :
+        pass
+
+    def toxml (self) :
+        return "<![CDATA[" + self.text + "]]>"
+
 class Element :

    type = "element"
@ -140,6 +222,19 @@ class Element :
        self.attributes = []
        self.attribute_values = {}
        self.childNodes = []
+        self.bidi = None
+        self.isDocumentElement = False
+
+    def setBidi(self, bidi) :
+
+        if bidi :
+
+
+            if not self.bidi or self.isDocumentElement:
+                # Once the bidi is set don't change it (except for doc element)
+                self.bidi = bidi
+                self.parent.setBidi(bidi)
+

    def unlink(self) :
        for child in self.childNodes :
@ -186,27 +281,56 @@ class Element :
        if ENABLE_ATTRIBUTES :
            for child in self.childNodes:
                child.handleAttributes()
+
        buffer = ""
        if self.nodeName in ['h1', 'h2', 'h3', 'h4'] :
            buffer += "\n"
        elif self.nodeName in ['li'] :
            buffer += "\n "
+
+        # Process children FIRST, then do the attributes
+
+        childBuffer = ""
+
+        if self.childNodes or self.nodeName in ['blockquote']:
+            childBuffer += ">"
+            for child in self.childNodes :
+                childBuffer += child.toxml()
+            if self.nodeName == 'p' :
+                childBuffer += "\n"
+            elif self.nodeName == 'li' :
+                childBuffer += "\n "
+            childBuffer += "</%s>" % self.nodeName
+        else :
+            childBuffer += "/>"
+
+
+            
        buffer += "<" + self.nodeName
+
+        if self.nodeName in ['p', 'li', 'ul', 'ol',
+                             'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] :
+
+            if not self.attribute_values.has_key("dir"):
+                if self.bidi :
+                    bidi = self.bidi
+                else :
+                    bidi = self.doc.bidi
+                    
+                if bidi=="rtl" :
+                    self.setAttribute("dir", "rtl")
+        
        for attr in self.attributes :
            value = self.attribute_values[attr]
-            value = self.doc.normalizeEntities(value)
+            value = self.doc.normalizeEntities(value,
+                                               avoidDoubleNormalizing=True)
            buffer += ' %s="%s"' % (attr, value)
-        if self.childNodes or self.nodeName in ['blockquote']:
-            buffer += ">"
-            for child in self.childNodes :
-                buffer += child.toxml()
-            if self.nodeName == 'p' :
-                buffer += "\n"
-            elif self.nodeName == 'li' :
-                buffer += "\n "
-            buffer += "</%s>" % self.nodeName
-        else :
-            buffer += "/>"
+
+
+        # Now let's actually append the children
+
+        buffer += childBuffer
+
        if self.nodeName in ['p', 'li', 'ul', 'ol',
                             'h1', 'h2', 'h3', 'h4'] :
            buffer += "\n"
@ -223,13 +347,18 @@ class TextNode :
        self.value = text        

    def attributeCallback(self, match) :
+
        self.parent.setAttribute(match.group(1), match.group(2))

    def handleAttributes(self) :
        self.value = self.attrRegExp.sub(self.attributeCallback, self.value)

    def toxml(self) :
+
        text = self.value
+
+        self.parent.setBidi(getBidiType(text))
+        
        if not text.startswith(HTML_PLACEHOLDER_PREFIX):
            if self.parent.nodeName == "p" :
                text = text.replace("\n", "\n   ")
@ -262,10 +391,10 @@ class EntityReference:
 Preprocessors munge source text before we start doing anything too
 complicated.

-Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document,
-modifies it as necessary and returns either the same pointer or a
-pointer to a new list.  Preprocessors must extend
-markdown.Preprocessor.
+Each preprocessor implements a "run" method that takes a pointer to a
+list of lines of the document, modifies it as necessary and returns
+either the same pointer or a pointer to a new list.  Preprocessors
+must extend markdown.Preprocessor.

 """

@ -305,10 +434,6 @@ class HeaderPreprocessor (Preprocessor):
                    lines[i] = "## " + lines[i].strip()
                    lines[i+1] = ""

-        #for l in lines :
-        #    print l.encode('utf8')
-        #sys.exit(0)
-
        return lines

 HEADER_PREPROCESSOR = HeaderPreprocessor()
@ -362,10 +487,13 @@ class HtmlBlockPreprocessor (Preprocessor):
        return block.rstrip()[-len(left_tag)-2:-1].lower()

    def _equal_tags(self, left_tag, right_tag):
+        
        if left_tag in ['?', '?php', 'div'] : # handle PHP, etc.
            return True
        if ("/" + left_tag) == right_tag:
            return True
+        if (right_tag == "--" and left_tag == "--") :
+            return True
        elif left_tag == right_tag[1:] \
            and right_tag[0] != "<":
            return True
@ -376,9 +504,10 @@ class HtmlBlockPreprocessor (Preprocessor):
        return (tag in ['hr', 'hr/'])

    
-    def run (self, lines) :
+    def run (self, text) :
+
        new_blocks = []
-        text = "\n".join(lines)
+        #text = "\n".join(lines)
        text = text.split("\n\n")
        
        items = []
@ -417,7 +546,7 @@ class HtmlBlockPreprocessor (Preprocessor):
                        new_blocks.append(
                            self.stash.store(block.strip()))
                        continue
-                    elif not block[1] == "!":
+                    else: #if not block[1] == "!":
                        # if is block level tag and is not complete
                        items.append(block.strip())
                        in_tag = True
@ -429,6 +558,7 @@ class HtmlBlockPreprocessor (Preprocessor):
                items.append(block.strip())
                
                right_tag = self._get_right_tag(left_tag, block)
+                
                if self._equal_tags(left_tag, right_tag):
                    # if find closing tag
                    in_tag = False
@ -436,7 +566,11 @@ class HtmlBlockPreprocessor (Preprocessor):
                        self.stash.store('\n\n'.join(items)))
                    items = []

-        return "\n\n".join(new_blocks).split("\n")
+        if items :
+            new_blocks.append(self.stash.store('\n\n'.join(items)))
+            new_blocks.append('\n')
+            
+        return "\n\n".join(new_blocks)   #.split("\n")

 HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()

@ -609,15 +743,15 @@ class LinkPattern (Pattern):
    def handleMatch(self, m, doc) :
        el = doc.createElement('a')
        el.appendChild(doc.createTextNode(m.group(2)))
-        parts = m.group(9).split()
+        parts = m.group(9).split('"')
        # We should now have [], [href], or [href, title]
        if parts :
-            el.setAttribute('href', parts[0])
+            el.setAttribute('href', parts[0].strip())
        else :
            el.setAttribute('href', "")
        if len(parts) > 1 :
            # we also got a title
-            title = " ".join(parts[1:]).strip()
+            title = '"' + '"'.join(parts[1:]).strip()
            title = dequote(title) #.replace('"', "&quot;")
            el.setAttribute('title', title)
        return el
@ -645,12 +779,14 @@ class ImagePattern (Pattern):
 class ReferencePattern (Pattern):

    def handleMatch(self, m, doc):
+
        if m.group(9) :
            id = m.group(9).lower()
        else :
            # if we got something like "[Google][]"
            # we'll use "google" as the id
            id = m.group(2).lower()
+
        if not self.references.has_key(id) : # ignore undefined refs
            return None
        href, title = self.references[id]
@ -789,7 +925,6 @@ class BlockGuru :
                      remainder of the original list"""

        items = []
-        item = -1
        
        i = 0 # to keep track of where we are

@ -908,11 +1043,11 @@ class Markdown:
        Markdown text """


-    def __init__(self, source=None,
-                 extensions=None,
+    def __init__(self, source=None,  # deprecated
+                 extensions=[],
                 extension_configs=None,
-                 encoding=None,
-                 safe_mode = True):
+                 encoding="utf-8",
+                 safe_mode = False):
        """Creates a new Markdown instance.

           @param source: The text in Markdown format.
@ -924,10 +1059,14 @@ class Markdown:
        self.blockGuru = BlockGuru()
        self.registeredExtensions = []
        self.stripTopLevelTags = 1
+        self.docType = ""

-        self.preprocessors = [ HEADER_PREPROCESSOR,
+
+        self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
+
+        self.preprocessors = [ 
+                               HEADER_PREPROCESSOR,
                               LINE_PREPROCESSOR,
-                               HTML_BLOCK_PREPROCESSOR,
                               LINE_BREAKS_PREPROCESSOR,
                               # A footnote preprocessor will
                               # get inserted here
@ -979,6 +1118,7 @@ class Markdown:
        for ext in extensions :

            extension_module_name = "libprs500.ebooks.markdown.mdx_" + ext
+
            try :
                module = sys.modules[extension_module_name]
            except :
@ -991,6 +1131,7 @@ class Markdown:
                    configs_for_ext = configs[ext]
                else :
                    configs_for_ext = []
+                
                extension = module.makeExtension(configs_for_ext)    
                extension.extendMarkdown(self, globals())

@ -1032,7 +1173,7 @@ class Markdown:
        self.doc.appendChild(self.top_element)

        # Fixup the source text
-        text = self.source.strip()
+        text = self.source #.strip()
        text = text.replace("\r\n", "\n").replace("\r", "\n")
        text += "\n\n"
        text = text.expandtabs(TAB_LENGTH)
@ -1085,9 +1226,7 @@ class Markdown:
           @param inList: a level
           @returns: None"""

-        if not lines :
-            return
-
+        while lines:
            # Check if this section starts with a list, a blockquote or
            # a code block
    
@ -1099,12 +1238,7 @@ class Markdown:
            for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
                m = RE.regExp[regexp].match(lines[0])
                if m :
-                try:
                    processFn[regexp](parent_elem, lines, inList)
-                except RuntimeError:
-                    print 'WARNING: Max recursion depth excedeeded, skipping section'
-                    #print '\n'.join(lines)
-                    #sys.exit()
                    return
    
            # We are NOT looking at one of the high-level structures like
@ -1124,36 +1258,34 @@ class Markdown:
    
            if inList :
    
-            start, theRest = self._linesUntil(lines, (lambda line:
+                start, lines = self._linesUntil(lines, (lambda line:
                                 RE.regExp['ul'].match(line)
                                 or RE.regExp['ol'].match(line)
                                                  or not line.strip()))
    
                self._processSection(parent_elem, start,
                                     inList - 1, looseList = looseList)
-            self._processSection(parent_elem, theRest,
+                self._processSection(parent_elem, lines,
                                     inList - 1, looseList = looseList)
    
    
            else : # Ok, so it's just a simple block
            
-            paragraph, theRest = self._linesUntil(lines, lambda line:
+                paragraph, lines = self._linesUntil(lines, lambda line:
                                                 not line.strip())
-
                if len(paragraph) and paragraph[0].startswith('#') :
                    m = RE.regExp['header'].match(paragraph[0])
                    if m :
                        level = len(m.group(1))
                        h = self.doc.createElement("h%d" % level)
                        parent_elem.appendChild(h)
-                    for item in self._handleInlineWrapper2(m.group(2).strip()) :
+                        for item in self._handleInlineWrapper(m.group(2).strip()) :
                            h.appendChild(item)
                    else :
                        message(CRITICAL, "We've got a problem header!")
    
                elif paragraph :
-
-                list = self._handleInlineWrapper2("\n".join(paragraph))
+                    list = self._handleInlineWrapper("\n".join(paragraph))
    
                    if ( parent_elem.nodeName == 'li'
                         and not (looseList or parent_elem.childNodes)):
@ -1171,13 +1303,8 @@ class Markdown:
                    for item in list :
                        el.appendChild(item)
    
-            if theRest :
-                theRest = theRest[1:]  # skip the first (blank) line
-
-            try:
-                self._processSection(parent_elem, theRest, inList)
-            except RuntimeError: #Added by Kovid
-                pass
+                if lines:
+                    lines = lines[1:]  # skip the first (blank) line
                


@ -1247,7 +1374,9 @@ class Markdown:
                m = RE.regExp[expr].match(line)
                if m :
                    if expr in ['ul', 'ol'] :  # We are looking at a new item
-                        if m.group(1) :
+                        #if m.group(1) :
+                        # Removed the check to allow for a blank line
+                        # at the beginning of the list item
                        items.append([m.group(1)])
                        item += 1
                    elif expr == 'tabbed' :  # This line needs to be detabbed
@ -1333,40 +1462,31 @@ class Markdown:
        detabbed, theRest = self.blockGuru.detectTabbed(lines)

        pre = self.doc.createElement('pre')
-        #code = self.doc.createElement('code')
+        code = self.doc.createElement('code')
        parent_elem.appendChild(pre)
-        #pre.appendChild(code)
+        pre.appendChild(code)
        text = "\n".join(detabbed).rstrip()+"\n"
        #text = text.replace("&", "&amp;")
-        pre.appendChild(self.doc.createTextNode(text))
+        code.appendChild(self.doc.createTextNode(text))
        self._processSection(parent_elem, theRest, inList)


-    def _handleInlineWrapper2 (self, line) :

+    def _handleInlineWrapper (self, line) :

        parts = [line]

-        #if not(line):
-        #    return [self.doc.createTextNode(' ')]
-
        for pattern in self.inlinePatterns :

-            #print
-            #print self.inlinePatterns.index(pattern)
-
            i = 0

-            #print parts
            while i < len(parts) :
                
                x = parts[i]
-                #print i
+
                if isinstance(x, (str, unicode)) :
                    result = self._applyPattern(x, pattern)
-                    #print result
-                    #print result
-                    #print parts, i
+
                    if result :
                        i -= 1
                        parts.remove(x)
@ -1383,27 +1503,6 @@ class Markdown:
        return parts
        

-
-    def _handleInlineWrapper (self, line) :
-
-        # A wrapper around _handleInline to avoid recursion
-
-        parts = [line]
-
-        i = 0
-        
-        while i < len(parts) :
-            x = parts[i]
-            if isinstance(x, (str, unicode)) :
-                parts.remove(x)
-                result = self._handleInline(x)
-                for y in result :
-                    parts.insert(i,y)
-            else :
-                i += 1
-
-        return parts
-
    def _handleInline(self,  line):
        """Transform a Markdown line with inline elements to an XHTML
        fragment.
@ -1424,6 +1523,7 @@ class Markdown:
        return [self.doc.createTextNode(line)]

    def _applyPattern(self, line, pattern) :
+
        """ Given a pattern name, this function checks if the line
        fits the pattern, creates the necessary elements, and returns
        back a list consisting of NanoDom elements and/or strings.
@ -1438,6 +1538,8 @@ class Markdown:
        # match the line to pattern's pre-compiled reg exp.
        # if no match, move on.

+
+
        m = pattern.getCompiledRegExp().match(line)
        if not m :
            return None
@ -1446,6 +1548,40 @@ class Markdown:
        # if it doesn't, move on
        node = pattern.handleMatch(m, self.doc)

+        # check if any of this nodes have children that need processing
+
+        if isinstance(node, Element):
+
+            if not node.nodeName in ["code", "pre"] :
+                for child in node.childNodes :
+                    if isinstance(child, TextNode):
+                        
+                        result = self._handleInlineWrapper(child.value)
+                        
+                        if result:
+
+                            if result == [child] :
+                                continue
+                                
+                            result.reverse()
+                            #to make insertion easier
+
+                            position = node.childNodes.index(child)
+                            
+                            node.removeChild(child)
+
+                            for item in result:
+
+                                if isinstance(item, (str, unicode)):
+                                    if len(item) > 0:
+                                        node.insertChild(position,
+                                             self.doc.createTextNode(item))
+                                else:
+                                    node.insertChild(position, item)
+                
+
+
+
        if node :
            # Those are in the reverse order!
            return ( m.groups()[-1], # the string to the left
@ -1455,7 +1591,7 @@ class Markdown:
        else :
            return None

-    def __str__(self, source = None):
+    def convert (self, source = None):
        """Return the document in XHTML format.

        @returns: A serialized XHTML body."""
@ -1464,6 +1600,14 @@ class Markdown:
        if source :
            self.source = source

+        if not self.source :
+            return ""
+
+        self.source = removeBOM(self.source, self.encoding)
+
+        for pp in self.textPreprocessors:
+            self.source = pp.run(self.source)
+        
        doc = self._transform()
        xml = doc.toxml()

@ -1474,8 +1618,8 @@ class Markdown:

        for i in range(self.htmlStash.html_counter) :
            html = self.htmlStash.rawHtmlBlocks[i]
-            if self.safeMode :
-                html = "[HTML_REMOVED]"
+            if self.safeMode and html != "<hr />" and html != "<br />":
+                html = HTML_REMOVED_TEXT
                
            xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
                              html + "\n")
@ -1490,10 +1634,13 @@ class Markdown:
        for pp in self.textPostprocessors :
            xml = pp.run(xml)

-        return xml
+        return (self.docType + xml).strip()


-    toString = __str__
+    __str__ = convert   # deprecated - will be changed in 1.7 to report
+                        # information about the MD instance
+    
+    toString = __str__  # toString() method is deprecated


    def __unicode__(self):
@ -1502,7 +1649,7 @@ class Markdown:
        return str(self)#.decode(self.encoding)


-    toUnicode = __unicode__
+    toUnicode = __unicode__  # deprecated - will be removed in 1.7



@ -1525,7 +1672,7 @@ def markdownFromFile(input = None,
    if not encoding :
        encoding = "utf-8"

-    input_file = codecs.open(input, mode="r", encoding="utf-8")
+    input_file = codecs.open(input, mode="r", encoding=encoding)
    text = input_file.read()
    input_file.close()

@ -1559,23 +1706,21 @@ def markdown(text,
            pairs = [x.split("=") for x in ext[pos+1:-1].split(",")]
            configs = [(x.strip(), y.strip()) for (x, y) in pairs]
            extension_configs[name] = configs
-            #print configs

-    md = Markdown(text, extensions=extension_names,
+    md = Markdown(extensions=extension_names,
                  extension_configs=extension_configs,
                  safe_mode = safe_mode)

-    return md.toString()
+    return md.convert(text)
        

 class Extension :

-    def __init__(self, configs={}) :
+    def __init__(self, configs = {}) :
        self.config = configs

    def getConfig(self, key) :
        if self.config.has_key(key) :
-            #print self.config[key][0]
            return self.config[key][0]
        else :
            return ""
@ -1653,10 +1798,8 @@ def parse_options() :
            'extensions' : options.extensions,
            'encoding' : options.encoding }

-def main():
+if __name__ == '__main__':
    """ Run Markdown from the command line. """
-    for a in ['-x', 'toc', '-x', 'tables', '-x', 'footnotes']:
-        sys.argv.append(a)

    options = parse_options()

@ -1667,8 +1810,7 @@ def main():
    
    markdownFromFile(**options)

-if __name__ == '__main__':
-    main()
+