Fix txt2lrf processing of long unstructured txt files

2025-07-09 03:04:10 -04:00 · 2007-08-25 02:51:27 +00:00 · 2007-08-25 02:51:27 +00:00 · 58c15ac8b7
commit 58c15ac8b7
parent ae0b5d3168
2 changed files with 382 additions and 231 deletions
--- a/src/libprs500/ebooks/lrf/txt/convert_from.py
+++ b/src/libprs500/ebooks/lrf/txt/convert_from.py
@ -31,10 +31,12 @@ def option_parser():
              'the text in mybook.txt. Default is to try to autodetect.'
    parser.add_option('-e', '--encoding', action='store', type='string', \
                      dest='encoding', help=enchelp, default=None)
    parser.add_option('--debug-html-generation', action='store_true', default=False,
                      dest='debug_html_generation', help='Print generated HTML to stdout and quit.')
    return parser
-def generate_html(txtfile, encoding):
+def generate_html(txtfile, encoding, logger):
    '''
    Convert txtfile to html and return a PersistentTemporaryFile object pointing
    to the file with the HTML.
@ -54,12 +56,14 @@ def generate_html(txtfile, encoding):
            raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
    else:
        txt = codecs.open(txtfile, 'rb', enc).read()
    logger.info('Converting text to HTML...')
    md = markdown.Markdown(txt,
-                           extensions=['footnotes', 'tables', 'toc'],
+                       extensions=['footnotes', 'tables', 'toc'],
-                           encoding=enc,
+                       safe_mode=False,
-                           safe_mode=False,
+                       )
                           )
    html = md.toString()
    p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
    p.close()
    codecs.open(p.name, 'wb', enc).write(html)
@ -73,14 +77,19 @@ def process_file(path, options, logger=None):
    txt = os.path.abspath(os.path.expanduser(path))
    if not hasattr(options, 'encoding'):
        options.encoding = None 
-    htmlfile = generate_html(txt, options.encoding)
+    if not hasattr(options, 'debug_html_generation'):
-    options.force_page_break = 'h2'
+        options.debug_html_generation = False
-    if not options.output:
+    htmlfile = generate_html(txt, options.encoding, logger)
-        ext = '.lrs' if options.lrs else '.lrf'
+    if not options.debug_html_generation:
-        options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
+        options.force_page_break = 'h2'
-    options.output = os.path.abspath(os.path.expanduser(options.output))                
+        if not options.output:
            ext = '.lrs' if options.lrs else '.lrf'
            options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
        options.output = os.path.abspath(os.path.expanduser(options.output))                
-    html_process_file(htmlfile.name, options, logger)        
+        html_process_file(htmlfile.name, options, logger)
    else:
        print open(htmlfile.name, 'rb').read()        
 def main(args=sys.argv, logger=None):
    parser = option_parser()    
--- a/src/libprs500/ebooks/markdown/markdown.py
+++ b/src/libprs500/ebooks/markdown/markdown.py
@ -1,43 +1,37 @@
 #!/usr/bin/env python
-# The following constant specifies the name used in the usage
+version = "1.6b"
-# statement displayed for python versions lower than 2.3.  (With
+version_info = (1,6,2,"rc-2")
-# python2.3 and higher the usage statement is generated by optparse
+__revision__ = "$Rev$"
 # and uses the actual name of the executable called.)
 EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
 SPEED_TEST = 0
 """
 ====================================================================
 IF YOA ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION
 ====================================================================
 Python-Markdown
 ===============
 Converts Markdown to HTML.  Basic usage as a module:
    import markdown
-    html = markdown.markdown(your_text_string)
+    md = Markdown()
    html = markdown.convert(your_text_string)
 See http://www.freewisdom.org/projects/python-markdown/ for more
 information and instructions on how to extend the functionality of the
 script.  (You might want to read that before you try modifying this
 file.)
 Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
 maintained  by [Yuri Takhteyev](http://www.freewisdom.org).
 Project website: http://www.freewisdom.org/projects/python-markdown
 Contact: yuri [at] freewisdom.org
 License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
 Version: 1.5a (July 9, 2006)
 For changelog, see end of file
 """
 import re, sys, os, random, codecs
-# set debug level: 3 none, 2 critical, 1 informative, 0 all
+import re, sys, codecs
 # Set debug level: 3 none, 2 critical, 1 informative, 0 all
 (VERBOSE, INFO, CRITICAL, NONE) = range(4)
 MESSAGE_THRESHOLD = CRITICAL
@ -49,10 +43,45 @@ def message(level, text) :
 # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
-# all tabs will be expanded to up to this many spaces
+TAB_LENGTH = 4            # expand tabs to this many spaces
-TAB_LENGTH = 4
+ENABLE_ATTRIBUTES = True  # @id = xyz -> <... id="xyz">
-ENABLE_ATTRIBUTES = 1
+SMART_EMPHASIS = 1        # this_or_that does not become this<i>or</i>that
-SMART_EMPHASIS = 1
+HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
 RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
                    # from Hebrew to Nko (includes Arabic, Syriac and Thaana)
                    (u'\u2D30', u'\u2D7F'),
                    # Tifinagh
                    )
 # Unicode Reference Table:
 # 0590-05FF - Hebrew
 # 0600-06FF - Arabic
 # 0700-074F - Syriac
 # 0750-077F - Arabic Supplement
 # 0780-07BF - Thaana
 # 07C0-07FF - Nko
 BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ),
         'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"),
                     unicode(codecs.BOM_UTF16_BE, "utf-16")),
         #'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"),
         #            unicode(codecs.BOM_UTF32_BE, "utf-32")),
         }
 def removeBOM(text, encoding):
    for bom in BOMS[encoding]:
        if text.startswith(bom):
            return text.lstrip(bom)
    return text
 # The following constant specifies the name used in the usage
 # statement displayed for python versions lower than 2.3.  (With
 # python2.3 and higher the usage statement is generated by optparse
 # and uses the actual name of the executable called.)
 EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
 # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
@ -82,14 +111,50 @@ Importantly, NanoDom does not do normalization, which is what we
 want. It also adds extra white space when converting DOM to string
 """
 ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&amp;"),
                                     (re.compile("<"), "&lt;"),
                                     (re.compile(">"), "&gt;"),
                                     (re.compile("\""), "&quot;")]
 ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&amp;"),
                                     (re.compile("<"), "&lt;"),
                                     (re.compile(">"), "&gt;"),
                                     (re.compile("\""), "&quot;")]
 def getBidiType(text) :
    if not text : return None
    ch = text[0]
    if not isinstance(ch, unicode) or not ch.isalpha():
        return None
    else :
        for min, max in RTL_BIDI_RANGES :
            if ( ch >= min and ch <= max ) :
                return "rtl"
        else :
            return "ltr"
 class Document :
    def __init__ (self) :
        self.bidi = "ltr"
    def appendChild(self, child) :
        self.documentElement = child
        child.isDocumentElement = True
        child.parent = self
        self.entities = {}
    def setBidi(self, bidi) :
        if bidi :
            self.bidi = bidi
    def createElement(self, tag, textNode=None) :
        el = Element(tag)
        el.doc = self
@ -107,19 +172,23 @@ class Document :
            self.entities[entity] = EntityReference(entity)
        return self.entities[entity]
    def createCDATA(self, text) :
        node = CDATA(text)
        node.doc = self
        return node
    def toxml (self) :
        return self.documentElement.toxml()
-    def normalizeEntities(self, text) :
+    def normalizeEntities(self, text, avoidDoubleNormalizing=False) :
-        pairs = [ ("&", "&amp;"),
+        if avoidDoubleNormalizing :
-                  ("<", "&lt;"),
+            regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT
-                  (">", "&gt;"),
+        else :
-                  ("\"", "&quot;")]
+            regexps = ENTITY_NORMALIZATION_EXPRESSIONS
-
+        for regexp, substitution in regexps :
-        for old, new in pairs :
+            text = regexp.sub(substitution, text)
            text = text.replace(old, new)
        return text
    def find(self, test) :
@ -130,6 +199,19 @@ class Document :
        self.documentElement = None
 class CDATA :
    type = "cdata"
    def __init__ (self, text) :
        self.text = text
    def handleAttributes(self) :
        pass
    def toxml (self) :
        return "<![CDATA[" + self.text + "]]>"
 class Element :
    type = "element"
@ -140,6 +222,19 @@ class Element :
        self.attributes = []
        self.attribute_values = {}
        self.childNodes = []
        self.bidi = None
        self.isDocumentElement = False
    def setBidi(self, bidi) :
        if bidi :
            if not self.bidi or self.isDocumentElement:
                # Once the bidi is set don't change it (except for doc element)
                self.bidi = bidi
                self.parent.setBidi(bidi)
    def unlink(self) :
        for child in self.childNodes :
@ -186,27 +281,56 @@ class Element :
        if ENABLE_ATTRIBUTES :
            for child in self.childNodes:
                child.handleAttributes()
        buffer = ""
        if self.nodeName in ['h1', 'h2', 'h3', 'h4'] :
            buffer += "\n"
        elif self.nodeName in ['li'] :
            buffer += "\n "
        # Process children FIRST, then do the attributes
        childBuffer = ""
        if self.childNodes or self.nodeName in ['blockquote']:
            childBuffer += ">"
            for child in self.childNodes :
                childBuffer += child.toxml()
            if self.nodeName == 'p' :
                childBuffer += "\n"
            elif self.nodeName == 'li' :
                childBuffer += "\n "
            childBuffer += "</%s>" % self.nodeName
        else :
            childBuffer += "/>"
        buffer += "<" + self.nodeName
        if self.nodeName in ['p', 'li', 'ul', 'ol',
                             'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] :
            if not self.attribute_values.has_key("dir"):
                if self.bidi :
                    bidi = self.bidi
                else :
                    bidi = self.doc.bidi
                if bidi=="rtl" :
                    self.setAttribute("dir", "rtl")
        for attr in self.attributes :
            value = self.attribute_values[attr]
-            value = self.doc.normalizeEntities(value)
+            value = self.doc.normalizeEntities(value,
                                               avoidDoubleNormalizing=True)
            buffer += ' %s="%s"' % (attr, value)
-        if self.childNodes or self.nodeName in ['blockquote']:
+
-            buffer += ">"
+
-            for child in self.childNodes :
+        # Now let's actually append the children
-                buffer += child.toxml()
+
-            if self.nodeName == 'p' :
+        buffer += childBuffer
-                buffer += "\n"
+
            elif self.nodeName == 'li' :
                buffer += "\n "
            buffer += "</%s>" % self.nodeName
        else :
            buffer += "/>"
        if self.nodeName in ['p', 'li', 'ul', 'ol',
                             'h1', 'h2', 'h3', 'h4'] :
            buffer += "\n"
@ -223,13 +347,18 @@ class TextNode :
        self.value = text        
    def attributeCallback(self, match) :
        self.parent.setAttribute(match.group(1), match.group(2))
    def handleAttributes(self) :
        self.value = self.attrRegExp.sub(self.attributeCallback, self.value)
    def toxml(self) :
        text = self.value
        self.parent.setBidi(getBidiType(text))
        if not text.startswith(HTML_PLACEHOLDER_PREFIX):
            if self.parent.nodeName == "p" :
                text = text.replace("\n", "\n   ")
@ -262,10 +391,10 @@ class EntityReference:
 Preprocessors munge source text before we start doing anything too
 complicated.
-Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document,
+Each preprocessor implements a "run" method that takes a pointer to a
-modifies it as necessary and returns either the same pointer or a
+list of lines of the document, modifies it as necessary and returns
-pointer to a new list.  Preprocessors must extend
+either the same pointer or a pointer to a new list.  Preprocessors
-markdown.Preprocessor.
+must extend markdown.Preprocessor.
 """
@ -305,10 +434,6 @@ class HeaderPreprocessor (Preprocessor):
                    lines[i] = "## " + lines[i].strip()
                    lines[i+1] = ""
        #for l in lines :
        #    print l.encode('utf8')
        #sys.exit(0)
        return lines
 HEADER_PREPROCESSOR = HeaderPreprocessor()
@ -362,10 +487,13 @@ class HtmlBlockPreprocessor (Preprocessor):
        return block.rstrip()[-len(left_tag)-2:-1].lower()
    def _equal_tags(self, left_tag, right_tag):
        if left_tag in ['?', '?php', 'div'] : # handle PHP, etc.
            return True
        if ("/" + left_tag) == right_tag:
            return True
        if (right_tag == "--" and left_tag == "--") :
            return True
        elif left_tag == right_tag[1:] \
            and right_tag[0] != "<":
            return True
@ -376,9 +504,10 @@ class HtmlBlockPreprocessor (Preprocessor):
        return (tag in ['hr', 'hr/'])
-    def run (self, lines) :
+    def run (self, text) :
        new_blocks = []
-        text = "\n".join(lines)
+        #text = "\n".join(lines)
        text = text.split("\n\n")
        items = []
@ -417,26 +546,31 @@ class HtmlBlockPreprocessor (Preprocessor):
                        new_blocks.append(
                            self.stash.store(block.strip()))
                        continue
-                    elif not block[1] == "!":
+                    else: #if not block[1] == "!":
                        # if is block level tag and is not complete
                        items.append(block.strip())
                        in_tag = True
                        continue
-                    
+
                new_blocks.append(block)
            else:
                items.append(block.strip())
                right_tag = self._get_right_tag(left_tag, block)
                if self._equal_tags(left_tag, right_tag):
                    # if find closing tag
                    in_tag = False
                    new_blocks.append(
                        self.stash.store('\n\n'.join(items)))
                    items = []
-                    
+
-        return "\n\n".join(new_blocks).split("\n")
+        if items :
            new_blocks.append(self.stash.store('\n\n'.join(items)))
            new_blocks.append('\n')
        return "\n\n".join(new_blocks)   #.split("\n")
 HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
@ -609,15 +743,15 @@ class LinkPattern (Pattern):
    def handleMatch(self, m, doc) :
        el = doc.createElement('a')
        el.appendChild(doc.createTextNode(m.group(2)))
-        parts = m.group(9).split()
+        parts = m.group(9).split('"')
        # We should now have [], [href], or [href, title]
        if parts :
-            el.setAttribute('href', parts[0])
+            el.setAttribute('href', parts[0].strip())
        else :
            el.setAttribute('href', "")
        if len(parts) > 1 :
            # we also got a title
-            title = " ".join(parts[1:]).strip()
+            title = '"' + '"'.join(parts[1:]).strip()
            title = dequote(title) #.replace('"', "&quot;")
            el.setAttribute('title', title)
        return el
@ -645,12 +779,14 @@ class ImagePattern (Pattern):
 class ReferencePattern (Pattern):
    def handleMatch(self, m, doc):
        if m.group(9) :
            id = m.group(9).lower()
        else :
            # if we got something like "[Google][]"
            # we'll use "google" as the id
            id = m.group(2).lower()
        if not self.references.has_key(id) : # ignore undefined refs
            return None
        href, title = self.references[id]
@ -789,8 +925,7 @@ class BlockGuru :
                      remainder of the original list"""
        items = []
-        item = -1
+        
        i = 0 # to keep track of where we are
        for line in lines :
@ -908,11 +1043,11 @@ class Markdown:
        Markdown text """
-    def __init__(self, source=None,
+    def __init__(self, source=None,  # deprecated
-                 extensions=None,
+                 extensions=[],
                 extension_configs=None,
-                 encoding=None,
+                 encoding="utf-8",
-                 safe_mode = True):
+                 safe_mode = False):
        """Creates a new Markdown instance.
           @param source: The text in Markdown format.
@ -924,10 +1059,14 @@ class Markdown:
        self.blockGuru = BlockGuru()
        self.registeredExtensions = []
        self.stripTopLevelTags = 1
        self.docType = ""
-        self.preprocessors = [ HEADER_PREPROCESSOR,
+
        self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
        self.preprocessors = [ 
                               HEADER_PREPROCESSOR,
                               LINE_PREPROCESSOR,
                               HTML_BLOCK_PREPROCESSOR,
                               LINE_BREAKS_PREPROCESSOR,
                               # A footnote preprocessor will
                               # get inserted here
@ -979,6 +1118,7 @@ class Markdown:
        for ext in extensions :
            extension_module_name = "libprs500.ebooks.markdown.mdx_" + ext
            try :
                module = sys.modules[extension_module_name]
            except :
@ -991,6 +1131,7 @@ class Markdown:
                    configs_for_ext = configs[ext]
                else :
                    configs_for_ext = []
                extension = module.makeExtension(configs_for_ext)    
                extension.extendMarkdown(self, globals())
@ -1032,7 +1173,7 @@ class Markdown:
        self.doc.appendChild(self.top_element)
        # Fixup the source text
-        text = self.source.strip()
+        text = self.source #.strip()
        text = text.replace("\r\n", "\n").replace("\r", "\n")
        text += "\n\n"
        text = text.expandtabs(TAB_LENGTH)
@ -1085,100 +1226,86 @@ class Markdown:
           @param inList: a level
           @returns: None"""
-        if not lines :
+        while lines:
-            return
+            # Check if this section starts with a list, a blockquote or
-
+            # a code block
-        # Check if this section starts with a list, a blockquote or
+    
-        # a code block
+            processFn = { 'ul' :     self._processUList,
-
+                          'ol' :     self._processOList,
-        processFn = { 'ul' :     self._processUList,
+                          'quoted' : self._processQuote,
-                      'ol' :     self._processOList,
+                          'tabbed' : self._processCodeBlock }
-                      'quoted' : self._processQuote,
+    
-                      'tabbed' : self._processCodeBlock }
+            for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
-
+                m = RE.regExp[regexp].match(lines[0])
        for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
            m = RE.regExp[regexp].match(lines[0])
            if m :
                try:
                    processFn[regexp](parent_elem, lines, inList)
                except RuntimeError:
                    print 'WARNING: Max recursion depth excedeeded, skipping section'
                    #print '\n'.join(lines)
                    #sys.exit()
                return
        # We are NOT looking at one of the high-level structures like
        # lists or blockquotes.  So, it's just a regular paragraph
        # (though perhaps nested inside a list or something else).  If
        # we are NOT inside a list, we just need to look for a blank
        # line to find the end of the block.  If we ARE inside a
        # list, however, we need to consider that a sublist does not
        # need to be separated by a blank line.  Rather, the following
        # markup is legal:
        #
        # * The top level list item
        #
        #     Another paragraph of the list.  This is where we are now.
        #     * Underneath we might have a sublist.
        #
        if inList :
            start, theRest = self._linesUntil(lines, (lambda line:
                             RE.regExp['ul'].match(line)
                             or RE.regExp['ol'].match(line)
                                              or not line.strip()))
            self._processSection(parent_elem, start,
                                 inList - 1, looseList = looseList)
            self._processSection(parent_elem, theRest,
                                 inList - 1, looseList = looseList)
        else : # Ok, so it's just a simple block
            paragraph, theRest = self._linesUntil(lines, lambda line:
                                                 not line.strip())
            if len(paragraph) and paragraph[0].startswith('#') :
                m = RE.regExp['header'].match(paragraph[0])
                if m :
-                    level = len(m.group(1))
+                    processFn[regexp](parent_elem, lines, inList)
-                    h = self.doc.createElement("h%d" % level)
+                    return
-                    parent_elem.appendChild(h)
+    
-                    for item in self._handleInlineWrapper2(m.group(2).strip()) :
+            # We are NOT looking at one of the high-level structures like
-                        h.appendChild(item)
+            # lists or blockquotes.  So, it's just a regular paragraph
-                else :
+            # (though perhaps nested inside a list or something else).  If
-                    message(CRITICAL, "We've got a problem header!")
+            # we are NOT inside a list, we just need to look for a blank
-
+            # line to find the end of the block.  If we ARE inside a
-            elif paragraph :
+            # list, however, we need to consider that a sublist does not
-
+            # need to be separated by a blank line.  Rather, the following
-                list = self._handleInlineWrapper2("\n".join(paragraph))
+            # markup is legal:
-
+            #
-                if ( parent_elem.nodeName == 'li'
+            # * The top level list item
-                     and not (looseList or parent_elem.childNodes)):
+            #
-
+            #     Another paragraph of the list.  This is where we are now.
-                    #and not parent_elem.childNodes) :
+            #     * Underneath we might have a sublist.
-                    # If this is the first paragraph inside "li", don't
+            #
-                    # put <p> around it - append the paragraph bits directly
+    
-                    # onto parent_elem
+            if inList :
-                    el = parent_elem
+    
-                else :
+                start, lines = self._linesUntil(lines, (lambda line:
-                    # Otherwise make a "p" element
+                                 RE.regExp['ul'].match(line)
-                    el = self.doc.createElement("p")
+                                 or RE.regExp['ol'].match(line)
-                    parent_elem.appendChild(el)
+                                                  or not line.strip()))
-
+    
-                for item in list :
+                self._processSection(parent_elem, start,
-                    el.appendChild(item)
+                                     inList - 1, looseList = looseList)
-
+                self._processSection(parent_elem, lines,
-            if theRest :
+                                     inList - 1, looseList = looseList)
-                theRest = theRest[1:]  # skip the first (blank) line
+    
-
+    
-            try:
+            else : # Ok, so it's just a simple block
-                self._processSection(parent_elem, theRest, inList)
+            
-            except RuntimeError: #Added by Kovid
+                paragraph, lines = self._linesUntil(lines, lambda line:
-                pass
+                                                 not line.strip())
-
+                if len(paragraph) and paragraph[0].startswith('#') :
                    m = RE.regExp['header'].match(paragraph[0])
                    if m :
                        level = len(m.group(1))
                        h = self.doc.createElement("h%d" % level)
                        parent_elem.appendChild(h)
                        for item in self._handleInlineWrapper(m.group(2).strip()) :
                            h.appendChild(item)
                    else :
                        message(CRITICAL, "We've got a problem header!")
                elif paragraph :
                    list = self._handleInlineWrapper("\n".join(paragraph))
                    if ( parent_elem.nodeName == 'li'
                         and not (looseList or parent_elem.childNodes)):
                        #and not parent_elem.childNodes) :
                        # If this is the first paragraph inside "li", don't
                        # put <p> around it - append the paragraph bits directly
                        # onto parent_elem
                        el = parent_elem
                    else :
                        # Otherwise make a "p" element
                        el = self.doc.createElement("p")
                        parent_elem.appendChild(el)
                    for item in list :
                        el.appendChild(item)
                if lines:
                    lines = lines[1:]  # skip the first (blank) line
    def _processUList(self, parent_elem, lines, inList) :
@ -1247,9 +1374,11 @@ class Markdown:
                m = RE.regExp[expr].match(line)
                if m :
                    if expr in ['ul', 'ol'] :  # We are looking at a new item
-                        if m.group(1) :
+                        #if m.group(1) :
-                            items.append([m.group(1)])
+                        # Removed the check to allow for a blank line
-                            item += 1
+                        # at the beginning of the list item
                        items.append([m.group(1)])
                        item += 1
                    elif expr == 'tabbed' :  # This line needs to be detabbed
                        items[item].append(m.group(4)) #after the 'tab'
@ -1333,46 +1462,37 @@ class Markdown:
        detabbed, theRest = self.blockGuru.detectTabbed(lines)
        pre = self.doc.createElement('pre')
-        #code = self.doc.createElement('code')
+        code = self.doc.createElement('code')
        parent_elem.appendChild(pre)
-        #pre.appendChild(code)
+        pre.appendChild(code)
        text = "\n".join(detabbed).rstrip()+"\n"
        #text = text.replace("&", "&amp;")
-        pre.appendChild(self.doc.createTextNode(text))
+        code.appendChild(self.doc.createTextNode(text))
        self._processSection(parent_elem, theRest, inList)
    def _handleInlineWrapper2 (self, line) :
    def _handleInlineWrapper (self, line) :
        parts = [line]
        #if not(line):
        #    return [self.doc.createTextNode(' ')]
        for pattern in self.inlinePatterns :
            #print
            #print self.inlinePatterns.index(pattern)
            i = 0
            #print parts
            while i < len(parts) :
                x = parts[i]
-                #print i
+
                if isinstance(x, (str, unicode)) :
                    result = self._applyPattern(x, pattern)
-                    #print result
+
                    #print result
                    #print parts, i
                    if result :
                        i -= 1
                        parts.remove(x)
                        for y in result :
                            parts.insert(i+1,y)
-                
+
                i += 1
        for i in range(len(parts)) :
@ -1383,27 +1503,6 @@ class Markdown:
        return parts
    def _handleInlineWrapper (self, line) :
        # A wrapper around _handleInline to avoid recursion
        parts = [line]
        i = 0
        while i < len(parts) :
            x = parts[i]
            if isinstance(x, (str, unicode)) :
                parts.remove(x)
                result = self._handleInline(x)
                for y in result :
                    parts.insert(i,y)
            else :
                i += 1
        return parts
    def _handleInline(self,  line):
        """Transform a Markdown line with inline elements to an XHTML
        fragment.
@ -1424,6 +1523,7 @@ class Markdown:
        return [self.doc.createTextNode(line)]
    def _applyPattern(self, line, pattern) :
        """ Given a pattern name, this function checks if the line
        fits the pattern, creates the necessary elements, and returns
        back a list consisting of NanoDom elements and/or strings.
@ -1438,6 +1538,8 @@ class Markdown:
        # match the line to pattern's pre-compiled reg exp.
        # if no match, move on.
        m = pattern.getCompiledRegExp().match(line)
        if not m :
            return None
@ -1446,6 +1548,40 @@ class Markdown:
        # if it doesn't, move on
        node = pattern.handleMatch(m, self.doc)
        # check if any of this nodes have children that need processing
        if isinstance(node, Element):
            if not node.nodeName in ["code", "pre"] :
                for child in node.childNodes :
                    if isinstance(child, TextNode):
                        result = self._handleInlineWrapper(child.value)
                        if result:
                            if result == [child] :
                                continue
                            result.reverse()
                            #to make insertion easier
                            position = node.childNodes.index(child)
                            node.removeChild(child)
                            for item in result:
                                if isinstance(item, (str, unicode)):
                                    if len(item) > 0:
                                        node.insertChild(position,
                                             self.doc.createTextNode(item))
                                else:
                                    node.insertChild(position, item)
        if node :
            # Those are in the reverse order!
            return ( m.groups()[-1], # the string to the left
@ -1455,7 +1591,7 @@ class Markdown:
        else :
            return None
-    def __str__(self, source = None):
+    def convert (self, source = None):
        """Return the document in XHTML format.
        @returns: A serialized XHTML body."""
@ -1463,6 +1599,14 @@ class Markdown:
        if source :
            self.source = source
        if not self.source :
            return ""
        self.source = removeBOM(self.source, self.encoding)
        for pp in self.textPreprocessors:
            self.source = pp.run(self.source)
        doc = self._transform()
        xml = doc.toxml()
@ -1474,8 +1618,8 @@ class Markdown:
        for i in range(self.htmlStash.html_counter) :
            html = self.htmlStash.rawHtmlBlocks[i]
-            if self.safeMode :
+            if self.safeMode and html != "<hr />" and html != "<br />":
-                html = "[HTML_REMOVED]"
+                html = HTML_REMOVED_TEXT
            xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
                              html + "\n")
@ -1490,10 +1634,13 @@ class Markdown:
        for pp in self.textPostprocessors :
            xml = pp.run(xml)
-        return xml
+        return (self.docType + xml).strip()
-    toString = __str__
+    __str__ = convert   # deprecated - will be changed in 1.7 to report
                        # information about the MD instance
    toString = __str__  # toString() method is deprecated
    def __unicode__(self):
@ -1502,7 +1649,7 @@ class Markdown:
        return str(self)#.decode(self.encoding)
-    toUnicode = __unicode__
+    toUnicode = __unicode__  # deprecated - will be removed in 1.7
@ -1525,7 +1672,7 @@ def markdownFromFile(input = None,
    if not encoding :
        encoding = "utf-8"
-    input_file = codecs.open(input, mode="r", encoding="utf-8")
+    input_file = codecs.open(input, mode="r", encoding=encoding)
    text = input_file.read()
    input_file.close()
@ -1559,23 +1706,21 @@ def markdown(text,
            pairs = [x.split("=") for x in ext[pos+1:-1].split(",")]
            configs = [(x.strip(), y.strip()) for (x, y) in pairs]
            extension_configs[name] = configs
            #print configs
-    md = Markdown(text, extensions=extension_names,
+    md = Markdown(extensions=extension_names,
                  extension_configs=extension_configs,
                  safe_mode = safe_mode)
-    return md.toString()
+    return md.convert(text)
 class Extension :
-    def __init__(self, configs={}) :
+    def __init__(self, configs = {}) :
        self.config = configs
    def getConfig(self, key) :
        if self.config.has_key(key) :
            #print self.config[key][0]
            return self.config[key][0]
        else :
            return ""
@ -1653,10 +1798,8 @@ def parse_options() :
            'extensions' : options.extensions,
            'encoding' : options.encoding }
-def main():
+if __name__ == '__main__':
    """ Run Markdown from the command line. """
    for a in ['-x', 'toc', '-x', 'tables', '-x', 'footnotes']:
        sys.argv.append(a)
    options = parse_options()
@ -1667,8 +1810,7 @@ def main():
    markdownFromFile(**options)
-if __name__ == '__main__':
+
    main()