Fix txt2lrf processing of long unstructured txt files

This commit is contained in:
Kovid Goyal 2007-08-25 02:51:27 +00:00
parent ae0b5d3168
commit 58c15ac8b7
2 changed files with 382 additions and 231 deletions

View File

@ -31,10 +31,12 @@ def option_parser():
'the text in mybook.txt. Default is to try to autodetect.' 'the text in mybook.txt. Default is to try to autodetect.'
parser.add_option('-e', '--encoding', action='store', type='string', \ parser.add_option('-e', '--encoding', action='store', type='string', \
dest='encoding', help=enchelp, default=None) dest='encoding', help=enchelp, default=None)
parser.add_option('--debug-html-generation', action='store_true', default=False,
dest='debug_html_generation', help='Print generated HTML to stdout and quit.')
return parser return parser
def generate_html(txtfile, encoding): def generate_html(txtfile, encoding, logger):
''' '''
Convert txtfile to html and return a PersistentTemporaryFile object pointing Convert txtfile to html and return a PersistentTemporaryFile object pointing
to the file with the HTML. to the file with the HTML.
@ -54,12 +56,14 @@ def generate_html(txtfile, encoding):
raise ConversionError, 'Could not detect encoding of %s'%(txtfile,) raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
else: else:
txt = codecs.open(txtfile, 'rb', enc).read() txt = codecs.open(txtfile, 'rb', enc).read()
logger.info('Converting text to HTML...')
md = markdown.Markdown(txt, md = markdown.Markdown(txt,
extensions=['footnotes', 'tables', 'toc'], extensions=['footnotes', 'tables', 'toc'],
encoding=enc, safe_mode=False,
safe_mode=False, )
)
html = md.toString() html = md.toString()
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile)) p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
p.close() p.close()
codecs.open(p.name, 'wb', enc).write(html) codecs.open(p.name, 'wb', enc).write(html)
@ -73,14 +77,19 @@ def process_file(path, options, logger=None):
txt = os.path.abspath(os.path.expanduser(path)) txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'encoding'): if not hasattr(options, 'encoding'):
options.encoding = None options.encoding = None
htmlfile = generate_html(txt, options.encoding) if not hasattr(options, 'debug_html_generation'):
options.force_page_break = 'h2' options.debug_html_generation = False
if not options.output: htmlfile = generate_html(txt, options.encoding, logger)
ext = '.lrs' if options.lrs else '.lrf' if not options.debug_html_generation:
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) options.force_page_break = 'h2'
options.output = os.path.abspath(os.path.expanduser(options.output)) if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
html_process_file(htmlfile.name, options, logger) html_process_file(htmlfile.name, options, logger)
else:
print open(htmlfile.name, 'rb').read()
def main(args=sys.argv, logger=None): def main(args=sys.argv, logger=None):
parser = option_parser() parser = option_parser()

View File

@ -1,43 +1,37 @@
#!/usr/bin/env python #!/usr/bin/env python
# The following constant specifies the name used in the usage version = "1.6b"
# statement displayed for python versions lower than 2.3. (With version_info = (1,6,2,"rc-2")
# python2.3 and higher the usage statement is generated by optparse __revision__ = "$Rev$"
# and uses the actual name of the executable called.)
EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
SPEED_TEST = 0
""" """
====================================================================
IF YOA ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION
====================================================================
Python-Markdown Python-Markdown
=============== ===============
Converts Markdown to HTML. Basic usage as a module: Converts Markdown to HTML. Basic usage as a module:
import markdown import markdown
html = markdown.markdown(your_text_string) md = Markdown()
html = markdown.convert(your_text_string)
See http://www.freewisdom.org/projects/python-markdown/ for more
information and instructions on how to extend the functionality of the
script. (You might want to read that before you try modifying this
file.)
Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
maintained by [Yuri Takhteyev](http://www.freewisdom.org). maintained by [Yuri Takhteyev](http://www.freewisdom.org).
Project website: http://www.freewisdom.org/projects/python-markdown
Contact: yuri [at] freewisdom.org Contact: yuri [at] freewisdom.org
License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
Version: 1.5a (July 9, 2006)
For changelog, see end of file
""" """
import re, sys, os, random, codecs
# set debug level: 3 none, 2 critical, 1 informative, 0 all import re, sys, codecs
# Set debug level: 3 none, 2 critical, 1 informative, 0 all
(VERBOSE, INFO, CRITICAL, NONE) = range(4) (VERBOSE, INFO, CRITICAL, NONE) = range(4)
MESSAGE_THRESHOLD = CRITICAL MESSAGE_THRESHOLD = CRITICAL
@ -49,10 +43,45 @@ def message(level, text) :
# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY ----------------- # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
# all tabs will be expanded to up to this many spaces TAB_LENGTH = 4 # expand tabs to this many spaces
TAB_LENGTH = 4 ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
ENABLE_ATTRIBUTES = 1 SMART_EMPHASIS = 1 # this_or_that does not become this<i>or</i>that
SMART_EMPHASIS = 1 HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
# from Hebrew to Nko (includes Arabic, Syriac and Thaana)
(u'\u2D30', u'\u2D7F'),
# Tifinagh
)
# Unicode Reference Table:
# 0590-05FF - Hebrew
# 0600-06FF - Arabic
# 0700-074F - Syriac
# 0750-077F - Arabic Supplement
# 0780-07BF - Thaana
# 07C0-07FF - Nko
BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ),
'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"),
unicode(codecs.BOM_UTF16_BE, "utf-16")),
#'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"),
# unicode(codecs.BOM_UTF32_BE, "utf-32")),
}
def removeBOM(text, encoding):
for bom in BOMS[encoding]:
if text.startswith(bom):
return text.lstrip(bom)
return text
# The following constant specifies the name used in the usage
# statement displayed for python versions lower than 2.3. (With
# python2.3 and higher the usage statement is generated by optparse
# and uses the actual name of the executable called.)
EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ---------- # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
@ -82,14 +111,50 @@ Importantly, NanoDom does not do normalization, which is what we
want. It also adds extra white space when converting DOM to string want. It also adds extra white space when converting DOM to string
""" """
ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&amp;"),
(re.compile("<"), "&lt;"),
(re.compile(">"), "&gt;"),
(re.compile("\""), "&quot;")]
ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&amp;"),
(re.compile("<"), "&lt;"),
(re.compile(">"), "&gt;"),
(re.compile("\""), "&quot;")]
def getBidiType(text) :
if not text : return None
ch = text[0]
if not isinstance(ch, unicode) or not ch.isalpha():
return None
else :
for min, max in RTL_BIDI_RANGES :
if ( ch >= min and ch <= max ) :
return "rtl"
else :
return "ltr"
class Document : class Document :
def __init__ (self) :
self.bidi = "ltr"
def appendChild(self, child) : def appendChild(self, child) :
self.documentElement = child self.documentElement = child
child.isDocumentElement = True
child.parent = self child.parent = self
self.entities = {} self.entities = {}
def setBidi(self, bidi) :
if bidi :
self.bidi = bidi
def createElement(self, tag, textNode=None) : def createElement(self, tag, textNode=None) :
el = Element(tag) el = Element(tag)
el.doc = self el.doc = self
@ -107,19 +172,23 @@ class Document :
self.entities[entity] = EntityReference(entity) self.entities[entity] = EntityReference(entity)
return self.entities[entity] return self.entities[entity]
def createCDATA(self, text) :
node = CDATA(text)
node.doc = self
return node
def toxml (self) : def toxml (self) :
return self.documentElement.toxml() return self.documentElement.toxml()
def normalizeEntities(self, text) : def normalizeEntities(self, text, avoidDoubleNormalizing=False) :
pairs = [ ("&", "&amp;"), if avoidDoubleNormalizing :
("<", "&lt;"), regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT
(">", "&gt;"), else :
("\"", "&quot;")] regexps = ENTITY_NORMALIZATION_EXPRESSIONS
for regexp, substitution in regexps :
for old, new in pairs : text = regexp.sub(substitution, text)
text = text.replace(old, new)
return text return text
def find(self, test) : def find(self, test) :
@ -130,6 +199,19 @@ class Document :
self.documentElement = None self.documentElement = None
class CDATA :
type = "cdata"
def __init__ (self, text) :
self.text = text
def handleAttributes(self) :
pass
def toxml (self) :
return "<![CDATA[" + self.text + "]]>"
class Element : class Element :
type = "element" type = "element"
@ -140,6 +222,19 @@ class Element :
self.attributes = [] self.attributes = []
self.attribute_values = {} self.attribute_values = {}
self.childNodes = [] self.childNodes = []
self.bidi = None
self.isDocumentElement = False
def setBidi(self, bidi) :
if bidi :
if not self.bidi or self.isDocumentElement:
# Once the bidi is set don't change it (except for doc element)
self.bidi = bidi
self.parent.setBidi(bidi)
def unlink(self) : def unlink(self) :
for child in self.childNodes : for child in self.childNodes :
@ -186,27 +281,56 @@ class Element :
if ENABLE_ATTRIBUTES : if ENABLE_ATTRIBUTES :
for child in self.childNodes: for child in self.childNodes:
child.handleAttributes() child.handleAttributes()
buffer = "" buffer = ""
if self.nodeName in ['h1', 'h2', 'h3', 'h4'] : if self.nodeName in ['h1', 'h2', 'h3', 'h4'] :
buffer += "\n" buffer += "\n"
elif self.nodeName in ['li'] : elif self.nodeName in ['li'] :
buffer += "\n " buffer += "\n "
# Process children FIRST, then do the attributes
childBuffer = ""
if self.childNodes or self.nodeName in ['blockquote']:
childBuffer += ">"
for child in self.childNodes :
childBuffer += child.toxml()
if self.nodeName == 'p' :
childBuffer += "\n"
elif self.nodeName == 'li' :
childBuffer += "\n "
childBuffer += "</%s>" % self.nodeName
else :
childBuffer += "/>"
buffer += "<" + self.nodeName buffer += "<" + self.nodeName
if self.nodeName in ['p', 'li', 'ul', 'ol',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] :
if not self.attribute_values.has_key("dir"):
if self.bidi :
bidi = self.bidi
else :
bidi = self.doc.bidi
if bidi=="rtl" :
self.setAttribute("dir", "rtl")
for attr in self.attributes : for attr in self.attributes :
value = self.attribute_values[attr] value = self.attribute_values[attr]
value = self.doc.normalizeEntities(value) value = self.doc.normalizeEntities(value,
avoidDoubleNormalizing=True)
buffer += ' %s="%s"' % (attr, value) buffer += ' %s="%s"' % (attr, value)
if self.childNodes or self.nodeName in ['blockquote']:
buffer += ">"
for child in self.childNodes : # Now let's actually append the children
buffer += child.toxml()
if self.nodeName == 'p' : buffer += childBuffer
buffer += "\n"
elif self.nodeName == 'li' :
buffer += "\n "
buffer += "</%s>" % self.nodeName
else :
buffer += "/>"
if self.nodeName in ['p', 'li', 'ul', 'ol', if self.nodeName in ['p', 'li', 'ul', 'ol',
'h1', 'h2', 'h3', 'h4'] : 'h1', 'h2', 'h3', 'h4'] :
buffer += "\n" buffer += "\n"
@ -223,13 +347,18 @@ class TextNode :
self.value = text self.value = text
def attributeCallback(self, match) : def attributeCallback(self, match) :
self.parent.setAttribute(match.group(1), match.group(2)) self.parent.setAttribute(match.group(1), match.group(2))
def handleAttributes(self) : def handleAttributes(self) :
self.value = self.attrRegExp.sub(self.attributeCallback, self.value) self.value = self.attrRegExp.sub(self.attributeCallback, self.value)
def toxml(self) : def toxml(self) :
text = self.value text = self.value
self.parent.setBidi(getBidiType(text))
if not text.startswith(HTML_PLACEHOLDER_PREFIX): if not text.startswith(HTML_PLACEHOLDER_PREFIX):
if self.parent.nodeName == "p" : if self.parent.nodeName == "p" :
text = text.replace("\n", "\n ") text = text.replace("\n", "\n ")
@ -262,10 +391,10 @@ class EntityReference:
Preprocessors munge source text before we start doing anything too Preprocessors munge source text before we start doing anything too
complicated. complicated.
Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document, Each preprocessor implements a "run" method that takes a pointer to a
modifies it as necessary and returns either the same pointer or a list of lines of the document, modifies it as necessary and returns
pointer to a new list. Preprocessors must extend either the same pointer or a pointer to a new list. Preprocessors
markdown.Preprocessor. must extend markdown.Preprocessor.
""" """
@ -305,10 +434,6 @@ class HeaderPreprocessor (Preprocessor):
lines[i] = "## " + lines[i].strip() lines[i] = "## " + lines[i].strip()
lines[i+1] = "" lines[i+1] = ""
#for l in lines :
# print l.encode('utf8')
#sys.exit(0)
return lines return lines
HEADER_PREPROCESSOR = HeaderPreprocessor() HEADER_PREPROCESSOR = HeaderPreprocessor()
@ -362,10 +487,13 @@ class HtmlBlockPreprocessor (Preprocessor):
return block.rstrip()[-len(left_tag)-2:-1].lower() return block.rstrip()[-len(left_tag)-2:-1].lower()
def _equal_tags(self, left_tag, right_tag): def _equal_tags(self, left_tag, right_tag):
if left_tag in ['?', '?php', 'div'] : # handle PHP, etc. if left_tag in ['?', '?php', 'div'] : # handle PHP, etc.
return True return True
if ("/" + left_tag) == right_tag: if ("/" + left_tag) == right_tag:
return True return True
if (right_tag == "--" and left_tag == "--") :
return True
elif left_tag == right_tag[1:] \ elif left_tag == right_tag[1:] \
and right_tag[0] != "<": and right_tag[0] != "<":
return True return True
@ -376,9 +504,10 @@ class HtmlBlockPreprocessor (Preprocessor):
return (tag in ['hr', 'hr/']) return (tag in ['hr', 'hr/'])
def run (self, lines) : def run (self, text) :
new_blocks = [] new_blocks = []
text = "\n".join(lines) #text = "\n".join(lines)
text = text.split("\n\n") text = text.split("\n\n")
items = [] items = []
@ -417,26 +546,31 @@ class HtmlBlockPreprocessor (Preprocessor):
new_blocks.append( new_blocks.append(
self.stash.store(block.strip())) self.stash.store(block.strip()))
continue continue
elif not block[1] == "!": else: #if not block[1] == "!":
# if is block level tag and is not complete # if is block level tag and is not complete
items.append(block.strip()) items.append(block.strip())
in_tag = True in_tag = True
continue continue
new_blocks.append(block) new_blocks.append(block)
else: else:
items.append(block.strip()) items.append(block.strip())
right_tag = self._get_right_tag(left_tag, block) right_tag = self._get_right_tag(left_tag, block)
if self._equal_tags(left_tag, right_tag): if self._equal_tags(left_tag, right_tag):
# if find closing tag # if find closing tag
in_tag = False in_tag = False
new_blocks.append( new_blocks.append(
self.stash.store('\n\n'.join(items))) self.stash.store('\n\n'.join(items)))
items = [] items = []
return "\n\n".join(new_blocks).split("\n") if items :
new_blocks.append(self.stash.store('\n\n'.join(items)))
new_blocks.append('\n')
return "\n\n".join(new_blocks) #.split("\n")
HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor() HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
@ -609,15 +743,15 @@ class LinkPattern (Pattern):
def handleMatch(self, m, doc) : def handleMatch(self, m, doc) :
el = doc.createElement('a') el = doc.createElement('a')
el.appendChild(doc.createTextNode(m.group(2))) el.appendChild(doc.createTextNode(m.group(2)))
parts = m.group(9).split() parts = m.group(9).split('"')
# We should now have [], [href], or [href, title] # We should now have [], [href], or [href, title]
if parts : if parts :
el.setAttribute('href', parts[0]) el.setAttribute('href', parts[0].strip())
else : else :
el.setAttribute('href', "") el.setAttribute('href', "")
if len(parts) > 1 : if len(parts) > 1 :
# we also got a title # we also got a title
title = " ".join(parts[1:]).strip() title = '"' + '"'.join(parts[1:]).strip()
title = dequote(title) #.replace('"', "&quot;") title = dequote(title) #.replace('"', "&quot;")
el.setAttribute('title', title) el.setAttribute('title', title)
return el return el
@ -645,12 +779,14 @@ class ImagePattern (Pattern):
class ReferencePattern (Pattern): class ReferencePattern (Pattern):
def handleMatch(self, m, doc): def handleMatch(self, m, doc):
if m.group(9) : if m.group(9) :
id = m.group(9).lower() id = m.group(9).lower()
else : else :
# if we got something like "[Google][]" # if we got something like "[Google][]"
# we'll use "google" as the id # we'll use "google" as the id
id = m.group(2).lower() id = m.group(2).lower()
if not self.references.has_key(id) : # ignore undefined refs if not self.references.has_key(id) : # ignore undefined refs
return None return None
href, title = self.references[id] href, title = self.references[id]
@ -789,8 +925,7 @@ class BlockGuru :
remainder of the original list""" remainder of the original list"""
items = [] items = []
item = -1
i = 0 # to keep track of where we are i = 0 # to keep track of where we are
for line in lines : for line in lines :
@ -908,11 +1043,11 @@ class Markdown:
Markdown text """ Markdown text """
def __init__(self, source=None, def __init__(self, source=None, # deprecated
extensions=None, extensions=[],
extension_configs=None, extension_configs=None,
encoding=None, encoding="utf-8",
safe_mode = True): safe_mode = False):
"""Creates a new Markdown instance. """Creates a new Markdown instance.
@param source: The text in Markdown format. @param source: The text in Markdown format.
@ -924,10 +1059,14 @@ class Markdown:
self.blockGuru = BlockGuru() self.blockGuru = BlockGuru()
self.registeredExtensions = [] self.registeredExtensions = []
self.stripTopLevelTags = 1 self.stripTopLevelTags = 1
self.docType = ""
self.preprocessors = [ HEADER_PREPROCESSOR,
self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
self.preprocessors = [
HEADER_PREPROCESSOR,
LINE_PREPROCESSOR, LINE_PREPROCESSOR,
HTML_BLOCK_PREPROCESSOR,
LINE_BREAKS_PREPROCESSOR, LINE_BREAKS_PREPROCESSOR,
# A footnote preprocessor will # A footnote preprocessor will
# get inserted here # get inserted here
@ -979,6 +1118,7 @@ class Markdown:
for ext in extensions : for ext in extensions :
extension_module_name = "libprs500.ebooks.markdown.mdx_" + ext extension_module_name = "libprs500.ebooks.markdown.mdx_" + ext
try : try :
module = sys.modules[extension_module_name] module = sys.modules[extension_module_name]
except : except :
@ -991,6 +1131,7 @@ class Markdown:
configs_for_ext = configs[ext] configs_for_ext = configs[ext]
else : else :
configs_for_ext = [] configs_for_ext = []
extension = module.makeExtension(configs_for_ext) extension = module.makeExtension(configs_for_ext)
extension.extendMarkdown(self, globals()) extension.extendMarkdown(self, globals())
@ -1032,7 +1173,7 @@ class Markdown:
self.doc.appendChild(self.top_element) self.doc.appendChild(self.top_element)
# Fixup the source text # Fixup the source text
text = self.source.strip() text = self.source #.strip()
text = text.replace("\r\n", "\n").replace("\r", "\n") text = text.replace("\r\n", "\n").replace("\r", "\n")
text += "\n\n" text += "\n\n"
text = text.expandtabs(TAB_LENGTH) text = text.expandtabs(TAB_LENGTH)
@ -1085,100 +1226,86 @@ class Markdown:
@param inList: a level @param inList: a level
@returns: None""" @returns: None"""
if not lines : while lines:
return # Check if this section starts with a list, a blockquote or
# a code block
# Check if this section starts with a list, a blockquote or
# a code block processFn = { 'ul' : self._processUList,
'ol' : self._processOList,
processFn = { 'ul' : self._processUList, 'quoted' : self._processQuote,
'ol' : self._processOList, 'tabbed' : self._processCodeBlock }
'quoted' : self._processQuote,
'tabbed' : self._processCodeBlock } for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
m = RE.regExp[regexp].match(lines[0])
for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
m = RE.regExp[regexp].match(lines[0])
if m :
try:
processFn[regexp](parent_elem, lines, inList)
except RuntimeError:
print 'WARNING: Max recursion depth excedeeded, skipping section'
#print '\n'.join(lines)
#sys.exit()
return
# We are NOT looking at one of the high-level structures like
# lists or blockquotes. So, it's just a regular paragraph
# (though perhaps nested inside a list or something else). If
# we are NOT inside a list, we just need to look for a blank
# line to find the end of the block. If we ARE inside a
# list, however, we need to consider that a sublist does not
# need to be separated by a blank line. Rather, the following
# markup is legal:
#
# * The top level list item
#
# Another paragraph of the list. This is where we are now.
# * Underneath we might have a sublist.
#
if inList :
start, theRest = self._linesUntil(lines, (lambda line:
RE.regExp['ul'].match(line)
or RE.regExp['ol'].match(line)
or not line.strip()))
self._processSection(parent_elem, start,
inList - 1, looseList = looseList)
self._processSection(parent_elem, theRest,
inList - 1, looseList = looseList)
else : # Ok, so it's just a simple block
paragraph, theRest = self._linesUntil(lines, lambda line:
not line.strip())
if len(paragraph) and paragraph[0].startswith('#') :
m = RE.regExp['header'].match(paragraph[0])
if m : if m :
level = len(m.group(1)) processFn[regexp](parent_elem, lines, inList)
h = self.doc.createElement("h%d" % level) return
parent_elem.appendChild(h)
for item in self._handleInlineWrapper2(m.group(2).strip()) : # We are NOT looking at one of the high-level structures like
h.appendChild(item) # lists or blockquotes. So, it's just a regular paragraph
else : # (though perhaps nested inside a list or something else). If
message(CRITICAL, "We've got a problem header!") # we are NOT inside a list, we just need to look for a blank
# line to find the end of the block. If we ARE inside a
elif paragraph : # list, however, we need to consider that a sublist does not
# need to be separated by a blank line. Rather, the following
list = self._handleInlineWrapper2("\n".join(paragraph)) # markup is legal:
#
if ( parent_elem.nodeName == 'li' # * The top level list item
and not (looseList or parent_elem.childNodes)): #
# Another paragraph of the list. This is where we are now.
#and not parent_elem.childNodes) : # * Underneath we might have a sublist.
# If this is the first paragraph inside "li", don't #
# put <p> around it - append the paragraph bits directly
# onto parent_elem if inList :
el = parent_elem
else : start, lines = self._linesUntil(lines, (lambda line:
# Otherwise make a "p" element RE.regExp['ul'].match(line)
el = self.doc.createElement("p") or RE.regExp['ol'].match(line)
parent_elem.appendChild(el) or not line.strip()))
for item in list : self._processSection(parent_elem, start,
el.appendChild(item) inList - 1, looseList = looseList)
self._processSection(parent_elem, lines,
if theRest : inList - 1, looseList = looseList)
theRest = theRest[1:] # skip the first (blank) line
try: else : # Ok, so it's just a simple block
self._processSection(parent_elem, theRest, inList)
except RuntimeError: #Added by Kovid paragraph, lines = self._linesUntil(lines, lambda line:
pass not line.strip())
if len(paragraph) and paragraph[0].startswith('#') :
m = RE.regExp['header'].match(paragraph[0])
if m :
level = len(m.group(1))
h = self.doc.createElement("h%d" % level)
parent_elem.appendChild(h)
for item in self._handleInlineWrapper(m.group(2).strip()) :
h.appendChild(item)
else :
message(CRITICAL, "We've got a problem header!")
elif paragraph :
list = self._handleInlineWrapper("\n".join(paragraph))
if ( parent_elem.nodeName == 'li'
and not (looseList or parent_elem.childNodes)):
#and not parent_elem.childNodes) :
# If this is the first paragraph inside "li", don't
# put <p> around it - append the paragraph bits directly
# onto parent_elem
el = parent_elem
else :
# Otherwise make a "p" element
el = self.doc.createElement("p")
parent_elem.appendChild(el)
for item in list :
el.appendChild(item)
if lines:
lines = lines[1:] # skip the first (blank) line
def _processUList(self, parent_elem, lines, inList) : def _processUList(self, parent_elem, lines, inList) :
@ -1247,9 +1374,11 @@ class Markdown:
m = RE.regExp[expr].match(line) m = RE.regExp[expr].match(line)
if m : if m :
if expr in ['ul', 'ol'] : # We are looking at a new item if expr in ['ul', 'ol'] : # We are looking at a new item
if m.group(1) : #if m.group(1) :
items.append([m.group(1)]) # Removed the check to allow for a blank line
item += 1 # at the beginning of the list item
items.append([m.group(1)])
item += 1
elif expr == 'tabbed' : # This line needs to be detabbed elif expr == 'tabbed' : # This line needs to be detabbed
items[item].append(m.group(4)) #after the 'tab' items[item].append(m.group(4)) #after the 'tab'
@ -1333,46 +1462,37 @@ class Markdown:
detabbed, theRest = self.blockGuru.detectTabbed(lines) detabbed, theRest = self.blockGuru.detectTabbed(lines)
pre = self.doc.createElement('pre') pre = self.doc.createElement('pre')
#code = self.doc.createElement('code') code = self.doc.createElement('code')
parent_elem.appendChild(pre) parent_elem.appendChild(pre)
#pre.appendChild(code) pre.appendChild(code)
text = "\n".join(detabbed).rstrip()+"\n" text = "\n".join(detabbed).rstrip()+"\n"
#text = text.replace("&", "&amp;") #text = text.replace("&", "&amp;")
pre.appendChild(self.doc.createTextNode(text)) code.appendChild(self.doc.createTextNode(text))
self._processSection(parent_elem, theRest, inList) self._processSection(parent_elem, theRest, inList)
def _handleInlineWrapper2 (self, line) :
def _handleInlineWrapper (self, line) :
parts = [line] parts = [line]
#if not(line):
# return [self.doc.createTextNode(' ')]
for pattern in self.inlinePatterns : for pattern in self.inlinePatterns :
#print
#print self.inlinePatterns.index(pattern)
i = 0 i = 0
#print parts
while i < len(parts) : while i < len(parts) :
x = parts[i] x = parts[i]
#print i
if isinstance(x, (str, unicode)) : if isinstance(x, (str, unicode)) :
result = self._applyPattern(x, pattern) result = self._applyPattern(x, pattern)
#print result
#print result
#print parts, i
if result : if result :
i -= 1 i -= 1
parts.remove(x) parts.remove(x)
for y in result : for y in result :
parts.insert(i+1,y) parts.insert(i+1,y)
i += 1 i += 1
for i in range(len(parts)) : for i in range(len(parts)) :
@ -1383,27 +1503,6 @@ class Markdown:
return parts return parts
def _handleInlineWrapper (self, line) :
# A wrapper around _handleInline to avoid recursion
parts = [line]
i = 0
while i < len(parts) :
x = parts[i]
if isinstance(x, (str, unicode)) :
parts.remove(x)
result = self._handleInline(x)
for y in result :
parts.insert(i,y)
else :
i += 1
return parts
def _handleInline(self, line): def _handleInline(self, line):
"""Transform a Markdown line with inline elements to an XHTML """Transform a Markdown line with inline elements to an XHTML
fragment. fragment.
@ -1424,6 +1523,7 @@ class Markdown:
return [self.doc.createTextNode(line)] return [self.doc.createTextNode(line)]
def _applyPattern(self, line, pattern) : def _applyPattern(self, line, pattern) :
""" Given a pattern name, this function checks if the line """ Given a pattern name, this function checks if the line
fits the pattern, creates the necessary elements, and returns fits the pattern, creates the necessary elements, and returns
back a list consisting of NanoDom elements and/or strings. back a list consisting of NanoDom elements and/or strings.
@ -1438,6 +1538,8 @@ class Markdown:
# match the line to pattern's pre-compiled reg exp. # match the line to pattern's pre-compiled reg exp.
# if no match, move on. # if no match, move on.
m = pattern.getCompiledRegExp().match(line) m = pattern.getCompiledRegExp().match(line)
if not m : if not m :
return None return None
@ -1446,6 +1548,40 @@ class Markdown:
# if it doesn't, move on # if it doesn't, move on
node = pattern.handleMatch(m, self.doc) node = pattern.handleMatch(m, self.doc)
# check if any of this nodes have children that need processing
if isinstance(node, Element):
if not node.nodeName in ["code", "pre"] :
for child in node.childNodes :
if isinstance(child, TextNode):
result = self._handleInlineWrapper(child.value)
if result:
if result == [child] :
continue
result.reverse()
#to make insertion easier
position = node.childNodes.index(child)
node.removeChild(child)
for item in result:
if isinstance(item, (str, unicode)):
if len(item) > 0:
node.insertChild(position,
self.doc.createTextNode(item))
else:
node.insertChild(position, item)
if node : if node :
# Those are in the reverse order! # Those are in the reverse order!
return ( m.groups()[-1], # the string to the left return ( m.groups()[-1], # the string to the left
@ -1455,7 +1591,7 @@ class Markdown:
else : else :
return None return None
def __str__(self, source = None): def convert (self, source = None):
"""Return the document in XHTML format. """Return the document in XHTML format.
@returns: A serialized XHTML body.""" @returns: A serialized XHTML body."""
@ -1463,6 +1599,14 @@ class Markdown:
if source : if source :
self.source = source self.source = source
if not self.source :
return ""
self.source = removeBOM(self.source, self.encoding)
for pp in self.textPreprocessors:
self.source = pp.run(self.source)
doc = self._transform() doc = self._transform()
xml = doc.toxml() xml = doc.toxml()
@ -1474,8 +1618,8 @@ class Markdown:
for i in range(self.htmlStash.html_counter) : for i in range(self.htmlStash.html_counter) :
html = self.htmlStash.rawHtmlBlocks[i] html = self.htmlStash.rawHtmlBlocks[i]
if self.safeMode : if self.safeMode and html != "<hr />" and html != "<br />":
html = "[HTML_REMOVED]" html = HTML_REMOVED_TEXT
xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i), xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
html + "\n") html + "\n")
@ -1490,10 +1634,13 @@ class Markdown:
for pp in self.textPostprocessors : for pp in self.textPostprocessors :
xml = pp.run(xml) xml = pp.run(xml)
return xml return (self.docType + xml).strip()
toString = __str__ __str__ = convert # deprecated - will be changed in 1.7 to report
# information about the MD instance
toString = __str__ # toString() method is deprecated
def __unicode__(self): def __unicode__(self):
@ -1502,7 +1649,7 @@ class Markdown:
return str(self)#.decode(self.encoding) return str(self)#.decode(self.encoding)
toUnicode = __unicode__ toUnicode = __unicode__ # deprecated - will be removed in 1.7
@ -1525,7 +1672,7 @@ def markdownFromFile(input = None,
if not encoding : if not encoding :
encoding = "utf-8" encoding = "utf-8"
input_file = codecs.open(input, mode="r", encoding="utf-8") input_file = codecs.open(input, mode="r", encoding=encoding)
text = input_file.read() text = input_file.read()
input_file.close() input_file.close()
@ -1559,23 +1706,21 @@ def markdown(text,
pairs = [x.split("=") for x in ext[pos+1:-1].split(",")] pairs = [x.split("=") for x in ext[pos+1:-1].split(",")]
configs = [(x.strip(), y.strip()) for (x, y) in pairs] configs = [(x.strip(), y.strip()) for (x, y) in pairs]
extension_configs[name] = configs extension_configs[name] = configs
#print configs
md = Markdown(text, extensions=extension_names, md = Markdown(extensions=extension_names,
extension_configs=extension_configs, extension_configs=extension_configs,
safe_mode = safe_mode) safe_mode = safe_mode)
return md.toString() return md.convert(text)
class Extension : class Extension :
def __init__(self, configs={}) : def __init__(self, configs = {}) :
self.config = configs self.config = configs
def getConfig(self, key) : def getConfig(self, key) :
if self.config.has_key(key) : if self.config.has_key(key) :
#print self.config[key][0]
return self.config[key][0] return self.config[key][0]
else : else :
return "" return ""
@ -1653,10 +1798,8 @@ def parse_options() :
'extensions' : options.extensions, 'extensions' : options.extensions,
'encoding' : options.encoding } 'encoding' : options.encoding }
def main(): if __name__ == '__main__':
""" Run Markdown from the command line. """ """ Run Markdown from the command line. """
for a in ['-x', 'toc', '-x', 'tables', '-x', 'footnotes']:
sys.argv.append(a)
options = parse_options() options = parse_options()
@ -1667,8 +1810,7 @@ def main():
markdownFromFile(**options) markdownFromFile(**options)
if __name__ == '__main__':
main()