mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix txt2lrf processing of long unstructured txt files
This commit is contained in:
parent
ae0b5d3168
commit
58c15ac8b7
@ -31,10 +31,12 @@ def option_parser():
|
||||
'the text in mybook.txt. Default is to try to autodetect.'
|
||||
parser.add_option('-e', '--encoding', action='store', type='string', \
|
||||
dest='encoding', help=enchelp, default=None)
|
||||
parser.add_option('--debug-html-generation', action='store_true', default=False,
|
||||
dest='debug_html_generation', help='Print generated HTML to stdout and quit.')
|
||||
return parser
|
||||
|
||||
|
||||
def generate_html(txtfile, encoding):
|
||||
def generate_html(txtfile, encoding, logger):
|
||||
'''
|
||||
Convert txtfile to html and return a PersistentTemporaryFile object pointing
|
||||
to the file with the HTML.
|
||||
@ -54,12 +56,14 @@ def generate_html(txtfile, encoding):
|
||||
raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
|
||||
else:
|
||||
txt = codecs.open(txtfile, 'rb', enc).read()
|
||||
|
||||
logger.info('Converting text to HTML...')
|
||||
md = markdown.Markdown(txt,
|
||||
extensions=['footnotes', 'tables', 'toc'],
|
||||
encoding=enc,
|
||||
safe_mode=False,
|
||||
)
|
||||
html = md.toString()
|
||||
|
||||
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
|
||||
p.close()
|
||||
codecs.open(p.name, 'wb', enc).write(html)
|
||||
@ -73,7 +77,10 @@ def process_file(path, options, logger=None):
|
||||
txt = os.path.abspath(os.path.expanduser(path))
|
||||
if not hasattr(options, 'encoding'):
|
||||
options.encoding = None
|
||||
htmlfile = generate_html(txt, options.encoding)
|
||||
if not hasattr(options, 'debug_html_generation'):
|
||||
options.debug_html_generation = False
|
||||
htmlfile = generate_html(txt, options.encoding, logger)
|
||||
if not options.debug_html_generation:
|
||||
options.force_page_break = 'h2'
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
@ -81,6 +88,8 @@ def process_file(path, options, logger=None):
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
|
||||
html_process_file(htmlfile.name, options, logger)
|
||||
else:
|
||||
print open(htmlfile.name, 'rb').read()
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
|
@ -1,43 +1,37 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# The following constant specifies the name used in the usage
|
||||
# statement displayed for python versions lower than 2.3. (With
|
||||
# python2.3 and higher the usage statement is generated by optparse
|
||||
# and uses the actual name of the executable called.)
|
||||
|
||||
EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
|
||||
|
||||
SPEED_TEST = 0
|
||||
version = "1.6b"
|
||||
version_info = (1,6,2,"rc-2")
|
||||
__revision__ = "$Rev$"
|
||||
|
||||
"""
|
||||
====================================================================
|
||||
IF YOA ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION
|
||||
====================================================================
|
||||
|
||||
Python-Markdown
|
||||
===============
|
||||
|
||||
Converts Markdown to HTML. Basic usage as a module:
|
||||
|
||||
import markdown
|
||||
html = markdown.markdown(your_text_string)
|
||||
md = Markdown()
|
||||
html = markdown.convert(your_text_string)
|
||||
|
||||
See http://www.freewisdom.org/projects/python-markdown/ for more
|
||||
information and instructions on how to extend the functionality of the
|
||||
script. (You might want to read that before you try modifying this
|
||||
file.)
|
||||
|
||||
Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
|
||||
maintained by [Yuri Takhteyev](http://www.freewisdom.org).
|
||||
|
||||
Project website: http://www.freewisdom.org/projects/python-markdown
|
||||
Contact: yuri [at] freewisdom.org
|
||||
|
||||
License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
|
||||
|
||||
Version: 1.5a (July 9, 2006)
|
||||
|
||||
For changelog, see end of file
|
||||
"""
|
||||
|
||||
import re, sys, os, random, codecs
|
||||
|
||||
# set debug level: 3 none, 2 critical, 1 informative, 0 all
|
||||
import re, sys, codecs
|
||||
|
||||
# Set debug level: 3 none, 2 critical, 1 informative, 0 all
|
||||
(VERBOSE, INFO, CRITICAL, NONE) = range(4)
|
||||
|
||||
MESSAGE_THRESHOLD = CRITICAL
|
||||
@ -49,10 +43,45 @@ def message(level, text) :
|
||||
|
||||
# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
|
||||
|
||||
# all tabs will be expanded to up to this many spaces
|
||||
TAB_LENGTH = 4
|
||||
ENABLE_ATTRIBUTES = 1
|
||||
SMART_EMPHASIS = 1
|
||||
TAB_LENGTH = 4 # expand tabs to this many spaces
|
||||
ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
|
||||
SMART_EMPHASIS = 1 # this_or_that does not become this<i>or</i>that
|
||||
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
|
||||
|
||||
RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
|
||||
# from Hebrew to Nko (includes Arabic, Syriac and Thaana)
|
||||
(u'\u2D30', u'\u2D7F'),
|
||||
# Tifinagh
|
||||
)
|
||||
|
||||
# Unicode Reference Table:
|
||||
# 0590-05FF - Hebrew
|
||||
# 0600-06FF - Arabic
|
||||
# 0700-074F - Syriac
|
||||
# 0750-077F - Arabic Supplement
|
||||
# 0780-07BF - Thaana
|
||||
# 07C0-07FF - Nko
|
||||
|
||||
BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ),
|
||||
'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"),
|
||||
unicode(codecs.BOM_UTF16_BE, "utf-16")),
|
||||
#'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"),
|
||||
# unicode(codecs.BOM_UTF32_BE, "utf-32")),
|
||||
}
|
||||
|
||||
def removeBOM(text, encoding):
|
||||
for bom in BOMS[encoding]:
|
||||
if text.startswith(bom):
|
||||
return text.lstrip(bom)
|
||||
return text
|
||||
|
||||
# The following constant specifies the name used in the usage
|
||||
# statement displayed for python versions lower than 2.3. (With
|
||||
# python2.3 and higher the usage statement is generated by optparse
|
||||
# and uses the actual name of the executable called.)
|
||||
|
||||
EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
|
||||
|
||||
|
||||
# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
|
||||
|
||||
@ -82,14 +111,50 @@ Importantly, NanoDom does not do normalization, which is what we
|
||||
want. It also adds extra white space when converting DOM to string
|
||||
"""
|
||||
|
||||
ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&"),
|
||||
(re.compile("<"), "<"),
|
||||
(re.compile(">"), ">"),
|
||||
(re.compile("\""), """)]
|
||||
|
||||
ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&"),
|
||||
(re.compile("<"), "<"),
|
||||
(re.compile(">"), ">"),
|
||||
(re.compile("\""), """)]
|
||||
|
||||
|
||||
def getBidiType(text) :
|
||||
|
||||
if not text : return None
|
||||
|
||||
ch = text[0]
|
||||
|
||||
if not isinstance(ch, unicode) or not ch.isalpha():
|
||||
return None
|
||||
|
||||
else :
|
||||
|
||||
for min, max in RTL_BIDI_RANGES :
|
||||
if ( ch >= min and ch <= max ) :
|
||||
return "rtl"
|
||||
else :
|
||||
return "ltr"
|
||||
|
||||
|
||||
class Document :
|
||||
|
||||
def __init__ (self) :
|
||||
self.bidi = "ltr"
|
||||
|
||||
def appendChild(self, child) :
|
||||
self.documentElement = child
|
||||
child.isDocumentElement = True
|
||||
child.parent = self
|
||||
self.entities = {}
|
||||
|
||||
def setBidi(self, bidi) :
|
||||
if bidi :
|
||||
self.bidi = bidi
|
||||
|
||||
def createElement(self, tag, textNode=None) :
|
||||
el = Element(tag)
|
||||
el.doc = self
|
||||
@ -107,19 +172,23 @@ class Document :
|
||||
self.entities[entity] = EntityReference(entity)
|
||||
return self.entities[entity]
|
||||
|
||||
def createCDATA(self, text) :
|
||||
node = CDATA(text)
|
||||
node.doc = self
|
||||
return node
|
||||
|
||||
def toxml (self) :
|
||||
return self.documentElement.toxml()
|
||||
|
||||
def normalizeEntities(self, text) :
|
||||
def normalizeEntities(self, text, avoidDoubleNormalizing=False) :
|
||||
|
||||
pairs = [ ("&", "&"),
|
||||
("<", "<"),
|
||||
(">", ">"),
|
||||
("\"", """)]
|
||||
if avoidDoubleNormalizing :
|
||||
regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT
|
||||
else :
|
||||
regexps = ENTITY_NORMALIZATION_EXPRESSIONS
|
||||
|
||||
|
||||
for old, new in pairs :
|
||||
text = text.replace(old, new)
|
||||
for regexp, substitution in regexps :
|
||||
text = regexp.sub(substitution, text)
|
||||
return text
|
||||
|
||||
def find(self, test) :
|
||||
@ -130,6 +199,19 @@ class Document :
|
||||
self.documentElement = None
|
||||
|
||||
|
||||
class CDATA :
|
||||
|
||||
type = "cdata"
|
||||
|
||||
def __init__ (self, text) :
|
||||
self.text = text
|
||||
|
||||
def handleAttributes(self) :
|
||||
pass
|
||||
|
||||
def toxml (self) :
|
||||
return "<![CDATA[" + self.text + "]]>"
|
||||
|
||||
class Element :
|
||||
|
||||
type = "element"
|
||||
@ -140,6 +222,19 @@ class Element :
|
||||
self.attributes = []
|
||||
self.attribute_values = {}
|
||||
self.childNodes = []
|
||||
self.bidi = None
|
||||
self.isDocumentElement = False
|
||||
|
||||
def setBidi(self, bidi) :
|
||||
|
||||
if bidi :
|
||||
|
||||
|
||||
if not self.bidi or self.isDocumentElement:
|
||||
# Once the bidi is set don't change it (except for doc element)
|
||||
self.bidi = bidi
|
||||
self.parent.setBidi(bidi)
|
||||
|
||||
|
||||
def unlink(self) :
|
||||
for child in self.childNodes :
|
||||
@ -186,27 +281,56 @@ class Element :
|
||||
if ENABLE_ATTRIBUTES :
|
||||
for child in self.childNodes:
|
||||
child.handleAttributes()
|
||||
|
||||
buffer = ""
|
||||
if self.nodeName in ['h1', 'h2', 'h3', 'h4'] :
|
||||
buffer += "\n"
|
||||
elif self.nodeName in ['li'] :
|
||||
buffer += "\n "
|
||||
|
||||
# Process children FIRST, then do the attributes
|
||||
|
||||
childBuffer = ""
|
||||
|
||||
if self.childNodes or self.nodeName in ['blockquote']:
|
||||
childBuffer += ">"
|
||||
for child in self.childNodes :
|
||||
childBuffer += child.toxml()
|
||||
if self.nodeName == 'p' :
|
||||
childBuffer += "\n"
|
||||
elif self.nodeName == 'li' :
|
||||
childBuffer += "\n "
|
||||
childBuffer += "</%s>" % self.nodeName
|
||||
else :
|
||||
childBuffer += "/>"
|
||||
|
||||
|
||||
|
||||
buffer += "<" + self.nodeName
|
||||
|
||||
if self.nodeName in ['p', 'li', 'ul', 'ol',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] :
|
||||
|
||||
if not self.attribute_values.has_key("dir"):
|
||||
if self.bidi :
|
||||
bidi = self.bidi
|
||||
else :
|
||||
bidi = self.doc.bidi
|
||||
|
||||
if bidi=="rtl" :
|
||||
self.setAttribute("dir", "rtl")
|
||||
|
||||
for attr in self.attributes :
|
||||
value = self.attribute_values[attr]
|
||||
value = self.doc.normalizeEntities(value)
|
||||
value = self.doc.normalizeEntities(value,
|
||||
avoidDoubleNormalizing=True)
|
||||
buffer += ' %s="%s"' % (attr, value)
|
||||
if self.childNodes or self.nodeName in ['blockquote']:
|
||||
buffer += ">"
|
||||
for child in self.childNodes :
|
||||
buffer += child.toxml()
|
||||
if self.nodeName == 'p' :
|
||||
buffer += "\n"
|
||||
elif self.nodeName == 'li' :
|
||||
buffer += "\n "
|
||||
buffer += "</%s>" % self.nodeName
|
||||
else :
|
||||
buffer += "/>"
|
||||
|
||||
|
||||
# Now let's actually append the children
|
||||
|
||||
buffer += childBuffer
|
||||
|
||||
if self.nodeName in ['p', 'li', 'ul', 'ol',
|
||||
'h1', 'h2', 'h3', 'h4'] :
|
||||
buffer += "\n"
|
||||
@ -223,13 +347,18 @@ class TextNode :
|
||||
self.value = text
|
||||
|
||||
def attributeCallback(self, match) :
|
||||
|
||||
self.parent.setAttribute(match.group(1), match.group(2))
|
||||
|
||||
def handleAttributes(self) :
|
||||
self.value = self.attrRegExp.sub(self.attributeCallback, self.value)
|
||||
|
||||
def toxml(self) :
|
||||
|
||||
text = self.value
|
||||
|
||||
self.parent.setBidi(getBidiType(text))
|
||||
|
||||
if not text.startswith(HTML_PLACEHOLDER_PREFIX):
|
||||
if self.parent.nodeName == "p" :
|
||||
text = text.replace("\n", "\n ")
|
||||
@ -262,10 +391,10 @@ class EntityReference:
|
||||
Preprocessors munge source text before we start doing anything too
|
||||
complicated.
|
||||
|
||||
Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document,
|
||||
modifies it as necessary and returns either the same pointer or a
|
||||
pointer to a new list. Preprocessors must extend
|
||||
markdown.Preprocessor.
|
||||
Each preprocessor implements a "run" method that takes a pointer to a
|
||||
list of lines of the document, modifies it as necessary and returns
|
||||
either the same pointer or a pointer to a new list. Preprocessors
|
||||
must extend markdown.Preprocessor.
|
||||
|
||||
"""
|
||||
|
||||
@ -305,10 +434,6 @@ class HeaderPreprocessor (Preprocessor):
|
||||
lines[i] = "## " + lines[i].strip()
|
||||
lines[i+1] = ""
|
||||
|
||||
#for l in lines :
|
||||
# print l.encode('utf8')
|
||||
#sys.exit(0)
|
||||
|
||||
return lines
|
||||
|
||||
HEADER_PREPROCESSOR = HeaderPreprocessor()
|
||||
@ -362,10 +487,13 @@ class HtmlBlockPreprocessor (Preprocessor):
|
||||
return block.rstrip()[-len(left_tag)-2:-1].lower()
|
||||
|
||||
def _equal_tags(self, left_tag, right_tag):
|
||||
|
||||
if left_tag in ['?', '?php', 'div'] : # handle PHP, etc.
|
||||
return True
|
||||
if ("/" + left_tag) == right_tag:
|
||||
return True
|
||||
if (right_tag == "--" and left_tag == "--") :
|
||||
return True
|
||||
elif left_tag == right_tag[1:] \
|
||||
and right_tag[0] != "<":
|
||||
return True
|
||||
@ -376,9 +504,10 @@ class HtmlBlockPreprocessor (Preprocessor):
|
||||
return (tag in ['hr', 'hr/'])
|
||||
|
||||
|
||||
def run (self, lines) :
|
||||
def run (self, text) :
|
||||
|
||||
new_blocks = []
|
||||
text = "\n".join(lines)
|
||||
#text = "\n".join(lines)
|
||||
text = text.split("\n\n")
|
||||
|
||||
items = []
|
||||
@ -417,7 +546,7 @@ class HtmlBlockPreprocessor (Preprocessor):
|
||||
new_blocks.append(
|
||||
self.stash.store(block.strip()))
|
||||
continue
|
||||
elif not block[1] == "!":
|
||||
else: #if not block[1] == "!":
|
||||
# if is block level tag and is not complete
|
||||
items.append(block.strip())
|
||||
in_tag = True
|
||||
@ -429,6 +558,7 @@ class HtmlBlockPreprocessor (Preprocessor):
|
||||
items.append(block.strip())
|
||||
|
||||
right_tag = self._get_right_tag(left_tag, block)
|
||||
|
||||
if self._equal_tags(left_tag, right_tag):
|
||||
# if find closing tag
|
||||
in_tag = False
|
||||
@ -436,7 +566,11 @@ class HtmlBlockPreprocessor (Preprocessor):
|
||||
self.stash.store('\n\n'.join(items)))
|
||||
items = []
|
||||
|
||||
return "\n\n".join(new_blocks).split("\n")
|
||||
if items :
|
||||
new_blocks.append(self.stash.store('\n\n'.join(items)))
|
||||
new_blocks.append('\n')
|
||||
|
||||
return "\n\n".join(new_blocks) #.split("\n")
|
||||
|
||||
HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
|
||||
|
||||
@ -609,15 +743,15 @@ class LinkPattern (Pattern):
|
||||
def handleMatch(self, m, doc) :
|
||||
el = doc.createElement('a')
|
||||
el.appendChild(doc.createTextNode(m.group(2)))
|
||||
parts = m.group(9).split()
|
||||
parts = m.group(9).split('"')
|
||||
# We should now have [], [href], or [href, title]
|
||||
if parts :
|
||||
el.setAttribute('href', parts[0])
|
||||
el.setAttribute('href', parts[0].strip())
|
||||
else :
|
||||
el.setAttribute('href', "")
|
||||
if len(parts) > 1 :
|
||||
# we also got a title
|
||||
title = " ".join(parts[1:]).strip()
|
||||
title = '"' + '"'.join(parts[1:]).strip()
|
||||
title = dequote(title) #.replace('"', """)
|
||||
el.setAttribute('title', title)
|
||||
return el
|
||||
@ -645,12 +779,14 @@ class ImagePattern (Pattern):
|
||||
class ReferencePattern (Pattern):
|
||||
|
||||
def handleMatch(self, m, doc):
|
||||
|
||||
if m.group(9) :
|
||||
id = m.group(9).lower()
|
||||
else :
|
||||
# if we got something like "[Google][]"
|
||||
# we'll use "google" as the id
|
||||
id = m.group(2).lower()
|
||||
|
||||
if not self.references.has_key(id) : # ignore undefined refs
|
||||
return None
|
||||
href, title = self.references[id]
|
||||
@ -789,7 +925,6 @@ class BlockGuru :
|
||||
remainder of the original list"""
|
||||
|
||||
items = []
|
||||
item = -1
|
||||
|
||||
i = 0 # to keep track of where we are
|
||||
|
||||
@ -908,11 +1043,11 @@ class Markdown:
|
||||
Markdown text """
|
||||
|
||||
|
||||
def __init__(self, source=None,
|
||||
extensions=None,
|
||||
def __init__(self, source=None, # deprecated
|
||||
extensions=[],
|
||||
extension_configs=None,
|
||||
encoding=None,
|
||||
safe_mode = True):
|
||||
encoding="utf-8",
|
||||
safe_mode = False):
|
||||
"""Creates a new Markdown instance.
|
||||
|
||||
@param source: The text in Markdown format.
|
||||
@ -924,10 +1059,14 @@ class Markdown:
|
||||
self.blockGuru = BlockGuru()
|
||||
self.registeredExtensions = []
|
||||
self.stripTopLevelTags = 1
|
||||
self.docType = ""
|
||||
|
||||
self.preprocessors = [ HEADER_PREPROCESSOR,
|
||||
|
||||
self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
|
||||
|
||||
self.preprocessors = [
|
||||
HEADER_PREPROCESSOR,
|
||||
LINE_PREPROCESSOR,
|
||||
HTML_BLOCK_PREPROCESSOR,
|
||||
LINE_BREAKS_PREPROCESSOR,
|
||||
# A footnote preprocessor will
|
||||
# get inserted here
|
||||
@ -979,6 +1118,7 @@ class Markdown:
|
||||
for ext in extensions :
|
||||
|
||||
extension_module_name = "libprs500.ebooks.markdown.mdx_" + ext
|
||||
|
||||
try :
|
||||
module = sys.modules[extension_module_name]
|
||||
except :
|
||||
@ -991,6 +1131,7 @@ class Markdown:
|
||||
configs_for_ext = configs[ext]
|
||||
else :
|
||||
configs_for_ext = []
|
||||
|
||||
extension = module.makeExtension(configs_for_ext)
|
||||
extension.extendMarkdown(self, globals())
|
||||
|
||||
@ -1032,7 +1173,7 @@ class Markdown:
|
||||
self.doc.appendChild(self.top_element)
|
||||
|
||||
# Fixup the source text
|
||||
text = self.source.strip()
|
||||
text = self.source #.strip()
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
text += "\n\n"
|
||||
text = text.expandtabs(TAB_LENGTH)
|
||||
@ -1085,9 +1226,7 @@ class Markdown:
|
||||
@param inList: a level
|
||||
@returns: None"""
|
||||
|
||||
if not lines :
|
||||
return
|
||||
|
||||
while lines:
|
||||
# Check if this section starts with a list, a blockquote or
|
||||
# a code block
|
||||
|
||||
@ -1099,12 +1238,7 @@ class Markdown:
|
||||
for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
|
||||
m = RE.regExp[regexp].match(lines[0])
|
||||
if m :
|
||||
try:
|
||||
processFn[regexp](parent_elem, lines, inList)
|
||||
except RuntimeError:
|
||||
print 'WARNING: Max recursion depth excedeeded, skipping section'
|
||||
#print '\n'.join(lines)
|
||||
#sys.exit()
|
||||
return
|
||||
|
||||
# We are NOT looking at one of the high-level structures like
|
||||
@ -1124,36 +1258,34 @@ class Markdown:
|
||||
|
||||
if inList :
|
||||
|
||||
start, theRest = self._linesUntil(lines, (lambda line:
|
||||
start, lines = self._linesUntil(lines, (lambda line:
|
||||
RE.regExp['ul'].match(line)
|
||||
or RE.regExp['ol'].match(line)
|
||||
or not line.strip()))
|
||||
|
||||
self._processSection(parent_elem, start,
|
||||
inList - 1, looseList = looseList)
|
||||
self._processSection(parent_elem, theRest,
|
||||
self._processSection(parent_elem, lines,
|
||||
inList - 1, looseList = looseList)
|
||||
|
||||
|
||||
else : # Ok, so it's just a simple block
|
||||
|
||||
paragraph, theRest = self._linesUntil(lines, lambda line:
|
||||
paragraph, lines = self._linesUntil(lines, lambda line:
|
||||
not line.strip())
|
||||
|
||||
if len(paragraph) and paragraph[0].startswith('#') :
|
||||
m = RE.regExp['header'].match(paragraph[0])
|
||||
if m :
|
||||
level = len(m.group(1))
|
||||
h = self.doc.createElement("h%d" % level)
|
||||
parent_elem.appendChild(h)
|
||||
for item in self._handleInlineWrapper2(m.group(2).strip()) :
|
||||
for item in self._handleInlineWrapper(m.group(2).strip()) :
|
||||
h.appendChild(item)
|
||||
else :
|
||||
message(CRITICAL, "We've got a problem header!")
|
||||
|
||||
elif paragraph :
|
||||
|
||||
list = self._handleInlineWrapper2("\n".join(paragraph))
|
||||
list = self._handleInlineWrapper("\n".join(paragraph))
|
||||
|
||||
if ( parent_elem.nodeName == 'li'
|
||||
and not (looseList or parent_elem.childNodes)):
|
||||
@ -1171,13 +1303,8 @@ class Markdown:
|
||||
for item in list :
|
||||
el.appendChild(item)
|
||||
|
||||
if theRest :
|
||||
theRest = theRest[1:] # skip the first (blank) line
|
||||
|
||||
try:
|
||||
self._processSection(parent_elem, theRest, inList)
|
||||
except RuntimeError: #Added by Kovid
|
||||
pass
|
||||
if lines:
|
||||
lines = lines[1:] # skip the first (blank) line
|
||||
|
||||
|
||||
|
||||
@ -1247,7 +1374,9 @@ class Markdown:
|
||||
m = RE.regExp[expr].match(line)
|
||||
if m :
|
||||
if expr in ['ul', 'ol'] : # We are looking at a new item
|
||||
if m.group(1) :
|
||||
#if m.group(1) :
|
||||
# Removed the check to allow for a blank line
|
||||
# at the beginning of the list item
|
||||
items.append([m.group(1)])
|
||||
item += 1
|
||||
elif expr == 'tabbed' : # This line needs to be detabbed
|
||||
@ -1333,40 +1462,31 @@ class Markdown:
|
||||
detabbed, theRest = self.blockGuru.detectTabbed(lines)
|
||||
|
||||
pre = self.doc.createElement('pre')
|
||||
#code = self.doc.createElement('code')
|
||||
code = self.doc.createElement('code')
|
||||
parent_elem.appendChild(pre)
|
||||
#pre.appendChild(code)
|
||||
pre.appendChild(code)
|
||||
text = "\n".join(detabbed).rstrip()+"\n"
|
||||
#text = text.replace("&", "&")
|
||||
pre.appendChild(self.doc.createTextNode(text))
|
||||
code.appendChild(self.doc.createTextNode(text))
|
||||
self._processSection(parent_elem, theRest, inList)
|
||||
|
||||
|
||||
def _handleInlineWrapper2 (self, line) :
|
||||
|
||||
def _handleInlineWrapper (self, line) :
|
||||
|
||||
parts = [line]
|
||||
|
||||
#if not(line):
|
||||
# return [self.doc.createTextNode(' ')]
|
||||
|
||||
for pattern in self.inlinePatterns :
|
||||
|
||||
#print
|
||||
#print self.inlinePatterns.index(pattern)
|
||||
|
||||
i = 0
|
||||
|
||||
#print parts
|
||||
while i < len(parts) :
|
||||
|
||||
x = parts[i]
|
||||
#print i
|
||||
|
||||
if isinstance(x, (str, unicode)) :
|
||||
result = self._applyPattern(x, pattern)
|
||||
#print result
|
||||
#print result
|
||||
#print parts, i
|
||||
|
||||
if result :
|
||||
i -= 1
|
||||
parts.remove(x)
|
||||
@ -1383,27 +1503,6 @@ class Markdown:
|
||||
return parts
|
||||
|
||||
|
||||
|
||||
def _handleInlineWrapper (self, line) :
|
||||
|
||||
# A wrapper around _handleInline to avoid recursion
|
||||
|
||||
parts = [line]
|
||||
|
||||
i = 0
|
||||
|
||||
while i < len(parts) :
|
||||
x = parts[i]
|
||||
if isinstance(x, (str, unicode)) :
|
||||
parts.remove(x)
|
||||
result = self._handleInline(x)
|
||||
for y in result :
|
||||
parts.insert(i,y)
|
||||
else :
|
||||
i += 1
|
||||
|
||||
return parts
|
||||
|
||||
def _handleInline(self, line):
|
||||
"""Transform a Markdown line with inline elements to an XHTML
|
||||
fragment.
|
||||
@ -1424,6 +1523,7 @@ class Markdown:
|
||||
return [self.doc.createTextNode(line)]
|
||||
|
||||
def _applyPattern(self, line, pattern) :
|
||||
|
||||
""" Given a pattern name, this function checks if the line
|
||||
fits the pattern, creates the necessary elements, and returns
|
||||
back a list consisting of NanoDom elements and/or strings.
|
||||
@ -1438,6 +1538,8 @@ class Markdown:
|
||||
# match the line to pattern's pre-compiled reg exp.
|
||||
# if no match, move on.
|
||||
|
||||
|
||||
|
||||
m = pattern.getCompiledRegExp().match(line)
|
||||
if not m :
|
||||
return None
|
||||
@ -1446,6 +1548,40 @@ class Markdown:
|
||||
# if it doesn't, move on
|
||||
node = pattern.handleMatch(m, self.doc)
|
||||
|
||||
# check if any of this nodes have children that need processing
|
||||
|
||||
if isinstance(node, Element):
|
||||
|
||||
if not node.nodeName in ["code", "pre"] :
|
||||
for child in node.childNodes :
|
||||
if isinstance(child, TextNode):
|
||||
|
||||
result = self._handleInlineWrapper(child.value)
|
||||
|
||||
if result:
|
||||
|
||||
if result == [child] :
|
||||
continue
|
||||
|
||||
result.reverse()
|
||||
#to make insertion easier
|
||||
|
||||
position = node.childNodes.index(child)
|
||||
|
||||
node.removeChild(child)
|
||||
|
||||
for item in result:
|
||||
|
||||
if isinstance(item, (str, unicode)):
|
||||
if len(item) > 0:
|
||||
node.insertChild(position,
|
||||
self.doc.createTextNode(item))
|
||||
else:
|
||||
node.insertChild(position, item)
|
||||
|
||||
|
||||
|
||||
|
||||
if node :
|
||||
# Those are in the reverse order!
|
||||
return ( m.groups()[-1], # the string to the left
|
||||
@ -1455,7 +1591,7 @@ class Markdown:
|
||||
else :
|
||||
return None
|
||||
|
||||
def __str__(self, source = None):
|
||||
def convert (self, source = None):
|
||||
"""Return the document in XHTML format.
|
||||
|
||||
@returns: A serialized XHTML body."""
|
||||
@ -1464,6 +1600,14 @@ class Markdown:
|
||||
if source :
|
||||
self.source = source
|
||||
|
||||
if not self.source :
|
||||
return ""
|
||||
|
||||
self.source = removeBOM(self.source, self.encoding)
|
||||
|
||||
for pp in self.textPreprocessors:
|
||||
self.source = pp.run(self.source)
|
||||
|
||||
doc = self._transform()
|
||||
xml = doc.toxml()
|
||||
|
||||
@ -1474,8 +1618,8 @@ class Markdown:
|
||||
|
||||
for i in range(self.htmlStash.html_counter) :
|
||||
html = self.htmlStash.rawHtmlBlocks[i]
|
||||
if self.safeMode :
|
||||
html = "[HTML_REMOVED]"
|
||||
if self.safeMode and html != "<hr />" and html != "<br />":
|
||||
html = HTML_REMOVED_TEXT
|
||||
|
||||
xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
|
||||
html + "\n")
|
||||
@ -1490,10 +1634,13 @@ class Markdown:
|
||||
for pp in self.textPostprocessors :
|
||||
xml = pp.run(xml)
|
||||
|
||||
return xml
|
||||
return (self.docType + xml).strip()
|
||||
|
||||
|
||||
toString = __str__
|
||||
__str__ = convert # deprecated - will be changed in 1.7 to report
|
||||
# information about the MD instance
|
||||
|
||||
toString = __str__ # toString() method is deprecated
|
||||
|
||||
|
||||
def __unicode__(self):
|
||||
@ -1502,7 +1649,7 @@ class Markdown:
|
||||
return str(self)#.decode(self.encoding)
|
||||
|
||||
|
||||
toUnicode = __unicode__
|
||||
toUnicode = __unicode__ # deprecated - will be removed in 1.7
|
||||
|
||||
|
||||
|
||||
@ -1525,7 +1672,7 @@ def markdownFromFile(input = None,
|
||||
if not encoding :
|
||||
encoding = "utf-8"
|
||||
|
||||
input_file = codecs.open(input, mode="r", encoding="utf-8")
|
||||
input_file = codecs.open(input, mode="r", encoding=encoding)
|
||||
text = input_file.read()
|
||||
input_file.close()
|
||||
|
||||
@ -1559,13 +1706,12 @@ def markdown(text,
|
||||
pairs = [x.split("=") for x in ext[pos+1:-1].split(",")]
|
||||
configs = [(x.strip(), y.strip()) for (x, y) in pairs]
|
||||
extension_configs[name] = configs
|
||||
#print configs
|
||||
|
||||
md = Markdown(text, extensions=extension_names,
|
||||
md = Markdown(extensions=extension_names,
|
||||
extension_configs=extension_configs,
|
||||
safe_mode = safe_mode)
|
||||
|
||||
return md.toString()
|
||||
return md.convert(text)
|
||||
|
||||
|
||||
class Extension :
|
||||
@ -1575,7 +1721,6 @@ class Extension :
|
||||
|
||||
def getConfig(self, key) :
|
||||
if self.config.has_key(key) :
|
||||
#print self.config[key][0]
|
||||
return self.config[key][0]
|
||||
else :
|
||||
return ""
|
||||
@ -1653,10 +1798,8 @@ def parse_options() :
|
||||
'extensions' : options.extensions,
|
||||
'encoding' : options.encoding }
|
||||
|
||||
def main():
|
||||
if __name__ == '__main__':
|
||||
""" Run Markdown from the command line. """
|
||||
for a in ['-x', 'toc', '-x', 'tables', '-x', 'footnotes']:
|
||||
sys.argv.append(a)
|
||||
|
||||
options = parse_options()
|
||||
|
||||
@ -1667,8 +1810,7 @@ def main():
|
||||
|
||||
markdownFromFile(**options)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user