mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Fix #392
This commit is contained in:
parent
42a90b9515
commit
f571a7b603
@ -58,11 +58,11 @@ def generate_html(txtfile, encoding, logger):
|
|||||||
txt = codecs.open(txtfile, 'rb', enc).read()
|
txt = codecs.open(txtfile, 'rb', enc).read()
|
||||||
|
|
||||||
logger.info('Converting text to HTML...')
|
logger.info('Converting text to HTML...')
|
||||||
md = markdown.Markdown(txt,
|
md = markdown.Markdown(
|
||||||
extensions=['footnotes', 'tables', 'toc'],
|
extensions=['footnotes', 'tables', 'toc'],
|
||||||
safe_mode=False,
|
safe_mode=False,
|
||||||
)
|
)
|
||||||
html = md.toString()
|
html = md.convert(txt)
|
||||||
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
|
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
|
||||||
p.close()
|
p.close()
|
||||||
codecs.open(p.name, 'wb', 'utf8').write(html)
|
codecs.open(p.name, 'wb', 'utf8').write(html)
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
version = "1.6b"
|
version = "1.7"
|
||||||
version_info = (1,6,2,"rc-2")
|
version_info = (1,7,0,"rc-1")
|
||||||
__revision__ = "$Rev$"
|
__revision__ = "$Rev: 66 $"
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Python-Markdown
|
Python-Markdown
|
||||||
@ -12,7 +12,7 @@ Converts Markdown to HTML. Basic usage as a module:
|
|||||||
|
|
||||||
import markdown
|
import markdown
|
||||||
md = Markdown()
|
md = Markdown()
|
||||||
html = markdown.convert(your_text_string)
|
html = md.convert(your_text_string)
|
||||||
|
|
||||||
See http://www.freewisdom.org/projects/python-markdown/ for more
|
See http://www.freewisdom.org/projects/python-markdown/ for more
|
||||||
information and instructions on how to extend the functionality of the
|
information and instructions on how to extend the functionality of the
|
||||||
@ -20,25 +20,39 @@ script. (You might want to read that before you try modifying this
|
|||||||
file.)
|
file.)
|
||||||
|
|
||||||
Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
|
Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
|
||||||
maintained by [Yuri Takhteyev](http://www.freewisdom.org).
|
maintained by [Yuri Takhteyev](http://www.freewisdom.org) and [Waylan
|
||||||
|
Limberg](http://achinghead.com/).
|
||||||
|
|
||||||
Contact: yuri [at] freewisdom.org
|
Contact: yuri [at] freewisdom.org
|
||||||
|
waylan [at] gmail.com
|
||||||
|
|
||||||
License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
|
License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
import re, sys, codecs
|
import re, sys, os, random, codecs
|
||||||
|
|
||||||
|
from logging import getLogger, StreamHandler, Formatter, \
|
||||||
|
DEBUG, INFO, WARN, ERROR, CRITICAL
|
||||||
|
|
||||||
# Set debug level: 3 none, 2 critical, 1 informative, 0 all
|
|
||||||
(VERBOSE, INFO, CRITICAL, NONE) = range(4)
|
|
||||||
|
|
||||||
MESSAGE_THRESHOLD = CRITICAL
|
MESSAGE_THRESHOLD = CRITICAL
|
||||||
|
|
||||||
|
|
||||||
|
# Configure debug message logger (the hard way - to support python 2.3)
|
||||||
|
logger = getLogger('MARKDOWN')
|
||||||
|
logger.setLevel(DEBUG) # This is restricted by handlers later
|
||||||
|
console_hndlr = StreamHandler()
|
||||||
|
formatter = Formatter('%(name)s-%(levelname)s: "%(message)s"')
|
||||||
|
console_hndlr.setFormatter(formatter)
|
||||||
|
console_hndlr.setLevel(MESSAGE_THRESHOLD)
|
||||||
|
logger.addHandler(console_hndlr)
|
||||||
|
|
||||||
|
|
||||||
def message(level, text):
|
def message(level, text):
|
||||||
if level >= MESSAGE_THRESHOLD :
|
''' A wrapper method for logging debug messages. '''
|
||||||
print text
|
logger.log(level, text)
|
||||||
|
|
||||||
|
|
||||||
# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
|
# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
|
||||||
@ -62,15 +76,15 @@ RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
|
|||||||
# 0780-07BF - Thaana
|
# 0780-07BF - Thaana
|
||||||
# 07C0-07FF - Nko
|
# 07C0-07FF - Nko
|
||||||
|
|
||||||
BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ),
|
BOMS = { 'utf-8': (codecs.BOM_UTF8, ),
|
||||||
'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"),
|
'utf-16': (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE),
|
||||||
unicode(codecs.BOM_UTF16_BE, "utf-16")),
|
#'utf-32': (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)
|
||||||
#'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"),
|
|
||||||
# unicode(codecs.BOM_UTF32_BE, "utf-32")),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def removeBOM(text, encoding):
|
def removeBOM(text, encoding):
|
||||||
|
convert = isinstance(text, unicode)
|
||||||
for bom in BOMS[encoding]:
|
for bom in BOMS[encoding]:
|
||||||
|
bom = convert and bom.decode(encoding) or bom
|
||||||
if text.startswith(bom):
|
if text.startswith(bom):
|
||||||
return text.lstrip(bom)
|
return text.lstrip(bom)
|
||||||
return text
|
return text
|
||||||
@ -229,6 +243,7 @@ class Element :
|
|||||||
|
|
||||||
if bidi:
|
if bidi:
|
||||||
|
|
||||||
|
orig_bidi = self.bidi
|
||||||
|
|
||||||
if not self.bidi or self.isDocumentElement:
|
if not self.bidi or self.isDocumentElement:
|
||||||
# Once the bidi is set don't change it (except for doc element)
|
# Once the bidi is set don't change it (except for doc element)
|
||||||
@ -331,7 +346,7 @@ class Element :
|
|||||||
|
|
||||||
buffer += childBuffer
|
buffer += childBuffer
|
||||||
|
|
||||||
if self.nodeName in ['p', 'li', 'ul', 'ol',
|
if self.nodeName in ['p', 'br ', 'li', 'ul', 'ol',
|
||||||
'h1', 'h2', 'h3', 'h4'] :
|
'h1', 'h2', 'h3', 'h4'] :
|
||||||
buffer += "\n"
|
buffer += "\n"
|
||||||
|
|
||||||
@ -441,14 +456,19 @@ HEADER_PREPROCESSOR = HeaderPreprocessor()
|
|||||||
class LinePreprocessor (Preprocessor):
|
class LinePreprocessor (Preprocessor):
|
||||||
"""Deals with HR lines (needs to be done before processing lists)"""
|
"""Deals with HR lines (needs to be done before processing lists)"""
|
||||||
|
|
||||||
|
blockquote_re = re.compile(r'^(> )+')
|
||||||
|
|
||||||
def run (self, lines):
|
def run (self, lines):
|
||||||
for i in range(len(lines)):
|
for i in range(len(lines)):
|
||||||
if self._isLine(lines[i]) :
|
prefix = ''
|
||||||
lines[i] = "<hr />"
|
m = self.blockquote_re.search(lines[i])
|
||||||
|
if m : prefix = m.group(0)
|
||||||
|
if self._isLine(lines[i][len(prefix):]):
|
||||||
|
lines[i] = prefix + self.stash.store("<hr />", safe=True)
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
def _isLine(self, block):
|
def _isLine(self, block):
|
||||||
"""Determines if a block should be replaced with an <HR>"""
|
"""Determines if a block should be replaced with an <:wHR>"""
|
||||||
if block.startswith(" "): return 0 # a code block
|
if block.startswith(" "): return 0 # a code block
|
||||||
text = "".join([x for x in block if not x.isspace()])
|
text = "".join([x for x in block if not x.isspace()])
|
||||||
if len(text) <= 2:
|
if len(text) <= 2:
|
||||||
@ -463,19 +483,6 @@ class LinePreprocessor (Preprocessor):
|
|||||||
LINE_PREPROCESSOR = LinePreprocessor()
|
LINE_PREPROCESSOR = LinePreprocessor()
|
||||||
|
|
||||||
|
|
||||||
class LineBreaksPreprocessor (Preprocessor):
|
|
||||||
"""Replaces double spaces at the end of the lines with <br/ >."""
|
|
||||||
|
|
||||||
def run (self, lines) :
|
|
||||||
for i in range(len(lines)) :
|
|
||||||
if (lines[i].endswith(" ")
|
|
||||||
and not RE.regExp['tabbed'].match(lines[i]) ):
|
|
||||||
lines[i] += "<br />"
|
|
||||||
return lines
|
|
||||||
|
|
||||||
LINE_BREAKS_PREPROCESSOR = LineBreaksPreprocessor()
|
|
||||||
|
|
||||||
|
|
||||||
class HtmlBlockPreprocessor (Preprocessor):
|
class HtmlBlockPreprocessor (Preprocessor):
|
||||||
"""Removes html blocks from self.lines"""
|
"""Removes html blocks from self.lines"""
|
||||||
|
|
||||||
@ -507,7 +514,6 @@ class HtmlBlockPreprocessor (Preprocessor):
|
|||||||
def run (self, text):
|
def run (self, text):
|
||||||
|
|
||||||
new_blocks = []
|
new_blocks = []
|
||||||
#text = "\n".join(lines)
|
|
||||||
text = text.split("\n\n")
|
text = text.split("\n\n")
|
||||||
|
|
||||||
items = []
|
items = []
|
||||||
@ -570,7 +576,7 @@ class HtmlBlockPreprocessor (Preprocessor):
|
|||||||
new_blocks.append(self.stash.store('\n\n'.join(items)))
|
new_blocks.append(self.stash.store('\n\n'.join(items)))
|
||||||
new_blocks.append('\n')
|
new_blocks.append('\n')
|
||||||
|
|
||||||
return "\n\n".join(new_blocks) #.split("\n")
|
return "\n\n".join(new_blocks)
|
||||||
|
|
||||||
HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
|
HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
|
||||||
|
|
||||||
@ -648,9 +654,10 @@ So, we apply the expressions in the following order:
|
|||||||
|
|
||||||
NOBRACKET = r'[^\]\[]*'
|
NOBRACKET = r'[^\]\[]*'
|
||||||
BRK = ( r'\[('
|
BRK = ( r'\[('
|
||||||
+ (NOBRACKET + r'(\['+NOBRACKET)*6
|
+ (NOBRACKET + r'(\[')*6
|
||||||
+ (NOBRACKET+ r'\])*'+NOBRACKET)*6
|
+ (NOBRACKET+ r'\])*')*6
|
||||||
+ NOBRACKET + r')\]' )
|
+ NOBRACKET + r')\]' )
|
||||||
|
NOIMG = r'(?<!\!)'
|
||||||
|
|
||||||
BACKTICK_RE = r'\`([^\`]*)\`' # `e= m*c^2`
|
BACKTICK_RE = r'\`([^\`]*)\`' # `e= m*c^2`
|
||||||
DOUBLE_BACKTICK_RE = r'\`\`(.*)\`\`' # ``e=f("`")``
|
DOUBLE_BACKTICK_RE = r'\`\`(.*)\`\`' # ``e=f("`")``
|
||||||
@ -667,10 +674,10 @@ else :
|
|||||||
STRONG_2_RE = r'__([^_]*)__' # __strong__
|
STRONG_2_RE = r'__([^_]*)__' # __strong__
|
||||||
STRONG_EM_2_RE = r'___([^_]*)___' # ___strong___
|
STRONG_EM_2_RE = r'___([^_]*)___' # ___strong___
|
||||||
|
|
||||||
LINK_RE = BRK + r'\s*\(([^\)]*)\)' # [text](url)
|
LINK_RE = NOIMG + BRK + r'\s*\(([^\)]*)\)' # [text](url)
|
||||||
LINK_ANGLED_RE = BRK + r'\s*\(<([^\)]*)>\)' # [text](<url>)
|
LINK_ANGLED_RE = NOIMG + BRK + r'\s*\(<([^\)]*)>\)' # [text](<url>)
|
||||||
IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(([^\)]*)\)' # 
|
IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(([^\)]*)\)' # 
|
||||||
REFERENCE_RE = BRK+ r'\s*\[([^\]]*)\]' # [Google][3]
|
REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3]
|
||||||
IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
|
IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
|
||||||
NOT_STRONG_RE = r'( \* )' # stand-alone * or _
|
NOT_STRONG_RE = r'( \* )' # stand-alone * or _
|
||||||
AUTOLINK_RE = r'<(http://[^>]*)>' # <http://www.123.com>
|
AUTOLINK_RE = r'<(http://[^>]*)>' # <http://www.123.com>
|
||||||
@ -678,6 +685,8 @@ AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com>
|
|||||||
#HTML_RE = r'(\<[^\>]*\>)' # <...>
|
#HTML_RE = r'(\<[^\>]*\>)' # <...>
|
||||||
HTML_RE = r'(\<[a-zA-Z/][^\>]*\>)' # <...>
|
HTML_RE = r'(\<[a-zA-Z/][^\>]*\>)' # <...>
|
||||||
ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &
|
ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &
|
||||||
|
LINE_BREAK_RE = r' \n' # two spaces at end of line
|
||||||
|
LINE_BREAK_2_RE = r' $' # two spaces at end of text
|
||||||
|
|
||||||
class Pattern:
|
class Pattern:
|
||||||
|
|
||||||
@ -706,6 +715,11 @@ class SimpleTagPattern (Pattern):
|
|||||||
el.appendChild(doc.createTextNode(m.group(2)))
|
el.appendChild(doc.createTextNode(m.group(2)))
|
||||||
return el
|
return el
|
||||||
|
|
||||||
|
class SubstituteTagPattern (SimpleTagPattern):
|
||||||
|
|
||||||
|
def handleMatch (self, m, doc):
|
||||||
|
return doc.createElement(self.tag)
|
||||||
|
|
||||||
class BacktickPattern (Pattern):
|
class BacktickPattern (Pattern):
|
||||||
|
|
||||||
def __init__ (self, pattern):
|
def __init__ (self, pattern):
|
||||||
@ -734,7 +748,9 @@ class DoubleTagPattern (SimpleTagPattern) :
|
|||||||
class HtmlPattern (Pattern):
|
class HtmlPattern (Pattern):
|
||||||
|
|
||||||
def handleMatch (self, m, doc):
|
def handleMatch (self, m, doc):
|
||||||
place_holder = self.stash.store(m.group(2))
|
rawhtml = m.group(2)
|
||||||
|
inline = True
|
||||||
|
place_holder = self.stash.store(rawhtml)
|
||||||
return doc.createTextNode(place_holder)
|
return doc.createTextNode(place_holder)
|
||||||
|
|
||||||
|
|
||||||
@ -762,7 +778,10 @@ class ImagePattern (Pattern):
|
|||||||
def handleMatch(self, m, doc):
|
def handleMatch(self, m, doc):
|
||||||
el = doc.createElement('img')
|
el = doc.createElement('img')
|
||||||
src_parts = m.group(9).split()
|
src_parts = m.group(9).split()
|
||||||
|
if src_parts:
|
||||||
el.setAttribute('src', src_parts[0])
|
el.setAttribute('src', src_parts[0])
|
||||||
|
else:
|
||||||
|
el.setAttribute('src', "")
|
||||||
if len(src_parts) > 1:
|
if len(src_parts) > 1:
|
||||||
el.setAttribute('title', dequote(" ".join(src_parts[1:])))
|
el.setAttribute('title', dequote(" ".join(src_parts[1:])))
|
||||||
if ENABLE_ATTRIBUTES:
|
if ENABLE_ATTRIBUTES:
|
||||||
@ -849,6 +868,9 @@ EMPHASIS_PATTERN_2 = SimpleTagPattern(EMPHASIS_2_RE, 'em')
|
|||||||
STRONG_EM_PATTERN = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
|
STRONG_EM_PATTERN = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
|
||||||
STRONG_EM_PATTERN_2 = DoubleTagPattern(STRONG_EM_2_RE, 'strong,em')
|
STRONG_EM_PATTERN_2 = DoubleTagPattern(STRONG_EM_2_RE, 'strong,em')
|
||||||
|
|
||||||
|
LINE_BREAK_PATTERN = SubstituteTagPattern(LINE_BREAK_RE, 'br ')
|
||||||
|
LINE_BREAK_PATTERN_2 = SubstituteTagPattern(LINE_BREAK_2_RE, 'br ')
|
||||||
|
|
||||||
LINK_PATTERN = LinkPattern(LINK_RE)
|
LINK_PATTERN = LinkPattern(LINK_RE)
|
||||||
LINK_ANGLED_PATTERN = LinkPattern(LINK_ANGLED_RE)
|
LINK_ANGLED_PATTERN = LinkPattern(LINK_ANGLED_RE)
|
||||||
IMAGE_LINK_PATTERN = ImagePattern(IMAGE_LINK_RE)
|
IMAGE_LINK_PATTERN = ImagePattern(IMAGE_LINK_RE)
|
||||||
@ -882,6 +904,51 @@ class Postprocessor :
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
======================================================================
|
||||||
|
======================== TEXT-POST-PROCESSORS ========================
|
||||||
|
======================================================================
|
||||||
|
|
||||||
|
Markdown also allows text-post-processors, which are similar to
|
||||||
|
textpreprocessors in that they need to implement a "run" method.
|
||||||
|
Unlike post-processors, they take a text string as a parameter and
|
||||||
|
should return a string.
|
||||||
|
|
||||||
|
Text-Post-Processors should extend markdown.Postprocessor.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class RawHtmlTextPostprocessor(Postprocessor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self, text):
|
||||||
|
for i in range(self.stash.html_counter):
|
||||||
|
html, safe = self.stash.rawHtmlBlocks[i]
|
||||||
|
if self.safeMode and not safe:
|
||||||
|
if str(self.safeMode).lower() == 'escape':
|
||||||
|
html = self.escape(html)
|
||||||
|
elif str(self.safeMode).lower() == 'remove':
|
||||||
|
html = ''
|
||||||
|
else:
|
||||||
|
html = HTML_REMOVED_TEXT
|
||||||
|
|
||||||
|
text = text.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
|
||||||
|
html + "\n")
|
||||||
|
text = text.replace(HTML_PLACEHOLDER % i, html)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def escape(self, html):
|
||||||
|
''' Basic html escaping '''
|
||||||
|
html = html.replace('&', '&')
|
||||||
|
html = html.replace('<', '<')
|
||||||
|
html = html.replace('>', '>')
|
||||||
|
return html.replace('"', '"')
|
||||||
|
|
||||||
|
RAWHTMLTEXTPOSTPROCESSOR = RawHtmlTextPostprocessor()
|
||||||
|
|
||||||
"""
|
"""
|
||||||
======================================================================
|
======================================================================
|
||||||
========================== MISC AUXILIARY CLASSES ====================
|
========================== MISC AUXILIARY CLASSES ====================
|
||||||
@ -896,14 +963,16 @@ class HtmlStash :
|
|||||||
self.html_counter = 0 # for counting inline html segments
|
self.html_counter = 0 # for counting inline html segments
|
||||||
self.rawHtmlBlocks=[]
|
self.rawHtmlBlocks=[]
|
||||||
|
|
||||||
def store(self, html) :
|
def store(self, html, safe=False):
|
||||||
"""Saves an HTML segment for later reinsertion. Returns a
|
"""Saves an HTML segment for later reinsertion. Returns a
|
||||||
placeholder string that needs to be inserted into the
|
placeholder string that needs to be inserted into the
|
||||||
document.
|
document.
|
||||||
|
|
||||||
@param html: an html segment
|
@param html: an html segment
|
||||||
|
@param safe: label an html segment as safe for safemode
|
||||||
|
@param inline: label a segmant as inline html
|
||||||
@returns : a placeholder string """
|
@returns : a placeholder string """
|
||||||
self.rawHtmlBlocks.append(html)
|
self.rawHtmlBlocks.append((html, safe))
|
||||||
placeholder = HTML_PLACEHOLDER % self.html_counter
|
placeholder = HTML_PLACEHOLDER % self.html_counter
|
||||||
self.html_counter += 1
|
self.html_counter += 1
|
||||||
return placeholder
|
return placeholder
|
||||||
@ -925,6 +994,7 @@ class BlockGuru :
|
|||||||
remainder of the original list"""
|
remainder of the original list"""
|
||||||
|
|
||||||
items = []
|
items = []
|
||||||
|
item = -1
|
||||||
|
|
||||||
i = 0 # to keep track of where we are
|
i = 0 # to keep track of where we are
|
||||||
|
|
||||||
@ -1043,31 +1113,30 @@ class Markdown:
|
|||||||
Markdown text """
|
Markdown text """
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, source=None, # deprecated
|
def __init__(self, source=None, # depreciated
|
||||||
extensions=[],
|
extensions=[],
|
||||||
extension_configs=None,
|
extension_configs=None,
|
||||||
encoding="utf-8",
|
|
||||||
safe_mode = False):
|
safe_mode = False):
|
||||||
"""Creates a new Markdown instance.
|
"""Creates a new Markdown instance.
|
||||||
|
|
||||||
@param source: The text in Markdown format.
|
@param source: The text in Markdown format. Depreciated!
|
||||||
@param encoding: The character encoding of <text>. """
|
@param extensions: A list if extensions.
|
||||||
|
@param extension-configs: Configuration setting for extensions.
|
||||||
|
@param safe_mode: Disallow raw html. """
|
||||||
|
|
||||||
self.safeMode = safe_mode
|
|
||||||
self.encoding = encoding
|
|
||||||
self.source = source
|
self.source = source
|
||||||
|
if source is not None:
|
||||||
|
message(WARN, "The `source` arg of Markdown.__init__() is depreciated and will be removed in the future. Use `instance.convert(source)` instead.")
|
||||||
|
self.safeMode = safe_mode
|
||||||
self.blockGuru = BlockGuru()
|
self.blockGuru = BlockGuru()
|
||||||
self.registeredExtensions = []
|
self.registeredExtensions = []
|
||||||
self.stripTopLevelTags = 1
|
self.stripTopLevelTags = 1
|
||||||
self.docType = ""
|
self.docType = ""
|
||||||
|
|
||||||
|
|
||||||
self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
|
self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
|
||||||
|
|
||||||
self.preprocessors = [
|
self.preprocessors = [HEADER_PREPROCESSOR,
|
||||||
HEADER_PREPROCESSOR,
|
|
||||||
LINE_PREPROCESSOR,
|
LINE_PREPROCESSOR,
|
||||||
LINE_BREAKS_PREPROCESSOR,
|
|
||||||
# A footnote preprocessor will
|
# A footnote preprocessor will
|
||||||
# get inserted here
|
# get inserted here
|
||||||
REFERENCE_PREPROCESSOR]
|
REFERENCE_PREPROCESSOR]
|
||||||
@ -1076,8 +1145,9 @@ class Markdown:
|
|||||||
self.postprocessors = [] # a footnote postprocessor will get
|
self.postprocessors = [] # a footnote postprocessor will get
|
||||||
# inserted later
|
# inserted later
|
||||||
|
|
||||||
self.textPostprocessors = [] # a footnote postprocessor will get
|
self.textPostprocessors = [# a footnote postprocessor will get
|
||||||
# inserted later
|
# inserted here
|
||||||
|
RAWHTMLTEXTPOSTPROCESSOR]
|
||||||
|
|
||||||
self.prePatterns = []
|
self.prePatterns = []
|
||||||
|
|
||||||
@ -1085,13 +1155,15 @@ class Markdown:
|
|||||||
self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN,
|
self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN,
|
||||||
BACKTICK_PATTERN,
|
BACKTICK_PATTERN,
|
||||||
ESCAPE_PATTERN,
|
ESCAPE_PATTERN,
|
||||||
IMAGE_LINK_PATTERN,
|
|
||||||
IMAGE_REFERENCE_PATTERN,
|
|
||||||
REFERENCE_PATTERN,
|
REFERENCE_PATTERN,
|
||||||
LINK_ANGLED_PATTERN,
|
LINK_ANGLED_PATTERN,
|
||||||
LINK_PATTERN,
|
LINK_PATTERN,
|
||||||
|
IMAGE_LINK_PATTERN,
|
||||||
|
IMAGE_REFERENCE_PATTERN,
|
||||||
AUTOLINK_PATTERN,
|
AUTOLINK_PATTERN,
|
||||||
AUTOMAIL_PATTERN,
|
AUTOMAIL_PATTERN,
|
||||||
|
#LINE_BREAK_PATTERN_2, Removed by Kovid as causes problems with mdx_tables
|
||||||
|
LINE_BREAK_PATTERN,
|
||||||
HTML_PATTERN,
|
HTML_PATTERN,
|
||||||
ENTITY_PATTERN,
|
ENTITY_PATTERN,
|
||||||
NOT_STRONG_PATTERN,
|
NOT_STRONG_PATTERN,
|
||||||
@ -1121,6 +1193,7 @@ class Markdown:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
module = sys.modules[extension_module_name]
|
module = sys.modules[extension_module_name]
|
||||||
|
|
||||||
except:
|
except:
|
||||||
message(CRITICAL,
|
message(CRITICAL,
|
||||||
"couldn't load extension %s (looking for %s module)"
|
"couldn't load extension %s (looking for %s module)"
|
||||||
@ -1149,11 +1222,14 @@ class Markdown:
|
|||||||
self.htmlStash = HtmlStash()
|
self.htmlStash = HtmlStash()
|
||||||
|
|
||||||
HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
|
HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
|
||||||
|
LINE_PREPROCESSOR.stash = self.htmlStash
|
||||||
REFERENCE_PREPROCESSOR.references = self.references
|
REFERENCE_PREPROCESSOR.references = self.references
|
||||||
HTML_PATTERN.stash = self.htmlStash
|
HTML_PATTERN.stash = self.htmlStash
|
||||||
ENTITY_PATTERN.stash = self.htmlStash
|
ENTITY_PATTERN.stash = self.htmlStash
|
||||||
REFERENCE_PATTERN.references = self.references
|
REFERENCE_PATTERN.references = self.references
|
||||||
IMAGE_REFERENCE_PATTERN.references = self.references
|
IMAGE_REFERENCE_PATTERN.references = self.references
|
||||||
|
RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash
|
||||||
|
RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode
|
||||||
|
|
||||||
for extension in self.registeredExtensions:
|
for extension in self.registeredExtensions:
|
||||||
extension.reset()
|
extension.reset()
|
||||||
@ -1173,7 +1249,7 @@ class Markdown:
|
|||||||
self.doc.appendChild(self.top_element)
|
self.doc.appendChild(self.top_element)
|
||||||
|
|
||||||
# Fixup the source text
|
# Fixup the source text
|
||||||
text = self.source #.strip()
|
text = self.source
|
||||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
text += "\n\n"
|
text += "\n\n"
|
||||||
text = text.expandtabs(TAB_LENGTH)
|
text = text.expandtabs(TAB_LENGTH)
|
||||||
@ -1226,7 +1302,9 @@ class Markdown:
|
|||||||
@param inList: a level
|
@param inList: a level
|
||||||
@returns: None"""
|
@returns: None"""
|
||||||
|
|
||||||
|
# Loop through lines until none left.
|
||||||
while lines:
|
while lines:
|
||||||
|
|
||||||
# Check if this section starts with a list, a blockquote or
|
# Check if this section starts with a list, a blockquote or
|
||||||
# a code block
|
# a code block
|
||||||
|
|
||||||
@ -1257,6 +1335,7 @@ class Markdown:
|
|||||||
#
|
#
|
||||||
|
|
||||||
if inList:
|
if inList:
|
||||||
|
|
||||||
start, lines = self._linesUntil(lines, (lambda line:
|
start, lines = self._linesUntil(lines, (lambda line:
|
||||||
RE.regExp['ul'].match(line)
|
RE.regExp['ul'].match(line)
|
||||||
or RE.regExp['ol'].match(line)
|
or RE.regExp['ol'].match(line)
|
||||||
@ -1264,15 +1343,25 @@ class Markdown:
|
|||||||
|
|
||||||
self._processSection(parent_elem, start,
|
self._processSection(parent_elem, start,
|
||||||
inList - 1, looseList = looseList)
|
inList - 1, looseList = looseList)
|
||||||
self._processSection(parent_elem, lines,
|
inList = inList-1
|
||||||
inList - 1, looseList = looseList)
|
|
||||||
|
|
||||||
|
|
||||||
else: # Ok, so it's just a simple block
|
else: # Ok, so it's just a simple block
|
||||||
|
|
||||||
paragraph, lines = self._linesUntil(lines, lambda line:
|
paragraph, lines = self._linesUntil(lines, lambda line:
|
||||||
not line.strip())
|
not line.strip())
|
||||||
|
|
||||||
if len(paragraph) and paragraph[0].startswith('#'):
|
if len(paragraph) and paragraph[0].startswith('#'):
|
||||||
|
self._processHeader(parent_elem, paragraph)
|
||||||
|
|
||||||
|
elif paragraph:
|
||||||
|
self._processParagraph(parent_elem, paragraph,
|
||||||
|
inList, looseList)
|
||||||
|
|
||||||
|
if lines and not lines[0].strip():
|
||||||
|
lines = lines[1:] # skip the first (blank) line
|
||||||
|
|
||||||
|
|
||||||
|
def _processHeader(self, parent_elem, paragraph):
|
||||||
m = RE.regExp['header'].match(paragraph[0])
|
m = RE.regExp['header'].match(paragraph[0])
|
||||||
if m:
|
if m:
|
||||||
level = len(m.group(1))
|
level = len(m.group(1))
|
||||||
@ -1283,13 +1372,13 @@ class Markdown:
|
|||||||
else:
|
else:
|
||||||
message(CRITICAL, "We've got a problem header!")
|
message(CRITICAL, "We've got a problem header!")
|
||||||
|
|
||||||
elif paragraph :
|
|
||||||
|
def _processParagraph(self, parent_elem, paragraph, inList, looseList):
|
||||||
list = self._handleInlineWrapper("\n".join(paragraph))
|
list = self._handleInlineWrapper("\n".join(paragraph))
|
||||||
|
|
||||||
if ( parent_elem.nodeName == 'li'
|
if ( parent_elem.nodeName == 'li'
|
||||||
and not (looseList or parent_elem.childNodes)):
|
and not (looseList or parent_elem.childNodes)):
|
||||||
|
|
||||||
#and not parent_elem.childNodes) :
|
|
||||||
# If this is the first paragraph inside "li", don't
|
# If this is the first paragraph inside "li", don't
|
||||||
# put <p> around it - append the paragraph bits directly
|
# put <p> around it - append the paragraph bits directly
|
||||||
# onto parent_elem
|
# onto parent_elem
|
||||||
@ -1302,10 +1391,6 @@ class Markdown:
|
|||||||
for item in list:
|
for item in list:
|
||||||
el.appendChild(item)
|
el.appendChild(item)
|
||||||
|
|
||||||
if lines and not lines[0].strip():
|
|
||||||
lines = lines[1:] # skip the first (blank) line
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _processUList(self, parent_elem, lines, inList):
|
def _processUList(self, parent_elem, lines, inList):
|
||||||
self._processList(parent_elem, lines, inList,
|
self._processList(parent_elem, lines, inList,
|
||||||
@ -1428,15 +1513,22 @@ class Markdown:
|
|||||||
|
|
||||||
dequoted = []
|
dequoted = []
|
||||||
i = 0
|
i = 0
|
||||||
|
blank_line = False # allow one blank line between paragraphs
|
||||||
for line in lines:
|
for line in lines:
|
||||||
m = RE.regExp['quoted'].match(line)
|
m = RE.regExp['quoted'].match(line)
|
||||||
if m:
|
if m:
|
||||||
dequoted.append(m.group(1))
|
dequoted.append(m.group(1))
|
||||||
i += 1
|
i += 1
|
||||||
|
blank_line = False
|
||||||
|
elif not blank_line and line.strip() != '':
|
||||||
|
dequoted.append(line)
|
||||||
|
i += 1
|
||||||
|
elif not blank_line and line.strip() == '':
|
||||||
|
dequoted.append(line)
|
||||||
|
i += 1
|
||||||
|
blank_line = True
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
else :
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
blockquote = self.doc.createElement('blockquote')
|
blockquote = self.doc.createElement('blockquote')
|
||||||
parent_elem.appendChild(blockquote)
|
parent_elem.appendChild(blockquote)
|
||||||
@ -1471,11 +1563,11 @@ class Markdown:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _handleInlineWrapper (self, line) :
|
def _handleInlineWrapper (self, line, patternIndex=0):
|
||||||
|
|
||||||
parts = [line]
|
parts = [line]
|
||||||
|
|
||||||
for pattern in self.inlinePatterns :
|
while patternIndex < len(self.inlinePatterns):
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
@ -1484,7 +1576,9 @@ class Markdown:
|
|||||||
x = parts[i]
|
x = parts[i]
|
||||||
|
|
||||||
if isinstance(x, (str, unicode)):
|
if isinstance(x, (str, unicode)):
|
||||||
result = self._applyPattern(x, pattern)
|
result = self._applyPattern(x, \
|
||||||
|
self.inlinePatterns[patternIndex], \
|
||||||
|
patternIndex)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
i -= 1
|
i -= 1
|
||||||
@ -1493,6 +1587,7 @@ class Markdown:
|
|||||||
parts.insert(i+1,y)
|
parts.insert(i+1,y)
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
patternIndex += 1
|
||||||
|
|
||||||
for i in range(len(parts)):
|
for i in range(len(parts)):
|
||||||
x = parts[i]
|
x = parts[i]
|
||||||
@ -1521,7 +1616,7 @@ class Markdown:
|
|||||||
|
|
||||||
return [self.doc.createTextNode(line)]
|
return [self.doc.createTextNode(line)]
|
||||||
|
|
||||||
def _applyPattern(self, line, pattern) :
|
def _applyPattern(self, line, pattern, patternIndex=0):
|
||||||
|
|
||||||
""" Given a pattern name, this function checks if the line
|
""" Given a pattern name, this function checks if the line
|
||||||
fits the pattern, creates the necessary elements, and returns
|
fits the pattern, creates the necessary elements, and returns
|
||||||
@ -1555,7 +1650,7 @@ class Markdown:
|
|||||||
for child in node.childNodes:
|
for child in node.childNodes:
|
||||||
if isinstance(child, TextNode):
|
if isinstance(child, TextNode):
|
||||||
|
|
||||||
result = self._handleInlineWrapper(child.value)
|
result = self._handleInlineWrapper(child.value, patternIndex+1)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
|
|
||||||
@ -1594,15 +1689,18 @@ class Markdown:
|
|||||||
"""Return the document in XHTML format.
|
"""Return the document in XHTML format.
|
||||||
|
|
||||||
@returns: A serialized XHTML body."""
|
@returns: A serialized XHTML body."""
|
||||||
#try :
|
|
||||||
|
|
||||||
if source :
|
if source is not None: #Allow blank string
|
||||||
self.source = source
|
self.source = source
|
||||||
|
|
||||||
if not self.source:
|
if not self.source:
|
||||||
return ""
|
return u""
|
||||||
|
|
||||||
self.source = removeBOM(self.source, self.encoding)
|
try:
|
||||||
|
self.source = unicode(self.source)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
|
||||||
|
return u""
|
||||||
|
|
||||||
for pp in self.textPreprocessors:
|
for pp in self.textPreprocessors:
|
||||||
self.source = pp.run(self.source)
|
self.source = pp.run(self.source)
|
||||||
@ -1610,22 +1708,8 @@ class Markdown:
|
|||||||
doc = self._transform()
|
doc = self._transform()
|
||||||
xml = doc.toxml()
|
xml = doc.toxml()
|
||||||
|
|
||||||
#finally:
|
|
||||||
# doc.unlink()
|
|
||||||
|
|
||||||
# Let's stick in all the raw html pieces
|
# Return everything but the top level tag
|
||||||
|
|
||||||
for i in range(self.htmlStash.html_counter) :
|
|
||||||
html = self.htmlStash.rawHtmlBlocks[i]
|
|
||||||
if self.safeMode and html != "<hr />" and html != "<br />":
|
|
||||||
html = HTML_REMOVED_TEXT
|
|
||||||
|
|
||||||
xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
|
|
||||||
html + "\n")
|
|
||||||
xml = xml.replace(HTML_PLACEHOLDER % i,
|
|
||||||
html)
|
|
||||||
|
|
||||||
# And return everything but the top level tag
|
|
||||||
|
|
||||||
if self.stripTopLevelTags:
|
if self.stripTopLevelTags:
|
||||||
xml = xml.strip()[23:-7] + "\n"
|
xml = xml.strip()[23:-7] + "\n"
|
||||||
@ -1636,20 +1720,18 @@ class Markdown:
|
|||||||
return (self.docType + xml).strip()
|
return (self.docType + xml).strip()
|
||||||
|
|
||||||
|
|
||||||
__str__ = convert # deprecated - will be changed in 1.7 to report
|
def __str__(self):
|
||||||
# information about the MD instance
|
''' Report info about instance. Markdown always returns unicode. '''
|
||||||
|
if self.source is None:
|
||||||
|
status = 'in which no source text has been assinged.'
|
||||||
|
else:
|
||||||
|
status = 'which contains %d chars and %d line(s) of source.'%\
|
||||||
|
(len(self.source), self.source.count('\n')+1)
|
||||||
|
return 'An instance of "%s" %s'% (self.__class__, status)
|
||||||
|
|
||||||
toString = __str__ # toString() method is deprecated
|
__unicode__ = convert # markdown should always return a unicode string
|
||||||
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
"""Return the document in XHTML format as a Unicode object.
|
|
||||||
"""
|
|
||||||
return str(self)#.decode(self.encoding)
|
|
||||||
|
|
||||||
|
|
||||||
toUnicode = __unicode__ # deprecated - will be removed in 1.7
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1662,11 +1744,10 @@ def markdownFromFile(input = None,
|
|||||||
message_threshold = CRITICAL,
|
message_threshold = CRITICAL,
|
||||||
safe = False):
|
safe = False):
|
||||||
|
|
||||||
global MESSAGE_THRESHOLD
|
global console_hndlr
|
||||||
MESSAGE_THRESHOLD = message_threshold
|
console_hndlr.setLevel(message_threshold)
|
||||||
|
|
||||||
message(VERBOSE, "input file: %s" % input)
|
|
||||||
|
|
||||||
|
message(DEBUG, "input file: %s" % input)
|
||||||
|
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
@ -1675,7 +1756,9 @@ def markdownFromFile(input = None,
|
|||||||
text = input_file.read()
|
text = input_file.read()
|
||||||
input_file.close()
|
input_file.close()
|
||||||
|
|
||||||
new_text = markdown(text, extensions, encoding, safe_mode = safe)
|
text = removeBOM(text, encoding)
|
||||||
|
|
||||||
|
new_text = markdown(text, extensions, safe_mode = safe)
|
||||||
|
|
||||||
if output:
|
if output:
|
||||||
output_file = codecs.open(output, "w", encoding=encoding)
|
output_file = codecs.open(output, "w", encoding=encoding)
|
||||||
@ -1687,10 +1770,9 @@ def markdownFromFile(input = None,
|
|||||||
|
|
||||||
def markdown(text,
|
def markdown(text,
|
||||||
extensions = [],
|
extensions = [],
|
||||||
encoding = None,
|
|
||||||
safe_mode = False):
|
safe_mode = False):
|
||||||
|
|
||||||
message(VERBOSE, "in markdown.markdown(), received text:\n%s" % text)
|
message(DEBUG, "in markdown.markdown(), received text:\n%s" % text)
|
||||||
|
|
||||||
extension_names = []
|
extension_names = []
|
||||||
extension_configs = {}
|
extension_configs = {}
|
||||||
@ -1764,17 +1846,17 @@ def parse_options() :
|
|||||||
parser.add_option("-e", "--encoding", dest="encoding",
|
parser.add_option("-e", "--encoding", dest="encoding",
|
||||||
help="encoding for input and output files",)
|
help="encoding for input and output files",)
|
||||||
parser.add_option("-q", "--quiet", default = CRITICAL,
|
parser.add_option("-q", "--quiet", default = CRITICAL,
|
||||||
action="store_const", const=NONE, dest="verbose",
|
action="store_const", const=60, dest="verbose",
|
||||||
help="suppress all messages")
|
help="suppress all messages")
|
||||||
parser.add_option("-v", "--verbose",
|
parser.add_option("-v", "--verbose",
|
||||||
action="store_const", const=INFO, dest="verbose",
|
action="store_const", const=INFO, dest="verbose",
|
||||||
help="print info messages")
|
help="print info messages")
|
||||||
parser.add_option("-s", "--safe",
|
parser.add_option("-s", "--safe", dest="safe", default=False,
|
||||||
action="store_const", const=True, dest="safe",
|
metavar="SAFE_MODE",
|
||||||
help="same mode (strip user's HTML tag)")
|
help="same mode ('replace', 'remove' or 'escape' user's HTML tag)")
|
||||||
|
|
||||||
parser.add_option("--noisy",
|
parser.add_option("--noisy",
|
||||||
action="store_const", const=VERBOSE, dest="verbose",
|
action="store_const", const=DEBUG, dest="verbose",
|
||||||
help="print debug messages")
|
help="print debug messages")
|
||||||
parser.add_option("-x", "--extension", action="append", dest="extensions",
|
parser.add_option("-x", "--extension", action="append", dest="extensions",
|
||||||
help = "load extension EXTENSION", metavar="EXTENSION")
|
help = "load extension EXTENSION", metavar="EXTENSION")
|
||||||
@ -1799,28 +1881,10 @@ def parse_options() :
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
options = parse_options()
|
options = parse_options()
|
||||||
|
|
||||||
#if os.access(inFile, os.R_OK):
|
|
||||||
|
|
||||||
if not options:
|
if not options:
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
markdownFromFile(**options)
|
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
markdownFromFile(**options)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
""" Run Markdown from the command line. """
|
""" Run Markdown from the command line. """
|
||||||
sys.exit(main())
|
sys.exit(main)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user