Some text with an ABBR and a REF. Ignore REFERENCE and ref.
' + >>> print markdown.markdown(text, ['abbr']) +Some text with an ABBR and a REF. Ignore REFERENCE and ref.
Copyright 2007-2008 * [Waylan Limberg](http://achinghead.com/) @@ -23,14 +23,18 @@ Copyright 2007-2008 ''' +from __future__ import absolute_import +from __future__ import unicode_literals +from . import Extension +from ..preprocessors import Preprocessor +from ..inlinepatterns import Pattern +from ..util import etree import re -import calibre.ebooks.markdown.markdown as markdown -from calibre.ebooks.markdown.markdown import etree # Global Vars ABBR_REF_RE = re.compile(r'[*]\[(?P[^\]]*)\][ ]?:\s*(?PNote
+This is the first line inside the box
+Did you know?
+Another line here.
+Note
` + title = klass.capitalize() + elif title == '': + # an explicit blank title should not be rendered + # e.g.: `!!! warning ""` will *not* render `p` with a title + title = None + return klass, title + + +def makeExtension(configs={}): + return AdmonitionExtension(configs=configs) diff --git a/src/calibre/ebooks/markdown/extensions/attr_list.py b/src/calibre/ebooks/markdown/extensions/attr_list.py new file mode 100644 index 0000000000..c98aa850a6 --- /dev/null +++ b/src/calibre/ebooks/markdown/extensions/attr_list.py @@ -0,0 +1,140 @@ +""" +Attribute List Extension for Python-Markdown +============================================ + +Adds attribute list syntax. Inspired by +[maruku](http://maruku.rubyforge.org/proposal.html#attribute_lists)'s +feature of the same name. + +Copyright 2011 [Waylan Limberg](http://achinghead.com/). + +Contact: markdown@freewisdom.org + +License: BSD (see ../LICENSE.md for details) + +Dependencies: +* [Python 2.4+](http://python.org) +* [Markdown 2.1+](http://packages.python.org/Markdown/) + +""" + +from __future__ import absolute_import +from __future__ import unicode_literals +from . import Extension +from ..treeprocessors import Treeprocessor +from ..util import isBlockLevel +import re + +try: + Scanner = re.Scanner +except AttributeError: + # must be on Python 2.4 + from sre import Scanner + +def _handle_double_quote(s, t): + k, v = t.split('=') + return k, v.strip('"') + +def _handle_single_quote(s, t): + k, v = t.split('=') + return k, v.strip("'") + +def _handle_key_value(s, t): + return t.split('=') + +def _handle_word(s, t): + if t.startswith('.'): + return '.', t[1:] + if t.startswith('#'): + return 'id', t[1:] + return t, t + +_scanner = Scanner([ + (r'[^ ]+=".*?"', _handle_double_quote), + (r"[^ ]+='.*?'", _handle_single_quote), + (r'[^ ]+=[^ ]*', _handle_key_value), + (r'[^ ]+', _handle_word), + (r' ', None) +]) + +def get_attrs(str): + """ Parse attribute list and return a list of attribute tuples. """ + return _scanner.scan(str)[0] + +def isheader(elem): + return elem.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + +class AttrListTreeprocessor(Treeprocessor): + + BASE_RE = r'\{\:?([^\}]*)\}' + HEADER_RE = re.compile(r'[ ]*%s[ ]*$' % BASE_RE) + BLOCK_RE = re.compile(r'\n[ ]*%s[ ]*$' % BASE_RE) + INLINE_RE = re.compile(r'^%s' % BASE_RE) + NAME_RE = re.compile(r'[^A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d' + r'\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef' + r'\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd' + r'\:\-\.0-9\u00b7\u0300-\u036f\u203f-\u2040]+') + + def run(self, doc): + for elem in doc.getiterator(): + if isBlockLevel(elem.tag): + # Block level: check for attrs on last line of text + RE = self.BLOCK_RE + if isheader(elem): + # header: check for attrs at end of line + RE = self.HEADER_RE + if len(elem) and elem[-1].tail: + # has children. Get from tail of last child + m = RE.search(elem[-1].tail) + if m: + self.assign_attrs(elem, m.group(1)) + elem[-1].tail = elem[-1].tail[:m.start()] + if isheader(elem): + # clean up trailing #s + elem[-1].tail = elem[-1].tail.rstrip('#').rstrip() + elif elem.text: + # no children. Get from text. + m = RE.search(elem.text) + if m: + self.assign_attrs(elem, m.group(1)) + elem.text = elem.text[:m.start()] + if isheader(elem): + # clean up trailing #s + elem.text = elem.text.rstrip('#').rstrip() + else: + # inline: check for attrs at start of tail + if elem.tail: + m = self.INLINE_RE.match(elem.tail) + if m: + self.assign_attrs(elem, m.group(1)) + elem.tail = elem.tail[m.end():] + + def assign_attrs(self, elem, attrs): + """ Assign attrs to element. """ + for k, v in get_attrs(attrs): + if k == '.': + # add to class + cls = elem.get('class') + if cls: + elem.set('class', '%s %s' % (cls, v)) + else: + elem.set('class', v) + else: + # assign attr k with v + elem.set(self.sanitize_name(k), v) + + def sanitize_name(self, name): + """ + Sanitize name as 'an XML Name, minus the ":"'. + See http://www.w3.org/TR/REC-xml-names/#NT-NCName + """ + return self.NAME_RE.sub('_', name) + + +class AttrListExtension(Extension): + def extendMarkdown(self, md, md_globals): + md.treeprocessors.add('attr_list', AttrListTreeprocessor(md), '>prettify') + + +def makeExtension(configs={}): + return AttrListExtension(configs=configs) diff --git a/src/calibre/ebooks/markdown/extensions/codehilite.py b/src/calibre/ebooks/markdown/extensions/codehilite.py index 42649ec252..72b40fde78 100644 --- a/src/calibre/ebooks/markdown/extensions/codehilite.py +++ b/src/calibre/ebooks/markdown/extensions/codehilite.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - """ CodeHilite Extension for Python-Markdown ======================================== @@ -8,149 +6,144 @@ Adds code/syntax highlighting to standard Python-Markdown code blocks. Copyright 2006-2008 [Waylan Limberg](http://achinghead.com/). -Project website:%s
\n' - for line in lines: - txt += '\t
- %s
\n'% line - txt += '
%s
\n'% \
+ (self.css_class, class_str, txt)
def _getLang(self):
- """
- Determines language of a code block from shebang lines and whether said
+ """
+ Determines language of a code block from shebang line and whether said
line should be removed or left in place. If the sheband line contains a
- path (even a single /) then it is assumed to be a real shebang lines and
- left alone. However, if no path is given (e.i.: #!python or :::python)
+ path (even a single /) then it is assumed to be a real shebang line and
+ left alone. However, if no path is given (e.i.: #!python or :::python)
then it is assumed to be a mock shebang for language identifitation of a
- code fragment and removed from the code block prior to processing for
- code highlighting. When a mock shebang (e.i: #!python) is found, line
- numbering is turned on. When colons are found in place of a shebang
- (e.i.: :::python), line numbering is left in the current state - off
+ code fragment and removed from the code block prior to processing for
+ code highlighting. When a mock shebang (e.i: #!python) is found, line
+ numbering is turned on. When colons are found in place of a shebang
+ (e.i.: :::python), line numbering is left in the current state - off
by default.
-
+
"""
import re
-
+
#split text into lines
lines = self.src.split("\n")
#pull first line to examine
fl = lines.pop(0)
-
+
c = re.compile(r'''
- (?:(?:::+)|(?PA paragraph before a fenced code block:
\\nFenced code block\\n
'
+ >>> print html
+ A paragraph before a fenced code block:
+Fenced code block
+
Works with safe_mode also (we check this because we are using the HtmlStash):
- >>> markdown.markdown(text, extensions=['fenced_code'], safe_mode='replace')
- u'A paragraph before a fenced code block:
\\nFenced code block\\n
'
-
+ >>> print markdown.markdown(text, extensions=['fenced_code'], safe_mode='replace')
+ A paragraph before a fenced code block:
+Fenced code block
+
+
Include tilde's in a code block and wrap with blank lines:
>>> text = '''
... ~~~~~~~~
- ...
+ ...
... ~~~~
- ...
... ~~~~~~~~'''
- >>> markdown.markdown(text, extensions=['fenced_code'])
- u'\\n~~~~\\n\\n
'
+ >>> print markdown.markdown(text, extensions=['fenced_code'])
+
+ ~~~~
+
-Multiple blocks and language tags:
+Language tags:
>>> text = '''
... ~~~~{.python}
- ... block one
- ... ~~~~
- ...
- ... ~~~~.html
- ... block two
+ ... # Some python code ... ~~~~''' - >>> markdown.markdown(text, extensions=['fenced_code']) - u'block one\\n
\\n\\n<p>block two</p>\\n
'
+ >>> print markdown.markdown(text, extensions=['fenced_code'])
+ # Some python code
+
+
+Optionally backticks instead of tildes as per how github's code block markdown is identified:
+
+ >>> text = '''
+ ... `````
+ ... # Arbitrary code
+ ... ~~~~~ # these tildes will not close the block
+ ... `````'''
+ >>> print markdown.markdown(text, extensions=['fenced_code'])
+ # Arbitrary code
+ ~~~~~ # these tildes will not close the block
+
Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/).
-Project website: .*?)(?P=fence)[ ]*$',
+ r'(?P^(?:~{3,}|`{3,}))[ ]*(\{?\.?(?P[a-zA-Z0-9_+-]*)\}?)?[ ]*\n(?P.*?)(?<=\n)(?P=fence)[ ]*$',
re.MULTILINE|re.DOTALL
)
CODE_WRAP = '%s
'
LANG_TAG = ' class="%s"'
-
-class FencedCodeExtension(markdown.Extension):
+class FencedCodeExtension(Extension):
def extendMarkdown(self, md, md_globals):
""" Add FencedBlockPreprocessor to the Markdown instance. """
+ md.registerExtension(self)
- md.preprocessors.add('fenced_code_block',
- FencedBlockPreprocessor(md),
- "_begin")
+ md.preprocessors.add('fenced_code_block',
+ FencedBlockPreprocessor(md),
+ ">normalize_whitespace")
-class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
-
+class FencedBlockPreprocessor(Preprocessor):
+
+ def __init__(self, md):
+ super(FencedBlockPreprocessor, self).__init__(md)
+
+ self.checked_for_codehilite = False
+ self.codehilite_conf = {}
+
def run(self, lines):
""" Match and store Fenced Code Blocks in the HtmlStash. """
+
+ # Check for code hilite extension
+ if not self.checked_for_codehilite:
+ for ext in self.markdown.registeredExtensions:
+ if isinstance(ext, CodeHiliteExtension):
+ self.codehilite_conf = ext.config
+ break
+
+ self.checked_for_codehilite = True
+
text = "\n".join(lines)
while 1:
m = FENCED_BLOCK_RE.search(text)
@@ -93,7 +126,22 @@ class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
lang = ''
if m.group('lang'):
lang = LANG_TAG % m.group('lang')
- code = CODE_WRAP % (lang, self._escape(m.group('code')))
+
+ # If config is not empty, then the codehighlite extension
+ # is enabled, so we call it to highlite the code
+ if self.codehilite_conf:
+ highliter = CodeHilite(m.group('code'),
+ linenums=self.codehilite_conf['linenums'][0],
+ guess_lang=self.codehilite_conf['guess_lang'][0],
+ css_class=self.codehilite_conf['css_class'][0],
+ style=self.codehilite_conf['pygments_style'][0],
+ lang=(m.group('lang') or None),
+ noclasses=self.codehilite_conf['noclasses'][0])
+
+ code = highliter.hilite()
+ else:
+ code = CODE_WRAP % (lang, self._escape(m.group('code')))
+
placeholder = self.markdown.htmlStash.store(code, safe=True)
text = '%s\n%s\n%s'% (text[:m.start()], placeholder, text[m.end():])
else:
@@ -110,9 +158,4 @@ class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
def makeExtension(configs=None):
- return FencedCodeExtension()
-
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+ return FencedCodeExtension(configs=configs)
diff --git a/src/calibre/ebooks/markdown/extensions/footnotes.py b/src/calibre/ebooks/markdown/extensions/footnotes.py
index df137f5d53..65ed597a7b 100644
--- a/src/calibre/ebooks/markdown/extensions/footnotes.py
+++ b/src/calibre/ebooks/markdown/extensions/footnotes.py
@@ -23,33 +23,55 @@ Example:
"""
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..preprocessors import Preprocessor
+from ..inlinepatterns import Pattern
+from ..treeprocessors import Treeprocessor
+from ..postprocessors import Postprocessor
+from ..util import etree, text_type
+from ..odict import OrderedDict
import re
-import calibre.ebooks.markdown.markdown as markdown
-from calibre.ebooks.markdown.markdown import etree
FN_BACKLINK_TEXT = "zz1337820767766393qq"
NBSP_PLACEHOLDER = "qq3936677670287331zz"
-DEF_RE = re.compile(r'(\ ?\ ?\ ?)\[\^([^\]]*)\]:\s*(.*)')
+DEF_RE = re.compile(r'[ ]{0,3}\[\^([^\]]*)\]:\s*(.*)')
TABBED_RE = re.compile(r'((\t)|( ))(.*)')
-class FootnoteExtension(markdown.Extension):
+class FootnoteExtension(Extension):
""" Footnote Extension. """
def __init__ (self, configs):
""" Setup configs. """
self.config = {'PLACE_MARKER':
["///Footnotes Go Here///",
- "The text string that marks where the footnotes go"]}
+ "The text string that marks where the footnotes go"],
+ 'UNIQUE_IDS':
+ [False,
+ "Avoid name collisions across "
+ "multiple calls to reset()."],
+ "BACKLINK_TEXT":
+ ["↩",
+ "The text string that links from the footnote to the reader's place."]
+ }
for key, value in configs:
self.config[key][0] = value
+ # In multiple invocations, emit links that don't get tangled.
+ self.unique_prefix = 0
+
self.reset()
def extendMarkdown(self, md, md_globals):
""" Add pieces to Markdown. """
md.registerExtension(self)
self.parser = md.parser
+ self.md = md
+ self.sep = ':'
+ if self.md.output_format in ['html5', 'xhtml5']:
+ self.sep = '-'
# Insert a preprocessor before ReferencePreprocessor
md.preprocessors.add("footnote", FootnotePreprocessor(self),
"amp_substitute")
def reset(self):
- """ Clear the footnotes on reset. """
- self.footnotes = markdown.odict.OrderedDict()
+ """ Clear the footnotes on reset, and prepare for a distinct document. """
+ self.footnotes = OrderedDict()
+ self.unique_prefix += 1
def findFootnotesPlaceholder(self, root):
""" Return ElementTree Element that contains Footnote placeholder. """
@@ -76,13 +99,13 @@ class FootnoteExtension(markdown.Extension):
for child in element:
if child.text:
if child.text.find(self.getConfig("PLACE_MARKER")) > -1:
- return child, True
+ return child, element, True
if child.tail:
if child.tail.find(self.getConfig("PLACE_MARKER")) > -1:
- return (child, element), False
+ return child, element, False
finder(child)
return None
-
+
res = finder(root)
return res
@@ -92,16 +115,22 @@ class FootnoteExtension(markdown.Extension):
def makeFootnoteId(self, id):
""" Return footnote link id. """
- return 'fn:%s' % id
+ if self.getConfig("UNIQUE_IDS"):
+ return 'fn%s%d-%s' % (self.sep, self.unique_prefix, id)
+ else:
+ return 'fn%s%s' % (self.sep, id)
def makeFootnoteRefId(self, id):
""" Return footnote back-link id. """
- return 'fnref:%s' % id
+ if self.getConfig("UNIQUE_IDS"):
+ return 'fnref%s%d-%s' % (self.sep, self.unique_prefix, id)
+ else:
+ return 'fnref%s%s' % (self.sep, id)
def makeFootnotesDiv(self, root):
""" Return div of footnotes as et Element. """
- if not self.footnotes.keys():
+ if not list(self.footnotes.keys()):
return None
div = etree.Element("div")
@@ -115,7 +144,9 @@ class FootnoteExtension(markdown.Extension):
self.parser.parseChunk(li, self.footnotes[id])
backlink = etree.Element("a")
backlink.set("href", "#" + self.makeFootnoteRefId(id))
- backlink.set("rev", "footnote")
+ if self.md.output_format not in ['html5', 'xhtml5']:
+ backlink.set("rev", "footnote") # Invalid in HTML5
+ backlink.set("class", "footnote-backref")
backlink.set("title", "Jump back to footnote %d in the text" % \
(self.footnotes.index(id)+1))
backlink.text = FN_BACKLINK_TEXT
@@ -131,61 +162,39 @@ class FootnoteExtension(markdown.Extension):
return div
-class FootnotePreprocessor(markdown.preprocessors.Preprocessor):
+class FootnotePreprocessor(Preprocessor):
""" Find all footnote references and store for later use. """
def __init__ (self, footnotes):
self.footnotes = footnotes
def run(self, lines):
- lines = self._handleFootnoteDefinitions(lines)
- text = "\n".join(lines)
- return text.split("\n")
-
- def _handleFootnoteDefinitions(self, lines):
"""
- Recursively find all footnote definitions in lines.
+ Loop through lines and find, set, and remove footnote definitions.
Keywords:
* lines: A list of lines of text
- Return: A list of lines with footnote definitions removed.
+ Return: A list of lines of text with footnote definitions removed.
"""
- i, id, footnote = self._findFootnoteDefinition(lines)
-
- if id :
- plain = lines[:i]
- detabbed, theRest = self.detectTabbed(lines[i+1:])
- self.footnotes.setFootnote(id,
- footnote + "\n"
- + "\n".join(detabbed))
- more_plain = self._handleFootnoteDefinitions(theRest)
- return plain + [""] + more_plain
- else :
- return lines
-
- def _findFootnoteDefinition(self, lines):
- """
- Find the parts of a footnote definition.
-
- Keywords:
-
- * lines: A list of lines of text.
-
- Return: A three item tuple containing the index of the first line of a
- footnote definition, the id of the definition and the body of the
- definition.
-
- """
- counter = 0
- for line in lines:
- m = DEF_RE.match(line)
+ newlines = []
+ i = 0
+ while True:
+ m = DEF_RE.match(lines[i])
if m:
- return counter, m.group(2), m.group(3)
- counter += 1
- return counter, None, None
+ fn, _i = self.detectTabbed(lines[i+1:])
+ fn.insert(0, m.group(2))
+ i += _i-1 # skip past footnote
+ self.footnotes.setFootnote(m.group(1), "\n".join(fn))
+ else:
+ newlines.append(lines[i])
+ if len(lines) > i+1:
+ i += 1
+ else:
+ break
+ return newlines
def detectTabbed(self, lines):
""" Find indented text and remove indent before further proccesing.
@@ -194,11 +203,11 @@ class FootnotePreprocessor(markdown.preprocessors.Preprocessor):
* lines: an array of strings
- Returns: a list of post processed items and the unused
- remainder of the original list
+ Returns: a list of post processed items and the index of last line.
"""
items = []
+ blank_line = False # have we encountered a blank line yet?
i = 0 # to keep track of where we are
def detab(line):
@@ -208,15 +217,21 @@ class FootnotePreprocessor(markdown.preprocessors.Preprocessor):
for line in lines:
if line.strip(): # Non-blank line
- line = detab(line)
- if line:
+ detabbed_line = detab(line)
+ if detabbed_line:
+ items.append(detabbed_line)
+ i += 1
+ continue
+ elif not blank_line and not DEF_RE.match(line):
+ # not tabbed but still part of first par.
items.append(line)
i += 1
continue
else:
- return items, lines[i:]
+ return items, i+1
else: # Blank line: _maybe_ we are done.
+ blank_line = True
i += 1 # advance
# Find the next non-blank line
@@ -235,28 +250,33 @@ class FootnotePreprocessor(markdown.preprocessors.Preprocessor):
else:
i += 1
- return items, lines[i:]
+ return items, i
-class FootnotePattern(markdown.inlinepatterns.Pattern):
+class FootnotePattern(Pattern):
""" InlinePattern for footnote markers in a document's body text. """
def __init__(self, pattern, footnotes):
- markdown.inlinepatterns.Pattern.__init__(self, pattern)
+ super(FootnotePattern, self).__init__(pattern)
self.footnotes = footnotes
def handleMatch(self, m):
- sup = etree.Element("sup")
- a = etree.SubElement(sup, "a")
id = m.group(2)
- sup.set('id', self.footnotes.makeFootnoteRefId(id))
- a.set('href', '#' + self.footnotes.makeFootnoteId(id))
- a.set('rel', 'footnote')
- a.text = str(self.footnotes.footnotes.index(id) + 1)
- return sup
+ if id in self.footnotes.footnotes.keys():
+ sup = etree.Element("sup")
+ a = etree.SubElement(sup, "a")
+ sup.set('id', self.footnotes.makeFootnoteRefId(id))
+ a.set('href', '#' + self.footnotes.makeFootnoteId(id))
+ if self.footnotes.md.output_format not in ['html5', 'xhtml5']:
+ a.set('rel', 'footnote') # invalid in HTML5
+ a.set('class', 'footnote-ref')
+ a.text = text_type(self.footnotes.footnotes.index(id) + 1)
+ return sup
+ else:
+ return None
-class FootnoteTreeprocessor(markdown.treeprocessors.Treeprocessor):
+class FootnoteTreeprocessor(Treeprocessor):
""" Build and append footnote div to end of document. """
def __init__ (self, footnotes):
@@ -267,23 +287,24 @@ class FootnoteTreeprocessor(markdown.treeprocessors.Treeprocessor):
if footnotesDiv:
result = self.footnotes.findFootnotesPlaceholder(root)
if result:
- node, isText = result
+ child, parent, isText = result
+ ind = parent.getchildren().index(child)
if isText:
- node.text = None
- node.getchildren().insert(0, footnotesDiv)
+ parent.remove(child)
+ parent.insert(ind, footnotesDiv)
else:
- child, element = node
- ind = element.getchildren().find(child)
- element.getchildren().insert(ind + 1, footnotesDiv)
+ parent.insert(ind + 1, footnotesDiv)
child.tail = None
else:
root.append(footnotesDiv)
-class FootnotePostprocessor(markdown.postprocessors.Postprocessor):
+class FootnotePostprocessor(Postprocessor):
""" Replace placeholders with html entities. """
+ def __init__(self, footnotes):
+ self.footnotes = footnotes
def run(self, text):
- text = text.replace(FN_BACKLINK_TEXT, "↩")
+ text = text.replace(FN_BACKLINK_TEXT, self.footnotes.getConfig("BACKLINK_TEXT"))
return text.replace(NBSP_PLACEHOLDER, " ")
def makeExtension(configs=[]):
diff --git a/src/calibre/ebooks/markdown/extensions/headerid.py b/src/calibre/ebooks/markdown/extensions/headerid.py
index 0ffd91ee93..7681b8d499 100644
--- a/src/calibre/ebooks/markdown/extensions/headerid.py
+++ b/src/calibre/ebooks/markdown/extensions/headerid.py
@@ -1,28 +1,28 @@
-#!/usr/bin/python
-
"""
HeaderID Extension for Python-Markdown
======================================
-Adds ability to set HTML IDs for headers.
+Auto-generate id attributes for HTML headers.
Basic usage:
>>> import markdown
- >>> text = "# Some Header # {#some_id}"
+ >>> text = "# Some Header #"
>>> md = markdown.markdown(text, ['headerid'])
- >>> md
- u'Some Header
'
+ >>> print md
+ Some Header
All header IDs are unique:
>>> text = '''
... #Header
- ... #Another Header {#header}
- ... #Third Header {#header}'''
+ ... #Header
+ ... #Header'''
>>> md = markdown.markdown(text, ['headerid'])
- >>> md
- u'Header
\\nAnother Header
\\nThird Header
'
+ >>> print md
+ Header
+ Header
+ Header
To fit within a html template's hierarchy, set the header base level:
@@ -30,17 +30,26 @@ To fit within a html template's hierarchy, set the header base level:
... #Some Header
... ## Next Level'''
>>> md = markdown.markdown(text, ['headerid(level=3)'])
- >>> md
- u'Some Header
\\nNext Level
'
+ >>> print md
+ Some Header
+ Next Level
+
+Works with inline markup.
+
+ >>> text = '#Some *Header* with [markup](http://example.com).'
+ >>> md = markdown.markdown(text, ['headerid'])
+ >>> print md
+ Some Header with markup.
Turn off auto generated IDs:
>>> text = '''
... # Some Header
- ... # Header with ID # { #foo }'''
+ ... # Another Header'''
>>> md = markdown.markdown(text, ['headerid(forceid=False)'])
- >>> md
- u'Some Header
\\nHeader with ID
'
+ >>> print md
+ Some Header
+ Another Header
Use with MetaData extension:
@@ -49,85 +58,101 @@ Use with MetaData extension:
...
... # A Header'''
>>> md = markdown.markdown(text, ['headerid', 'meta'])
- >>> md
- u'A Header
'
+ >>> print md
+ A Header
-Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/).
+Copyright 2007-2011 [Waylan Limberg](http://achinghead.com/).
-Project website:
+Project website:
Contact: markdown@freewisdom.org
-License: BSD (see ../docs/LICENSE for details)
+License: BSD (see ../docs/LICENSE for details)
Dependencies:
* [Python 2.3+](http://python.org)
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
+* [Markdown 2.0+](http://packages.python.org/Markdown/)
"""
-import calibre.ebooks.markdown.markdown as markdown
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..treeprocessors import Treeprocessor
import re
-from string import ascii_lowercase, digits, punctuation
+import logging
+import unicodedata
+
+logger = logging.getLogger('MARKDOWN')
-ID_CHARS = ascii_lowercase + digits + '-_'
IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
-class HeaderIdProcessor(markdown.blockprocessors.BlockProcessor):
- """ Replacement BlockProcessor for Header IDs. """
+def slugify(value, separator):
+ """ Slugify a string, to make it URL friendly. """
+ value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
+ value = re.sub('[^\w\s-]', '', value.decode('ascii')).strip().lower()
+ return re.sub('[%s\s]+' % separator, separator, value)
- # Detect a header at start of any line in block
- RE = re.compile(r"""(^|\n)
- (?P\#{1,6}) # group('level') = string of hashes
- (?P.*?) # group('header') = Header text
- \#* # optional closing hashes
- (?:[ \t]*\{[ \t]*\#(?P[-_:a-zA-Z0-9]+)[ \t]*\})?
- (\n|$) # ^^ group('id') = id attribute
- """,
- re.VERBOSE)
- IDs = []
-
- def test(self, parent, block):
- return bool(self.RE.search(block))
-
- def run(self, parent, blocks):
- block = blocks.pop(0)
- m = self.RE.search(block)
+def unique(id, ids):
+ """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """
+ while id in ids or not id:
+ m = IDCOUNT_RE.match(id)
if m:
- before = block[:m.start()] # All lines before header
- after = block[m.end():] # All lines after header
- if before:
- # As the header was not the first line of the block and the
- # lines before the header must be parsed first,
- # recursively parse this lines as a block.
- self.parser.parseBlocks(parent, [before])
- # Create header using named groups from RE
- start_level, force_id = self._get_meta()
- level = len(m.group('level')) + start_level
- if level > 6:
- level = 6
- h = markdown.etree.SubElement(parent, 'h%d' % level)
- h.text = m.group('header').strip()
- if m.group('id'):
- h.set('id', self._unique_id(m.group('id')))
- elif force_id:
- h.set('id', self._create_id(m.group('header').strip()))
- if after:
- # Insert remaining lines as first block for future parsing.
- blocks.insert(0, after)
+ id = '%s_%d'% (m.group(1), int(m.group(2))+1)
else:
- # This should never happen, but just in case...
- print ("We've got a problem header!")
+ id = '%s_%d'% (id, 1)
+ ids.add(id)
+ return id
+
+
+def itertext(elem):
+ """ Loop through all children and return text only.
+
+ Reimplements method of same name added to ElementTree in Python 2.7
+
+ """
+ if elem.text:
+ yield elem.text
+ for e in elem:
+ for s in itertext(e):
+ yield s
+ if e.tail:
+ yield e.tail
+
+
+class HeaderIdTreeprocessor(Treeprocessor):
+ """ Assign IDs to headers. """
+
+ IDs = set()
+
+ def run(self, doc):
+ start_level, force_id = self._get_meta()
+ slugify = self.config['slugify']
+ sep = self.config['separator']
+ for elem in doc.getiterator():
+ if elem.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+ if force_id:
+ if "id" in elem.attrib:
+ id = elem.get('id')
+ else:
+ id = slugify(''.join(itertext(elem)), sep)
+ elem.set('id', unique(id, self.IDs))
+ if start_level:
+ level = int(elem.tag[-1]) + start_level
+ if level > 6:
+ level = 6
+ elem.tag = 'h%d' % level
+
def _get_meta(self):
""" Return meta data suported by this ext as a tuple """
- level = int(self.config['level'][0]) - 1
- force = self._str2bool(self.config['forceid'][0])
+ level = int(self.config['level']) - 1
+ force = self._str2bool(self.config['forceid'])
if hasattr(self.md, 'Meta'):
- if self.md.Meta.has_key('header_level'):
+ if 'header_level' in self.md.Meta:
level = int(self.md.Meta['header_level'][0]) - 1
- if self.md.Meta.has_key('header_forceid'):
+ if 'header_forceid' in self.md.Meta:
force = self._str2bool(self.md.Meta['header_forceid'][0])
return level, force
@@ -140,34 +165,15 @@ class HeaderIdProcessor(markdown.blockprocessors.BlockProcessor):
return True
return default
- def _unique_id(self, id):
- """ Ensure ID is unique. Append '_1', '_2'... if not """
- while id in self.IDs:
- m = IDCOUNT_RE.match(id)
- if m:
- id = '%s_%d'% (m.group(1), int(m.group(2))+1)
- else:
- id = '%s_%d'% (id, 1)
- self.IDs.append(id)
- return id
- def _create_id(self, header):
- """ Return ID from Header text. """
- h = ''
- for c in header.lower().replace(' ', '_'):
- if c in ID_CHARS:
- h += c
- elif c not in punctuation:
- h += '+'
- return self._unique_id(h)
-
-
-class HeaderIdExtension (markdown.Extension):
+class HeaderIdExtension(Extension):
def __init__(self, configs):
# set defaults
self.config = {
'level' : ['1', 'Base level for headers.'],
- 'forceid' : ['True', 'Force all headers to have an id.']
+ 'forceid' : ['True', 'Force all headers to have an id.'],
+ 'separator' : ['-', 'Word separator.'],
+ 'slugify' : [slugify, 'Callable to generate anchors'],
}
for key, value in configs:
@@ -175,20 +181,19 @@ class HeaderIdExtension (markdown.Extension):
def extendMarkdown(self, md, md_globals):
md.registerExtension(self)
- self.processor = HeaderIdProcessor(md.parser)
+ self.processor = HeaderIdTreeprocessor()
self.processor.md = md
- self.processor.config = self.config
- # Replace existing hasheader in place.
- md.parser.blockprocessors['hashheader'] = self.processor
+ self.processor.config = self.getConfigs()
+ if 'attr_list' in md.treeprocessors.keys():
+ # insert after attr_list treeprocessor
+ md.treeprocessors.add('headerid', self.processor, '>attr_list')
+ else:
+ # insert after 'prettify' treeprocessor.
+ md.treeprocessors.add('headerid', self.processor, '>prettify')
def reset(self):
- self.processor.IDs = []
+ self.processor.IDs = set()
def makeExtension(configs=None):
return HeaderIdExtension(configs=configs)
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
-
diff --git a/src/calibre/ebooks/markdown/extensions/meta.py b/src/calibre/ebooks/markdown/extensions/meta.py
index b33eaa0f71..aaff4365a8 100644
--- a/src/calibre/ebooks/markdown/extensions/meta.py
+++ b/src/calibre/ebooks/markdown/extensions/meta.py
@@ -1,5 +1,3 @@
-#!usr/bin/python
-
"""
Meta Data Extension for Python-Markdown
=======================================
@@ -17,37 +15,41 @@ Basic Usage:
... The body. This is paragraph one.
... '''
>>> md = markdown.Markdown(['meta'])
- >>> md.convert(text)
- u'The body. This is paragraph one.
'
- >>> md.Meta
+ >>> print md.convert(text)
+ The body. This is paragraph one.
+ >>> print md.Meta
{u'blank_data': [u''], u'author': [u'Waylan Limberg', u'John Doe'], u'title': [u'A Test Doc.']}
Make sure text without Meta Data still works (markdown < 1.6b returns a ).
>>> text = ' Some Code - not extra lines of meta data.'
>>> md = markdown.Markdown(['meta'])
- >>> md.convert(text)
- u'
Some Code - not extra lines of meta data.\\n
'
+ >>> print md.convert(text)
+ Some Code - not extra lines of meta data.
+
>>> md.Meta
{}
Copyright 2007-2008 [Waylan Limberg](http://achinghead.com).
-Project website:
+Project website:
Contact: markdown@freewisdom.org
-License: BSD (see ../docs/LICENSE for details)
+License: BSD (see ../LICENSE.md for details)
"""
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..preprocessors import Preprocessor
import re
-import calibre.ebooks.markdown.markdown as markdown
# Global Vars
META_RE = re.compile(r'^[ ]{0,3}(?P[A-Za-z0-9_-]+):\s*(?P.*)')
META_MORE_RE = re.compile(r'^[ ]{4,}(?P.*)')
-class MetaExtension (markdown.Extension):
+class MetaExtension (Extension):
""" Meta-Data extension for Python-Markdown. """
def extendMarkdown(self, md, md_globals):
@@ -56,7 +58,7 @@ class MetaExtension (markdown.Extension):
md.preprocessors.add("meta", MetaPreprocessor(md), "_begin")
-class MetaPreprocessor(markdown.preprocessors.Preprocessor):
+class MetaPreprocessor(Preprocessor):
""" Get Meta-Data. """
def run(self, lines):
@@ -70,7 +72,11 @@ class MetaPreprocessor(markdown.preprocessors.Preprocessor):
m1 = META_RE.match(line)
if m1:
key = m1.group('key').lower().strip()
- meta[key] = [m1.group('value').strip()]
+ value = m1.group('value').strip()
+ try:
+ meta[key].append(value)
+ except KeyError:
+ meta[key] = [value]
else:
m2 = META_MORE_RE.match(line)
if m2 and key:
@@ -85,7 +91,3 @@ class MetaPreprocessor(markdown.preprocessors.Preprocessor):
def makeExtension(configs={}):
return MetaExtension(configs=configs)
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
diff --git a/src/calibre/ebooks/markdown/extensions/nl2br.py b/src/calibre/ebooks/markdown/extensions/nl2br.py
new file mode 100644
index 0000000000..da4b339958
--- /dev/null
+++ b/src/calibre/ebooks/markdown/extensions/nl2br.py
@@ -0,0 +1,38 @@
+"""
+NL2BR Extension
+===============
+
+A Python-Markdown extension to treat newlines as hard breaks; like
+GitHub-flavored Markdown does.
+
+Usage:
+
+ >>> import markdown
+ >>> print markdown.markdown('line 1\\nline 2', extensions=['nl2br'])
+ line 1
+ line 2
+
+Copyright 2011 [Brian Neal](http://deathofagremmie.com/)
+
+Dependencies:
+* [Python 2.4+](http://python.org)
+* [Markdown 2.1+](http://packages.python.org/Markdown/)
+
+"""
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..inlinepatterns import SubstituteTagPattern
+
+BR_RE = r'\n'
+
+class Nl2BrExtension(Extension):
+
+ def extendMarkdown(self, md, md_globals):
+ br_tag = SubstituteTagPattern(BR_RE, 'br')
+ md.inlinePatterns.add('nl', br_tag, '_end')
+
+
+def makeExtension(configs=None):
+ return Nl2BrExtension(configs)
diff --git a/src/calibre/ebooks/markdown/extensions/rss.py b/src/calibre/ebooks/markdown/extensions/rss.py
deleted file mode 100644
index 466c502da0..0000000000
--- a/src/calibre/ebooks/markdown/extensions/rss.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import calibre.ebooks.markdown.markdown as markdown
-from calibre.ebooks.markdown.markdown import etree
-
-DEFAULT_URL = "http://www.freewisdom.org/projects/python-markdown/"
-DEFAULT_CREATOR = "Yuri Takhteyev"
-DEFAULT_TITLE = "Markdown in Python"
-GENERATOR = "http://www.freewisdom.org/projects/python-markdown/markdown2rss"
-
-month_map = { "Jan" : "01",
- "Feb" : "02",
- "March" : "03",
- "April" : "04",
- "May" : "05",
- "June" : "06",
- "July" : "07",
- "August" : "08",
- "September" : "09",
- "October" : "10",
- "November" : "11",
- "December" : "12" }
-
-def get_time(heading):
-
- heading = heading.split("-")[0]
- heading = heading.strip().replace(",", " ").replace(".", " ")
-
- month, date, year = heading.split()
- month = month_map[month]
-
- return rdftime(" ".join((month, date, year, "12:00:00 AM")))
-
-def rdftime(time):
-
- time = time.replace(":", " ")
- time = time.replace("/", " ")
- time = time.split()
- return "%s-%s-%sT%s:%s:%s-08:00" % (time[0], time[1], time[2],
- time[3], time[4], time[5])
-
-
-def get_date(text):
- return "date"
-
-class RssExtension (markdown.Extension):
-
- def extendMarkdown(self, md, md_globals):
-
- self.config = { 'URL' : [DEFAULT_URL, "Main URL"],
- 'CREATOR' : [DEFAULT_CREATOR, "Feed creator's name"],
- 'TITLE' : [DEFAULT_TITLE, "Feed title"] }
-
- md.xml_mode = True
-
- # Insert a tree-processor that would actually add the title tag
- treeprocessor = RssTreeProcessor(md)
- treeprocessor.ext = self
- md.treeprocessors['rss'] = treeprocessor
- md.stripTopLevelTags = 0
- md.docType = '\n'
-
-class RssTreeProcessor(markdown.treeprocessors.Treeprocessor):
-
- def run (self, root):
-
- rss = etree.Element("rss")
- rss.set("version", "2.0")
-
- channel = etree.SubElement(rss, "channel")
-
- for tag, text in (("title", self.ext.getConfig("TITLE")),
- ("link", self.ext.getConfig("URL")),
- ("description", None)):
-
- element = etree.SubElement(channel, tag)
- element.text = text
-
- for child in root:
-
- if child.tag in ["h1", "h2", "h3", "h4", "h5"]:
-
- heading = child.text.strip()
- item = etree.SubElement(channel, "item")
- link = etree.SubElement(item, "link")
- link.text = self.ext.getConfig("URL")
- title = etree.SubElement(item, "title")
- title.text = heading
-
- guid = ''.join([x for x in heading if x.isalnum()])
- guidElem = etree.SubElement(item, "guid")
- guidElem.text = guid
- guidElem.set("isPermaLink", "false")
-
- elif child.tag in ["p"]:
- try:
- description = etree.SubElement(item, "description")
- except UnboundLocalError:
- # Item not defined - moving on
- pass
- else:
- if len(child):
- content = "\n".join([etree.tostring(node)
- for node in child])
- else:
- content = child.text
- pholder = self.markdown.htmlStash.store(
- "" % content)
- description.text = pholder
-
- return rss
-
-
-def makeExtension(configs):
-
- return RssExtension(configs)
diff --git a/src/calibre/ebooks/markdown/extensions/sane_lists.py b/src/calibre/ebooks/markdown/extensions/sane_lists.py
new file mode 100644
index 0000000000..23e9a7f4a6
--- /dev/null
+++ b/src/calibre/ebooks/markdown/extensions/sane_lists.py
@@ -0,0 +1,51 @@
+"""
+Sane List Extension for Python-Markdown
+=======================================
+
+Modify the behavior of Lists in Python-Markdown t act in a sane manor.
+
+In standard Markdown sytex, the following would constitute a single
+ordered list. However, with this extension, the output would include
+two lists, the first an ordered list and the second and unordered list.
+
+ 1. ordered
+ 2. list
+
+ * unordered
+ * list
+
+Copyright 2011 - [Waylan Limberg](http://achinghead.com)
+
+"""
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..blockprocessors import OListProcessor, UListProcessor
+import re
+
+
+class SaneOListProcessor(OListProcessor):
+
+ CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.))[ ]+(.*)')
+ SIBLING_TAGS = ['ol']
+
+
+class SaneUListProcessor(UListProcessor):
+
+ CHILD_RE = re.compile(r'^[ ]{0,3}(([*+-]))[ ]+(.*)')
+ SIBLING_TAGS = ['ul']
+
+
+class SaneListExtension(Extension):
+ """ Add sane lists to Markdown. """
+
+ def extendMarkdown(self, md, md_globals):
+ """ Override existing Processors. """
+ md.parser.blockprocessors['olist'] = SaneOListProcessor(md.parser)
+ md.parser.blockprocessors['ulist'] = SaneUListProcessor(md.parser)
+
+
+def makeExtension(configs={}):
+ return SaneListExtension(configs=configs)
+
diff --git a/src/calibre/ebooks/markdown/extensions/smart_strong.py b/src/calibre/ebooks/markdown/extensions/smart_strong.py
new file mode 100644
index 0000000000..4818cf9ea8
--- /dev/null
+++ b/src/calibre/ebooks/markdown/extensions/smart_strong.py
@@ -0,0 +1,42 @@
+'''
+Smart_Strong Extension for Python-Markdown
+==========================================
+
+This extention adds smarter handling of double underscores within words.
+
+Simple Usage:
+
+ >>> import markdown
+ >>> print markdown.markdown('Text with double__underscore__words.',
+ ... extensions=['smart_strong'])
+ Text with double__underscore__words.
+ >>> print markdown.markdown('__Strong__ still works.',
+ ... extensions=['smart_strong'])
+ Strong still works.
+ >>> print markdown.markdown('__this__works__too__.',
+ ... extensions=['smart_strong'])
+ this__works__too.
+
+Copyright 2011
+[Waylan Limberg](http://achinghead.com)
+
+'''
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..inlinepatterns import SimpleTagPattern
+
+SMART_STRONG_RE = r'(?emphasis2')
+
+def makeExtension(configs={}):
+ return SmartEmphasisExtension(configs=dict(configs))
diff --git a/src/calibre/ebooks/markdown/extensions/tables.py b/src/calibre/ebooks/markdown/extensions/tables.py
index f47ec1cc0e..ad52ec11c7 100644
--- a/src/calibre/ebooks/markdown/extensions/tables.py
+++ b/src/calibre/ebooks/markdown/extensions/tables.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env Python
"""
Tables Extension for Python-Markdown
====================================
@@ -14,31 +13,35 @@ A simple example:
Copyright 2009 - [Waylan Limberg](http://achinghead.com)
"""
-import calibre.ebooks.markdown.markdown as markdown
-from calibre.ebooks.markdown.markdown import etree
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..blockprocessors import BlockProcessor
+from ..util import etree
-class TableProcessor(markdown.blockprocessors.BlockProcessor):
+class TableProcessor(BlockProcessor):
""" Process Tables. """
def test(self, parent, block):
rows = block.split('\n')
return (len(rows) > 2 and '|' in rows[0] and
'|' in rows[1] and '-' in rows[1] and
- rows[1][0] in ['|', ':', '-'])
+ rows[1].strip()[0] in ['|', ':', '-'])
def run(self, parent, blocks):
""" Parse a table block and build table. """
block = blocks.pop(0).split('\n')
- header = block[:2]
+ header = block[0].strip()
+ seperator = block[1].strip()
rows = block[2:]
# Get format type (bordered by pipes or not)
border = False
- if header[0].startswith('|'):
+ if header.startswith('|'):
border = True
# Get alignment of columns
align = []
- for c in self._split_row(header[1], border):
+ for c in self._split_row(seperator, border):
if c.startswith(':') and c.endswith(':'):
align.append('center')
elif c.startswith(':'):
@@ -50,10 +53,10 @@ class TableProcessor(markdown.blockprocessors.BlockProcessor):
# Build table
table = etree.SubElement(parent, 'table')
thead = etree.SubElement(table, 'thead')
- self._build_row(header[0], thead, align, border)
+ self._build_row(header, thead, align, border)
tbody = etree.SubElement(table, 'tbody')
for row in rows:
- self._build_row(row, tbody, align, border)
+ self._build_row(row.strip(), tbody, align, border)
def _build_row(self, row, parent, align, border):
""" Given a row of text, build table cells. """
@@ -83,7 +86,7 @@ class TableProcessor(markdown.blockprocessors.BlockProcessor):
return row.split('|')
-class TableExtension(markdown.Extension):
+class TableExtension(Extension):
""" Add tables to Markdown. """
def extendMarkdown(self, md, md_globals):
diff --git a/src/calibre/ebooks/markdown/extensions/toc.py b/src/calibre/ebooks/markdown/extensions/toc.py
index efa5516624..73b0844517 100644
--- a/src/calibre/ebooks/markdown/extensions/toc.py
+++ b/src/calibre/ebooks/markdown/extensions/toc.py
@@ -5,43 +5,141 @@ Table of Contents Extension for Python-Markdown
(c) 2008 [Jack Miller](http://codezen.org)
Dependencies:
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
+* [Markdown 2.1+](http://packages.python.org/Markdown/)
"""
-import calibre.ebooks.markdown.markdown as markdown
-from calibre.ebooks.markdown.markdown import etree
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..treeprocessors import Treeprocessor
+from ..util import etree
+from .headerid import slugify, unique, itertext
import re
-class TocTreeprocessor(markdown.treeprocessors.Treeprocessor):
+
+def order_toc_list(toc_list):
+ """Given an unsorted list with errors and skips, return a nested one.
+ [{'level': 1}, {'level': 2}]
+ =>
+ [{'level': 1, 'children': [{'level': 2, 'children': []}]}]
+
+ A wrong list is also converted:
+ [{'level': 2}, {'level': 1}]
+ =>
+ [{'level': 2, 'children': []}, {'level': 1, 'children': []}]
+ """
+
+ def build_correct(remaining_list, prev_elements=[{'level': 1000}]):
+
+ if not remaining_list:
+ return [], []
+
+ current = remaining_list.pop(0)
+ if not 'children' in current.keys():
+ current['children'] = []
+
+ if not prev_elements:
+ # This happens for instance with [8, 1, 1], ie. when some
+ # header level is outside a scope. We treat it as a
+ # top-level
+ next_elements, children = build_correct(remaining_list, [current])
+ current['children'].append(children)
+ return [current] + next_elements, []
+
+ prev_element = prev_elements.pop()
+ children = []
+ next_elements = []
+ # Is current part of the child list or next list?
+ if current['level'] > prev_element['level']:
+ #print "%d is a child of %d" % (current['level'], prev_element['level'])
+ prev_elements.append(prev_element)
+ prev_elements.append(current)
+ prev_element['children'].append(current)
+ next_elements2, children2 = build_correct(remaining_list, prev_elements)
+ children += children2
+ next_elements += next_elements2
+ else:
+ #print "%d is ancestor of %d" % (current['level'], prev_element['level'])
+ if not prev_elements:
+ #print "No previous elements, so appending to the next set"
+ next_elements.append(current)
+ prev_elements = [current]
+ next_elements2, children2 = build_correct(remaining_list, prev_elements)
+ current['children'].extend(children2)
+ else:
+ #print "Previous elements, comparing to those first"
+ remaining_list.insert(0, current)
+ next_elements2, children2 = build_correct(remaining_list, prev_elements)
+ children.extend(children2)
+ next_elements += next_elements2
+
+ return next_elements, children
+
+ ordered_list, __ = build_correct(toc_list)
+ return ordered_list
+
+
+class TocTreeprocessor(Treeprocessor):
+
# Iterator wrapper to get parent and child all at once
def iterparent(self, root):
for parent in root.getiterator():
for child in parent:
yield parent, child
-
- def run(self, doc):
- div = etree.Element("div")
- div.attrib["class"] = "toc"
- last_li = None
-
+
+ def add_anchor(self, c, elem_id): #@ReservedAssignment
+ if self.use_anchors:
+ anchor = etree.Element("a")
+ anchor.text = c.text
+ anchor.attrib["href"] = "#" + elem_id
+ anchor.attrib["class"] = "toclink"
+ c.text = ""
+ for elem in c.getchildren():
+ anchor.append(elem)
+ c.remove(elem)
+ c.append(anchor)
+
+ def build_toc_etree(self, div, toc_list):
# Add title to the div
- if self.config["title"][0]:
+ if self.config["title"]:
header = etree.SubElement(div, "span")
header.attrib["class"] = "toctitle"
- header.text = self.config["title"][0]
+ header.text = self.config["title"]
- level = 0
- list_stack=[div]
+ def build_etree_ul(toc_list, parent):
+ ul = etree.SubElement(parent, "ul")
+ for item in toc_list:
+ # List item link, to be inserted into the toc div
+ li = etree.SubElement(ul, "li")
+ link = etree.SubElement(li, "a")
+ link.text = item.get('name', '')
+ link.attrib["href"] = '#' + item.get('id', '')
+ if item['children']:
+ build_etree_ul(item['children'], li)
+ return ul
+
+ return build_etree_ul(toc_list, div)
+
+ def run(self, doc):
+
+ div = etree.Element("div")
+ div.attrib["class"] = "toc"
header_rgx = re.compile("[Hh][123456]")
-
+
+ self.use_anchors = self.config["anchorlink"] in [1, '1', True, 'True', 'true']
+
# Get a list of id attributes
- used_ids = []
+ used_ids = set()
for c in doc.getiterator():
if "id" in c.attrib:
- used_ids.append(c.attrib["id"])
+ used_ids.add(c.attrib["id"])
+ toc_list = []
+ marker_found = False
for (p, c) in self.iterparent(doc):
- if not c.text:
+ text = ''.join(itertext(c)).strip()
+ if not text:
continue
# To keep the output from screwing up the
@@ -50,69 +148,54 @@ class TocTreeprocessor(markdown.treeprocessors.Treeprocessor):
# We do not allow the marker inside a header as that
# would causes an enless loop of placing a new TOC
# inside previously generated TOC.
-
- if c.text.find(self.config["marker"][0]) > -1 and not header_rgx.match(c.tag):
+ if c.text and c.text.strip() == self.config["marker"] and \
+ not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']:
for i in range(len(p)):
if p[i] == c:
p[i] = div
break
-
+ marker_found = True
+
if header_rgx.match(c.tag):
- tag_level = int(c.tag[-1])
- # Regardless of how many levels we jumped
- # only one list should be created, since
- # empty lists containing lists are illegal.
-
- if tag_level < level:
- list_stack.pop()
- level = tag_level
-
- if tag_level > level:
- newlist = etree.Element("ul")
- if last_li:
- last_li.append(newlist)
- else:
- list_stack[-1].append(newlist)
- list_stack.append(newlist)
- level = tag_level
-
# Do not override pre-existing ids
if not "id" in c.attrib:
- id = self.config["slugify"][0](c.text)
- if id in used_ids:
- ctr = 1
- while "%s_%d" % (id, ctr) in used_ids:
- ctr += 1
- id = "%s_%d" % (id, ctr)
- used_ids.append(id)
- c.attrib["id"] = id
+ elem_id = unique(self.config["slugify"](text, '-'), used_ids)
+ c.attrib["id"] = elem_id
else:
- id = c.attrib["id"]
+ elem_id = c.attrib["id"]
- # List item link, to be inserted into the toc div
- last_li = etree.Element("li")
- link = etree.SubElement(last_li, "a")
- link.text = c.text
- link.attrib["href"] = '#' + id
+ tag_level = int(c.tag[-1])
+
+ toc_list.append({'level': tag_level,
+ 'id': elem_id,
+ 'name': text})
+
+ self.add_anchor(c, elem_id)
+
+ toc_list_nested = order_toc_list(toc_list)
+ self.build_toc_etree(div, toc_list_nested)
+ prettify = self.markdown.treeprocessors.get('prettify')
+ if prettify: prettify.run(div)
+ if not marker_found:
+ # serialize and attach to markdown instance.
+ toc = self.markdown.serializer(div)
+ for pp in self.markdown.postprocessors.values():
+ toc = pp.run(toc)
+ self.markdown.toc = toc
- if int(self.config["anchorlink"][0]):
- anchor = etree.SubElement(c, "a")
- anchor.text = c.text
- anchor.attrib["href"] = "#" + id
- anchor.attrib["class"] = "toclink"
- c.text = ""
- list_stack[-1].append(last_li)
-
-class TocExtension(markdown.Extension):
- def __init__(self, configs):
+class TocExtension(Extension):
+
+ TreeProcessorClass = TocTreeprocessor
+
+ def __init__(self, configs=[]):
self.config = { "marker" : ["[TOC]",
"Text to find and replace with Table of Contents -"
"Defaults to \"[TOC]\""],
- "slugify" : [self.slugify,
+ "slugify" : [slugify,
"Function to generate anchors based on header text-"
- "Defaults to a built in slugify function."],
+ "Defaults to the headerid ext's slugify function."],
"title" : [None,
"Title to insert into TOC - "
"Defaults to None"],
@@ -123,18 +206,16 @@ class TocExtension(markdown.Extension):
for key, value in configs:
self.setConfig(key, value)
- # This is exactly the same as Django's slugify
- def slugify(self, value):
- """ Slugify a string, to make it URL friendly. """
- import unicodedata
- value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
- value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
- return re.sub('[-\s]+','-',value)
-
def extendMarkdown(self, md, md_globals):
- tocext = TocTreeprocessor(md)
- tocext.config = self.config
- md.treeprocessors.add("toc", tocext, "_begin")
-
+ tocext = self.TreeProcessorClass(md)
+ tocext.config = self.getConfigs()
+ # Headerid ext is set to '>prettify'. With this set to '_end',
+ # it should always come after headerid ext (and honor ids assinged
+ # by the header id extension) if both are used. Same goes for
+ # attr_list extension. This must come last because we don't want
+ # to redefine ids after toc is created. But we do want toc prettified.
+ md.treeprocessors.add("toc", tocext, "_end")
+
+
def makeExtension(configs={}):
return TocExtension(configs=configs)
diff --git a/src/calibre/ebooks/markdown/extensions/wikilinks.py b/src/calibre/ebooks/markdown/extensions/wikilinks.py
index ddb7b5f0d6..877890b8ab 100644
--- a/src/calibre/ebooks/markdown/extensions/wikilinks.py
+++ b/src/calibre/ebooks/markdown/extensions/wikilinks.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
'''
WikiLinks Extension for Python-Markdown
======================================
@@ -11,22 +9,22 @@ Basic usage:
>>> import markdown
>>> text = "Some text with a [[WikiLink]]."
>>> html = markdown.markdown(text, ['wikilinks'])
- >>> html
- u'Some text with a WikiLink.
'
+ >>> print html
+ Some text with a WikiLink.
Whitespace behavior:
- >>> markdown.markdown('[[ foo bar_baz ]]', ['wikilinks'])
- u''
- >>> markdown.markdown('foo [[ ]] bar', ['wikilinks'])
- u'foo bar
'
+ >>> print markdown.markdown('[[ foo bar_baz ]]', ['wikilinks'])
+
+ >>> print markdown.markdown('foo [[ ]] bar', ['wikilinks'])
+ foo bar
To define custom settings the simple way:
- >>> markdown.markdown(text,
+ >>> print markdown.markdown(text,
... ['wikilinks(base_url=/wiki/,end_url=.html,html_class=foo)']
... )
- u'Some text with a WikiLink.
'
+ Some text with a WikiLink.
Custom settings the complex way:
@@ -37,8 +35,8 @@ Custom settings the complex way:
... ('end_url', '.html'),
... ('html_class', '') ]},
... safe_mode = True)
- >>> md.convert(text)
- u'Some text with a WikiLink.
'
+ >>> print md.convert(text)
+ Some text with a WikiLink.
Use MetaData with mdx_meta.py (Note the blank html_class in MetaData):
@@ -48,13 +46,13 @@ Use MetaData with mdx_meta.py (Note the blank html_class in MetaData):
...
... Some text with a [[WikiLink]]."""
>>> md = markdown.Markdown(extensions=['meta', 'wikilinks'])
- >>> md.convert(text)
- u'Some text with a WikiLink.
'
+ >>> print md.convert(text)
+ Some text with a WikiLink.
MetaData should not carry over to next document:
- >>> md.convert("No [[MetaData]] here.")
- u'No MetaData here.
'
+ >>> print md.convert("No [[MetaData]] here.")
+ No MetaData here.
Define a custom URL builder:
@@ -62,8 +60,8 @@ Define a custom URL builder:
... return '/bar/'
>>> md = markdown.Markdown(extensions=['wikilinks'],
... extension_configs={'wikilinks' : [('build_url', my_url_builder)]})
- >>> md.convert('[[foo]]')
- u''
+ >>> print md.convert('[[foo]]')
+
From the command line:
@@ -75,10 +73,14 @@ License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
Dependencies:
* [Python 2.3+](http://python.org)
-* [Markdown 2.0+](http://www.freewisdom.org/projects/python-markdown/)
+* [Markdown 2.0+](http://packages.python.org/Markdown/)
'''
-import calibre.ebooks.markdown.markdown as markdown
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import Extension
+from ..inlinepatterns import Pattern
+from ..util import etree
import re
def build_url(label, base, end):
@@ -87,7 +89,7 @@ def build_url(label, base, end):
return '%s%s%s'% (base, clean_label, end)
-class WikiLinkExtension(markdown.Extension):
+class WikiLinkExtension(Extension):
def __init__(self, configs):
# set extension defaults
self.config = {
@@ -105,23 +107,23 @@ class WikiLinkExtension(markdown.Extension):
self.md = md
# append to end of inline patterns
- WIKILINK_RE = r'\[\[([A-Za-z0-9_ -]+)\]\]'
- wikilinkPattern = WikiLinks(WIKILINK_RE, self.config)
+ WIKILINK_RE = r'\[\[([\w0-9_ -]+)\]\]'
+ wikilinkPattern = WikiLinks(WIKILINK_RE, self.getConfigs())
wikilinkPattern.md = md
md.inlinePatterns.add('wikilink', wikilinkPattern, "= "3.0":
- from html import entities as htmlentitydefs
- htmlentitydefs
-else:
- import htmlentitydefs
+try:
+ from urllib.parse import urlparse, urlunparse
+except ImportError:
+ from urlparse import urlparse, urlunparse
+try:
+ from html import entities
+except ImportError:
+ import htmlentitydefs as entities
+
+
+def build_inlinepatterns(md_instance, **kwargs):
+ """ Build the default set of inline patterns for Markdown. """
+ inlinePatterns = odict.OrderedDict()
+ inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
+ inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
+ inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
+ inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
+ inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
+ inlinePatterns["image_reference"] = \
+ ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
+ inlinePatterns["short_reference"] = \
+ ReferencePattern(SHORT_REF_RE, md_instance)
+ inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
+ inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
+ inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
+ if md_instance.safeMode != 'escape':
+ inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
+ inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
+ inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
+ inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
+ inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
+ inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
+ if md_instance.smart_emphasis:
+ inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
+ else:
+ inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
+ return inlinePatterns
"""
The actual regular expressions for patterns
@@ -65,31 +98,27 @@ NOIMG = r'(?|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*)\12)?\)'''
-# [text](url) or [text]()
+r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
+# [text](url) or [text]() or [text](url "title")
IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
#  or 
-REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3]
-IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
-NOT_STRONG_RE = r'( \* )' # stand-alone * or _
-AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' #
+REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3]
+SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]
+IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
+NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _
+AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' #
AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' #
HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &
LINE_BREAK_RE = r' \n' # two spaces at end of line
-LINE_BREAK_2_RE = r' $' # two spaces at end of text
def dequote(string):
@@ -114,10 +143,10 @@ The pattern classes
-----------------------------------------------------------------------------
"""
-class Pattern:
+class Pattern(object):
"""Base class that inline patterns subclass. """
- def __init__ (self, pattern, markdown_instance=None):
+ def __init__(self, pattern, markdown_instance=None):
"""
Create an instant of an inline pattern.
@@ -127,14 +156,15 @@ class Pattern:
"""
self.pattern = pattern
- self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL)
+ self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
+ re.DOTALL | re.UNICODE)
# Api for Markdown to pass safe_mode into instance
self.safe_mode = False
if markdown_instance:
self.markdown = markdown_instance
- def getCompiledRegExp (self):
+ def getCompiledRegExp(self):
""" Return a compiled regular expression. """
return self.compiled_re
@@ -154,17 +184,57 @@ class Pattern:
""" Return class name, to define pattern type """
return self.__class__.__name__
-BasePattern = Pattern # for backward compatibility
+ def unescape(self, text):
+ """ Return unescaped text given text with an inline placeholder. """
+ try:
+ stash = self.markdown.treeprocessors['inline'].stashed_nodes
+ except KeyError:
+ return text
+ def itertext(el):
+ ' Reimplement Element.itertext for older python versions '
+ tag = el.tag
+ if not isinstance(tag, util.string_type) and tag is not None:
+ return
+ if el.text:
+ yield el.text
+ for e in el:
+ for s in itertext(e):
+ yield s
+ if e.tail:
+ yield e.tail
+ def get_stash(m):
+ id = m.group(1)
+ if id in stash:
+ value = stash.get(id)
+ if isinstance(value, util.string_type):
+ return value
+ else:
+ # An etree Element - return text content only
+ return ''.join(itertext(value))
+ return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
-class SimpleTextPattern (Pattern):
+
+class SimpleTextPattern(Pattern):
""" Return a simple text of group(2) of a Pattern. """
def handleMatch(self, m):
text = m.group(2)
- if text == markdown.INLINE_PLACEHOLDER_PREFIX:
+ if text == util.INLINE_PLACEHOLDER_PREFIX:
return None
return text
-class SimpleTagPattern (Pattern):
+
+class EscapePattern(Pattern):
+ """ Return an escaped character. """
+
+ def handleMatch(self, m):
+ char = m.group(2)
+ if char in self.markdown.ESCAPED_CHARS:
+ return '%s%s%s' % (util.STX, ord(char), util.ETX)
+ else:
+ return '\\%s' % char
+
+
+class SimpleTagPattern(Pattern):
"""
Return element of type `tag` with a text attribute of group(3)
of a Pattern.
@@ -175,30 +245,30 @@ class SimpleTagPattern (Pattern):
self.tag = tag
def handleMatch(self, m):
- el = markdown.etree.Element(self.tag)
+ el = util.etree.Element(self.tag)
el.text = m.group(3)
return el
-class SubstituteTagPattern (SimpleTagPattern):
- """ Return a eLement of type `tag` with no children. """
+class SubstituteTagPattern(SimpleTagPattern):
+ """ Return an element of type `tag` with no children. """
def handleMatch (self, m):
- return markdown.etree.Element(self.tag)
+ return util.etree.Element(self.tag)
-class BacktickPattern (Pattern):
+class BacktickPattern(Pattern):
""" Return a `` element containing the matching text. """
def __init__ (self, pattern):
Pattern.__init__(self, pattern)
self.tag = "code"
def handleMatch(self, m):
- el = markdown.etree.Element(self.tag)
- el.text = markdown.AtomicString(m.group(3).strip())
+ el = util.etree.Element(self.tag)
+ el.text = util.AtomicString(m.group(3).strip())
return el
-class DoubleTagPattern (SimpleTagPattern):
+class DoubleTagPattern(SimpleTagPattern):
"""Return a ElementTree element nested in tag2 nested in tag1.
Useful for strong emphasis etc.
@@ -206,37 +276,54 @@ class DoubleTagPattern (SimpleTagPattern):
"""
def handleMatch(self, m):
tag1, tag2 = self.tag.split(",")
- el1 = markdown.etree.Element(tag1)
- el2 = markdown.etree.SubElement(el1, tag2)
+ el1 = util.etree.Element(tag1)
+ el2 = util.etree.SubElement(el1, tag2)
el2.text = m.group(3)
return el1
-class HtmlPattern (Pattern):
+class HtmlPattern(Pattern):
""" Store raw inline html and return a placeholder. """
def handleMatch (self, m):
- rawhtml = m.group(2)
+ rawhtml = self.unescape(m.group(2))
place_holder = self.markdown.htmlStash.store(rawhtml)
return place_holder
+ def unescape(self, text):
+ """ Return unescaped text given text with an inline placeholder. """
+ try:
+ stash = self.markdown.treeprocessors['inline'].stashed_nodes
+ except KeyError:
+ return text
+ def get_stash(m):
+ id = m.group(1)
+ value = stash.get(id)
+ if value is not None:
+ try:
+ return self.markdown.serializer(value)
+ except:
+ return '\%s' % value
+
+ return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
-class LinkPattern (Pattern):
+
+class LinkPattern(Pattern):
""" Return a link element from the given match. """
def handleMatch(self, m):
- el = markdown.etree.Element("a")
+ el = util.etree.Element("a")
el.text = m.group(2)
- title = m.group(11)
+ title = m.group(13)
href = m.group(9)
if href:
if href[0] == "<":
href = href[1:-1]
- el.set("href", self.sanitize_url(href.strip()))
+ el.set("href", self.sanitize_url(self.unescape(href.strip())))
else:
el.set("href", "")
if title:
- title = dequote(title) #.replace('"', """)
+ title = dequote(self.unescape(title))
el.set("title", title)
return el
@@ -257,54 +344,75 @@ class LinkPattern (Pattern):
`username:password@host:port`.
"""
+ url = url.replace(' ', '%20')
+ if not self.markdown.safeMode:
+ # Return immediately bipassing parsing.
+ return url
+
+ try:
+ scheme, netloc, path, params, query, fragment = url = urlparse(url)
+ except ValueError:
+ # Bad url - so bad it couldn't be parsed.
+ return ''
+
locless_schemes = ['', 'mailto', 'news']
- scheme, netloc, path, params, query, fragment = url = urlparse(url)
- safe_url = False
- if netloc != '' or scheme in locless_schemes:
- safe_url = True
+ allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
+ if scheme not in allowed_schemes:
+ # Not a known (allowed) scheme. Not safe.
+ return ''
+
+ if netloc == '' and scheme not in locless_schemes:
+ # This should not happen. Treat as suspect.
+ return ''
for part in url[2:]:
if ":" in part:
- safe_url = False
+ # A colon in "path", "parameters", "query" or "fragment" is suspect.
+ return ''
- if self.markdown.safeMode and not safe_url:
- return ''
- else:
- return urlunparse(url)
+ # Url passes all tests. Return url as-is.
+ return urlunparse(url)
class ImagePattern(LinkPattern):
""" Return a img element from the given match. """
def handleMatch(self, m):
- el = markdown.etree.Element("img")
+ el = util.etree.Element("img")
src_parts = m.group(9).split()
if src_parts:
src = src_parts[0]
if src[0] == "<" and src[-1] == ">":
src = src[1:-1]
- el.set('src', self.sanitize_url(src))
+ el.set('src', self.sanitize_url(self.unescape(src)))
else:
el.set('src', "")
if len(src_parts) > 1:
- el.set('title', dequote(" ".join(src_parts[1:])))
+ el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
- if markdown.ENABLE_ATTRIBUTES:
+ if self.markdown.enable_attributes:
truealt = handleAttributes(m.group(2), el)
else:
truealt = m.group(2)
- el.set('alt', truealt)
+ el.set('alt', self.unescape(truealt))
return el
class ReferencePattern(LinkPattern):
""" Match to a stored reference and return link element. """
+
+ NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
+
def handleMatch(self, m):
- if m.group(9):
+ try:
id = m.group(9).lower()
- else:
- # if we got something like "[Google][]"
+ except IndexError:
+ id = None
+ if not id:
+ # if we got something like "[Google][]" or "[Goggle]"
# we'll use "google" as the id
id = m.group(2).lower()
+ # Clean up linebreaks in id
+ id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
if not id in self.markdown.references: # ignore undefined refs
return None
href, title = self.markdown.references[id]
@@ -313,7 +421,7 @@ class ReferencePattern(LinkPattern):
return self.makeTag(href, title, text)
def makeTag(self, href, title, text):
- el = markdown.etree.Element('a')
+ el = util.etree.Element('a')
el.set('href', self.sanitize_url(href))
if title:
@@ -323,48 +431,52 @@ class ReferencePattern(LinkPattern):
return el
-class ImageReferencePattern (ReferencePattern):
+class ImageReferencePattern(ReferencePattern):
""" Match to a stored reference and return img element. """
def makeTag(self, href, title, text):
- el = markdown.etree.Element("img")
+ el = util.etree.Element("img")
el.set("src", self.sanitize_url(href))
if title:
el.set("title", title)
- el.set("alt", text)
+
+ if self.markdown.enable_attributes:
+ text = handleAttributes(text, el)
+
+ el.set("alt", self.unescape(text))
return el
-class AutolinkPattern (Pattern):
+class AutolinkPattern(Pattern):
""" Return a link Element given an autolink (``). """
def handleMatch(self, m):
- el = markdown.etree.Element("a")
- el.set('href', m.group(2))
- el.text = markdown.AtomicString(m.group(2))
+ el = util.etree.Element("a")
+ el.set('href', self.unescape(m.group(2)))
+ el.text = util.AtomicString(m.group(2))
return el
-class AutomailPattern (Pattern):
+class AutomailPattern(Pattern):
"""
Return a mailto link Element given an automail link (``).
"""
def handleMatch(self, m):
- el = markdown.etree.Element('a')
- email = m.group(2)
+ el = util.etree.Element('a')
+ email = self.unescape(m.group(2))
if email.startswith("mailto:"):
email = email[len("mailto:"):]
def codepoint2name(code):
"""Return entity definition by code, or the code if not defined."""
- entity = htmlentitydefs.codepoint2name.get(code)
+ entity = entities.codepoint2name.get(code)
if entity:
- return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity)
+ return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
else:
- return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code)
+ return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
letters = [codepoint2name(ord(letter)) for letter in email]
- el.text = markdown.AtomicString(''.join(letters))
+ el.text = util.AtomicString(''.join(letters))
mailto = "mailto:" + email
- mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' %
+ mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
ord(letter) for letter in mailto])
el.set('href', mailto)
return el
diff --git a/src/calibre/ebooks/markdown/markdown.py b/src/calibre/ebooks/markdown/markdown.py
deleted file mode 100644
index 46ac21983c..0000000000
--- a/src/calibre/ebooks/markdown/markdown.py
+++ /dev/null
@@ -1,612 +0,0 @@
-"""
-Python Markdown
-===============
-
-Python Markdown converts Markdown to HTML and can be used as a library or
-called from the command line.
-
-## Basic usage as a module:
-
- import markdown
- md = Markdown()
- html = md.convert(your_text_string)
-
-## Basic use from the command line:
-
- python markdown.py source.txt > destination.html
-
-Run "python markdown.py --help" to see more options.
-
-## Extensions
-
-See for more
-information and instructions on how to extend the functionality of
-Python Markdown. Read that before you try modifying this file.
-
-## Authors and License
-
-Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
-maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
-Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
-
-Contact: markdown@freewisdom.org
-
-Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
-Copyright 200? Django Software Foundation (OrderedDict implementation)
-Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
-Copyright 2004 Manfred Stienstra (the original version)
-
-License: BSD (see docs/LICENSE for details).
-"""
-from calibre.ebooks.markdown.commandline import parse_options
-
-version = "2.0"
-version_info = (2,0,0, "Final")
-
-import re
-import codecs
-import sys
-import warnings
-import logging
-from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
-
-
-"""
-CONSTANTS
-=============================================================================
-"""
-
-"""
-Constants you might want to modify
------------------------------------------------------------------------------
-"""
-
-# default logging level for command-line use
-COMMAND_LINE_LOGGING_LEVEL = CRITICAL
-TAB_LENGTH = 4 # expand tabs to this many spaces
-ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
-#SMART_EMPHASIS = True # this_or_that does not become thisorthat
-SMART_EMPHASIS = False # this_or_that needs to have _ escaped as \_.
-DEFAULT_OUTPUT_FORMAT = 'xhtml1' # xhtml or html4 output
-HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
-BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
- "|script|noscript|form|fieldset|iframe|math"
- "|ins|del|hr|hr/|style|li|dt|dd|thead|tbody"
- "|tr|th|td")
-DOC_TAG = "div" # Element used to wrap document - later removed
-
-# Placeholders
-STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder
-ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder
-INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
-INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
-AMP_SUBSTITUTE = STX+"amp"+ETX
-
-
-"""
-Constants you probably do not need to change
------------------------------------------------------------------------------
-"""
-
-RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
- # Hebrew (0590-05FF), Arabic (0600-06FF),
- # Syriac (0700-074F), Arabic supplement (0750-077F),
- # Thaana (0780-07BF), Nko (07C0-07FF).
- (u'\u2D30', u'\u2D7F'), # Tifinagh
- )
-
-
-"""
-AUXILIARY GLOBAL FUNCTIONS
-=============================================================================
-"""
-
-
-def message(level, text):
- """ A wrapper method for logging debug messages. """
- logger = logging.getLogger('MARKDOWN')
- if logger.handlers:
- # The logger is configured
- logger.log(level, text)
- if level > WARN:
- sys.exit(0)
- elif level > WARN:
- raise MarkdownException, text
- else:
- warnings.warn(text, MarkdownWarning)
-
-
-def isBlockLevel(tag):
- """Check if the tag is a block level HTML tag."""
- return BLOCK_LEVEL_ELEMENTS.match(tag)
-
-"""
-MISC AUXILIARY CLASSES
-=============================================================================
-"""
-
-class AtomicString(unicode):
- """A string which should not be further processed."""
- pass
-
-
-class MarkdownException(Exception):
- """ A Markdown Exception. """
- pass
-
-
-class MarkdownWarning(Warning):
- """ A Markdown Warning. """
- pass
-
-
-"""
-OVERALL DESIGN
-=============================================================================
-
-Markdown processing takes place in four steps:
-
-1. A bunch of "preprocessors" munge the input text.
-2. BlockParser() parses the high-level structural elements of the
- pre-processed text into an ElementTree.
-3. A bunch of "treeprocessors" are run against the ElementTree. One such
- treeprocessor runs InlinePatterns against the ElementTree, detecting inline
- markup.
-4. Some post-processors are run against the text after the ElementTree has
- been serialized into text.
-5. The output is written to a string.
-
-Those steps are put together by the Markdown() class.
-
-"""
-
-import preprocessors
-import blockprocessors
-import treeprocessors
-import inlinepatterns
-import postprocessors
-import blockparser
-import etree_loader
-import odict
-
-# Extensions should use "markdown.etree" instead of "etree" (or do `from
-# markdown import etree`). Do not import it by yourself.
-
-etree = etree_loader.importETree()
-
-# Adds the ability to output html4
-import html4
-
-
-class Markdown:
- """Convert Markdown to HTML."""
-
- def __init__(self,
- extensions=[],
- extension_configs={},
- safe_mode = False,
- output_format=DEFAULT_OUTPUT_FORMAT):
- """
- Creates a new Markdown instance.
-
- Keyword arguments:
-
- * extensions: A list of extensions.
- If they are of type string, the module mdx_name.py will be loaded.
- If they are a subclass of markdown.Extension, they will be used
- as-is.
- * extension-configs: Configuration setting for extensions.
- * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
- * output_format: Format of output. Supported formats are:
- * "xhtml1": Outputs XHTML 1.x. Default.
- * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
- * "html4": Outputs HTML 4
- * "html": Outputs latest supported version of HTML (currently HTML 4).
- Note that it is suggested that the more specific formats ("xhtml1"
- and "html4") be used as "xhtml" or "html" may change in the future
- if it makes sense at that time.
-
- """
-
- self.safeMode = safe_mode
- self.registeredExtensions = []
- self.docType = ""
- self.stripTopLevelTags = True
-
- # Preprocessors
- self.preprocessors = odict.OrderedDict()
- self.preprocessors["html_block"] = \
- preprocessors.HtmlBlockPreprocessor(self)
- self.preprocessors["reference"] = \
- preprocessors.ReferencePreprocessor(self)
- # footnote preprocessor will be inserted with "amp_substitute"
-
- # Map format keys to serializers
- self.output_formats = {
- 'html' : html4.to_html_string,
- 'html4' : html4.to_html_string,
- 'xhtml' : etree.tostring,
- 'xhtml1': etree.tostring,
- }
-
- self.references = {}
- self.htmlStash = preprocessors.HtmlStash()
- self.registerExtensions(extensions = extensions,
- configs = extension_configs)
- self.set_output_format(output_format)
- self.reset()
-
- def registerExtensions(self, extensions, configs):
- """
- Register extensions with this instance of Markdown.
-
- Keyword aurguments:
-
- * extensions: A list of extensions, which can either
- be strings or objects. See the docstring on Markdown.
- * configs: A dictionary mapping module names to config options.
-
- """
- for ext in extensions:
- if isinstance(ext, basestring):
- ext = load_extension(ext, configs.get(ext, []))
- try:
- ext.extendMarkdown(self, globals())
- except AttributeError:
- message(ERROR, "Incorrect type! Extension '%s' is "
- "neither a string or an Extension." %(repr(ext)))
-
-
- def registerExtension(self, extension):
- """ This gets called by the extension """
- self.registeredExtensions.append(extension)
-
- def reset(self):
- """
- Resets all state variables so that we can start with a new text.
- """
- self.htmlStash.reset()
- self.references.clear()
-
- for extension in self.registeredExtensions:
- extension.reset()
-
- def set_output_format(self, format):
- """ Set the output format for the class instance. """
- try:
- self.serializer = self.output_formats[format.lower()]
- except KeyError:
- message(CRITICAL, 'Invalid Output Format: "%s". Use one of %s.' \
- % (format, self.output_formats.keys()))
-
- def convert(self, source):
- """
- Convert markdown to serialized XHTML or HTML.
-
- Keyword arguments:
-
- * source: Source text as a Unicode string.
-
- """
-
- # Fixup the source text
- if not source.strip():
- return u"" # a blank unicode string
- try:
- source = unicode(source)
- except UnicodeDecodeError:
- message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
- return u""
-
- source = source.replace(STX, "").replace(ETX, "")
- source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
- source = re.sub(r'\n\s+\n', '\n\n', source)
- source = source.expandtabs(TAB_LENGTH)
-
- # Split into lines and run the line preprocessors.
- self.lines = source.split("\n")
- for prep in self.preprocessors.values():
- self.lines = prep.run(self.lines)
-
- # Parse the high-level elements.
- root = self.parser.parseDocument(self.lines).getroot()
-
- # Run the tree-processors
- for treeprocessor in self.treeprocessors.values():
- newRoot = treeprocessor.run(root)
- if newRoot:
- root = newRoot
-
- # Serialize _properly_. Strip top-level tags.
- output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf8"))
- if self.stripTopLevelTags:
- start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
- end = output.rindex('%s>'%DOC_TAG)
- output = output[start:end].strip()
-
- # Run the text post-processors
- for pp in self.postprocessors.values():
- output = pp.run(output)
-
- return output.strip()
-
- def convertFile(self, input=None, output=None, encoding=None):
- """Converts a markdown file and returns the HTML as a unicode string.
-
- Decodes the file using the provided encoding (defaults to utf-8),
- passes the file content to markdown, and outputs the html to either
- the provided stream or the file with provided name, using the same
- encoding as the source file.
-
- **Note:** This is the only place that decoding and encoding of unicode
- takes place in Python-Markdown. (All other code is unicode-in /
- unicode-out.)
-
- Keyword arguments:
-
- * input: Name of source text file.
- * output: Name of output file. Writes to stdout if `None`.
- * encoding: Encoding of input and output files. Defaults to utf-8.
-
- """
-
- encoding = encoding or "utf-8"
-
- # Read the source
- input_file = codecs.open(input, mode="r", encoding=encoding)
- text = input_file.read()
- input_file.close()
- text = text.lstrip(u'\ufeff') # remove the byte-order mark
-
- # Convert
- html = self.convert(text)
-
- # Write to file or stdout
- if isinstance(output, (str, unicode)):
- output_file = codecs.open(output, "w", encoding=encoding)
- output_file.write(html)
- output_file.close()
- else:
- output.write(html.encode(encoding))
-
-
-"""
-Extensions
------------------------------------------------------------------------------
-"""
-
-class Extension:
- """ Base class for extensions to subclass. """
- def __init__(self, configs = {}):
- """Create an instance of an Extention.
-
- Keyword arguments:
-
- * configs: A dict of configuration setting used by an Extension.
- """
- self.config = configs
-
- def getConfig(self, key):
- """ Return a setting for the given key or an empty string. """
- if key in self.config:
- return self.config[key][0]
- else:
- return ""
-
- def getConfigInfo(self):
- """ Return all config settings as a list of tuples. """
- return [(key, self.config[key][1]) for key in self.config.keys()]
-
- def setConfig(self, key, value):
- """ Set a config setting for `key` with the given `value`. """
- self.config[key][0] = value
-
- def extendMarkdown(self, md, md_globals):
- """
- Add the various proccesors and patterns to the Markdown Instance.
-
- This method must be overriden by every extension.
-
- Keyword arguments:
-
- * md: The Markdown instance.
-
- * md_globals: Global variables in the markdown module namespace.
-
- """
- pass
-
-
-def load_extension(ext_name, configs = []):
- """Load extension by name, then return the module.
-
- The extension name may contain arguments as part of the string in the
- following format: "extname(key1=value1,key2=value2)"
-
- """
-
- # Parse extensions config params (ignore the order)
- configs = dict(configs)
- pos = ext_name.find("(") # find the first "("
- if pos > 0:
- ext_args = ext_name[pos+1:-1]
- ext_name = ext_name[:pos]
- pairs = [x.split("=") for x in ext_args.split(",")]
- configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
-
- # Setup the module names
- ext_module = 'calibre.ebooks.markdown.extensions'
- module_name_new_style = '.'.join([ext_module, ext_name])
- module_name_old_style = '_'.join(['mdx', ext_name])
-
- # Try loading the extention first from one place, then another
- try: # New style (markdown.extensons.)
- module = __import__(module_name_new_style, {}, {}, [ext_module])
- except ImportError:
- try: # Old style (mdx.)
- module = __import__(module_name_old_style)
- except ImportError:
- message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
- % (ext_name, module_name_new_style, module_name_old_style))
- # Return None so we don't try to initiate none-existant extension
- return None
-
- # If the module is loaded successfully, we expect it to define a
- # function called makeExtension()
- try:
- return module.makeExtension(configs.items())
- except AttributeError:
- message(CRITICAL, "Failed to initiate extension '%s'" % ext_name)
-
-
-def load_extensions(ext_names):
- """Loads multiple extensions"""
- extensions = []
- for ext_name in ext_names:
- extension = load_extension(ext_name)
- if extension:
- extensions.append(extension)
- return extensions
-
-
-"""
-EXPORTED FUNCTIONS
-=============================================================================
-
-Those are the two functions we really mean to export: markdown() and
-markdownFromFile().
-"""
-
-def markdown(text,
- extensions = [],
- safe_mode = False,
- output_format = DEFAULT_OUTPUT_FORMAT):
- """Convert a markdown string to HTML and return HTML as a unicode string.
-
- This is a shortcut function for `Markdown` class to cover the most
- basic use case. It initializes an instance of Markdown, loads the
- necessary extensions and runs the parser on the given text.
-
- Keyword arguments:
-
- * text: Markdown formatted text as Unicode or ASCII string.
- * extensions: A list of extensions or extension names (may contain config args).
- * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
- * output_format: Format of output. Supported formats are:
- * "xhtml1": Outputs XHTML 1.x. Default.
- * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
- * "html4": Outputs HTML 4
- * "html": Outputs latest supported version of HTML (currently HTML 4).
- Note that it is suggested that the more specific formats ("xhtml1"
- and "html4") be used as "xhtml" or "html" may change in the future
- if it makes sense at that time.
-
- Returns: An HTML document as a string.
-
- """
- md = Markdown(extensions=load_extensions(extensions),
- safe_mode=safe_mode,
- output_format=output_format)
- return md.convert(text)
-
-
-def markdownFromFile(input = None,
- output = None,
- extensions = [],
- encoding = None,
- safe_mode = False,
- output_format = DEFAULT_OUTPUT_FORMAT):
- """Read markdown code from a file and write it to a file or a stream."""
- md = Markdown(extensions=load_extensions(extensions),
- safe_mode=safe_mode,
- output_format=output_format)
- md.convertFile(input, output, encoding)
-
-
-def main():
- from commandline import run
- run()
-
-
-if __name__ == '__main__':
- sys.exit(main())
- ''' Run Markdown from the command line. '''
diff --git a/src/calibre/ebooks/markdown/odict.py b/src/calibre/ebooks/markdown/odict.py
index 277cd4ebba..8089ece21a 100644
--- a/src/calibre/ebooks/markdown/odict.py
+++ b/src/calibre/ebooks/markdown/odict.py
@@ -1,7 +1,18 @@
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from . import util
+
+from copy import deepcopy
+
+def iteritems_compat(d):
+ """Return an iterator over the (key, value) pairs of a dictionary.
+ Copied from `six` module."""
+ return iter(getattr(d, _iteritems)())
+
class OrderedDict(dict):
"""
A dictionary that keeps its keys in the order in which they're inserted.
-
+
Copied from Django's SortedDict with some modifications.
"""
@@ -11,34 +22,44 @@ class OrderedDict(dict):
return instance
def __init__(self, data=None):
- if data is None:
- data = {}
- super(OrderedDict, self).__init__(data)
- if isinstance(data, dict):
- self.keyOrder = data.keys()
+ if data is None or isinstance(data, dict):
+ data = data or []
+ super(OrderedDict, self).__init__(data)
+ self.keyOrder = list(data) if data else []
else:
- self.keyOrder = []
+ super(OrderedDict, self).__init__()
+ super_set = super(OrderedDict, self).__setitem__
for key, value in data:
- if key not in self.keyOrder:
+ # Take the ordering from first key
+ if key not in self:
self.keyOrder.append(key)
+ # But override with last value in data (dict() does this)
+ super_set(key, value)
def __deepcopy__(self, memo):
- from copy import deepcopy
return self.__class__([(key, deepcopy(value, memo))
- for key, value in self.iteritems()])
+ for key, value in self.items()])
+
+ def __copy__(self):
+ # The Python's default copy implementation will alter the state
+ # of self. The reason for this seems complex but is likely related to
+ # subclassing dict.
+ return self.copy()
def __setitem__(self, key, value):
- super(OrderedDict, self).__setitem__(key, value)
- if key not in self.keyOrder:
+ if key not in self:
self.keyOrder.append(key)
+ super(OrderedDict, self).__setitem__(key, value)
def __delitem__(self, key):
super(OrderedDict, self).__delitem__(key)
self.keyOrder.remove(key)
def __iter__(self):
- for k in self.keyOrder:
- yield k
+ return iter(self.keyOrder)
+
+ def __reversed__(self):
+ return reversed(self.keyOrder)
def pop(self, k, *args):
result = super(OrderedDict, self).pop(k, *args)
@@ -54,41 +75,51 @@ class OrderedDict(dict):
self.keyOrder.remove(result[0])
return result
- def items(self):
- return zip(self.keyOrder, self.values())
-
- def iteritems(self):
+ def _iteritems(self):
for key in self.keyOrder:
- yield key, super(OrderedDict, self).__getitem__(key)
+ yield key, self[key]
- def keys(self):
- return self.keyOrder[:]
-
- def iterkeys(self):
- return iter(self.keyOrder)
-
- def values(self):
- return [super(OrderedDict, self).__getitem__(k) for k in self.keyOrder]
-
- def itervalues(self):
+ def _iterkeys(self):
for key in self.keyOrder:
- yield super(OrderedDict, self).__getitem__(key)
+ yield key
+
+ def _itervalues(self):
+ for key in self.keyOrder:
+ yield self[key]
+
+ if util.PY3:
+ items = _iteritems
+ keys = _iterkeys
+ values = _itervalues
+ else:
+ iteritems = _iteritems
+ iterkeys = _iterkeys
+ itervalues = _itervalues
+
+ def items(self):
+ return [(k, self[k]) for k in self.keyOrder]
+
+ def keys(self):
+ return self.keyOrder[:]
+
+ def values(self):
+ return [self[k] for k in self.keyOrder]
def update(self, dict_):
- for k, v in dict_.items():
- self.__setitem__(k, v)
+ for k, v in iteritems_compat(dict_):
+ self[k] = v
def setdefault(self, key, default):
- if key not in self.keyOrder:
+ if key not in self:
self.keyOrder.append(key)
return super(OrderedDict, self).setdefault(key, default)
def value_for_index(self, index):
- """Return the value of the item at the given zero-based index."""
+ """Returns the value of the item at the given zero-based index."""
return self[self.keyOrder[index]]
def insert(self, index, key, value):
- """Insert the key, value pair before the item with the given index."""
+ """Inserts the key, value pair before the item with the given index."""
if key in self.keyOrder:
n = self.keyOrder.index(key)
del self.keyOrder[n]
@@ -98,18 +129,16 @@ class OrderedDict(dict):
super(OrderedDict, self).__setitem__(key, value)
def copy(self):
- """Return a copy of this object."""
+ """Returns a copy of this object."""
# This way of initializing the copy means it works for subclasses, too.
- obj = self.__class__(self)
- obj.keyOrder = self.keyOrder[:]
- return obj
+ return self.__class__(self)
def __repr__(self):
"""
- Replace the normal dict.__repr__ with a version that returns the keys
- in their sorted order.
+ Replaces the normal dict.__repr__ with a version that returns the keys
+ in their Ordered order.
"""
- return '{%s}' % ', '.join(['%r: %r' % (k, v) for k, v in self.items()])
+ return '{%s}' % ', '.join(['%r: %r' % (k, v) for k, v in iteritems_compat(self)])
def clear(self):
super(OrderedDict, self).clear()
@@ -117,7 +146,10 @@ class OrderedDict(dict):
def index(self, key):
""" Return the index of a given key. """
- return self.keyOrder.index(key)
+ try:
+ return self.keyOrder.index(key)
+ except ValueError:
+ raise ValueError("Element '%s' was not found in OrderedDict" % key)
def index_for_location(self, location):
""" Return index or None for a given location. """
@@ -150,8 +182,8 @@ class OrderedDict(dict):
""" Change location of an existing item. """
n = self.keyOrder.index(key)
del self.keyOrder[n]
- i = self.index_for_location(location)
try:
+ i = self.index_for_location(location)
if i is not None:
self.keyOrder.insert(i, key)
else:
diff --git a/src/calibre/ebooks/markdown/postprocessors.py b/src/calibre/ebooks/markdown/postprocessors.py
index 80227bb909..5f3f032c15 100644
--- a/src/calibre/ebooks/markdown/postprocessors.py
+++ b/src/calibre/ebooks/markdown/postprocessors.py
@@ -8,15 +8,23 @@ processing.
"""
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import util
+from . import odict
+import re
-import markdown
-class Processor:
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
+def build_postprocessors(md_instance, **kwargs):
+ """ Build the default postprocessors for Markdown. """
+ postprocessors = odict.OrderedDict()
+ postprocessors["raw_html"] = RawHtmlPostprocessor(md_instance)
+ postprocessors["amp_substitute"] = AndSubstitutePostprocessor()
+ postprocessors["unescape"] = UnescapePostprocessor()
+ return postprocessors
-class Postprocessor(Processor):
+
+class Postprocessor(util.Processor):
"""
Postprocessors are run after the ElementTree it converted back into text.
@@ -50,12 +58,12 @@ class RawHtmlPostprocessor(Postprocessor):
elif str(self.markdown.safeMode).lower() == 'remove':
html = ''
else:
- html = markdown.HTML_REMOVED_TEXT
- if safe or not self.markdown.safeMode:
+ html = self.markdown.html_replacement_text
+ if self.isblocklevel(html) and (safe or not self.markdown.safeMode):
text = text.replace("%s
" %
- (markdown.preprocessors.HTML_PLACEHOLDER % i),
+ (self.markdown.htmlStash.get_placeholder(i)),
html + "\n")
- text = text.replace(markdown.preprocessors.HTML_PLACEHOLDER % i,
+ text = text.replace(self.markdown.htmlStash.get_placeholder(i),
html)
return text
@@ -66,12 +74,31 @@ class RawHtmlPostprocessor(Postprocessor):
html = html.replace('>', '>')
return html.replace('"', '"')
+ def isblocklevel(self, html):
+ m = re.match(r'^\<\/?([^ >]+)', html)
+ if m:
+ if m.group(1)[0] in ('!', '?', '@', '%'):
+ # Comment, php etc...
+ return True
+ return util.isBlockLevel(m.group(1))
+ return False
+
class AndSubstitutePostprocessor(Postprocessor):
""" Restore valid entities """
- def __init__(self):
- pass
def run(self, text):
- text = text.replace(markdown.AMP_SUBSTITUTE, "&")
+ text = text.replace(util.AMP_SUBSTITUTE, "&")
return text
+
+
+class UnescapePostprocessor(Postprocessor):
+ """ Restore escaped chars """
+
+ RE = re.compile('%s(\d+)%s' % (util.STX, util.ETX))
+
+ def unescape(self, m):
+ return util.int2str(int(m.group(1)))
+
+ def run(self, text):
+ return self.RE.sub(self.unescape, text)
diff --git a/src/calibre/ebooks/markdown/preprocessors.py b/src/calibre/ebooks/markdown/preprocessors.py
index 712a1e8755..72b2ed6f35 100644
--- a/src/calibre/ebooks/markdown/preprocessors.py
+++ b/src/calibre/ebooks/markdown/preprocessors.py
@@ -1,4 +1,3 @@
-
"""
PRE-PROCESSORS
=============================================================================
@@ -7,18 +6,24 @@ Preprocessors work on source text before we start doing anything too
complicated.
"""
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import util
+from . import odict
import re
-import markdown
-HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
-HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX
-class Processor:
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
+def build_preprocessors(md_instance, **kwargs):
+ """ Build the default set of preprocessors used by Markdown. """
+ preprocessors = odict.OrderedDict()
+ preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
+ if md_instance.safeMode != 'escape':
+ preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
+ preprocessors["reference"] = ReferencePreprocessor(md_instance)
+ return preprocessors
-class Preprocessor (Processor):
+
+class Preprocessor(util.Processor):
"""
Preprocessors are run after the text is broken into lines.
@@ -38,66 +43,95 @@ class Preprocessor (Processor):
"""
pass
-class HtmlStash:
- """
- This class is used for stashing HTML objects that we extract
- in the beginning and replace with place-holders.
- """
- def __init__ (self):
- """ Create a HtmlStash. """
- self.html_counter = 0 # for counting inline html segments
- self.rawHtmlBlocks=[]
+class NormalizeWhitespace(Preprocessor):
+ """ Normalize whitespace for consistant parsing. """
- def store(self, html, safe=False):
- """
- Saves an HTML segment for later reinsertion. Returns a
- placeholder string that needs to be inserted into the
- document.
-
- Keyword arguments:
-
- * html: an html segment
- * safe: label an html segment as safe for safemode
-
- Returns : a placeholder string
-
- """
- self.rawHtmlBlocks.append((html, safe))
- placeholder = HTML_PLACEHOLDER % self.html_counter
- self.html_counter += 1
- return placeholder
-
- def reset(self):
- self.html_counter = 0
- self.rawHtmlBlocks = []
+ def run(self, lines):
+ source = '\n'.join(lines)
+ source = source.replace(util.STX, "").replace(util.ETX, "")
+ source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
+ source = source.expandtabs(self.markdown.tab_length)
+ source = re.sub(r'(?<=\n) +\n', '\n', source)
+ return source.split('\n')
class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
right_tag_patterns = ["%s>", "%s>"]
+ attrs_pattern = r"""
+ \s+(?P[^>"'/= ]+)=(?P['"])(?P.*?)(?P=q) # attr="value"
+ | # OR
+ \s+(?P[^>"'/= ]+)=(?P[^> ]+) # attr=value
+ | # OR
+ \s+(?P[^>"'/= ]+) # attr
+ """
+ left_tag_pattern = r'^\<(?P[^> ]+)(?P(%s)*)\s*\/?\>?' % attrs_pattern
+ attrs_re = re.compile(attrs_pattern, re.VERBOSE)
+ left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
+ markdown_in_raw = False
def _get_left_tag(self, block):
- return block[1:].replace(">", " ", 1).split()[0].lower()
+ m = self.left_tag_re.match(block)
+ if m:
+ tag = m.group('tag')
+ raw_attrs = m.group('attrs')
+ attrs = {}
+ if raw_attrs:
+ for ma in self.attrs_re.finditer(raw_attrs):
+ if ma.group('attr'):
+ if ma.group('value'):
+ attrs[ma.group('attr').strip()] = ma.group('value')
+ else:
+ attrs[ma.group('attr').strip()] = ""
+ elif ma.group('attr1'):
+ if ma.group('value1'):
+ attrs[ma.group('attr1').strip()] = ma.group('value1')
+ else:
+ attrs[ma.group('attr1').strip()] = ""
+ elif ma.group('attr2'):
+ attrs[ma.group('attr2').strip()] = ""
+ return tag, len(m.group(0)), attrs
+ else:
+ tag = block[1:].split(">", 1)[0].lower()
+ return tag, len(tag)+2, {}
- def _get_right_tag(self, left_tag, block):
+ def _recursive_tagfind(self, ltag, rtag, start_index, block):
+ while 1:
+ i = block.find(rtag, start_index)
+ if i == -1:
+ return -1
+ j = block.find(ltag, start_index)
+ # if no ltag, or rtag found before another ltag, return index
+ if (j > i or j == -1):
+ return i + len(rtag)
+ # another ltag found before rtag, use end of ltag as starting
+ # point and search again
+ j = block.find('>', j)
+ start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
+ if start_index == -1:
+ # HTML potentially malformed- ltag has no corresponding
+ # rtag
+ return -1
+
+ def _get_right_tag(self, left_tag, left_index, block):
for p in self.right_tag_patterns:
tag = p % left_tag
- i = block.rfind(tag)
+ i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
if i > 2:
- return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
- return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
-
+ return tag.lstrip("<").rstrip(">"), i
+ return block.rstrip()[-left_index:-1].lower(), len(block)
+
def _equal_tags(self, left_tag, right_tag):
- if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
+ if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
return True
if ("/" + left_tag) == right_tag:
return True
if (right_tag == "--" and left_tag == "--"):
return True
elif left_tag == right_tag[1:] \
- and right_tag[0] != "<":
+ and right_tag[0] == "/":
return True
else:
return False
@@ -108,7 +142,7 @@ class HtmlBlockPreprocessor(Preprocessor):
def run(self, lines):
text = "\n".join(lines)
new_blocks = []
- text = text.split("\n\n")
+ text = text.rsplit("\n\n")
items = []
left_tag = ''
right_tag = ''
@@ -124,15 +158,25 @@ class HtmlBlockPreprocessor(Preprocessor):
block = block[1:]
if not in_tag:
- if block.startswith("<"):
- left_tag = self._get_left_tag(block)
- right_tag, data_index = self._get_right_tag(left_tag, block)
+ if block.startswith("<") and len(block.strip()) > 1:
- if data_index < len(block):
+ if block[1] == "!":
+ # is a comment block
+ left_tag, left_index, attrs = "--", 2, {}
+ else:
+ left_tag, left_index, attrs = self._get_left_tag(block)
+ right_tag, data_index = self._get_right_tag(left_tag,
+ left_index,
+ block)
+ # keep checking conditions below and maybe just append
+
+ if data_index < len(block) \
+ and (util.isBlockLevel(left_tag)
+ or left_tag == '--'):
text.insert(0, block[data_index:])
block = block[:data_index]
- if not (markdown.isBlockLevel(left_tag) \
+ if not (util.isBlockLevel(left_tag) \
or block[1] in ["!", "?", "@", "%"]):
new_blocks.append(block)
continue
@@ -141,22 +185,27 @@ class HtmlBlockPreprocessor(Preprocessor):
new_blocks.append(block.strip())
continue
- if block[1] == "!":
- # is a comment block
- left_tag = "--"
- right_tag, data_index = self._get_right_tag(left_tag, block)
- # keep checking conditions below and maybe just append
-
if block.rstrip().endswith(">") \
and self._equal_tags(left_tag, right_tag):
- new_blocks.append(
- self.markdown.htmlStash.store(block.strip()))
+ if self.markdown_in_raw and 'markdown' in attrs.keys():
+ start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
+ '', block[:left_index])
+ end = block[-len(right_tag)-2:]
+ block = block[left_index:-len(right_tag)-2]
+ new_blocks.append(
+ self.markdown.htmlStash.store(start))
+ new_blocks.append(block)
+ new_blocks.append(
+ self.markdown.htmlStash.store(end))
+ else:
+ new_blocks.append(
+ self.markdown.htmlStash.store(block.strip()))
continue
- else: #if not block[1] == "!":
+ else:
# if is block level tag and is not complete
- if markdown.isBlockLevel(left_tag) or left_tag == "--" \
- and not block.rstrip().endswith(">"):
+ if util.isBlockLevel(left_tag) or left_tag == "--" \
+ and not block.rstrip().endswith(">"):
items.append(block.strip())
in_tag = True
else:
@@ -168,19 +217,52 @@ class HtmlBlockPreprocessor(Preprocessor):
new_blocks.append(block)
else:
- items.append(block.strip())
+ items.append(block)
- right_tag, data_index = self._get_right_tag(left_tag, block)
+ right_tag, data_index = self._get_right_tag(left_tag, 0, block)
if self._equal_tags(left_tag, right_tag):
# if find closing tag
+
+ if data_index < len(block):
+ # we have more text after right_tag
+ items[-1] = block[:data_index]
+ text.insert(0, block[data_index:])
+
in_tag = False
- new_blocks.append(
- self.markdown.htmlStash.store('\n\n'.join(items)))
+ if self.markdown_in_raw and 'markdown' in attrs.keys():
+ start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
+ '', items[0][:left_index])
+ items[0] = items[0][left_index:]
+ end = items[-1][-len(right_tag)-2:]
+ items[-1] = items[-1][:-len(right_tag)-2]
+ new_blocks.append(
+ self.markdown.htmlStash.store(start))
+ new_blocks.extend(items)
+ new_blocks.append(
+ self.markdown.htmlStash.store(end))
+ else:
+ new_blocks.append(
+ self.markdown.htmlStash.store('\n\n'.join(items)))
items = []
if items:
- new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
+ if self.markdown_in_raw and 'markdown' in attrs.keys():
+ start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
+ '', items[0][:left_index])
+ items[0] = items[0][left_index:]
+ end = items[-1][-len(right_tag)-2:]
+ items[-1] = items[-1][:-len(right_tag)-2]
+ new_blocks.append(
+ self.markdown.htmlStash.store(start))
+ new_blocks.extend(items)
+ if end.strip():
+ new_blocks.append(
+ self.markdown.htmlStash.store(end))
+ else:
+ new_blocks.append(
+ self.markdown.htmlStash.store('\n\n'.join(items)))
+ #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
new_blocks.append('\n')
new_text = "\n\n".join(new_blocks)
@@ -190,24 +272,26 @@ class HtmlBlockPreprocessor(Preprocessor):
class ReferencePreprocessor(Preprocessor):
""" Remove reference definitions from text and store for later use. """
- RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
+ TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
+ RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
+ TITLE_RE = re.compile(r'^%s$' % TITLE)
def run (self, lines):
new_text = [];
- for line in lines:
+ while lines:
+ line = lines.pop(0)
m = self.RE.match(line)
if m:
- id = m.group(2).strip().lower()
- t = m.group(4).strip() # potential title
+ id = m.group(1).strip().lower()
+ link = m.group(2).lstrip('<').rstrip('>')
+ t = m.group(5) or m.group(6) or m.group(7)
if not t:
- self.markdown.references[id] = (m.group(3), t)
- elif (len(t) >= 2
- and (t[0] == t[-1] == "\""
- or t[0] == t[-1] == "\'"
- or (t[0] == "(" and t[-1] == ")") ) ):
- self.markdown.references[id] = (m.group(3), t[1:-1])
- else:
- new_text.append(line)
+ # Check next line for title
+ tm = self.TITLE_RE.match(lines[0])
+ if tm:
+ lines.pop(0)
+ t = tm.group(2) or tm.group(3) or tm.group(4)
+ self.markdown.references[id] = (link, t)
else:
new_text.append(line)
diff --git a/src/calibre/ebooks/markdown/html4.py b/src/calibre/ebooks/markdown/serializers.py
similarity index 71%
rename from src/calibre/ebooks/markdown/html4.py
rename to src/calibre/ebooks/markdown/serializers.py
index 08f241d57a..b19d61c93d 100644
--- a/src/calibre/ebooks/markdown/html4.py
+++ b/src/calibre/ebooks/markdown/serializers.py
@@ -1,6 +1,6 @@
-# markdown/html4.py
+# markdown/searializers.py
#
-# Add html4 serialization to older versions of Elementree
+# Add x/html serialization to Elementree
# Taken from ElementTree 1.3 preview with slight modifications
#
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
@@ -37,12 +37,19 @@
# --------------------------------------------------------------------
-import markdown
-ElementTree = markdown.etree.ElementTree
-QName = markdown.etree.QName
-Comment = markdown.etree.Comment
-PI = markdown.etree.PI
-ProcessingInstruction = markdown.etree.ProcessingInstruction
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import util
+ElementTree = util.etree.ElementTree
+QName = util.etree.QName
+if hasattr(util.etree, 'test_comment'):
+ Comment = util.etree.test_comment
+else:
+ Comment = util.etree.Comment
+PI = util.etree.PI
+ProcessingInstruction = util.etree.ProcessingInstruction
+
+__all__ = ['to_html_string', 'to_xhtml_string']
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
"img", "input", "isindex", "link", "meta" "param")
@@ -77,7 +84,7 @@ def _encode(text, encoding):
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _escape_cdata(text, encoding):
+def _escape_cdata(text):
# escape character data
try:
# it's worth avoiding do-nothing calls for strings that are
@@ -89,12 +96,12 @@ def _escape_cdata(text, encoding):
text = text.replace("<", "<")
if ">" in text:
text = text.replace(">", ">")
- return text.encode(encoding, "xmlcharrefreplace")
+ return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _escape_attrib(text, encoding):
+def _escape_attrib(text):
# escape attribute value
try:
if "&" in text:
@@ -107,38 +114,40 @@ def _escape_attrib(text, encoding):
text = text.replace("\"", """)
if "\n" in text:
text = text.replace("\n", "
")
- return text.encode(encoding, "xmlcharrefreplace")
+ return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _escape_attrib_html(text, encoding):
+def _escape_attrib_html(text):
# escape attribute value
try:
if "&" in text:
text = text.replace("&", "&")
+ if "<" in text:
+ text = text.replace("<", "<")
if ">" in text:
text = text.replace(">", ">")
if "\"" in text:
text = text.replace("\"", """)
- return text.encode(encoding, "xmlcharrefreplace")
+ return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _serialize_html(write, elem, encoding, qnames, namespaces):
+def _serialize_html(write, elem, qnames, namespaces, format):
tag = elem.tag
text = elem.text
if tag is Comment:
- write("" % _escape_cdata(text, encoding))
+ write("" % _escape_cdata(text))
elif tag is ProcessingInstruction:
- write("%s?>" % _escape_cdata(text, encoding))
+ write("%s?>" % _escape_cdata(text))
else:
tag = qnames[tag]
if tag is None:
if text:
- write(_escape_cdata(text, encoding))
+ write(_escape_cdata(text))
for e in elem:
- _serialize_html(write, e, encoding, qnames, None)
+ _serialize_html(write, e, qnames, None, format)
else:
write("<" + tag)
items = elem.items()
@@ -150,54 +159,55 @@ def _serialize_html(write, elem, encoding, qnames, namespaces):
if isinstance(v, QName):
v = qnames[v.text]
else:
- v = _escape_attrib_html(v, encoding)
- # FIXME: handle boolean attributes
- write(" %s=\"%s\"" % (qnames[k], v))
+ v = _escape_attrib_html(v)
+ if qnames[k] == v and format == 'html':
+ # handle boolean attributes
+ write(" %s" % v)
+ else:
+ write(" %s=\"%s\"" % (qnames[k], v))
if namespaces:
items = namespaces.items()
items.sort(key=lambda x: x[1]) # sort on prefix
for v, k in items:
if k:
k = ":" + k
- write(" xmlns%s=\"%s\"" % (
- k.encode(encoding),
- _escape_attrib(v, encoding)
- ))
- write(">")
- tag = tag.lower()
- if text:
- if tag == "script" or tag == "style":
- write(_encode(text, encoding))
- else:
- write(_escape_cdata(text, encoding))
- for e in elem:
- _serialize_html(write, e, encoding, qnames, None)
- if tag not in HTML_EMPTY:
- write("" + tag + ">")
+ write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
+ if format == "xhtml" and tag in HTML_EMPTY:
+ write(" />")
+ else:
+ write(">")
+ tag = tag.lower()
+ if text:
+ if tag == "script" or tag == "style":
+ write(text)
+ else:
+ write(_escape_cdata(text))
+ for e in elem:
+ _serialize_html(write, e, qnames, None, format)
+ if tag not in HTML_EMPTY:
+ write("" + tag + ">")
if elem.tail:
- write(_escape_cdata(elem.tail, encoding))
+ write(_escape_cdata(elem.tail))
-def write_html(root, f,
- # keyword arguments
- encoding="us-ascii",
- default_namespace=None):
+def _write_html(root,
+ encoding=None,
+ default_namespace=None,
+ format="html"):
assert root is not None
- if not hasattr(f, "write"):
- f = open(f, "wb")
- write = f.write
- if not encoding:
- encoding = "us-ascii"
- qnames, namespaces = _namespaces(
- root, encoding, default_namespace
- )
- _serialize_html(
- write, root, encoding, qnames, namespaces
- )
+ data = []
+ write = data.append
+ qnames, namespaces = _namespaces(root, default_namespace)
+ _serialize_html(write, root, qnames, namespaces, format)
+ if encoding is None:
+ return "".join(data)
+ else:
+ return _encode("".join(data))
+
# --------------------------------------------------------------------
# serialization support
-def _namespaces(elem, encoding, default_namespace=None):
+def _namespaces(elem, default_namespace=None):
# identify namespaces used in this tree
# maps qnames to *encoded* prefix:local names
@@ -208,9 +218,6 @@ def _namespaces(elem, encoding, default_namespace=None):
if default_namespace:
namespaces[default_namespace] = ""
- def encode(text):
- return text.encode(encoding)
-
def add_qname(qname):
# calculate serialized qname representation
try:
@@ -224,17 +231,16 @@ def _namespaces(elem, encoding, default_namespace=None):
if prefix != "xml":
namespaces[uri] = prefix
if prefix:
- qnames[qname] = encode("%s:%s" % (prefix, tag))
+ qnames[qname] = "%s:%s" % (prefix, tag)
else:
- qnames[qname] = encode(tag) # default element
+ qnames[qname] = tag # default element
else:
if default_namespace:
- # FIXME: can this be handled in XML 1.0?
raise ValueError(
"cannot use non-qualified names with "
"default_namespace option"
)
- qnames[qname] = encode(qname)
+ qnames[qname] = qname
except TypeError:
_raise_serialization_error(qname)
@@ -247,7 +253,7 @@ def _namespaces(elem, encoding, default_namespace=None):
tag = elem.tag
if isinstance(tag, QName) and tag.text not in qnames:
add_qname(tag.text)
- elif isinstance(tag, basestring):
+ elif isinstance(tag, util.string_type):
if tag not in qnames:
add_qname(tag)
elif tag is not None and tag is not Comment and tag is not PI:
@@ -264,11 +270,8 @@ def _namespaces(elem, encoding, default_namespace=None):
add_qname(text.text)
return qnames, namespaces
-def to_html_string(element, encoding=None):
- class dummy:
- pass
- data = []
- file = dummy()
- file.write = data.append
- write_html(ElementTree(element).getroot(),file,encoding)
- return "".join(data)
+def to_html_string(element):
+ return _write_html(ElementTree(element).getroot(), format="html")
+
+def to_xhtml_string(element):
+ return _write_html(ElementTree(element).getroot(), format="xhtml")
diff --git a/src/calibre/ebooks/markdown/treeprocessors.py b/src/calibre/ebooks/markdown/treeprocessors.py
index 0604b1970f..e6d3dc9381 100644
--- a/src/calibre/ebooks/markdown/treeprocessors.py
+++ b/src/calibre/ebooks/markdown/treeprocessors.py
@@ -1,16 +1,26 @@
-import markdown
-import re
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from . import util
+from . import odict
+from . import inlinepatterns
+
+
+def build_treeprocessors(md_instance, **kwargs):
+ """ Build the default treeprocessors for Markdown. """
+ treeprocessors = odict.OrderedDict()
+ treeprocessors["inline"] = InlineProcessor(md_instance)
+ treeprocessors["prettify"] = PrettifyTreeprocessor(md_instance)
+ return treeprocessors
+
def isString(s):
""" Check if it's string """
- return isinstance(s, unicode) or isinstance(s, str)
+ if not isinstance(s, util.AtomicString):
+ return isinstance(s, util.string_type)
+ return False
-class Processor:
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
-class Treeprocessor(Processor):
+class Treeprocessor(util.Processor):
"""
Treeprocessors are run on the ElementTree object before serialization.
@@ -24,8 +34,8 @@ class Treeprocessor(Processor):
def run(self, root):
"""
Subclasses of Treeprocessor should implement a `run` method, which
- takes a root ElementTree. This method can return another ElementTree
- object, and the existing root ElementTree will be replaced, or it can
+ takes a root ElementTree. This method can return another ElementTree
+ object, and the existing root ElementTree will be replaced, or it can
modify the current tree and return None.
"""
pass
@@ -36,18 +46,18 @@ class InlineProcessor(Treeprocessor):
A Treeprocessor that traverses a tree, applying inline patterns.
"""
- def __init__ (self, md):
- self.__placeholder_prefix = markdown.INLINE_PLACEHOLDER_PREFIX
- self.__placeholder_suffix = markdown.ETX
+ def __init__(self, md):
+ self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
+ self.__placeholder_suffix = util.ETX
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+ len(self.__placeholder_suffix)
- self.__placeholder_re = re.compile(markdown.INLINE_PLACEHOLDER % r'([0-9]{4})')
+ self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
self.markdown = md
def __makePlaceholder(self, type):
""" Generate a placeholder """
id = "%04d" % len(self.stashed_nodes)
- hash = markdown.INLINE_PLACEHOLDER % id
+ hash = util.INLINE_PLACEHOLDER % id
return hash, id
def __findPlaceholder(self, data, index):
@@ -60,8 +70,8 @@ class InlineProcessor(Treeprocessor):
* index: index, from which we start search
Returns: placeholder id and string index, after the found placeholder.
+
"""
-
m = self.__placeholder_re.search(data, index)
if m:
return m.group(1), m.end()
@@ -87,7 +97,7 @@ class InlineProcessor(Treeprocessor):
Returns: String with placeholders.
"""
- if not isinstance(data, markdown.AtomicString):
+ if not isinstance(data, util.AtomicString):
startIndex = 0
while patternIndex < len(self.markdown.inlinePatterns):
data, matched, startIndex = self.__applyPattern(
@@ -140,6 +150,7 @@ class InlineProcessor(Treeprocessor):
* parent: Element, which contains processing inline data
Returns: list with ElementTree elements with applied inline patterns.
+
"""
def linkText(text):
if text:
@@ -153,7 +164,6 @@ class InlineProcessor(Treeprocessor):
parent.text += text
else:
parent.text = text
-
result = []
strartIndex = 0
while data:
@@ -172,7 +182,7 @@ class InlineProcessor(Treeprocessor):
for child in [node] + node.getchildren():
if child.tail:
if child.tail.strip():
- self.__processElementText(node, child, False)
+ self.__processElementText(node, child,False)
if child.text:
if child.text.strip():
self.__processElementText(child, child)
@@ -190,6 +200,9 @@ class InlineProcessor(Treeprocessor):
strartIndex = end
else:
text = data[strartIndex:]
+ if isinstance(data, util.AtomicString):
+ # We don't want to loose the AtomicString
+ text = util.AtomicString(text)
linkText(text)
data = ""
@@ -205,7 +218,7 @@ class InlineProcessor(Treeprocessor):
* data: the text to be processed
* pattern: the pattern to be checked
* patternIndex: index of current pattern
- * startIndex: string index, from which we starting search
+ * startIndex: string index, from which we start searching
Returns: String with placeholders instead of ElementTree elements.
@@ -219,14 +232,14 @@ class InlineProcessor(Treeprocessor):
node = pattern.handleMatch(match)
if node is None:
- return data, True, len(leftData) + match.span(len(match.groups()))[0]
+ return data, True, len(leftData)+match.span(len(match.groups()))[0]
if not isString(node):
- if not isinstance(node.text, markdown.AtomicString):
+ if not isinstance(node.text, util.AtomicString):
# We need to process current node too
for child in [node] + node.getchildren():
if not isString(node):
- if child.text:
+ if child.text:
child.text = self.__handleInline(child.text,
patternIndex + 1)
if child.tail:
@@ -244,14 +257,14 @@ class InlineProcessor(Treeprocessor):
Iterate over ElementTree, find elements with inline tag, apply inline
patterns and append newly created Elements to tree. If you don't
- want process your data with inline paterns, instead of normal string,
+ want to process your data with inline paterns, instead of normal string,
use subclass AtomicString:
- node.text = markdown.AtomicString("data won't be processed with inline patterns")
+ node.text = markdown.AtomicString("This will not be processed.")
Arguments:
- * markdownTree: ElementTree object, representing Markdown tree.
+ * tree: ElementTree object, representing Markdown tree.
Returns: ElementTree object with applied inline patterns.
@@ -264,33 +277,46 @@ class InlineProcessor(Treeprocessor):
currElement = stack.pop()
insertQueue = []
for child in currElement.getchildren():
- if child.text and not isinstance(child.text, markdown.AtomicString):
+ if child.text and not isinstance(child.text, util.AtomicString):
text = child.text
child.text = None
lst = self.__processPlaceholders(self.__handleInline(
text), child)
stack += lst
insertQueue.append((child, lst))
-
+ if child.tail:
+ tail = self.__handleInline(child.tail)
+ dumby = util.etree.Element('d')
+ tailResult = self.__processPlaceholders(tail, dumby)
+ if dumby.text:
+ child.tail = dumby.text
+ else:
+ child.tail = None
+ pos = currElement.getchildren().index(child) + 1
+ tailResult.reverse()
+ for newChild in tailResult:
+ currElement.insert(pos, newChild)
if child.getchildren():
stack.append(child)
for element, lst in insertQueue:
- if element.text:
- element.text = \
- markdown.inlinepatterns.handleAttributes(element.text,
- element)
+ if self.markdown.enable_attributes:
+ if element.text and isString(element.text):
+ element.text = \
+ inlinepatterns.handleAttributes(element.text,
+ element)
i = 0
for newChild in lst:
- # Processing attributes
- if newChild.tail:
- newChild.tail = \
- markdown.inlinepatterns.handleAttributes(newChild.tail,
- element)
- if newChild.text:
- newChild.text = \
- markdown.inlinepatterns.handleAttributes(newChild.text,
- newChild)
+ if self.markdown.enable_attributes:
+ # Processing attributes
+ if newChild.tail and isString(newChild.tail):
+ newChild.tail = \
+ inlinepatterns.handleAttributes(newChild.tail,
+ element)
+ if newChild.text and isString(newChild.text):
+ newChild.text = \
+ inlinepatterns.handleAttributes(newChild.text,
+ newChild)
element.insert(i, newChild)
i += 1
return tree
@@ -303,12 +329,12 @@ class PrettifyTreeprocessor(Treeprocessor):
""" Recursively add linebreaks to ElementTree children. """
i = "\n"
- if markdown.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
+ if util.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
if (not elem.text or not elem.text.strip()) \
- and len(elem) and markdown.isBlockLevel(elem[0].tag):
+ and len(elem) and util.isBlockLevel(elem[0].tag):
elem.text = i
for e in elem:
- if markdown.isBlockLevel(e.tag):
+ if util.isBlockLevel(e.tag):
self._prettifyETree(e)
if not elem.tail or not elem.tail.strip():
elem.tail = i
@@ -327,3 +353,8 @@ class PrettifyTreeprocessor(Treeprocessor):
br.tail = '\n'
else:
br.tail = '\n%s' % br.tail
+ # Clean up extra empty lines at end of code blocks.
+ pres = root.getiterator('pre')
+ for pre in pres:
+ if len(pre) and pre[0].tag == 'code':
+ pre[0].text = pre[0].text.rstrip() + '\n'
diff --git a/src/calibre/ebooks/markdown/util.py b/src/calibre/ebooks/markdown/util.py
new file mode 100644
index 0000000000..1036197c47
--- /dev/null
+++ b/src/calibre/ebooks/markdown/util.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+import re
+import sys
+
+
+"""
+Python 3 Stuff
+=============================================================================
+"""
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+ string_type = str
+ text_type = str
+ int2str = chr
+else:
+ string_type = basestring
+ text_type = unicode
+ int2str = unichr
+
+
+"""
+Constants you might want to modify
+-----------------------------------------------------------------------------
+"""
+
+BLOCK_LEVEL_ELEMENTS = re.compile("^(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
+ "|script|noscript|form|fieldset|iframe|math"
+ "|hr|hr/|style|li|dt|dd|thead|tbody"
+ "|tr|th|td|section|footer|header|group|figure"
+ "|figcaption|aside|article|canvas|output"
+ "|progress|video)$", re.IGNORECASE)
+# Placeholders
+STX = '\u0002' # Use STX ("Start of text") for start-of-placeholder
+ETX = '\u0003' # Use ETX ("End of text") for end-of-placeholder
+INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
+INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
+INLINE_PLACEHOLDER_RE = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})')
+AMP_SUBSTITUTE = STX+"amp"+ETX
+
+"""
+Constants you probably do not need to change
+-----------------------------------------------------------------------------
+"""
+
+RTL_BIDI_RANGES = ( ('\u0590', '\u07FF'),
+ # Hebrew (0590-05FF), Arabic (0600-06FF),
+ # Syriac (0700-074F), Arabic supplement (0750-077F),
+ # Thaana (0780-07BF), Nko (07C0-07FF).
+ ('\u2D30', '\u2D7F'), # Tifinagh
+ )
+
+# Extensions should use "markdown.util.etree" instead of "etree" (or do `from
+# markdown.util import etree`). Do not import it by yourself.
+
+try: # Is the C implemenation of ElementTree available?
+ import xml.etree.cElementTree as etree
+ from xml.etree.ElementTree import Comment
+ # Serializers (including ours) test with non-c Comment
+ etree.test_comment = Comment
+ if etree.VERSION < "1.0.5":
+ raise RuntimeError("cElementTree version 1.0.5 or higher is required.")
+except (ImportError, RuntimeError):
+ # Use the Python implementation of ElementTree?
+ import xml.etree.ElementTree as etree
+ if etree.VERSION < "1.1":
+ raise RuntimeError("ElementTree version 1.1 or higher is required")
+
+
+"""
+AUXILIARY GLOBAL FUNCTIONS
+=============================================================================
+"""
+
+
+def isBlockLevel(tag):
+ """Check if the tag is a block level HTML tag."""
+ if isinstance(tag, string_type):
+ return BLOCK_LEVEL_ELEMENTS.match(tag)
+ # Some ElementTree tags are not strings, so return False.
+ return False
+
+"""
+MISC AUXILIARY CLASSES
+=============================================================================
+"""
+
+class AtomicString(text_type):
+ """A string which should not be further processed."""
+ pass
+
+
+class Processor(object):
+ def __init__(self, markdown_instance=None):
+ if markdown_instance:
+ self.markdown = markdown_instance
+
+
+class HtmlStash(object):
+ """
+ This class is used for stashing HTML objects that we extract
+ in the beginning and replace with place-holders.
+ """
+
+ def __init__ (self):
+ """ Create a HtmlStash. """
+ self.html_counter = 0 # for counting inline html segments
+ self.rawHtmlBlocks=[]
+
+ def store(self, html, safe=False):
+ """
+ Saves an HTML segment for later reinsertion. Returns a
+ placeholder string that needs to be inserted into the
+ document.
+
+ Keyword arguments:
+
+ * html: an html segment
+ * safe: label an html segment as safe for safemode
+
+ Returns : a placeholder string
+
+ """
+ self.rawHtmlBlocks.append((html, safe))
+ placeholder = self.get_placeholder(self.html_counter)
+ self.html_counter += 1
+ return placeholder
+
+ def reset(self):
+ self.html_counter = 0
+ self.rawHtmlBlocks = []
+
+ def get_placeholder(self, key):
+ return "%swzxhzdk:%d%s" % (STX, key, ETX)
+
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 2420236114..8d6139e1cf 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -97,9 +97,9 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
def convert_markdown(txt, title='', extensions=('footnotes', 'tables', 'toc')):
from calibre.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS
- from calibre.ebooks.markdown import markdown
+ from calibre.ebooks.markdown import Markdown
extensions = [x.lower() for x in extensions if x.lower() in MD_EXTENSIONS]
- md = markdown.Markdown(
+ md = Markdown(
extensions,
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))