Fix txt2lrf processing of long unstructured txt files

This commit is contained in:
Kovid Goyal 2007-08-25 02:51:27 +00:00
parent ae0b5d3168
commit 58c15ac8b7
2 changed files with 382 additions and 231 deletions

View File

@ -31,10 +31,12 @@ def option_parser():
'the text in mybook.txt. Default is to try to autodetect.'
parser.add_option('-e', '--encoding', action='store', type='string', \
dest='encoding', help=enchelp, default=None)
parser.add_option('--debug-html-generation', action='store_true', default=False,
dest='debug_html_generation', help='Print generated HTML to stdout and quit.')
return parser
def generate_html(txtfile, encoding):
def generate_html(txtfile, encoding, logger):
'''
Convert txtfile to html and return a PersistentTemporaryFile object pointing
to the file with the HTML.
@ -54,12 +56,14 @@ def generate_html(txtfile, encoding):
raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
else:
txt = codecs.open(txtfile, 'rb', enc).read()
logger.info('Converting text to HTML...')
md = markdown.Markdown(txt,
extensions=['footnotes', 'tables', 'toc'],
encoding=enc,
safe_mode=False,
)
html = md.toString()
p = PersistentTemporaryFile('.html', dir=os.path.dirname(txtfile))
p.close()
codecs.open(p.name, 'wb', enc).write(html)
@ -73,7 +77,10 @@ def process_file(path, options, logger=None):
txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'encoding'):
options.encoding = None
htmlfile = generate_html(txt, options.encoding)
if not hasattr(options, 'debug_html_generation'):
options.debug_html_generation = False
htmlfile = generate_html(txt, options.encoding, logger)
if not options.debug_html_generation:
options.force_page_break = 'h2'
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
@ -81,6 +88,8 @@ def process_file(path, options, logger=None):
options.output = os.path.abspath(os.path.expanduser(options.output))
html_process_file(htmlfile.name, options, logger)
else:
print open(htmlfile.name, 'rb').read()
def main(args=sys.argv, logger=None):
parser = option_parser()

View File

@ -1,43 +1,37 @@
#!/usr/bin/env python
# The following constant specifies the name used in the usage
# statement displayed for python versions lower than 2.3. (With
# python2.3 and higher the usage statement is generated by optparse
# and uses the actual name of the executable called.)
EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
SPEED_TEST = 0
version = "1.6b"
version_info = (1,6,2,"rc-2")
__revision__ = "$Rev$"
"""
====================================================================
IF YOA ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION
====================================================================
Python-Markdown
===============
Converts Markdown to HTML. Basic usage as a module:
import markdown
html = markdown.markdown(your_text_string)
md = Markdown()
html = markdown.convert(your_text_string)
See http://www.freewisdom.org/projects/python-markdown/ for more
information and instructions on how to extend the functionality of the
script. (You might want to read that before you try modifying this
file.)
Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
maintained by [Yuri Takhteyev](http://www.freewisdom.org).
Project website: http://www.freewisdom.org/projects/python-markdown
Contact: yuri [at] freewisdom.org
License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
Version: 1.5a (July 9, 2006)
For changelog, see end of file
"""
import re, sys, os, random, codecs
# set debug level: 3 none, 2 critical, 1 informative, 0 all
import re, sys, codecs
# Set debug level: 3 none, 2 critical, 1 informative, 0 all
(VERBOSE, INFO, CRITICAL, NONE) = range(4)
MESSAGE_THRESHOLD = CRITICAL
@ -49,10 +43,45 @@ def message(level, text) :
# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
# all tabs will be expanded to up to this many spaces
TAB_LENGTH = 4
ENABLE_ATTRIBUTES = 1
SMART_EMPHASIS = 1
TAB_LENGTH = 4 # expand tabs to this many spaces
ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
SMART_EMPHASIS = 1 # this_or_that does not become this<i>or</i>that
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
# from Hebrew to Nko (includes Arabic, Syriac and Thaana)
(u'\u2D30', u'\u2D7F'),
# Tifinagh
)
# Unicode Reference Table:
# 0590-05FF - Hebrew
# 0600-06FF - Arabic
# 0700-074F - Syriac
# 0750-077F - Arabic Supplement
# 0780-07BF - Thaana
# 07C0-07FF - Nko
BOMS = { 'utf-8' : (unicode(codecs.BOM_UTF8, "utf-8"), ),
'utf-16' : (unicode(codecs.BOM_UTF16_LE, "utf-16"),
unicode(codecs.BOM_UTF16_BE, "utf-16")),
#'utf-32' : (unicode(codecs.BOM_UTF32_LE, "utf-32"),
# unicode(codecs.BOM_UTF32_BE, "utf-32")),
}
def removeBOM(text, encoding):
for bom in BOMS[encoding]:
if text.startswith(bom):
return text.lstrip(bom)
return text
# The following constant specifies the name used in the usage
# statement displayed for python versions lower than 2.3. (With
# python2.3 and higher the usage statement is generated by optparse
# and uses the actual name of the executable called.)
EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
@ -82,14 +111,50 @@ Importantly, NanoDom does not do normalization, which is what we
want. It also adds extra white space when converting DOM to string
"""
ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&amp;"),
(re.compile("<"), "&lt;"),
(re.compile(">"), "&gt;"),
(re.compile("\""), "&quot;")]
ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&amp;"),
(re.compile("<"), "&lt;"),
(re.compile(">"), "&gt;"),
(re.compile("\""), "&quot;")]
def getBidiType(text) :
if not text : return None
ch = text[0]
if not isinstance(ch, unicode) or not ch.isalpha():
return None
else :
for min, max in RTL_BIDI_RANGES :
if ( ch >= min and ch <= max ) :
return "rtl"
else :
return "ltr"
class Document :
def __init__ (self) :
self.bidi = "ltr"
def appendChild(self, child) :
self.documentElement = child
child.isDocumentElement = True
child.parent = self
self.entities = {}
def setBidi(self, bidi) :
if bidi :
self.bidi = bidi
def createElement(self, tag, textNode=None) :
el = Element(tag)
el.doc = self
@ -107,19 +172,23 @@ class Document :
self.entities[entity] = EntityReference(entity)
return self.entities[entity]
def createCDATA(self, text) :
node = CDATA(text)
node.doc = self
return node
def toxml (self) :
return self.documentElement.toxml()
def normalizeEntities(self, text) :
def normalizeEntities(self, text, avoidDoubleNormalizing=False) :
pairs = [ ("&", "&amp;"),
("<", "&lt;"),
(">", "&gt;"),
("\"", "&quot;")]
if avoidDoubleNormalizing :
regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT
else :
regexps = ENTITY_NORMALIZATION_EXPRESSIONS
for old, new in pairs :
text = text.replace(old, new)
for regexp, substitution in regexps :
text = regexp.sub(substitution, text)
return text
def find(self, test) :
@ -130,6 +199,19 @@ class Document :
self.documentElement = None
class CDATA :
type = "cdata"
def __init__ (self, text) :
self.text = text
def handleAttributes(self) :
pass
def toxml (self) :
return "<![CDATA[" + self.text + "]]>"
class Element :
type = "element"
@ -140,6 +222,19 @@ class Element :
self.attributes = []
self.attribute_values = {}
self.childNodes = []
self.bidi = None
self.isDocumentElement = False
def setBidi(self, bidi) :
if bidi :
if not self.bidi or self.isDocumentElement:
# Once the bidi is set don't change it (except for doc element)
self.bidi = bidi
self.parent.setBidi(bidi)
def unlink(self) :
for child in self.childNodes :
@ -186,27 +281,56 @@ class Element :
if ENABLE_ATTRIBUTES :
for child in self.childNodes:
child.handleAttributes()
buffer = ""
if self.nodeName in ['h1', 'h2', 'h3', 'h4'] :
buffer += "\n"
elif self.nodeName in ['li'] :
buffer += "\n "
# Process children FIRST, then do the attributes
childBuffer = ""
if self.childNodes or self.nodeName in ['blockquote']:
childBuffer += ">"
for child in self.childNodes :
childBuffer += child.toxml()
if self.nodeName == 'p' :
childBuffer += "\n"
elif self.nodeName == 'li' :
childBuffer += "\n "
childBuffer += "</%s>" % self.nodeName
else :
childBuffer += "/>"
buffer += "<" + self.nodeName
if self.nodeName in ['p', 'li', 'ul', 'ol',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] :
if not self.attribute_values.has_key("dir"):
if self.bidi :
bidi = self.bidi
else :
bidi = self.doc.bidi
if bidi=="rtl" :
self.setAttribute("dir", "rtl")
for attr in self.attributes :
value = self.attribute_values[attr]
value = self.doc.normalizeEntities(value)
value = self.doc.normalizeEntities(value,
avoidDoubleNormalizing=True)
buffer += ' %s="%s"' % (attr, value)
if self.childNodes or self.nodeName in ['blockquote']:
buffer += ">"
for child in self.childNodes :
buffer += child.toxml()
if self.nodeName == 'p' :
buffer += "\n"
elif self.nodeName == 'li' :
buffer += "\n "
buffer += "</%s>" % self.nodeName
else :
buffer += "/>"
# Now let's actually append the children
buffer += childBuffer
if self.nodeName in ['p', 'li', 'ul', 'ol',
'h1', 'h2', 'h3', 'h4'] :
buffer += "\n"
@ -223,13 +347,18 @@ class TextNode :
self.value = text
def attributeCallback(self, match) :
self.parent.setAttribute(match.group(1), match.group(2))
def handleAttributes(self) :
self.value = self.attrRegExp.sub(self.attributeCallback, self.value)
def toxml(self) :
text = self.value
self.parent.setBidi(getBidiType(text))
if not text.startswith(HTML_PLACEHOLDER_PREFIX):
if self.parent.nodeName == "p" :
text = text.replace("\n", "\n ")
@ -262,10 +391,10 @@ class EntityReference:
Preprocessors munge source text before we start doing anything too
complicated.
Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document,
modifies it as necessary and returns either the same pointer or a
pointer to a new list. Preprocessors must extend
markdown.Preprocessor.
Each preprocessor implements a "run" method that takes a pointer to a
list of lines of the document, modifies it as necessary and returns
either the same pointer or a pointer to a new list. Preprocessors
must extend markdown.Preprocessor.
"""
@ -305,10 +434,6 @@ class HeaderPreprocessor (Preprocessor):
lines[i] = "## " + lines[i].strip()
lines[i+1] = ""
#for l in lines :
# print l.encode('utf8')
#sys.exit(0)
return lines
HEADER_PREPROCESSOR = HeaderPreprocessor()
@ -362,10 +487,13 @@ class HtmlBlockPreprocessor (Preprocessor):
return block.rstrip()[-len(left_tag)-2:-1].lower()
def _equal_tags(self, left_tag, right_tag):
if left_tag in ['?', '?php', 'div'] : # handle PHP, etc.
return True
if ("/" + left_tag) == right_tag:
return True
if (right_tag == "--" and left_tag == "--") :
return True
elif left_tag == right_tag[1:] \
and right_tag[0] != "<":
return True
@ -376,9 +504,10 @@ class HtmlBlockPreprocessor (Preprocessor):
return (tag in ['hr', 'hr/'])
def run (self, lines) :
def run (self, text) :
new_blocks = []
text = "\n".join(lines)
#text = "\n".join(lines)
text = text.split("\n\n")
items = []
@ -417,7 +546,7 @@ class HtmlBlockPreprocessor (Preprocessor):
new_blocks.append(
self.stash.store(block.strip()))
continue
elif not block[1] == "!":
else: #if not block[1] == "!":
# if is block level tag and is not complete
items.append(block.strip())
in_tag = True
@ -429,6 +558,7 @@ class HtmlBlockPreprocessor (Preprocessor):
items.append(block.strip())
right_tag = self._get_right_tag(left_tag, block)
if self._equal_tags(left_tag, right_tag):
# if find closing tag
in_tag = False
@ -436,7 +566,11 @@ class HtmlBlockPreprocessor (Preprocessor):
self.stash.store('\n\n'.join(items)))
items = []
return "\n\n".join(new_blocks).split("\n")
if items :
new_blocks.append(self.stash.store('\n\n'.join(items)))
new_blocks.append('\n')
return "\n\n".join(new_blocks) #.split("\n")
HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
@ -609,15 +743,15 @@ class LinkPattern (Pattern):
def handleMatch(self, m, doc) :
el = doc.createElement('a')
el.appendChild(doc.createTextNode(m.group(2)))
parts = m.group(9).split()
parts = m.group(9).split('"')
# We should now have [], [href], or [href, title]
if parts :
el.setAttribute('href', parts[0])
el.setAttribute('href', parts[0].strip())
else :
el.setAttribute('href', "")
if len(parts) > 1 :
# we also got a title
title = " ".join(parts[1:]).strip()
title = '"' + '"'.join(parts[1:]).strip()
title = dequote(title) #.replace('"', "&quot;")
el.setAttribute('title', title)
return el
@ -645,12 +779,14 @@ class ImagePattern (Pattern):
class ReferencePattern (Pattern):
def handleMatch(self, m, doc):
if m.group(9) :
id = m.group(9).lower()
else :
# if we got something like "[Google][]"
# we'll use "google" as the id
id = m.group(2).lower()
if not self.references.has_key(id) : # ignore undefined refs
return None
href, title = self.references[id]
@ -789,7 +925,6 @@ class BlockGuru :
remainder of the original list"""
items = []
item = -1
i = 0 # to keep track of where we are
@ -908,11 +1043,11 @@ class Markdown:
Markdown text """
def __init__(self, source=None,
extensions=None,
def __init__(self, source=None, # deprecated
extensions=[],
extension_configs=None,
encoding=None,
safe_mode = True):
encoding="utf-8",
safe_mode = False):
"""Creates a new Markdown instance.
@param source: The text in Markdown format.
@ -924,10 +1059,14 @@ class Markdown:
self.blockGuru = BlockGuru()
self.registeredExtensions = []
self.stripTopLevelTags = 1
self.docType = ""
self.preprocessors = [ HEADER_PREPROCESSOR,
self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
self.preprocessors = [
HEADER_PREPROCESSOR,
LINE_PREPROCESSOR,
HTML_BLOCK_PREPROCESSOR,
LINE_BREAKS_PREPROCESSOR,
# A footnote preprocessor will
# get inserted here
@ -979,6 +1118,7 @@ class Markdown:
for ext in extensions :
extension_module_name = "libprs500.ebooks.markdown.mdx_" + ext
try :
module = sys.modules[extension_module_name]
except :
@ -991,6 +1131,7 @@ class Markdown:
configs_for_ext = configs[ext]
else :
configs_for_ext = []
extension = module.makeExtension(configs_for_ext)
extension.extendMarkdown(self, globals())
@ -1032,7 +1173,7 @@ class Markdown:
self.doc.appendChild(self.top_element)
# Fixup the source text
text = self.source.strip()
text = self.source #.strip()
text = text.replace("\r\n", "\n").replace("\r", "\n")
text += "\n\n"
text = text.expandtabs(TAB_LENGTH)
@ -1085,9 +1226,7 @@ class Markdown:
@param inList: a level
@returns: None"""
if not lines :
return
while lines:
# Check if this section starts with a list, a blockquote or
# a code block
@ -1099,12 +1238,7 @@ class Markdown:
for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
m = RE.regExp[regexp].match(lines[0])
if m :
try:
processFn[regexp](parent_elem, lines, inList)
except RuntimeError:
print 'WARNING: Max recursion depth excedeeded, skipping section'
#print '\n'.join(lines)
#sys.exit()
return
# We are NOT looking at one of the high-level structures like
@ -1124,36 +1258,34 @@ class Markdown:
if inList :
start, theRest = self._linesUntil(lines, (lambda line:
start, lines = self._linesUntil(lines, (lambda line:
RE.regExp['ul'].match(line)
or RE.regExp['ol'].match(line)
or not line.strip()))
self._processSection(parent_elem, start,
inList - 1, looseList = looseList)
self._processSection(parent_elem, theRest,
self._processSection(parent_elem, lines,
inList - 1, looseList = looseList)
else : # Ok, so it's just a simple block
paragraph, theRest = self._linesUntil(lines, lambda line:
paragraph, lines = self._linesUntil(lines, lambda line:
not line.strip())
if len(paragraph) and paragraph[0].startswith('#') :
m = RE.regExp['header'].match(paragraph[0])
if m :
level = len(m.group(1))
h = self.doc.createElement("h%d" % level)
parent_elem.appendChild(h)
for item in self._handleInlineWrapper2(m.group(2).strip()) :
for item in self._handleInlineWrapper(m.group(2).strip()) :
h.appendChild(item)
else :
message(CRITICAL, "We've got a problem header!")
elif paragraph :
list = self._handleInlineWrapper2("\n".join(paragraph))
list = self._handleInlineWrapper("\n".join(paragraph))
if ( parent_elem.nodeName == 'li'
and not (looseList or parent_elem.childNodes)):
@ -1171,13 +1303,8 @@ class Markdown:
for item in list :
el.appendChild(item)
if theRest :
theRest = theRest[1:] # skip the first (blank) line
try:
self._processSection(parent_elem, theRest, inList)
except RuntimeError: #Added by Kovid
pass
if lines:
lines = lines[1:] # skip the first (blank) line
@ -1247,7 +1374,9 @@ class Markdown:
m = RE.regExp[expr].match(line)
if m :
if expr in ['ul', 'ol'] : # We are looking at a new item
if m.group(1) :
#if m.group(1) :
# Removed the check to allow for a blank line
# at the beginning of the list item
items.append([m.group(1)])
item += 1
elif expr == 'tabbed' : # This line needs to be detabbed
@ -1333,40 +1462,31 @@ class Markdown:
detabbed, theRest = self.blockGuru.detectTabbed(lines)
pre = self.doc.createElement('pre')
#code = self.doc.createElement('code')
code = self.doc.createElement('code')
parent_elem.appendChild(pre)
#pre.appendChild(code)
pre.appendChild(code)
text = "\n".join(detabbed).rstrip()+"\n"
#text = text.replace("&", "&amp;")
pre.appendChild(self.doc.createTextNode(text))
code.appendChild(self.doc.createTextNode(text))
self._processSection(parent_elem, theRest, inList)
def _handleInlineWrapper2 (self, line) :
def _handleInlineWrapper (self, line) :
parts = [line]
#if not(line):
# return [self.doc.createTextNode(' ')]
for pattern in self.inlinePatterns :
#print
#print self.inlinePatterns.index(pattern)
i = 0
#print parts
while i < len(parts) :
x = parts[i]
#print i
if isinstance(x, (str, unicode)) :
result = self._applyPattern(x, pattern)
#print result
#print result
#print parts, i
if result :
i -= 1
parts.remove(x)
@ -1383,27 +1503,6 @@ class Markdown:
return parts
def _handleInlineWrapper (self, line) :
# A wrapper around _handleInline to avoid recursion
parts = [line]
i = 0
while i < len(parts) :
x = parts[i]
if isinstance(x, (str, unicode)) :
parts.remove(x)
result = self._handleInline(x)
for y in result :
parts.insert(i,y)
else :
i += 1
return parts
def _handleInline(self, line):
"""Transform a Markdown line with inline elements to an XHTML
fragment.
@ -1424,6 +1523,7 @@ class Markdown:
return [self.doc.createTextNode(line)]
def _applyPattern(self, line, pattern) :
""" Given a pattern name, this function checks if the line
fits the pattern, creates the necessary elements, and returns
back a list consisting of NanoDom elements and/or strings.
@ -1438,6 +1538,8 @@ class Markdown:
# match the line to pattern's pre-compiled reg exp.
# if no match, move on.
m = pattern.getCompiledRegExp().match(line)
if not m :
return None
@ -1446,6 +1548,40 @@ class Markdown:
# if it doesn't, move on
node = pattern.handleMatch(m, self.doc)
# check if any of this nodes have children that need processing
if isinstance(node, Element):
if not node.nodeName in ["code", "pre"] :
for child in node.childNodes :
if isinstance(child, TextNode):
result = self._handleInlineWrapper(child.value)
if result:
if result == [child] :
continue
result.reverse()
#to make insertion easier
position = node.childNodes.index(child)
node.removeChild(child)
for item in result:
if isinstance(item, (str, unicode)):
if len(item) > 0:
node.insertChild(position,
self.doc.createTextNode(item))
else:
node.insertChild(position, item)
if node :
# Those are in the reverse order!
return ( m.groups()[-1], # the string to the left
@ -1455,7 +1591,7 @@ class Markdown:
else :
return None
def __str__(self, source = None):
def convert (self, source = None):
"""Return the document in XHTML format.
@returns: A serialized XHTML body."""
@ -1464,6 +1600,14 @@ class Markdown:
if source :
self.source = source
if not self.source :
return ""
self.source = removeBOM(self.source, self.encoding)
for pp in self.textPreprocessors:
self.source = pp.run(self.source)
doc = self._transform()
xml = doc.toxml()
@ -1474,8 +1618,8 @@ class Markdown:
for i in range(self.htmlStash.html_counter) :
html = self.htmlStash.rawHtmlBlocks[i]
if self.safeMode :
html = "[HTML_REMOVED]"
if self.safeMode and html != "<hr />" and html != "<br />":
html = HTML_REMOVED_TEXT
xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
html + "\n")
@ -1490,10 +1634,13 @@ class Markdown:
for pp in self.textPostprocessors :
xml = pp.run(xml)
return xml
return (self.docType + xml).strip()
toString = __str__
__str__ = convert # deprecated - will be changed in 1.7 to report
# information about the MD instance
toString = __str__ # toString() method is deprecated
def __unicode__(self):
@ -1502,7 +1649,7 @@ class Markdown:
return str(self)#.decode(self.encoding)
toUnicode = __unicode__
toUnicode = __unicode__ # deprecated - will be removed in 1.7
@ -1525,7 +1672,7 @@ def markdownFromFile(input = None,
if not encoding :
encoding = "utf-8"
input_file = codecs.open(input, mode="r", encoding="utf-8")
input_file = codecs.open(input, mode="r", encoding=encoding)
text = input_file.read()
input_file.close()
@ -1559,23 +1706,21 @@ def markdown(text,
pairs = [x.split("=") for x in ext[pos+1:-1].split(",")]
configs = [(x.strip(), y.strip()) for (x, y) in pairs]
extension_configs[name] = configs
#print configs
md = Markdown(text, extensions=extension_names,
md = Markdown(extensions=extension_names,
extension_configs=extension_configs,
safe_mode = safe_mode)
return md.toString()
return md.convert(text)
class Extension :
def __init__(self, configs={}) :
def __init__(self, configs = {}) :
self.config = configs
def getConfig(self, key) :
if self.config.has_key(key) :
#print self.config[key][0]
return self.config[key][0]
else :
return ""
@ -1653,10 +1798,8 @@ def parse_options() :
'extensions' : options.extensions,
'encoding' : options.encoding }
def main():
if __name__ == '__main__':
""" Run Markdown from the command line. """
for a in ['-x', 'toc', '-x', 'tables', '-x', 'footnotes']:
sys.argv.append(a)
options = parse_options()
@ -1667,8 +1810,7 @@ def main():
markdownFromFile(**options)
if __name__ == '__main__':
main()