Markdown output: Use inline images and links instead of references. This allows multiple files to be appened and still work.

This commit is contained in:
John Schember 2011-03-30 19:18:53 -04:00
parent 7d7b7bd2be
commit 5e2e6a9d30

View File

@ -1,8 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
"""html2text: Turn HTML into equivalent Markdown-structured text.""" """html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "2.39" # Last upstream version before changes
__author__ = "Aaron Swartz (me@aaronsw.com)" #__version__ = "2.39"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __license__ = 'GPL 3'
__copyright__ = '''
Copyright (c) 2011, John Schember <john@nachtimwald.com>
(C) 2004-2008 Aaron Swartz <me@aaronsw.com>
'''
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
# TODO: # TODO:
@ -11,7 +17,6 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
if not hasattr(__builtins__, 'True'): True, False = 1, 0 if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs import re, sys, urllib, htmlentitydefs, codecs
import sgmllib import sgmllib
import urlparse
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
try: from textwrap import wrap try: from textwrap import wrap
@ -145,9 +150,7 @@ class _html2text(sgmllib.SGMLParser):
self.outcount = 0 self.outcount = 0
self.start = 1 self.start = 1
self.space = 0 self.space = 0
self.a = []
self.astack = [] self.astack = []
self.acount = 0
self.list = [] self.list = []
self.blockquote = 0 self.blockquote = 0
self.pre = 0 self.pre = 0
@ -181,29 +184,6 @@ class _html2text(sgmllib.SGMLParser):
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
self.handle_tag(tag, None, 0) self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def handle_tag(self, tag, attrs, start): def handle_tag(self, tag, attrs, start):
attrs = fixattrs(attrs) attrs = fixattrs(attrs)
@ -268,34 +248,23 @@ class _html2text(sgmllib.SGMLParser):
if self.astack: if self.astack:
a = self.astack.pop() a = self.astack.pop()
if a: if a:
i = self.previousIndex(a) title = ''
if i is not None: if a.has_key('title'):
a = self.a[i] title = ' "%s"' % a['title']
else: self.o('](%s%s)' % (a['href'], title))
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
if tag == "img" and start: if tag == "img" and start:
attrsD = {} attrsD = {}
for (x, y) in attrs: attrsD[x] = y for (x, y) in attrs: attrsD[x] = y
attrs = attrsD attrs = attrsD
if attrs.has_key('src'): if attrs.has_key('src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '') alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![") self.o("![")
self.o(alt) self.o(alt)
self.o("]["+`attrs['count']`+"]") title = ''
if attrs.has_key('title'):
title = ' "%s"' % attrs['title']
self.o('](%s%s)' % (attrs['src'], title))
if tag == 'dl' and start: self.p() if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr() if tag == 'dt' and not start: self.pbr()
@ -373,7 +342,6 @@ class _html2text(sgmllib.SGMLParser):
self.out("\n") self.out("\n")
self.space = 0 self.space = 0
if self.p_p: if self.p_p:
self.out(('\n'+bq)*self.p_p) self.out(('\n'+bq)*self.p_p)
self.space = 0 self.space = 0
@ -382,22 +350,6 @@ class _html2text(sgmllib.SGMLParser):
if not self.lastWasNL: self.out(' ') if not self.lastWasNL: self.out(' ')
self.space = 0 self.space = 0
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end": if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items(): for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n") self.out(" *[" + abbr + "]: " + definition + "\n")