mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Markdown output: Use inline images and links instead of references. This allows multiple files to be appened and still work.
This commit is contained in:
parent
7d7b7bd2be
commit
5e2e6a9d30
@ -1,8 +1,14 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
||||||
__version__ = "2.39"
|
# Last upstream version before changes
|
||||||
__author__ = "Aaron Swartz (me@aaronsw.com)"
|
#__version__ = "2.39"
|
||||||
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '''
|
||||||
|
Copyright (c) 2011, John Schember <john@nachtimwald.com>
|
||||||
|
(C) 2004-2008 Aaron Swartz <me@aaronsw.com>
|
||||||
|
'''
|
||||||
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
@ -11,7 +17,6 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
|||||||
if not hasattr(__builtins__, 'True'): True, False = 1, 0
|
if not hasattr(__builtins__, 'True'): True, False = 1, 0
|
||||||
import re, sys, urllib, htmlentitydefs, codecs
|
import re, sys, urllib, htmlentitydefs, codecs
|
||||||
import sgmllib
|
import sgmllib
|
||||||
import urlparse
|
|
||||||
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
|
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||||
|
|
||||||
try: from textwrap import wrap
|
try: from textwrap import wrap
|
||||||
@ -145,9 +150,7 @@ class _html2text(sgmllib.SGMLParser):
|
|||||||
self.outcount = 0
|
self.outcount = 0
|
||||||
self.start = 1
|
self.start = 1
|
||||||
self.space = 0
|
self.space = 0
|
||||||
self.a = []
|
|
||||||
self.astack = []
|
self.astack = []
|
||||||
self.acount = 0
|
|
||||||
self.list = []
|
self.list = []
|
||||||
self.blockquote = 0
|
self.blockquote = 0
|
||||||
self.pre = 0
|
self.pre = 0
|
||||||
@ -181,29 +184,6 @@ class _html2text(sgmllib.SGMLParser):
|
|||||||
def unknown_endtag(self, tag):
|
def unknown_endtag(self, tag):
|
||||||
self.handle_tag(tag, None, 0)
|
self.handle_tag(tag, None, 0)
|
||||||
|
|
||||||
def previousIndex(self, attrs):
|
|
||||||
""" returns the index of certain set of attributes (of a link) in the
|
|
||||||
self.a list
|
|
||||||
|
|
||||||
If the set of attributes is not found, returns None
|
|
||||||
"""
|
|
||||||
if not attrs.has_key('href'): return None
|
|
||||||
|
|
||||||
i = -1
|
|
||||||
for a in self.a:
|
|
||||||
i += 1
|
|
||||||
match = 0
|
|
||||||
|
|
||||||
if a.has_key('href') and a['href'] == attrs['href']:
|
|
||||||
if a.has_key('title') or attrs.has_key('title'):
|
|
||||||
if (a.has_key('title') and attrs.has_key('title') and
|
|
||||||
a['title'] == attrs['title']):
|
|
||||||
match = True
|
|
||||||
else:
|
|
||||||
match = True
|
|
||||||
|
|
||||||
if match: return i
|
|
||||||
|
|
||||||
def handle_tag(self, tag, attrs, start):
|
def handle_tag(self, tag, attrs, start):
|
||||||
attrs = fixattrs(attrs)
|
attrs = fixattrs(attrs)
|
||||||
|
|
||||||
@ -268,34 +248,23 @@ class _html2text(sgmllib.SGMLParser):
|
|||||||
if self.astack:
|
if self.astack:
|
||||||
a = self.astack.pop()
|
a = self.astack.pop()
|
||||||
if a:
|
if a:
|
||||||
i = self.previousIndex(a)
|
title = ''
|
||||||
if i is not None:
|
if a.has_key('title'):
|
||||||
a = self.a[i]
|
title = ' "%s"' % a['title']
|
||||||
else:
|
self.o('](%s%s)' % (a['href'], title))
|
||||||
self.acount += 1
|
|
||||||
a['count'] = self.acount
|
|
||||||
a['outcount'] = self.outcount
|
|
||||||
self.a.append(a)
|
|
||||||
self.o("][" + `a['count']` + "]")
|
|
||||||
|
|
||||||
if tag == "img" and start:
|
if tag == "img" and start:
|
||||||
attrsD = {}
|
attrsD = {}
|
||||||
for (x, y) in attrs: attrsD[x] = y
|
for (x, y) in attrs: attrsD[x] = y
|
||||||
attrs = attrsD
|
attrs = attrsD
|
||||||
if attrs.has_key('src'):
|
if attrs.has_key('src'):
|
||||||
attrs['href'] = attrs['src']
|
|
||||||
alt = attrs.get('alt', '')
|
alt = attrs.get('alt', '')
|
||||||
i = self.previousIndex(attrs)
|
|
||||||
if i is not None:
|
|
||||||
attrs = self.a[i]
|
|
||||||
else:
|
|
||||||
self.acount += 1
|
|
||||||
attrs['count'] = self.acount
|
|
||||||
attrs['outcount'] = self.outcount
|
|
||||||
self.a.append(attrs)
|
|
||||||
self.o("![")
|
self.o("![")
|
||||||
self.o(alt)
|
self.o(alt)
|
||||||
self.o("]["+`attrs['count']`+"]")
|
title = ''
|
||||||
|
if attrs.has_key('title'):
|
||||||
|
title = ' "%s"' % attrs['title']
|
||||||
|
self.o('](%s%s)' % (attrs['src'], title))
|
||||||
|
|
||||||
if tag == 'dl' and start: self.p()
|
if tag == 'dl' and start: self.p()
|
||||||
if tag == 'dt' and not start: self.pbr()
|
if tag == 'dt' and not start: self.pbr()
|
||||||
@ -373,7 +342,6 @@ class _html2text(sgmllib.SGMLParser):
|
|||||||
self.out("\n")
|
self.out("\n")
|
||||||
self.space = 0
|
self.space = 0
|
||||||
|
|
||||||
|
|
||||||
if self.p_p:
|
if self.p_p:
|
||||||
self.out(('\n'+bq)*self.p_p)
|
self.out(('\n'+bq)*self.p_p)
|
||||||
self.space = 0
|
self.space = 0
|
||||||
@ -382,22 +350,6 @@ class _html2text(sgmllib.SGMLParser):
|
|||||||
if not self.lastWasNL: self.out(' ')
|
if not self.lastWasNL: self.out(' ')
|
||||||
self.space = 0
|
self.space = 0
|
||||||
|
|
||||||
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
|
|
||||||
if force == "end": self.out("\n")
|
|
||||||
|
|
||||||
newa = []
|
|
||||||
for link in self.a:
|
|
||||||
if self.outcount > link['outcount']:
|
|
||||||
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
|
|
||||||
if link.has_key('title'): self.out(" ("+link['title']+")")
|
|
||||||
self.out("\n")
|
|
||||||
else:
|
|
||||||
newa.append(link)
|
|
||||||
|
|
||||||
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
|
|
||||||
|
|
||||||
self.a = newa
|
|
||||||
|
|
||||||
if self.abbr_list and force == "end":
|
if self.abbr_list and force == "end":
|
||||||
for abbr, definition in self.abbr_list.items():
|
for abbr, definition in self.abbr_list.items():
|
||||||
self.out(" *[" + abbr + "]: " + definition + "\n")
|
self.out(" *[" + abbr + "]: " + definition + "\n")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user