diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py index 0eb84a3d38..3779c68918 100644 --- a/src/calibre/utils/html2text.py +++ b/src/calibre/utils/html2text.py @@ -1,8 +1,14 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- + """html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "2.39" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." +# Last upstream version before changes +#__version__ = "2.39" +__license__ = 'GPL 3' +__copyright__ = ''' +Copyright (c) 2011, John Schember +(C) 2004-2008 Aaron Swartz +''' __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] # TODO: @@ -11,7 +17,6 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] if not hasattr(__builtins__, 'True'): True, False = 1, 0 import re, sys, urllib, htmlentitydefs, codecs import sgmllib -import urlparse sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') try: from textwrap import wrap @@ -145,9 +150,7 @@ class _html2text(sgmllib.SGMLParser): self.outcount = 0 self.start = 1 self.space = 0 - self.a = [] self.astack = [] - self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 @@ -181,29 +184,6 @@ class _html2text(sgmllib.SGMLParser): def unknown_endtag(self, tag): self.handle_tag(tag, None, 0) - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not attrs.has_key('href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if a.has_key('href') and a['href'] == attrs['href']: - if a.has_key('title') or attrs.has_key('title'): - if (a.has_key('title') and attrs.has_key('title') and - a['title'] == attrs['title']): - match = True - else: - match = True - - if match: return i - def handle_tag(self, tag, attrs, start): attrs = fixattrs(attrs) @@ -268,34 +248,23 @@ class _html2text(sgmllib.SGMLParser): if self.astack: a = self.astack.pop() if a: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + `a['count']` + "]") + title = '' + if a.has_key('title'): + title = ' "%s"' % a['title'] + self.o('](%s%s)' % (a['href'], title)) if tag == "img" and start: attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD if attrs.has_key('src'): - attrs['href'] = attrs['src'] alt = attrs.get('alt', '') - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) self.o("![") self.o(alt) - self.o("]["+`attrs['count']`+"]") + title = '' + if attrs.has_key('title'): + title = ' "%s"' % attrs['title'] + self.o('](%s%s)' % (attrs['src'], title)) if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() @@ -373,7 +342,6 @@ class _html2text(sgmllib.SGMLParser): self.out("\n") self.space = 0 - if self.p_p: self.out(('\n'+bq)*self.p_p) self.space = 0 @@ -382,22 +350,6 @@ class _html2text(sgmllib.SGMLParser): if not self.lastWasNL: self.out(' ') self.space = 0 - if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): - if force == "end": self.out("\n") - - newa = [] - for link in self.a: - if self.outcount > link['outcount']: - self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) - if link.has_key('title'): self.out(" ("+link['title']+")") - self.out("\n") - else: - newa.append(link) - - if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. - - self.a = newa - if self.abbr_list and force == "end": for abbr, definition in self.abbr_list.items(): self.out(" *[" + abbr + "]: " + definition + "\n")