mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix an error when converting invalid html with empty tags to text
This commit is contained in:
parent
0626088861
commit
8d70476517
@ -50,6 +50,7 @@ def name2cp(k):
|
|||||||
return int(k[2:-1]) # not in latin-1
|
return int(k[2:-1]) # not in latin-1
|
||||||
return ord(codecs.latin_1_decode(k)[0])
|
return ord(codecs.latin_1_decode(k)[0])
|
||||||
|
|
||||||
|
|
||||||
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
|
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
|
||||||
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
|
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
|
||||||
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
|
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
|
||||||
@ -96,6 +97,7 @@ def replaceEntities(s):
|
|||||||
else:
|
else:
|
||||||
return entityref(s)
|
return entityref(s)
|
||||||
|
|
||||||
|
|
||||||
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
|
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
|
||||||
|
|
||||||
|
|
||||||
@ -150,7 +152,7 @@ def optwrap(text):
|
|||||||
|
|
||||||
|
|
||||||
def hn(tag):
|
def hn(tag):
|
||||||
if tag[0] == 'h' and len(tag) == 2:
|
if tag and tag[0] == 'h' and len(tag) == 2:
|
||||||
try:
|
try:
|
||||||
n = int(tag[1])
|
n = int(tag[1])
|
||||||
if n in range(1, 10):
|
if n in range(1, 10):
|
||||||
@ -364,7 +366,7 @@ class _html2text(sgmllib.SGMLParser):
|
|||||||
|
|
||||||
if not self.quiet:
|
if not self.quiet:
|
||||||
if puredata and not self.pre:
|
if puredata and not self.pre:
|
||||||
data = re.sub('\s+', ' ', data)
|
data = re.sub(r'\s+', ' ', data)
|
||||||
if data and data[0] == ' ':
|
if data and data[0] == ' ':
|
||||||
self.space = 1
|
self.space = 1
|
||||||
data = data[1:]
|
data = data[1:]
|
||||||
@ -435,6 +437,7 @@ def html2text_file(html, out=wrapwrite, baseurl=''):
|
|||||||
def html2text(html, baseurl=''):
|
def html2text(html, baseurl=''):
|
||||||
return optwrap(html2text_file(html, None, baseurl))
|
return optwrap(html2text_file(html, None, baseurl))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
baseurl = ''
|
baseurl = ''
|
||||||
if sys.argv[1:]:
|
if sys.argv[1:]:
|
||||||
@ -461,4 +464,3 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
data = sys.stdin.read().decode('utf8')
|
data = sys.stdin.read().decode('utf8')
|
||||||
wrapwrite(html2text(data, baseurl))
|
wrapwrite(html2text(data, baseurl))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user