mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
BeautifulSoup: Fix bug in extract() method that prevented extraction of identical individual tags
This commit is contained in:
parent
a031f04e91
commit
9fa6e8abe2
@ -129,11 +129,16 @@ class PageElement:
|
|||||||
|
|
||||||
def extract(self):
|
def extract(self):
|
||||||
"""Destructively rips this element out of the tree."""
|
"""Destructively rips this element out of the tree."""
|
||||||
|
# Changed by KG as list.remove uses _-eq__ which is True for two Tags
|
||||||
|
# with the same name and attributes.
|
||||||
if self.parent:
|
if self.parent:
|
||||||
try:
|
idx = None
|
||||||
self.parent.contents.remove(self)
|
for i, x in enumerate(self.parent.contents):
|
||||||
except ValueError:
|
if x is self:
|
||||||
pass
|
idx = i
|
||||||
|
break
|
||||||
|
if idx is not None:
|
||||||
|
self.parent.contents.pop(idx)
|
||||||
|
|
||||||
#Find the two elements that would be next to each other if
|
#Find the two elements that would be next to each other if
|
||||||
#this element (and any children) hadn't been parsed. Connect
|
#this element (and any children) hadn't been parsed. Connect
|
||||||
@ -1075,7 +1080,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||||||
self.originalEncoding = None
|
self.originalEncoding = None
|
||||||
else:
|
else:
|
||||||
# Changed detection by Kovid
|
# Changed detection by Kovid
|
||||||
markup, self.originalEncoding = chardet.xml_to_unicode(markup)
|
markup, self.originalEncoding = chardet.xml_to_unicode(markup)
|
||||||
if markup:
|
if markup:
|
||||||
if self.markupMassage:
|
if self.markupMassage:
|
||||||
if not isList(self.markupMassage):
|
if not isList(self.markupMassage):
|
||||||
@ -1090,7 +1095,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||||||
del(self.markupMassage)
|
del(self.markupMassage)
|
||||||
self.markup = markup
|
self.markup = markup
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
SGMLParser.feed(self, markup)
|
SGMLParser.feed(self, markup)
|
||||||
# Close out any unfinished strings and close all the open tags.
|
# Close out any unfinished strings and close all the open tags.
|
||||||
self.endData()
|
self.endData()
|
||||||
@ -1309,7 +1314,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||||||
try:
|
try:
|
||||||
data = unichr(int(ref))
|
data = unichr(int(ref))
|
||||||
except ValueError: # Bad numerical entity. Added by Kovid
|
except ValueError: # Bad numerical entity. Added by Kovid
|
||||||
data = u''
|
data = u''
|
||||||
else:
|
else:
|
||||||
data = '&#%s;' % ref
|
data = '&#%s;' % ref
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
@ -1663,7 +1668,7 @@ class UnicodeDammit:
|
|||||||
self._detectEncoding(markup)
|
self._detectEncoding(markup)
|
||||||
self.smartQuotesTo = smartQuotesTo
|
self.smartQuotesTo = smartQuotesTo
|
||||||
self.triedEncodings = []
|
self.triedEncodings = []
|
||||||
|
|
||||||
if markup == '' or isinstance(markup, unicode):
|
if markup == '' or isinstance(markup, unicode):
|
||||||
self.originalEncoding = None
|
self.originalEncoding = None
|
||||||
self.unicode = unicode(markup)
|
self.unicode = unicode(markup)
|
||||||
@ -1677,7 +1682,7 @@ class UnicodeDammit:
|
|||||||
for proposedEncoding in (documentEncoding, sniffedEncoding):
|
for proposedEncoding in (documentEncoding, sniffedEncoding):
|
||||||
u = self._convertFrom(proposedEncoding)
|
u = self._convertFrom(proposedEncoding)
|
||||||
if u: break
|
if u: break
|
||||||
|
|
||||||
# If no luck and we have auto-detection library, try that:
|
# If no luck and we have auto-detection library, try that:
|
||||||
if not u and chardet and not isinstance(self.markup, unicode):
|
if not u and chardet and not isinstance(self.markup, unicode):
|
||||||
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
|
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
|
||||||
@ -1751,9 +1756,9 @@ class UnicodeDammit:
|
|||||||
elif data[:4] == '\xff\xfe\x00\x00':
|
elif data[:4] == '\xff\xfe\x00\x00':
|
||||||
encoding = 'utf-32le'
|
encoding = 'utf-32le'
|
||||||
data = data[4:]
|
data = data[4:]
|
||||||
|
|
||||||
newdata = unicode(data, encoding)
|
newdata = unicode(data, encoding)
|
||||||
|
|
||||||
return newdata
|
return newdata
|
||||||
|
|
||||||
def _detectEncoding(self, xml_data):
|
def _detectEncoding(self, xml_data):
|
||||||
@ -1763,9 +1768,9 @@ class UnicodeDammit:
|
|||||||
if xml_data[:4] == '\x4c\x6f\xa7\x94':
|
if xml_data[:4] == '\x4c\x6f\xa7\x94':
|
||||||
# EBCDIC
|
# EBCDIC
|
||||||
xml_data = self._ebcdic_to_ascii(xml_data)
|
xml_data = self._ebcdic_to_ascii(xml_data)
|
||||||
|
|
||||||
# By Kovid commented out all the recoding to UTF-8 of UTF-16 and UTF-32
|
# By Kovid commented out all the recoding to UTF-8 of UTF-16 and UTF-32
|
||||||
# as this doesn't make sense and doesn't work for the test case
|
# as this doesn't make sense and doesn't work for the test case
|
||||||
# BeautifulSoup.UnicodeDammit(u'abcd'.encode('utf-16')).unicode
|
# BeautifulSoup.UnicodeDammit(u'abcd'.encode('utf-16')).unicode
|
||||||
elif xml_data[:4] == '\x00\x3c\x00\x3f':
|
elif xml_data[:4] == '\x00\x3c\x00\x3f':
|
||||||
# UTF-16BE
|
# UTF-16BE
|
||||||
@ -1817,14 +1822,14 @@ class UnicodeDammit:
|
|||||||
xml_encoding_match = None
|
xml_encoding_match = None
|
||||||
if xml_encoding_match:
|
if xml_encoding_match:
|
||||||
xml_encoding = xml_encoding_match.groups()[0].lower()
|
xml_encoding = xml_encoding_match.groups()[0].lower()
|
||||||
|
|
||||||
if sniffed_xml_encoding and \
|
if sniffed_xml_encoding and \
|
||||||
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
||||||
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
||||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||||
'utf16', 'u16')):
|
'utf16', 'u16')):
|
||||||
xml_encoding = sniffed_xml_encoding
|
xml_encoding = sniffed_xml_encoding
|
||||||
|
|
||||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user