diff --git a/src/calibre/ebooks/BeautifulSoup.py b/src/calibre/ebooks/BeautifulSoup.py index c19f4c0a11..ecdfec486b 100644 --- a/src/calibre/ebooks/BeautifulSoup.py +++ b/src/calibre/ebooks/BeautifulSoup.py @@ -972,6 +972,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): NESTABLE_TAGS = {} RESET_NESTING_TAGS = {} QUOTE_TAGS = {} + PRESERVE_WHITESPACE_TAGS = frozenset() MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), lambda x: x.group(1) + ' />'), @@ -1155,7 +1156,10 @@ class BeautifulStoneSoup(Tag, SGMLParser): def endData(self, containerClass=NavigableString): if self.currentData: currentData = ''.join(self.currentData) - if not currentData.translate(self.STRIP_ASCII_SPACES): + # Changed by Kovid to not clobber whitespace inside
tags and the like + if ( (not currentData.translate(self.STRIP_ASCII_SPACES)) and ( + not frozenset(tag.name for tag in self.tagStack).intersection( + self.PRESERVE_WHITESPACE_TAGS))): if '\n' in currentData: currentData = '\n' else: @@ -1443,6 +1447,8 @@ class BeautifulSoup(BeautifulStoneSoup): ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + PRESERVE_WHITESPACE_TAGS = frozenset(('pre', 'textarea')) + QUOTE_TAGS = {'script' : None, 'textarea' : None} #According to the HTML standard, each of these inline tags can