From 66162850ca067add8a5ec1999914c65e4f7fc072 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 27 Apr 2010 16:58:02 -0600
Subject: [PATCH] Fix #5237 (Metadata "Comments" don't handle HTML markup
 correctly)

---
 src/calibre/ebooks/html/input.py            |   2 -
 src/calibre/ebooks/oeb/transforms/jacket.py |  20 +++-
 src/calibre/gui2/convert/metadata.py        |   2 +-
 src/calibre/gui2/dialogs/book_info.py       |   3 +
 src/calibre/gui2/dialogs/metadata_single.py |   6 +-
 src/calibre/gui2/status.py                  |   3 +
 src/calibre/library/comments.py             | 114 ++++++++++++++++++++
 7 files changed, 139 insertions(+), 11 deletions(-)
 create mode 100644 src/calibre/library/comments.py
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 9e61d26aa6..d931eb0e98 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -320,7 +320,6 @@ class HTMLInput(InputFormatPlugin):
         if not metadata.title:
             oeb.logger.warn('Title not specified')
             metadata.add('title', self.oeb.translate(__('Unknown')))
-
         bookid = str(uuid.uuid4())
         metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
         for ident in metadata.identifier:
@@ -328,7 +327,6 @@ class HTMLInput(InputFormatPlugin):
                 self.oeb.uid = metadata.identifier[0]
                 break
 
-
         filelist = get_filelist(htmlpath, basedir, opts, log)
         filelist = [f for f in filelist if not f.is_binary]
         htmlfile_map = {}
diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py
index 40d7ce33f4..597c6f59cd 100644
--- a/src/calibre/ebooks/oeb/transforms/jacket.py
+++ b/src/calibre/ebooks/oeb/transforms/jacket.py
@@ -14,7 +14,7 @@ from lxml import etree
 
 from calibre.ebooks.oeb.base import XPath, XPNSMAP
 from calibre import guess_type
-
+from calibre.library.comments import comments_to_html
 class Jacket(object):
     '''
     Book jacket manipulation. Remove first image and insert comments at start of
@@ -25,6 +25,7 @@ class Jacket(object):
     <html xmlns="%(xmlns)s">
         <head>
             <title>%(title)s</title>
+            <meta name="calibre-content" content="jacket"/>
         </head>
         <body>
             <div class="calibre_rescale_100">
@@ -83,7 +84,9 @@ class Jacket(object):
                 comments = ''
         if not comments.strip():
             comments = ''
-        comments = comments.replace('\r\n', '\n').replace('\n\n', '<br/><br/>')
+        orig_comments = comments
+        if comments:
+            comments = comments_to_html(comments)
         series = '<b>Series: </b>' + escape(mi.series if mi.series else '')
         if mi.series and mi.series_index is not None:
             series += escape(' [%s]'%mi.format_series_index())
@@ -103,12 +106,19 @@ class Jacket(object):
             title = mi.title if mi.title else unicode(self.oeb.metadata.title[0])
         except:
             title = _('Unknown')
-        html = self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'],
-                title=escape(title), comments=escape(comments),
+
+        def generate_html(comments):
+            return self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'],
+                title=escape(title), comments=comments,
                 jacket=escape(_('Book Jacket')), series=series,
                 tags=tags, rating=self.get_rating(mi.rating))
         id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml')
-        root = etree.fromstring(html)
+        from calibre.ebooks.oeb.base import RECOVER_PARSER
+        try:
+            root = etree.fromstring(generate_html(comments), parser=RECOVER_PARSER)
+        except:
+            root = etree.fromstring(generate_html(escape(orig_comments)),
+                    parser=RECOVER_PARSER)
         item = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root)
         self.oeb.spine.insert(0, item, True)
 
diff --git a/src/calibre/gui2/convert/metadata.py b/src/calibre/gui2/convert/metadata.py
index 58960a9927..2026f1cee5 100644
--- a/src/calibre/gui2/convert/metadata.py
+++ b/src/calibre/gui2/convert/metadata.py
@@ -71,7 +71,7 @@ class MetadataWidget(Widget, Ui_Form):
         self.author_sort.setText(mi.author_sort if mi.author_sort else '')
         self.tags.setText(', '.join(mi.tags if mi.tags else []))
         self.tags.update_tags_cache(self.db.all_tags())
-        self.comment.setText(mi.comments if mi.comments else '')
+        self.comment.setPlainText(mi.comments if mi.comments else '')
         if mi.series:
             self.series.setCurrentIndex(self.series.findText(mi.series))
         if mi.series_index is not None:
diff --git a/src/calibre/gui2/dialogs/book_info.py b/src/calibre/gui2/dialogs/book_info.py
index efc65eb6f7..05841d9178 100644
--- a/src/calibre/gui2/dialogs/book_info.py
+++ b/src/calibre/gui2/dialogs/book_info.py
@@ -11,6 +11,7 @@ from PyQt4.QtGui import QDialog, QPixmap, QGraphicsScene, QIcon, QDesktopService
 from calibre.gui2.dialogs.book_info_ui import Ui_BookInfo
 from calibre.gui2 import dynamic
 from calibre import fit_image
+from calibre.library.comments import comments_to_html
 
 class BookInfo(QDialog, Ui_BookInfo):
 
@@ -96,6 +97,8 @@ class BookInfo(QDialog, Ui_BookInfo):
         self.setWindowTitle(info[_('Title')])
         self.title.setText('<b>'+info.pop(_('Title')))
         comments = info.pop(_('Comments'), '')
+        if comments:
+            comments = comments_to_html(comments)
         if re.search(r'<[a-zA-Z]+>', comments) is None:
             lines = comments.splitlines()
             lines = [x if x.strip() else '<br><br>' for x in lines]
diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index 67f7ac1ab8..f4d5d0034c 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -231,7 +231,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
             if mi.series_index is not None:
                 self.series_index.setValue(float(mi.series_index))
         if mi.comments and mi.comments.strip():
-            self.comments.setText(mi.comments)
+            self.comments.setPlainText(mi.comments)
 
 
     def set_cover(self):
@@ -590,7 +590,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                             prefix = unicode(self.comments.toPlainText())
                             if prefix:
                                 prefix += '\n'
-                            self.comments.setText(prefix + summ)
+                            self.comments.setPlainText(prefix + summ)
                         if book.rating is not None:
                             self.rating.setValue(int(book.rating))
                         if book.tags:
@@ -654,7 +654,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
             self.db.set_series(self.id,
                     unicode(self.series.currentText()).strip(), notify=False)
             self.db.set_series_index(self.id, self.series_index.value(), notify=False)
-            self.db.set_comment(self.id, qstring_to_unicode(self.comments.toPlainText()), notify=False)
+            self.db.set_comment(self.id, unicode(self.comments.toPlainText()), notify=False)
             d = self.pubdate.date()
             d = qt_to_dt(d)
             self.db.set_pubdate(self.id, d, notify=False)
diff --git a/src/calibre/gui2/status.py b/src/calibre/gui2/status.py
index 28a1bbea6b..bdba768c5f 100644
--- a/src/calibre/gui2/status.py
+++ b/src/calibre/gui2/status.py
@@ -11,6 +11,7 @@ from calibre.gui2.widgets import IMAGE_EXTENSIONS
 from calibre.gui2.progress_indicator import ProgressIndicator
 from calibre.gui2.notify import get_notifier
 from calibre.ebooks import BOOK_EXTENSIONS
+from calibre.library.comments import comments_to_html
 
 class BookInfoDisplay(QWidget):
 
@@ -133,6 +134,8 @@ class BookInfoDisplay(QWidget):
                 key = key.decode(preferred_encoding, 'replace')
             if isinstance(txt, str):
                 txt = txt.decode(preferred_encoding, 'replace')
+            if key == _('Comments'):
+                txt = comments_to_html(txt)
             rows += u'<tr><td><b>%s:</b></td><td>%s</td></tr>'%(key, txt)
         self.book_data.setText(u'<table>'+rows+u'</table>')
 
diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py
new file mode 100644
index 0000000000..1898e78cbf
--- /dev/null
+++ b/src/calibre/library/comments.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.constants import preferred_encoding
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
+from calibre import prepare_string_for_xml
+
+def comments_to_html(comments):
+    '''
+    Convert random comment text to normalized, xml-legal block of <p>s
+    'plain text' returns as
+    <p>plain text</p>
+
+    'plain text with <i>minimal</i> <b>markup</b>' returns as
+    <p>plain text with <i>minimal</i> <b>markup</b></p>
+
+    '<p>pre-formatted text</p> returns untouched
+
+    'A line of text\n\nFollowed by a line of text' returns as
+    <p>A line of text</p>
+    <p>Followed by a line of text</p>
+
+    'A line of text.\nA second line of text.\rA third line of text' returns as
+    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>
+
+    '...end of a paragraph.Somehow the break was lost...' returns as
+    <p>...end of a paragraph.</p>
+    <p>Somehow the break was lost...</p>
+
+    Deprecated HTML returns as HTML via BeautifulSoup()
+
+    '''
+    if not isinstance(comments, unicode):
+        comments = comments.decode(preferred_encoding, 'replace')
+
+    # Hackish - ignoring sentences ending or beginning in numbers to avoid
+    # confusion with decimal points.
+
+    # Explode lost CRs to \n\n
+    for lost_cr in re.finditer('([a-z])([\.\?!])([A-Z])', comments):
+        comments = comments.replace(lost_cr.group(),
+                                    '%s%s\n\n%s' % (lost_cr.group(1),
+                                                    lost_cr.group(2),
+                                                    lost_cr.group(3)))
+
+    # Convert \n\n to <p>s
+    if re.search('\n\n', comments):
+        soup = BeautifulSoup()
+        split_ps = comments.split(u'\n\n')
+        tsc = 0
+        for p in split_ps:
+            pTag = Tag(soup,'p')
+            pTag.insert(0,p)
+            soup.insert(tsc,pTag)
+            tsc += 1
+        comments = soup.renderContents(None)
+
+    # Convert solo returns to <br />
+    comments = re.sub('[\r\n]','<br />', comments)
+
+    # Convert two hyphens to emdash
+    comments = re.sub('--', '&mdash;', comments)
+    soup = BeautifulSoup(comments)
+    result = BeautifulSoup()
+    rtc = 0
+    open_pTag = False
+
+    all_tokens = list(soup.contents)
+    for token in all_tokens:
+        if type(token) is NavigableString:
+            if not open_pTag:
+                pTag = Tag(result,'p')
+                open_pTag = True
+                ptc = 0
+            pTag.insert(ptc,prepare_string_for_xml(token))
+            ptc += 1
+
+        elif token.name in ['br','b','i','em']:
+            if not open_pTag:
+                pTag = Tag(result,'p')
+                open_pTag = True
+                ptc = 0
+            pTag.insert(ptc, token)
+            ptc += 1
+
+        else:
+            if open_pTag:
+                result.insert(rtc, pTag)
+                rtc += 1
+                open_pTag = False
+                ptc = 0
+            # Clean up NavigableStrings for xml
+            sub_tokens = list(token.contents)
+            for sub_token in sub_tokens:
+                if type(sub_token) is NavigableString:
+                    sub_token.replaceWith(prepare_string_for_xml(sub_token))
+            result.insert(rtc, token)
+            rtc += 1
+
+    if open_pTag:
+        result.insert(rtc, pTag)
+
+    paras = result.findAll('p')
+    for p in paras:
+        p['class'] = 'description'
+
+    return result.renderContents(encoding=None)
+