From 66162850ca067add8a5ec1999914c65e4f7fc072 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 27 Apr 2010 16:58:02 -0600 Subject: [PATCH] Fix #5237 (Metadata "Comments" don't handle HTML markup correctly) --- src/calibre/ebooks/html/input.py | 2 - src/calibre/ebooks/oeb/transforms/jacket.py | 20 +++- src/calibre/gui2/convert/metadata.py | 2 +- src/calibre/gui2/dialogs/book_info.py | 3 + src/calibre/gui2/dialogs/metadata_single.py | 6 +- src/calibre/gui2/status.py | 3 + src/calibre/library/comments.py | 114 ++++++++++++++++++++ 7 files changed, 139 insertions(+), 11 deletions(-) create mode 100644 src/calibre/library/comments.py diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 9e61d26aa6..d931eb0e98 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -320,7 +320,6 @@ class HTMLInput(InputFormatPlugin): if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) - bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: @@ -328,7 +327,6 @@ class HTMLInput(InputFormatPlugin): self.oeb.uid = metadata.identifier[0] break - filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index 40d7ce33f4..597c6f59cd 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -14,7 +14,7 @@ from lxml import etree from calibre.ebooks.oeb.base import XPath, XPNSMAP from calibre import guess_type - +from calibre.library.comments import comments_to_html class Jacket(object): ''' Book jacket manipulation. Remove first image and insert comments at start of @@ -25,6 +25,7 @@ class Jacket(object): %(title)s +
@@ -83,7 +84,9 @@ class Jacket(object): comments = '' if not comments.strip(): comments = '' - comments = comments.replace('\r\n', '\n').replace('\n\n', '

') + orig_comments = comments + if comments: + comments = comments_to_html(comments) series = 'Series: ' + escape(mi.series if mi.series else '') if mi.series and mi.series_index is not None: series += escape(' [%s]'%mi.format_series_index()) @@ -103,12 +106,19 @@ class Jacket(object): title = mi.title if mi.title else unicode(self.oeb.metadata.title[0]) except: title = _('Unknown') - html = self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'], - title=escape(title), comments=escape(comments), + + def generate_html(comments): + return self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'], + title=escape(title), comments=comments, jacket=escape(_('Book Jacket')), series=series, tags=tags, rating=self.get_rating(mi.rating)) id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml') - root = etree.fromstring(html) + from calibre.ebooks.oeb.base import RECOVER_PARSER + try: + root = etree.fromstring(generate_html(comments), parser=RECOVER_PARSER) + except: + root = etree.fromstring(generate_html(escape(orig_comments)), + parser=RECOVER_PARSER) item = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root) self.oeb.spine.insert(0, item, True) diff --git a/src/calibre/gui2/convert/metadata.py b/src/calibre/gui2/convert/metadata.py index 58960a9927..2026f1cee5 100644 --- a/src/calibre/gui2/convert/metadata.py +++ b/src/calibre/gui2/convert/metadata.py @@ -71,7 +71,7 @@ class MetadataWidget(Widget, Ui_Form): self.author_sort.setText(mi.author_sort if mi.author_sort else '') self.tags.setText(', '.join(mi.tags if mi.tags else [])) self.tags.update_tags_cache(self.db.all_tags()) - self.comment.setText(mi.comments if mi.comments else '') + self.comment.setPlainText(mi.comments if mi.comments else '') if mi.series: self.series.setCurrentIndex(self.series.findText(mi.series)) if mi.series_index is not None: diff --git a/src/calibre/gui2/dialogs/book_info.py b/src/calibre/gui2/dialogs/book_info.py index efc65eb6f7..05841d9178 100644 --- a/src/calibre/gui2/dialogs/book_info.py +++ b/src/calibre/gui2/dialogs/book_info.py @@ -11,6 +11,7 @@ from PyQt4.QtGui import QDialog, QPixmap, QGraphicsScene, QIcon, QDesktopService from calibre.gui2.dialogs.book_info_ui import Ui_BookInfo from calibre.gui2 import dynamic from calibre import fit_image +from calibre.library.comments import comments_to_html class BookInfo(QDialog, Ui_BookInfo): @@ -96,6 +97,8 @@ class BookInfo(QDialog, Ui_BookInfo): self.setWindowTitle(info[_('Title')]) self.title.setText(''+info.pop(_('Title'))) comments = info.pop(_('Comments'), '') + if comments: + comments = comments_to_html(comments) if re.search(r'<[a-zA-Z]+>', comments) is None: lines = comments.splitlines() lines = [x if x.strip() else '

' for x in lines] diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 67f7ac1ab8..f4d5d0034c 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -231,7 +231,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if mi.series_index is not None: self.series_index.setValue(float(mi.series_index)) if mi.comments and mi.comments.strip(): - self.comments.setText(mi.comments) + self.comments.setPlainText(mi.comments) def set_cover(self): @@ -590,7 +590,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): prefix = unicode(self.comments.toPlainText()) if prefix: prefix += '\n' - self.comments.setText(prefix + summ) + self.comments.setPlainText(prefix + summ) if book.rating is not None: self.rating.setValue(int(book.rating)) if book.tags: @@ -654,7 +654,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.db.set_series(self.id, unicode(self.series.currentText()).strip(), notify=False) self.db.set_series_index(self.id, self.series_index.value(), notify=False) - self.db.set_comment(self.id, qstring_to_unicode(self.comments.toPlainText()), notify=False) + self.db.set_comment(self.id, unicode(self.comments.toPlainText()), notify=False) d = self.pubdate.date() d = qt_to_dt(d) self.db.set_pubdate(self.id, d, notify=False) diff --git a/src/calibre/gui2/status.py b/src/calibre/gui2/status.py index 28a1bbea6b..bdba768c5f 100644 --- a/src/calibre/gui2/status.py +++ b/src/calibre/gui2/status.py @@ -11,6 +11,7 @@ from calibre.gui2.widgets import IMAGE_EXTENSIONS from calibre.gui2.progress_indicator import ProgressIndicator from calibre.gui2.notify import get_notifier from calibre.ebooks import BOOK_EXTENSIONS +from calibre.library.comments import comments_to_html class BookInfoDisplay(QWidget): @@ -133,6 +134,8 @@ class BookInfoDisplay(QWidget): key = key.decode(preferred_encoding, 'replace') if isinstance(txt, str): txt = txt.decode(preferred_encoding, 'replace') + if key == _('Comments'): + txt = comments_to_html(txt) rows += u'%s:%s'%(key, txt) self.book_data.setText(u''+rows+u'
') diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py new file mode 100644 index 0000000000..1898e78cbf --- /dev/null +++ b/src/calibre/library/comments.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.constants import preferred_encoding +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString +from calibre import prepare_string_for_xml + +def comments_to_html(comments): + ''' + Convert random comment text to normalized, xml-legal block of

s + 'plain text' returns as +

plain text

+ + 'plain text with minimal markup' returns as +

plain text with minimal markup

+ + '

pre-formatted text

returns untouched + + 'A line of text\n\nFollowed by a line of text' returns as +

A line of text

+

Followed by a line of text

+ + 'A line of text.\nA second line of text.\rA third line of text' returns as +

A line of text.
A second line of text.
A third line of text.

+ + '...end of a paragraph.Somehow the break was lost...' returns as +

...end of a paragraph.

+

Somehow the break was lost...

+ + Deprecated HTML returns as HTML via BeautifulSoup() + + ''' + if not isinstance(comments, unicode): + comments = comments.decode(preferred_encoding, 'replace') + + # Hackish - ignoring sentences ending or beginning in numbers to avoid + # confusion with decimal points. + + # Explode lost CRs to \n\n + for lost_cr in re.finditer('([a-z])([\.\?!])([A-Z])', comments): + comments = comments.replace(lost_cr.group(), + '%s%s\n\n%s' % (lost_cr.group(1), + lost_cr.group(2), + lost_cr.group(3))) + + # Convert \n\n to

s + if re.search('\n\n', comments): + soup = BeautifulSoup() + split_ps = comments.split(u'\n\n') + tsc = 0 + for p in split_ps: + pTag = Tag(soup,'p') + pTag.insert(0,p) + soup.insert(tsc,pTag) + tsc += 1 + comments = soup.renderContents(None) + + # Convert solo returns to
+ comments = re.sub('[\r\n]','
', comments) + + # Convert two hyphens to emdash + comments = re.sub('--', '—', comments) + soup = BeautifulSoup(comments) + result = BeautifulSoup() + rtc = 0 + open_pTag = False + + all_tokens = list(soup.contents) + for token in all_tokens: + if type(token) is NavigableString: + if not open_pTag: + pTag = Tag(result,'p') + open_pTag = True + ptc = 0 + pTag.insert(ptc,prepare_string_for_xml(token)) + ptc += 1 + + elif token.name in ['br','b','i','em']: + if not open_pTag: + pTag = Tag(result,'p') + open_pTag = True + ptc = 0 + pTag.insert(ptc, token) + ptc += 1 + + else: + if open_pTag: + result.insert(rtc, pTag) + rtc += 1 + open_pTag = False + ptc = 0 + # Clean up NavigableStrings for xml + sub_tokens = list(token.contents) + for sub_token in sub_tokens: + if type(sub_token) is NavigableString: + sub_token.replaceWith(prepare_string_for_xml(sub_token)) + result.insert(rtc, token) + rtc += 1 + + if open_pTag: + result.insert(rtc, pTag) + + paras = result.findAll('p') + for p in paras: + p['class'] = 'description' + + return result.renderContents(encoding=None) +