From ecbcb38ead93745a82b652d6e3f3194ddc04b62f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Feb 2011 11:23:22 -0700 Subject: [PATCH] MOBI Output: Normalize unicode strings when writing metadata to MOBI files as the Kindle cannot handle non-normalized unicode. Fixes #8229 (Diacritical mark in MOBI title) --- src/calibre/ebooks/__init__.py | 9 +++++++++ src/calibre/ebooks/metadata/mobi.py | 12 +++++++----- src/calibre/ebooks/mobi/writer.py | 13 +++++++------ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index 49604ae682..dcd32811b3 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -152,8 +152,17 @@ def check_ebook_format(stream, current_guess): stream.seek(0) return ans +def normalize(x): + if isinstance(x, unicode): + import unicodedata + x = unicodedata.normalize('NFKC', x) + return x + def calibre_cover(title, author_string, series_string=None, output_format='jpg', title_size=46, author_size=36): + title = normalize(title) + author_string = normalize(author_string) + series_string = normalize(series_string) from calibre.utils.magick.draw import create_cover_page, TextLine lines = [TextLine(title, title_size), TextLine(author_string, author_size)] if series_string: diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index 30668d70f7..963391dcf8 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -12,6 +12,7 @@ __docformat__ = 'restructuredtext en' from struct import pack, unpack from cStringIO import StringIO +from calibre.ebooks import normalize from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN from calibre.ebooks.mobi.langcodes import iana2mobi @@ -311,6 +312,7 @@ class MetadataUpdater(object): return StreamSlicer(self.stream, start, stop) def update(self, mi): + mi.title = normalize(mi.title) def update_exth_record(rec): recs.append(rec) if rec[0] in self.original_exth_records: @@ -331,12 +333,12 @@ class MetadataUpdater(object): kindle_pdoc = None if mi.author_sort and pas: authors = mi.author_sort - update_exth_record((100, authors.encode(self.codec, 'replace'))) + update_exth_record((100, normalize(authors).encode(self.codec, 'replace'))) elif mi.authors: authors = ';'.join(mi.authors) - update_exth_record((100, authors.encode(self.codec, 'replace'))) + update_exth_record((100, normalize(authors).encode(self.codec, 'replace'))) if mi.publisher: - update_exth_record((101, mi.publisher.encode(self.codec, 'replace'))) + update_exth_record((101, normalize(mi.publisher).encode(self.codec, 'replace'))) if mi.comments: # Strip user annotations a_offset = mi.comments.find('
') @@ -345,12 +347,12 @@ class MetadataUpdater(object): mi.comments = mi.comments[:a_offset] if ad_offset >= 0: mi.comments = mi.comments[:ad_offset] - update_exth_record((103, mi.comments.encode(self.codec, 'replace'))) + update_exth_record((103, normalize(mi.comments).encode(self.codec, 'replace'))) if mi.isbn: update_exth_record((104, mi.isbn.encode(self.codec, 'replace'))) if mi.tags: subjects = '; '.join(mi.tags) - update_exth_record((105, subjects.encode(self.codec, 'replace'))) + update_exth_record((105, normalize(subjects).encode(self.codec, 'replace'))) if kindle_pdoc and kindle_pdoc in mi.tags: update_exth_record((501, str('PDOC'))) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index abba173d69..b3f8160e3a 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -14,8 +14,9 @@ import re from struct import pack import time from urlparse import urldefrag - from cStringIO import StringIO + +from calibre.ebooks import normalize from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.mobiml import MBP_NS from calibre.ebooks.oeb.base import OEB_DOCS @@ -1365,7 +1366,7 @@ class MobiWriter(object): self._text_length, self._text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf) uid = random.randint(0, 0xffffffff) - title = unicode(metadata.title[0]).encode('utf-8') + title = normalize(unicode(metadata.title[0])).encode('utf-8') # The MOBI Header # 0x0 - 0x3 @@ -1523,12 +1524,12 @@ class MobiWriter(object): items = oeb.metadata[term] if term == 'creator': if self._prefer_author_sort: - creators = [unicode(c.file_as or c) for c in items] + creators = [normalize(unicode(c.file_as or c)) for c in items] else: - creators = [unicode(c) for c in items] + creators = [normalize(unicode(c)) for c in items] items = ['; '.join(creators)] for item in items: - data = self.COLLAPSE_RE.sub(' ', unicode(item)) + data = self.COLLAPSE_RE.sub(' ', normalize(unicode(item))) if term == 'identifier': if data.lower().startswith('urn:isbn:'): data = data[9:] @@ -1542,7 +1543,7 @@ class MobiWriter(object): nrecs += 1 if term == 'rights' : try: - rights = unicode(oeb.metadata.rights[0]).encode('utf-8') + rights = normalize(unicode(oeb.metadata.rights[0])).encode('utf-8') except: rights = 'Unknown' exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))