diff --git a/src/calibre/ebooks/pml/__init__.py b/src/calibre/ebooks/pml/__init__.py index e69de29bb2..9bda82bafb 100644 --- a/src/calibre/ebooks/pml/__init__.py +++ b/src/calibre/ebooks/pml/__init__.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +# Uncommon Characters supported by PML. \\a tag codes +A_CHARS = range(160, 256) + range(130, 136) + range(138, 141) + \ + range(145, 152) + range(153, 157) + [159] + +# Extended Unicode characters supported by PML +Latin_ExtendedA = range(0x0100, 0x0104) + [0x0105, 0x0107, 0x010C, 0x010D, + 0x0112, 0x0113, 0x0115, 0x0117, 0x0119, 0x011B, 0x011D, 0x011F, 0x012A, + 0x012B, 0x012D, 0x012F, 0x0131, 0x0141, 0x0142, 0x0144, 0x0148] + \ + range(0x014B, 0x014E) + [0x014F, 0x0151, 0x0155] + range(0x0159, 0x015C) + \ + [0x015F, 0x0163, 0x0169, 0x016B, 0x016D, 0x0177, 0x017A, 0x017D, 0x017E] +Latin_ExtendedB = [0x01BF, 0x01CE, 0x01D0, 0x01D2, 0x01D4, 0x01E1, 0x01E3, + 0x01E7, 0x01EB, 0x01F0, 0x0207, 0x021D, 0x0227, 0x022F, 0x0233] +IPA_Extensions = [0x0251, 0x0251, 0x0254, 0x0259, 0x025C, 0x0265, 0x026A, + 0x0272, 0x0283, 0x0289, 0x028A, 0x028C, 0x028F, 0x0292, 0x0294, 0x029C] +Spacing_Modifier_Letters = [0x02BE, 0x02BF, 0x02C7, 0x02C8, 0x02CC, 0x02D0, + 0x02D8, 0x02D9] +Greek_and_Coptic = range(0x0391, 0x03A2) + range(0x03A3, 0x03AA) + \ + range(0x03B1, 0x03CA) + [0x03D1, 0x03DD] +Hebrew = range(0x05D0, 0x05EB) +Latin_Extended_Additional = [0x1E0B, 0x1E0D, 0x1E17, 0x1E22, 0x1E24, 0x1E25, + 0x1E2B, 0x1E33, 0x1E37, 0x1E41, 0x1E43, 0x1E45, 0x1E47, 0x1E53] + \ + range(0x1E59, 0x1E5C) + [0x1E61, 0x1E63, 0x1E6B, 0x1E6D, 0x1E6F, 0x1E91, + 0x1E93, 0x1E96, 0x1EA1, 0x1ECD, 0x1EF9] +General_Punctuation = [0x2011, 0x2038, 0x203D, 0x2042] +Arrows = [0x2190, 0x2192] +Mathematical_Operators = [0x2202, 0x221A, 0x221E, 0x2225, 0x222B, 0x2260, + 0x2294, 0x2295, 0x22EE] +Enclosed_Alphanumerics = [0x24CA] +Miscellaneous_Symbols = range(0x261C, 0x2641) + range(0x2642, 0x2648) + \ + range(0x2660, 0x2664) + range(0x266D, 0x2670) +Dingbats = [0x2713, 0x2720] +Private_Use_Area = range(0xE000, 0xE01D) + range(0xE01E, 0xE029) + \ + range(0xE02A, 0xE052) +Alphabetic_Presentation_Forms = [0xFB02, 0xFB2A, 0xFB2B] + +# \\U tag codes. +U_CHARS = Latin_ExtendedA + Latin_ExtendedB + IPA_Extensions + \ + Spacing_Modifier_Letters + Greek_and_Coptic + Hebrew + \ + Latin_Extended_Additional + General_Punctuation + Arrows + \ + Mathematical_Operators + Enclosed_Alphanumerics + Miscellaneous_Symbols + \ + Dingbats + Private_Use_Area + Alphabetic_Presentation_Forms + +def unipmlcode(char): + try: + val = ord(char.encode('cp1252')) + if val in A_CHARS: + return '\\a%i' % val + except: + pass + val = ord(char) + if val in U_CHARS: + return '\\U%04x'.upper() % val + else: + return '?' diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index aa608496c7..b40870c0b5 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -13,6 +13,7 @@ import re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.pdb.ereader import image_name +from calibre.ebooks.pml import unipmlcode from calibre import entity_to_unicode TAG_MAP = { @@ -163,8 +164,9 @@ class PMLMLizer(object): mo = re.search('(%s)' % entity[1:-1], text) text = text.replace(entity, entity_to_unicode(mo)) - # Turn all unicode characters into their PML hex equivelent - text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text) + # Turn all characters that cannot be represented by themself into their + # PML code equivelent + text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text) # Remove excess spaces at beginning and end of lines text = re.sub('(?m)^[ ]+', '', text)