mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
PML Output: Only create \a and \U tags for supported characters.
This commit is contained in:
parent
44ba14e77b
commit
d19848184d
@ -0,0 +1,60 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
# Uncommon Characters supported by PML. \\a tag codes
|
||||
A_CHARS = range(160, 256) + range(130, 136) + range(138, 141) + \
|
||||
range(145, 152) + range(153, 157) + [159]
|
||||
|
||||
# Extended Unicode characters supported by PML
|
||||
Latin_ExtendedA = range(0x0100, 0x0104) + [0x0105, 0x0107, 0x010C, 0x010D,
|
||||
0x0112, 0x0113, 0x0115, 0x0117, 0x0119, 0x011B, 0x011D, 0x011F, 0x012A,
|
||||
0x012B, 0x012D, 0x012F, 0x0131, 0x0141, 0x0142, 0x0144, 0x0148] + \
|
||||
range(0x014B, 0x014E) + [0x014F, 0x0151, 0x0155] + range(0x0159, 0x015C) + \
|
||||
[0x015F, 0x0163, 0x0169, 0x016B, 0x016D, 0x0177, 0x017A, 0x017D, 0x017E]
|
||||
Latin_ExtendedB = [0x01BF, 0x01CE, 0x01D0, 0x01D2, 0x01D4, 0x01E1, 0x01E3,
|
||||
0x01E7, 0x01EB, 0x01F0, 0x0207, 0x021D, 0x0227, 0x022F, 0x0233]
|
||||
IPA_Extensions = [0x0251, 0x0251, 0x0254, 0x0259, 0x025C, 0x0265, 0x026A,
|
||||
0x0272, 0x0283, 0x0289, 0x028A, 0x028C, 0x028F, 0x0292, 0x0294, 0x029C]
|
||||
Spacing_Modifier_Letters = [0x02BE, 0x02BF, 0x02C7, 0x02C8, 0x02CC, 0x02D0,
|
||||
0x02D8, 0x02D9]
|
||||
Greek_and_Coptic = range(0x0391, 0x03A2) + range(0x03A3, 0x03AA) + \
|
||||
range(0x03B1, 0x03CA) + [0x03D1, 0x03DD]
|
||||
Hebrew = range(0x05D0, 0x05EB)
|
||||
Latin_Extended_Additional = [0x1E0B, 0x1E0D, 0x1E17, 0x1E22, 0x1E24, 0x1E25,
|
||||
0x1E2B, 0x1E33, 0x1E37, 0x1E41, 0x1E43, 0x1E45, 0x1E47, 0x1E53] + \
|
||||
range(0x1E59, 0x1E5C) + [0x1E61, 0x1E63, 0x1E6B, 0x1E6D, 0x1E6F, 0x1E91,
|
||||
0x1E93, 0x1E96, 0x1EA1, 0x1ECD, 0x1EF9]
|
||||
General_Punctuation = [0x2011, 0x2038, 0x203D, 0x2042]
|
||||
Arrows = [0x2190, 0x2192]
|
||||
Mathematical_Operators = [0x2202, 0x221A, 0x221E, 0x2225, 0x222B, 0x2260,
|
||||
0x2294, 0x2295, 0x22EE]
|
||||
Enclosed_Alphanumerics = [0x24CA]
|
||||
Miscellaneous_Symbols = range(0x261C, 0x2641) + range(0x2642, 0x2648) + \
|
||||
range(0x2660, 0x2664) + range(0x266D, 0x2670)
|
||||
Dingbats = [0x2713, 0x2720]
|
||||
Private_Use_Area = range(0xE000, 0xE01D) + range(0xE01E, 0xE029) + \
|
||||
range(0xE02A, 0xE052)
|
||||
Alphabetic_Presentation_Forms = [0xFB02, 0xFB2A, 0xFB2B]
|
||||
|
||||
# \\U tag codes.
|
||||
U_CHARS = Latin_ExtendedA + Latin_ExtendedB + IPA_Extensions + \
|
||||
Spacing_Modifier_Letters + Greek_and_Coptic + Hebrew + \
|
||||
Latin_Extended_Additional + General_Punctuation + Arrows + \
|
||||
Mathematical_Operators + Enclosed_Alphanumerics + Miscellaneous_Symbols + \
|
||||
Dingbats + Private_Use_Area + Alphabetic_Presentation_Forms
|
||||
|
||||
def unipmlcode(char):
|
||||
try:
|
||||
val = ord(char.encode('cp1252'))
|
||||
if val in A_CHARS:
|
||||
return '\\a%i' % val
|
||||
except:
|
||||
pass
|
||||
val = ord(char)
|
||||
if val in U_CHARS:
|
||||
return '\\U%04x'.upper() % val
|
||||
else:
|
||||
return '?'
|
@ -13,6 +13,7 @@ import re
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.pdb.ereader import image_name
|
||||
from calibre.ebooks.pml import unipmlcode
|
||||
from calibre import entity_to_unicode
|
||||
|
||||
TAG_MAP = {
|
||||
@ -163,8 +164,9 @@ class PMLMLizer(object):
|
||||
mo = re.search('(%s)' % entity[1:-1], text)
|
||||
text = text.replace(entity, entity_to_unicode(mo))
|
||||
|
||||
# Turn all unicode characters into their PML hex equivelent
|
||||
text = re.sub('[^\x00-\x7f]', lambda x: '\\U%04x' % ord(x.group()), text)
|
||||
# Turn all characters that cannot be represented by themself into their
|
||||
# PML code equivelent
|
||||
text = re.sub('[^\x00-\x7f]', lambda x: unipmlcode(x.group()), text)
|
||||
|
||||
# Remove excess spaces at beginning and end of lines
|
||||
text = re.sub('(?m)^[ ]+', '', text)
|
||||
|
Loading…
x
Reference in New Issue
Block a user