Read character styles

This commit is contained in:
Kovid Goyal 2013-05-06 14:16:57 +05:30
parent 3e40c288ff
commit a5549a0fc4
2 changed files with 458 additions and 58 deletions

View File

@ -0,0 +1,233 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
lcid = {
1078: 'af', # Afrikaans - South Africa
1052: 'sq', # Albanian - Albania
1118: 'am', # Amharic - Ethiopia
1025: 'ar', # Arabic - Saudi Arabia
5121: 'ar', # Arabic - Algeria
15361: 'ar', # Arabic - Bahrain
3073: 'ar', # Arabic - Egypt
2049: 'ar', # Arabic - Iraq
11265: 'ar', # Arabic - Jordan
13313: 'ar', # Arabic - Kuwait
12289: 'ar', # Arabic - Lebanon
4097: 'ar', # Arabic - Libya
6145: 'ar', # Arabic - Morocco
8193: 'ar', # Arabic - Oman
16385: 'ar', # Arabic - Qatar
10241: 'ar', # Arabic - Syria
7169: 'ar', # Arabic - Tunisia
14337: 'ar', # Arabic - U.A.E.
9217: 'ar', # Arabic - Yemen
1067: 'hy', # Armenian - Armenia
1101: 'as', # Assamese
2092: 'az', # Azeri (Cyrillic)
1068: 'az', # Azeri (Latin)
1069: 'eu', # Basque
1059: 'be', # Belarusian
1093: 'bn', # Bengali (India)
2117: 'bn', # Bengali (Bangladesh)
5146: 'bs', # Bosnian (Bosnia/Herzegovina)
1026: 'bg', # Bulgarian
1109: 'my', # Burmese
1027: 'ca', # Catalan
1116: 'chr', # Cherokee - United States
2052: 'zh', # Chinese - People's Republic of China
4100: 'zh', # Chinese - Singapore
1028: 'zh', # Chinese - Taiwan
3076: 'zh', # Chinese - Hong Kong SAR
5124: 'zh', # Chinese - Macao SAR
1050: 'hr', # Croatian
4122: 'hr', # Croatian (Bosnia/Herzegovina)
1029: 'cs', # Czech
1030: 'da', # Danish
1125: 'dv', # Divehi
1043: 'nl', # Dutch - Netherlands
2067: 'nl', # Dutch - Belgium
1126: 'bin', # Edo
1033: 'en', # English - United States
2057: 'en', # English - United Kingdom
3081: 'en', # English - Australia
10249: 'en', # English - Belize
4105: 'en', # English - Canada
9225: 'en', # English - Caribbean
15369: 'en', # English - Hong Kong SAR
16393: 'en', # English - India
14345: 'en', # English - Indonesia
6153: 'en', # English - Ireland
8201: 'en', # English - Jamaica
17417: 'en', # English - Malaysia
5129: 'en', # English - New Zealand
13321: 'en', # English - Philippines
18441: 'en', # English - Singapore
7177: 'en', # English - South Africa
11273: 'en', # English - Trinidad
12297: 'en', # English - Zimbabwe
1061: 'et', # Estonian
1080: 'fo', # Faroese
1065: None, # TODO: Farsi
1124: 'fil', # Filipino
1035: 'fi', # Finnish
1036: 'fr', # French - France
2060: 'fr', # French - Belgium
11276: 'fr', # French - Cameroon
3084: 'fr', # French - Canada
9228: 'fr', # French - Democratic Rep. of Congo
12300: 'fr', # French - Cote d'Ivoire
15372: 'fr', # French - Haiti
5132: 'fr', # French - Luxembourg
13324: 'fr', # French - Mali
6156: 'fr', # French - Monaco
14348: 'fr', # French - Morocco
58380: 'fr', # French - North Africa
8204: 'fr', # French - Reunion
10252: 'fr', # French - Senegal
4108: 'fr', # French - Switzerland
7180: 'fr', # French - West Indies
1122: 'fy', # Frisian - Netherlands
1127: None, # TODO: Fulfulde - Nigeria
1071: 'mk', # FYRO Macedonian
2108: 'ga', # Gaelic (Ireland)
1084: 'gd', # Gaelic (Scotland)
1110: 'gl', # Galician
1079: 'ka', # Georgian
1031: 'de', # German - Germany
3079: 'de', # German - Austria
5127: 'de', # German - Liechtenstein
4103: 'de', # German - Luxembourg
2055: 'de', # German - Switzerland
1032: 'el', # Greek
1140: 'gn', # Guarani - Paraguay
1095: 'gu', # Gujarati
1128: 'ha', # Hausa - Nigeria
1141: 'haw', # Hawaiian - United States
1037: 'he', # Hebrew
1081: 'hi', # Hindi
1038: 'hu', # Hungarian
1129: None, # TODO: Ibibio - Nigeria
1039: 'is', # Icelandic
1136: 'ig', # Igbo - Nigeria
1057: 'id', # Indonesian
1117: 'iu', # Inuktitut
1040: 'it', # Italian - Italy
2064: 'it', # Italian - Switzerland
1041: 'ja', # Japanese
1099: 'kn', # Kannada
1137: 'kr', # Kanuri - Nigeria
2144: 'ks', # Kashmiri
1120: 'ks', # Kashmiri (Arabic)
1087: 'kk', # Kazakh
1107: 'km', # Khmer
1111: 'kok', # Konkani
1042: 'ko', # Korean
1088: 'ky', # Kyrgyz (Cyrillic)
1108: 'lo', # Lao
1142: 'la', # Latin
1062: 'lv', # Latvian
1063: 'lt', # Lithuanian
1086: 'ms', # Malay - Malaysia
2110: 'ms', # Malay - Brunei Darussalam
1100: 'ml', # Malayalam
1082: 'mt', # Maltese
1112: 'mni', # Manipuri
1153: 'mi', # Maori - New Zealand
1102: 'mr', # Marathi
1104: 'mn', # Mongolian (Cyrillic)
2128: 'mn', # Mongolian (Mongolian)
1121: 'ne', # Nepali
2145: 'ne', # Nepali - India
1044: 'no', # Norwegian (Bokmᅢᆬl)
2068: 'no', # Norwegian (Nynorsk)
1096: 'or', # Oriya
1138: 'om', # Oromo
1145: 'pap', # Papiamentu
1123: 'ps', # Pashto
1045: 'pl', # Polish
1046: 'pt', # Portuguese - Brazil
2070: 'pt', # Portuguese - Portugal
1094: 'pa', # Punjabi
2118: 'pa', # Punjabi (Pakistan)
1131: 'qu', # Quecha - Bolivia
2155: 'qu', # Quecha - Ecuador
3179: 'qu', # Quecha - Peru
1047: 'rm', # Rhaeto-Romanic
1048: 'ro', # Romanian
2072: 'ro', # Romanian - Moldava
1049: 'ru', # Russian
2073: 'ru', # Russian - Moldava
1083: 'se', # Sami (Lappish)
1103: 'sa', # Sanskrit
1132: 'nso', # Sepedi
3098: 'sr', # Serbian (Cyrillic)
2074: 'sr', # Serbian (Latin)
1113: 'sd', # Sindhi - India
2137: 'sd', # Sindhi - Pakistan
1115: 'si', # Sinhalese - Sri Lanka
1051: 'sk', # Slovak
1060: 'sl', # Slovenian
1143: 'so', # Somali
1070: 'wen', # Sorbian
3082: 'es', # Spanish - Spain (Modern Sort)
1034: 'es', # Spanish - Spain (Traditional Sort)
11274: 'es', # Spanish - Argentina
16394: 'es', # Spanish - Bolivia
13322: 'es', # Spanish - Chile
9226: 'es', # Spanish - Colombia
5130: 'es', # Spanish - Costa Rica
7178: 'es', # Spanish - Dominican Republic
12298: 'es', # Spanish - Ecuador
17418: 'es', # Spanish - El Salvador
4106: 'es', # Spanish - Guatemala
18442: 'es', # Spanish - Honduras
58378: 'es', # Spanish - Latin America
2058: 'es', # Spanish - Mexico
19466: 'es', # Spanish - Nicaragua
6154: 'es', # Spanish - Panama
15370: 'es', # Spanish - Paraguay
10250: 'es', # Spanish - Peru
20490: 'es', # Spanish - Puerto Rico
21514: 'es', # Spanish - United States
14346: 'es', # Spanish - Uruguay
8202: 'es', # Spanish - Venezuela
1072: None, # TODO: Sutu
1089: 'sw', # Swahili
1053: 'sv', # Swedish
2077: 'sv', # Swedish - Finland
1114: 'syr', # Syriac
1064: 'tg', # Tajik
1119: None, # TODO: Tamazight (Arabic)
2143: None, # TODO: Tamazight (Latin)
1097: 'ta', # Tamil
1092: 'tt', # Tatar
1098: 'te', # Telugu
1054: 'th', # Thai
2129: 'bo', # Tibetan - Bhutan
1105: 'bo', # Tibetan - People's Republic of China
2163: 'ti', # Tigrigna - Eritrea
1139: 'ti', # Tigrigna - Ethiopia
1073: 'ts', # Tsonga
1074: 'tn', # Tswana
1055: 'tr', # Turkish
1090: 'tk', # Turkmen
1152: 'ug', # Uighur - China
1058: 'uk', # Ukrainian
1056: 'ur', # Urdu
2080: 'ur', # Urdu - India
2115: 'uz', # Uzbek (Cyrillic)
1091: 'uz', # Uzbek (Latin)
1075: 've', # Venda
1066: 'vi', # Vietnamese
1106: 'cy', # Welsh
1076: 'xh', # Xhosa
1144: 'ii', # Yi
1085: 'yi', # Yiddish
1130: 'yo', # Yoruba
1077: 'zu' # Zulu
}

View File

@ -15,15 +15,15 @@ class Inherit:
inherit = Inherit()
def binary_property(parent, name):
vals = XPath('./w:%s')
vals = XPath('./w:%s' % name)(parent)
if not vals:
return inherit
val = get(vals[0], 'w:val', 'on')
return True if val in {'on', '1', 'true'} else False
def simple_color(col):
def simple_color(col, auto='black'):
if not col or col == 'auto' or len(col) != 6:
return 'black'
return auto
return '#'+col
def simple_float(val, mult=1.0):
@ -66,37 +66,38 @@ LINE_STYLES = { # {{{
'triple': 'double',
} # }}}
def read_border(border, dest):
all_attrs = set()
def read_border(parent, dest):
tvals = {'padding_%s':inherit, 'border_%s_width':inherit,
'border_%s_style':inherit, 'border_%s_color':inherit}
vals = {}
for edge in ('left', 'top', 'right', 'bottom'):
vals = {'padding_%s':inherit, 'border_%s_width':inherit,
'border_%s_style':inherit, 'border_%s_color':inherit}
all_attrs |= {key % edge for key in vals}
for elem in XPath('./w:%s' % edge):
color = get(elem, 'w:color')
if color is not None:
vals['border_%s_color'] = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
vals['border_%s_style'] = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
vals['padding_%s'] = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:space')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
vals['border_%s_width'] = min(96, max(2, float(sz))) * 8
except (ValueError, TypeError):
pass
vals.update({k % edge:v for k, v in tvals.iteritems()})
for key, val in vals.iteritems():
setattr(dest, key % edge, val)
for border in XPath('./w:pBdr')(parent):
for edge in ('left', 'top', 'right', 'bottom'):
for elem in XPath('./w:%s' % edge):
color = get(elem, 'w:color')
if color is not None:
vals['border_%s_color' % edge] = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
vals['padding_%s' % edge] = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
return all_attrs
for key, val in vals.iteritems():
setattr(dest, key, val)
def read_indent(parent, dest):
padding_left = padding_right = text_indent = inherit
@ -116,12 +117,11 @@ def read_indent(parent, dest):
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
if ti is not None:
text_indent = '%.3f' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
text_indent = '%.3f%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
setattr(dest, 'padding_left', padding_left)
setattr(dest, 'padding_right', padding_right)
setattr(dest, 'margin_left', padding_left)
setattr(dest, 'margin_right', padding_right)
setattr(dest, 'text_indent', text_indent)
return {'padding_left', 'padding_right', 'text_indent'}
def read_justification(parent, dest):
ans = inherit
@ -134,7 +134,6 @@ def read_justification(parent, dest):
if val in {'left', 'center', 'right',}:
ans = val
setattr(dest, 'text_align', ans)
return {'text_align'}
def read_spacing(parent, dest):
padding_top = padding_bottom = line_height = inherit
@ -154,10 +153,9 @@ def read_spacing(parent, dest):
lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0)
line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '')
setattr(dest, 'padding_top', padding_top)
setattr(dest, 'padding_bottom', padding_bottom)
setattr(dest, 'margin_top', padding_top)
setattr(dest, 'margin_bottom', padding_bottom)
setattr(dest, 'line_height', line_height)
return {'padding_top', 'padding_bottom', 'line_height'}
def read_direction(parent, dest):
ans = inherit
@ -168,34 +166,187 @@ def read_direction(parent, dest):
if 'rl' in val.lower():
ans = 'rtl'
setattr(dest, 'direction', ans)
return {'direction'}
def read_shd(parent, dest):
ans = inherit
for shd in XPath('./w:shd[@w:fill]')(parent):
val = get(shd, 'w:fill')
if val:
ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans)
class ParagraphStyle(object):
border_path = XPath('./w:pBdr')
all_properties = (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
# Border margins padding
'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
# Misc.
'text_indent', 'text_align', 'line_height', 'direction', 'background_color',
)
def __init__(self, pPr):
self.all_properties = set()
for p in (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN',
'bidi', 'contextualSpacing', 'keepLines', 'keepNext',
'mirrorIndents', 'pageBreakBefore', 'snapToGrid',
'suppressLineNumbers', 'suppressOverlap', 'topLinePunct',
'widowControl', 'wordWrap',
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
):
self.all_properties.add(p)
setattr(p, binary_property(pPr, p))
setattr(self, p, binary_property(pPr, p))
for border in self.border_path(pPr):
self.all_properties |= read_border(border, self)
self.all_properties |= read_indent(pPr, self)
self.all_properties |= read_justification(pPr, self)
self.all_properties |= read_spacing(pPr, self)
self.all_properties |= read_direction(pPr, self)
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'):
f = globals()['read_%s' % x]
f(pPr, self)
# TODO: numPr and outlineLvl
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
# }}}
# Character styles {{{
def read_text_border(parent, dest):
border_color = border_style = border_width = padding = inherit
elems = XPath('./w:bdr')(parent)
if elems:
border_color = simple_color('auto')
border_style = 'solid'
border_width = 1
for elem in elems:
color = get(elem, 'w:color')
if color is not None:
border_color = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
border_style = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
border_width = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
setattr(dest, 'border_color', border_color)
setattr(dest, 'border_style', border_style)
setattr(dest, 'border_width', border_width)
setattr(dest, 'padding', padding)
def read_color(parent, dest):
ans = inherit
for col in XPath('./w:color[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
ans = simple_color(val)
setattr(dest, 'color', ans)
def read_highlight(parent, dest):
ans = inherit
for col in XPath('./w:highlight[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
if not val or val == 'none':
val = 'transparent'
ans = val
setattr(dest, 'highlight', ans)
def read_lang(parent, dest):
ans = inherit
for col in XPath('./w:lang[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
try:
code = int(val, 16)
except (ValueError, TypeError):
ans = val
else:
from calibre.ebooks.docx.lcid import lcid
val = lcid.get(code, None)
if val:
ans = val
setattr(dest, 'lang', ans)
def read_letter_spacing(parent, dest):
ans = inherit
for col in XPath('./w:spacing[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.05)
if val:
ans = val
setattr(dest, 'letter_spacing', ans)
def read_sz(parent, dest):
ans = inherit
for col in XPath('./w:sz[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
if val:
ans = val
setattr(dest, 'font_size', ans)
def read_underline(parent, dest):
ans = inherit
for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val')
if val:
ans = 'underline'
setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest):
ans = inherit
for col in XPath('./w:vertAlign[@w:val]')(parent):
val = get(col, 'w:val')
if val and val in {'baseline', 'subscript', 'superscript'}:
ans = val
setattr(dest, 'vert_align', ans)
class RunStyle(object):
all_properties = (
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish',
'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background-color',
'letter_spacing', 'font_size', 'text_decoration', 'vert_align',
)
def __init__(self, rPr):
for p in (
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish',
):
setattr(self, p, binary_property(rPr, p))
for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align'):
f = globals()['read_%s' % x]
f(rPr, self)
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
# }}}
class Style(object):
@ -218,6 +369,24 @@ class Style(object):
if self.style_type not in {'paragraph', 'character'}:
self.link = None
self.paragraph_style = self.character_style = None
if self.style_type in {'paragraph', 'character'}:
if self.style_type == 'paragraph':
for pPr in XPath('./w:pPr')(elem):
ps = ParagraphStyle(pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
self.paragraph_style.update(ps)
for rPr in XPath('./w:rPr')(elem):
rs = RunStyle(rPr)
if self.character_style is None:
self.character_style = rs
else:
self.character_style.update(rs)
class Styles(object):
@ -259,5 +428,3 @@ class Styles(object):
# TODO: Document defaults (docDefaults)