diff --git a/src/calibre/ebooks/docx/lcid.py b/src/calibre/ebooks/docx/lcid.py new file mode 100644 index 0000000000..293212ab8b --- /dev/null +++ b/src/calibre/ebooks/docx/lcid.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +lcid = { + 1078: 'af', # Afrikaans - South Africa + 1052: 'sq', # Albanian - Albania + 1118: 'am', # Amharic - Ethiopia + 1025: 'ar', # Arabic - Saudi Arabia + 5121: 'ar', # Arabic - Algeria + 15361: 'ar', # Arabic - Bahrain + 3073: 'ar', # Arabic - Egypt + 2049: 'ar', # Arabic - Iraq + 11265: 'ar', # Arabic - Jordan + 13313: 'ar', # Arabic - Kuwait + 12289: 'ar', # Arabic - Lebanon + 4097: 'ar', # Arabic - Libya + 6145: 'ar', # Arabic - Morocco + 8193: 'ar', # Arabic - Oman + 16385: 'ar', # Arabic - Qatar + 10241: 'ar', # Arabic - Syria + 7169: 'ar', # Arabic - Tunisia + 14337: 'ar', # Arabic - U.A.E. + 9217: 'ar', # Arabic - Yemen + 1067: 'hy', # Armenian - Armenia + 1101: 'as', # Assamese + 2092: 'az', # Azeri (Cyrillic) + 1068: 'az', # Azeri (Latin) + 1069: 'eu', # Basque + 1059: 'be', # Belarusian + 1093: 'bn', # Bengali (India) + 2117: 'bn', # Bengali (Bangladesh) + 5146: 'bs', # Bosnian (Bosnia/Herzegovina) + 1026: 'bg', # Bulgarian + 1109: 'my', # Burmese + 1027: 'ca', # Catalan + 1116: 'chr', # Cherokee - United States + 2052: 'zh', # Chinese - People's Republic of China + 4100: 'zh', # Chinese - Singapore + 1028: 'zh', # Chinese - Taiwan + 3076: 'zh', # Chinese - Hong Kong SAR + 5124: 'zh', # Chinese - Macao SAR + 1050: 'hr', # Croatian + 4122: 'hr', # Croatian (Bosnia/Herzegovina) + 1029: 'cs', # Czech + 1030: 'da', # Danish + 1125: 'dv', # Divehi + 1043: 'nl', # Dutch - Netherlands + 2067: 'nl', # Dutch - Belgium + 1126: 'bin', # Edo + 1033: 'en', # English - United States + 2057: 'en', # English - United Kingdom + 3081: 'en', # English - Australia + 10249: 'en', # English - Belize + 4105: 'en', # English - Canada + 9225: 'en', # English - Caribbean + 15369: 'en', # English - Hong Kong SAR + 16393: 'en', # English - India + 14345: 'en', # English - Indonesia + 6153: 'en', # English - Ireland + 8201: 'en', # English - Jamaica + 17417: 'en', # English - Malaysia + 5129: 'en', # English - New Zealand + 13321: 'en', # English - Philippines + 18441: 'en', # English - Singapore + 7177: 'en', # English - South Africa + 11273: 'en', # English - Trinidad + 12297: 'en', # English - Zimbabwe + 1061: 'et', # Estonian + 1080: 'fo', # Faroese + 1065: None, # TODO: Farsi + 1124: 'fil', # Filipino + 1035: 'fi', # Finnish + 1036: 'fr', # French - France + 2060: 'fr', # French - Belgium + 11276: 'fr', # French - Cameroon + 3084: 'fr', # French - Canada + 9228: 'fr', # French - Democratic Rep. of Congo + 12300: 'fr', # French - Cote d'Ivoire + 15372: 'fr', # French - Haiti + 5132: 'fr', # French - Luxembourg + 13324: 'fr', # French - Mali + 6156: 'fr', # French - Monaco + 14348: 'fr', # French - Morocco + 58380: 'fr', # French - North Africa + 8204: 'fr', # French - Reunion + 10252: 'fr', # French - Senegal + 4108: 'fr', # French - Switzerland + 7180: 'fr', # French - West Indies + 1122: 'fy', # Frisian - Netherlands + 1127: None, # TODO: Fulfulde - Nigeria + 1071: 'mk', # FYRO Macedonian + 2108: 'ga', # Gaelic (Ireland) + 1084: 'gd', # Gaelic (Scotland) + 1110: 'gl', # Galician + 1079: 'ka', # Georgian + 1031: 'de', # German - Germany + 3079: 'de', # German - Austria + 5127: 'de', # German - Liechtenstein + 4103: 'de', # German - Luxembourg + 2055: 'de', # German - Switzerland + 1032: 'el', # Greek + 1140: 'gn', # Guarani - Paraguay + 1095: 'gu', # Gujarati + 1128: 'ha', # Hausa - Nigeria + 1141: 'haw', # Hawaiian - United States + 1037: 'he', # Hebrew + 1081: 'hi', # Hindi + 1038: 'hu', # Hungarian + 1129: None, # TODO: Ibibio - Nigeria + 1039: 'is', # Icelandic + 1136: 'ig', # Igbo - Nigeria + 1057: 'id', # Indonesian + 1117: 'iu', # Inuktitut + 1040: 'it', # Italian - Italy + 2064: 'it', # Italian - Switzerland + 1041: 'ja', # Japanese + 1099: 'kn', # Kannada + 1137: 'kr', # Kanuri - Nigeria + 2144: 'ks', # Kashmiri + 1120: 'ks', # Kashmiri (Arabic) + 1087: 'kk', # Kazakh + 1107: 'km', # Khmer + 1111: 'kok', # Konkani + 1042: 'ko', # Korean + 1088: 'ky', # Kyrgyz (Cyrillic) + 1108: 'lo', # Lao + 1142: 'la', # Latin + 1062: 'lv', # Latvian + 1063: 'lt', # Lithuanian + 1086: 'ms', # Malay - Malaysia + 2110: 'ms', # Malay - Brunei Darussalam + 1100: 'ml', # Malayalam + 1082: 'mt', # Maltese + 1112: 'mni', # Manipuri + 1153: 'mi', # Maori - New Zealand + 1102: 'mr', # Marathi + 1104: 'mn', # Mongolian (Cyrillic) + 2128: 'mn', # Mongolian (Mongolian) + 1121: 'ne', # Nepali + 2145: 'ne', # Nepali - India + 1044: 'no', # Norwegian (Bokmᅢᆬl) + 2068: 'no', # Norwegian (Nynorsk) + 1096: 'or', # Oriya + 1138: 'om', # Oromo + 1145: 'pap', # Papiamentu + 1123: 'ps', # Pashto + 1045: 'pl', # Polish + 1046: 'pt', # Portuguese - Brazil + 2070: 'pt', # Portuguese - Portugal + 1094: 'pa', # Punjabi + 2118: 'pa', # Punjabi (Pakistan) + 1131: 'qu', # Quecha - Bolivia + 2155: 'qu', # Quecha - Ecuador + 3179: 'qu', # Quecha - Peru + 1047: 'rm', # Rhaeto-Romanic + 1048: 'ro', # Romanian + 2072: 'ro', # Romanian - Moldava + 1049: 'ru', # Russian + 2073: 'ru', # Russian - Moldava + 1083: 'se', # Sami (Lappish) + 1103: 'sa', # Sanskrit + 1132: 'nso', # Sepedi + 3098: 'sr', # Serbian (Cyrillic) + 2074: 'sr', # Serbian (Latin) + 1113: 'sd', # Sindhi - India + 2137: 'sd', # Sindhi - Pakistan + 1115: 'si', # Sinhalese - Sri Lanka + 1051: 'sk', # Slovak + 1060: 'sl', # Slovenian + 1143: 'so', # Somali + 1070: 'wen', # Sorbian + 3082: 'es', # Spanish - Spain (Modern Sort) + 1034: 'es', # Spanish - Spain (Traditional Sort) + 11274: 'es', # Spanish - Argentina + 16394: 'es', # Spanish - Bolivia + 13322: 'es', # Spanish - Chile + 9226: 'es', # Spanish - Colombia + 5130: 'es', # Spanish - Costa Rica + 7178: 'es', # Spanish - Dominican Republic + 12298: 'es', # Spanish - Ecuador + 17418: 'es', # Spanish - El Salvador + 4106: 'es', # Spanish - Guatemala + 18442: 'es', # Spanish - Honduras + 58378: 'es', # Spanish - Latin America + 2058: 'es', # Spanish - Mexico + 19466: 'es', # Spanish - Nicaragua + 6154: 'es', # Spanish - Panama + 15370: 'es', # Spanish - Paraguay + 10250: 'es', # Spanish - Peru + 20490: 'es', # Spanish - Puerto Rico + 21514: 'es', # Spanish - United States + 14346: 'es', # Spanish - Uruguay + 8202: 'es', # Spanish - Venezuela + 1072: None, # TODO: Sutu + 1089: 'sw', # Swahili + 1053: 'sv', # Swedish + 2077: 'sv', # Swedish - Finland + 1114: 'syr', # Syriac + 1064: 'tg', # Tajik + 1119: None, # TODO: Tamazight (Arabic) + 2143: None, # TODO: Tamazight (Latin) + 1097: 'ta', # Tamil + 1092: 'tt', # Tatar + 1098: 'te', # Telugu + 1054: 'th', # Thai + 2129: 'bo', # Tibetan - Bhutan + 1105: 'bo', # Tibetan - People's Republic of China + 2163: 'ti', # Tigrigna - Eritrea + 1139: 'ti', # Tigrigna - Ethiopia + 1073: 'ts', # Tsonga + 1074: 'tn', # Tswana + 1055: 'tr', # Turkish + 1090: 'tk', # Turkmen + 1152: 'ug', # Uighur - China + 1058: 'uk', # Ukrainian + 1056: 'ur', # Urdu + 2080: 'ur', # Urdu - India + 2115: 'uz', # Uzbek (Cyrillic) + 1091: 'uz', # Uzbek (Latin) + 1075: 've', # Venda + 1066: 'vi', # Vietnamese + 1106: 'cy', # Welsh + 1076: 'xh', # Xhosa + 1144: 'ii', # Yi + 1085: 'yi', # Yiddish + 1130: 'yo', # Yoruba + 1077: 'zu' # Zulu +} diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index 5113f4c551..e27602c33d 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -15,15 +15,15 @@ class Inherit: inherit = Inherit() def binary_property(parent, name): - vals = XPath('./w:%s') + vals = XPath('./w:%s' % name)(parent) if not vals: return inherit val = get(vals[0], 'w:val', 'on') return True if val in {'on', '1', 'true'} else False -def simple_color(col): +def simple_color(col, auto='black'): if not col or col == 'auto' or len(col) != 6: - return 'black' + return auto return '#'+col def simple_float(val, mult=1.0): @@ -66,37 +66,38 @@ LINE_STYLES = { # {{{ 'triple': 'double', } # }}} -def read_border(border, dest): - all_attrs = set() +def read_border(parent, dest): + tvals = {'padding_%s':inherit, 'border_%s_width':inherit, + 'border_%s_style':inherit, 'border_%s_color':inherit} + vals = {} for edge in ('left', 'top', 'right', 'bottom'): - vals = {'padding_%s':inherit, 'border_%s_width':inherit, - 'border_%s_style':inherit, 'border_%s_color':inherit} - all_attrs |= {key % edge for key in vals} - for elem in XPath('./w:%s' % edge): - color = get(elem, 'w:color') - if color is not None: - vals['border_%s_color'] = simple_color(color) - style = get(elem, 'w:val') - if style is not None: - vals['border_%s_style'] = LINE_STYLES.get(style, 'solid') - space = get(elem, 'w:space') - if space is not None: - try: - vals['padding_%s'] = float(space) - except (ValueError, TypeError): - pass - sz = get(elem, 'w:space') - if sz is not None: - # we dont care about art borders (they are only used for page borders) - try: - vals['border_%s_width'] = min(96, max(2, float(sz))) * 8 - except (ValueError, TypeError): - pass + vals.update({k % edge:v for k, v in tvals.iteritems()}) - for key, val in vals.iteritems(): - setattr(dest, key % edge, val) + for border in XPath('./w:pBdr')(parent): + for edge in ('left', 'top', 'right', 'bottom'): + for elem in XPath('./w:%s' % edge): + color = get(elem, 'w:color') + if color is not None: + vals['border_%s_color' % edge] = simple_color(color) + style = get(elem, 'w:val') + if style is not None: + vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid') + space = get(elem, 'w:space') + if space is not None: + try: + vals['padding_%s' % edge] = float(space) + except (ValueError, TypeError): + pass + sz = get(elem, 'w:sz') + if sz is not None: + # we dont care about art borders (they are only used for page borders) + try: + vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8 + except (ValueError, TypeError): + pass - return all_attrs + for key, val in vals.iteritems(): + setattr(dest, key, val) def read_indent(parent, dest): padding_left = padding_right = text_indent = inherit @@ -116,12 +117,11 @@ def read_indent(parent, dest): ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None) if ti is not None: - text_indent = '%.3f' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') + text_indent = '%.3f%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') - setattr(dest, 'padding_left', padding_left) - setattr(dest, 'padding_right', padding_right) + setattr(dest, 'margin_left', padding_left) + setattr(dest, 'margin_right', padding_right) setattr(dest, 'text_indent', text_indent) - return {'padding_left', 'padding_right', 'text_indent'} def read_justification(parent, dest): ans = inherit @@ -134,7 +134,6 @@ def read_justification(parent, dest): if val in {'left', 'center', 'right',}: ans = val setattr(dest, 'text_align', ans) - return {'text_align'} def read_spacing(parent, dest): padding_top = padding_bottom = line_height = inherit @@ -154,10 +153,9 @@ def read_spacing(parent, dest): lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0) line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '') - setattr(dest, 'padding_top', padding_top) - setattr(dest, 'padding_bottom', padding_bottom) + setattr(dest, 'margin_top', padding_top) + setattr(dest, 'margin_bottom', padding_bottom) setattr(dest, 'line_height', line_height) - return {'padding_top', 'padding_bottom', 'line_height'} def read_direction(parent, dest): ans = inherit @@ -168,34 +166,187 @@ def read_direction(parent, dest): if 'rl' in val.lower(): ans = 'rtl' setattr(dest, 'direction', ans) - return {'direction'} +def read_shd(parent, dest): + ans = inherit + for shd in XPath('./w:shd[@w:fill]')(parent): + val = get(shd, 'w:fill') + if val: + ans = simple_color(val, auto='transparent') + setattr(dest, 'background_color', ans) class ParagraphStyle(object): - border_path = XPath('./w:pBdr') + all_properties = ( + 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', + 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', + 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', + 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', + + # Border margins padding + 'border_left_width', 'border_left_style', 'border_left_color', 'padding_left', + 'border_top_width', 'border_top_style', 'border_top_color', 'padding_top', + 'border_right_width', 'border_right_style', 'border_right_color', 'padding_right', + 'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom', + 'margin_left', 'margin_top', 'margin_right', 'margin_bottom', + + # Misc. + 'text_indent', 'text_align', 'line_height', 'direction', 'background_color', + ) def __init__(self, pPr): - self.all_properties = set() for p in ( - 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', - 'bidi', 'contextualSpacing', 'keepLines', 'keepNext', - 'mirrorIndents', 'pageBreakBefore', 'snapToGrid', - 'suppressLineNumbers', 'suppressOverlap', 'topLinePunct', - 'widowControl', 'wordWrap', + 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', + 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', + 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', + 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', ): - self.all_properties.add(p) - setattr(p, binary_property(pPr, p)) + setattr(self, p, binary_property(pPr, p)) - for border in self.border_path(pPr): - self.all_properties |= read_border(border, self) - - self.all_properties |= read_indent(pPr, self) - self.all_properties |= read_justification(pPr, self) - self.all_properties |= read_spacing(pPr, self) - self.all_properties |= read_direction(pPr, self) + for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'): + f = globals()['read_%s' % x] + f(pPr, self) # TODO: numPr and outlineLvl + + def update(self, other): + for prop in self.all_properties: + nval = getattr(other, prop) + if nval is not inherit: + setattr(self, prop, nval) + +# }}} + +# Character styles {{{ +def read_text_border(parent, dest): + border_color = border_style = border_width = padding = inherit + elems = XPath('./w:bdr')(parent) + if elems: + border_color = simple_color('auto') + border_style = 'solid' + border_width = 1 + for elem in elems: + color = get(elem, 'w:color') + if color is not None: + border_color = simple_color(color) + style = get(elem, 'w:val') + if style is not None: + border_style = LINE_STYLES.get(style, 'solid') + space = get(elem, 'w:space') + if space is not None: + try: + padding = float(space) + except (ValueError, TypeError): + pass + sz = get(elem, 'w:sz') + if sz is not None: + # we dont care about art borders (they are only used for page borders) + try: + border_width = min(96, max(2, float(sz))) / 8 + except (ValueError, TypeError): + pass + + setattr(dest, 'border_color', border_color) + setattr(dest, 'border_style', border_style) + setattr(dest, 'border_width', border_width) + setattr(dest, 'padding', padding) + +def read_color(parent, dest): + ans = inherit + for col in XPath('./w:color[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + ans = simple_color(val) + setattr(dest, 'color', ans) + +def read_highlight(parent, dest): + ans = inherit + for col in XPath('./w:highlight[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + if not val or val == 'none': + val = 'transparent' + ans = val + setattr(dest, 'highlight', ans) + +def read_lang(parent, dest): + ans = inherit + for col in XPath('./w:lang[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + try: + code = int(val, 16) + except (ValueError, TypeError): + ans = val + else: + from calibre.ebooks.docx.lcid import lcid + val = lcid.get(code, None) + if val: + ans = val + setattr(dest, 'lang', ans) + +def read_letter_spacing(parent, dest): + ans = inherit + for col in XPath('./w:spacing[@w:val]')(parent): + val = simple_float(get(col, 'w:val'), 0.05) + if val: + ans = val + setattr(dest, 'letter_spacing', ans) + +def read_sz(parent, dest): + ans = inherit + for col in XPath('./w:sz[@w:val]')(parent): + val = simple_float(get(col, 'w:val'), 0.5) + if val: + ans = val + setattr(dest, 'font_size', ans) + +def read_underline(parent, dest): + ans = inherit + for col in XPath('./w:u[@w:val]')(parent): + val = get(col, 'w:val') + if val: + ans = 'underline' + setattr(dest, 'text_decoration', ans) + +def read_vert_align(parent, dest): + ans = inherit + for col in XPath('./w:vertAlign[@w:val]')(parent): + val = get(col, 'w:val') + if val and val in {'baseline', 'subscript', 'superscript'}: + ans = val + setattr(dest, 'vert_align', ans) + + +class RunStyle(object): + + all_properties = ( + 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', + 'smallCaps', 'strike', 'vanish', + + 'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background-color', + 'letter_spacing', 'font_size', 'text_decoration', 'vert_align', + ) + + def __init__(self, rPr): + for p in ( + 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', + 'smallCaps', 'strike', 'vanish', + ): + setattr(self, p, binary_property(rPr, p)) + + for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align'): + f = globals()['read_%s' % x] + f(rPr, self) + + def update(self, other): + for prop in self.all_properties: + nval = getattr(other, prop) + if nval is not inherit: + setattr(self, prop, nval) # }}} class Style(object): @@ -218,6 +369,24 @@ class Style(object): if self.style_type not in {'paragraph', 'character'}: self.link = None + self.paragraph_style = self.character_style = None + + if self.style_type in {'paragraph', 'character'}: + if self.style_type == 'paragraph': + for pPr in XPath('./w:pPr')(elem): + ps = ParagraphStyle(pPr) + if self.paragraph_style is None: + self.paragraph_style = ps + else: + self.paragraph_style.update(ps) + + for rPr in XPath('./w:rPr')(elem): + rs = RunStyle(rPr) + if self.character_style is None: + self.character_style = rs + else: + self.character_style.update(rs) + class Styles(object): @@ -259,5 +428,3 @@ class Styles(object): # TODO: Document defaults (docDefaults) - -