diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py deleted file mode 100644 index d46e4c707a..0000000000 --- a/src/calibre/ebooks/htmlsymbols.py +++ /dev/null @@ -1,312 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Maping of non-acii symbols and their corresponding html entity number and name -''' -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' - -# http://www.w3schools.com/tags/ref_symbols.asp -HTML_SYMBOLS = { - # Math Symbols - u'∀' : ['∀', '∀'], # for all - u'∂' : ['∂', '∂'], # part - u'∃' : ['∃', '&exists;'], # exists - u'∅' : ['∅', '∅'], # empty - u'∇' : ['∇', '∇'], # nabla - u'∈' : ['∈', '∈'], # isin - u'∉' : ['∉', '∉'], # notin - u'∋' : ['∋', '∋'], # ni - u'∏' : ['∏', '∏'], # prod - u'∑' : ['∑', '∑'], # sum - u'−' : ['−', '−'], # minus - u'∗' : ['∗', '∗'], # lowast - u'√' : ['√', '√'], # square root - u'∝' : ['∝', '∝'], # proportional to - u'∞' : ['∞', '∞'], # infinity - u'∠' : ['∠', '∠'], # angle - u'∧' : ['∧', '∧'], # and - u'∨' : ['∨', '∨'], # or - u'∩' : ['∩', '∩'], # cap - u'∪' : ['∪', '∪'], # cup - u'∫' : ['∫', '∫'], # integral - u'∴' : ['∴', '∴'], # therefore - u'∼' : ['∼', '∼'], # simular to - u'≅' : ['≅', '≅'], # approximately equal - u'≈' : ['≈', '≈'], # almost equal - u'≠' : ['≠', '≠'], # not equal - u'≡' : ['≡', '≡'], # equivalent - u'≤' : ['≤', '≤'], # less or equal - u'≥' : ['≥', '≥'], # greater or equal - u'⊂' : ['⊂', '⊂'], # subset of - u'⊃' : ['⊃', '⊃'], # superset of - u'⊄' : ['⊄', '⊄'], # not subset of - u'⊆' : ['⊆', '⊆'], # subset or equal - u'⊇' : ['⊇', '⊇'], # superset or equal - u'⊕' : ['⊕', '⊕'], # circled plus - u'⊗' : ['⊗', '⊗'], # cirled times - u'⊥' : ['⊥', '⊥'], # perpendicular - u'⋅' : ['⋅', '⋅'], # dot operator - # Greek Letters - u'Α' : ['Α', 'Α'], # Alpha - u'Β' : ['Β', 'Β'], # Beta - u'Γ' : ['Γ', 'Γ'], # Gamma - u'Δ' : ['Δ', 'Δ'], # Delta - u'Ε' : ['Ε', 'Ε'], # Epsilon - u'Ζ' : ['Ζ', 'Ζ'], # Zeta - u'Η' : ['Η', 'Η'], # Eta - u'Θ' : ['Θ', 'Θ'], # Theta - u'Ι' : ['Ι', 'Ι'], # Iota - u'Κ' : ['Κ', 'Κ'], # Kappa - u'Λ' : ['Λ', 'Λ'], # Lambda - u'Μ' : ['Μ', 'Μ'], # Mu - u'Ν' : ['Ν', 'Ν'], # Nu - u'Ξ' : ['Ξ', 'Ξ'], # Xi - u'Ο' : ['Ο', 'Ο'], # Omicron - u'Π' : ['Π', 'Π'], # Pi - u'Ρ' : ['Ρ', 'Ρ'], # Rho - u'Σ' : ['Σ', 'Σ'], # Sigma - u'Τ' : ['Τ', 'Τ'], # Tau - u'Υ' : ['Υ', 'Υ'], # Upsilon - u'Φ' : ['Φ', 'Φ'], # Phi - u'Χ' : ['Χ', 'Χ'], # Chi - u'Ψ' : ['Ψ', 'Ψ'], # Psi - u'ω' : ['ω', 'ω'], # omega - u'ϑ' : ['ϑ', 'ϑ'], # theta symbol - u'ϒ' : ['ϒ', 'ϒ'], # upsilon symbol - u'ϖ' : ['ϖ', 'ϖ'], # pi symbol - # Other - u'Œ' : ['Œ', 'Œ'], # capital ligature OE - u'œ' : ['œ', 'œ'], # small ligature oe - u'Š' : ['Š', 'Š'], # capital S with caron - u'š' : ['š', 'š'], # small S with caron - u'Ÿ' : ['Ÿ', 'Ÿ'], # capital Y with diaeres - u'ƒ' : ['ƒ', 'ƒ'], # f with hook - u'ˆ' : ['ˆ', 'ˆ'], # modifier letter circumflex accent - u'˜' : ['˜', '˜'], # small tilde - u'–' : ['–', '–'], # en dash - u'—' : ['—', '—'], # em dash - u'‘' : ['‘', '‘'], # left single quotation mark - u'’' : ['’', '’'], # right single quotation mark - u'‚' : ['‚', '‚'], # single low-9 quotation mark - u'“' : ['“', '“'], # left double quotation mark - u'”' : ['”', '”'], # right double quotation mark - u'„' : ['„', '„'], # double low-9 quotation mark - u'†' : ['†', '†'], # dagger - u'‡' : ['‡', '‡'], # double dagger - u'•' : ['•', '•'], # bullet - u'…' : ['…', '…'], # horizontal ellipsis - u'‰' : ['‰', '‰'], # per mille - u'′' : ['′', '′'], # minutes - u'″' : ['″', '″'], # seconds - u'‹' : ['‹', '‹'], # single left angle quotation - u'›' : ['›', '›'], # single right angle quotation - u'‾' : ['‾', '‾'], # overline - u'€' : ['€', '€'], # euro - u'™' : ['™', '™'], # trademark - u'←' : ['←', '←'], # left arrow - u'↑' : ['↑', '↑'], # up arrow - u'→' : ['→', '→'], # right arrow - u'↓' : ['↓', '↓'], # down arrow - u'↔' : ['↔', '↔'], # left right arrow - u'↵' : ['↵', '↵'], # carriage return arrow - u'⌈' : ['⌈', '⌈'], # left ceiling - u'⌉' : ['⌉', '⌉'], # right ceiling - u'⌊' : ['⌊', '⌊'], # left floor - u'⌋' : ['⌋', '⌋'], # right floor - u'◊' : ['◊', '◊'], # lozenge - u'♠' : ['♠', '♠'], # spade - u'♣' : ['♣', '♣'], # club - u'♥' : ['♥', '♥'], # heart - u'♦' : ['♦', '♦'], # diamond - # Extra http://www.ascii.cl/htmlcodes.htm - u' ' : [' '], # space - u'!' : ['!'], # exclamation point - u'#' : ['#'], # number sign - u'$' : ['$'], # dollar sign - u'%' : ['%'], # percent sign - u'\'' : ['''], # single quote - u'(' : ['('], # opening parenthesis - u')' : [')'], # closing parenthesis - u'*' : ['*'], # asterisk - u'+' : ['+'], # plus sign - u',' : [','], # comma - u'-' : ['-'], # minus sign - hyphen - u'.' : ['.'], # period - u'/' : ['/'], # slash - u'0' : ['0'], # zero - u'1' : ['1'], # one - u'2' : ['2'], # two - u'3' : ['3'], # three - u'4' : ['4'], # four - u'5' : ['5'], # five - u'6' : ['6'], # six - u'7' : ['7'], # seven - u'8' : ['8'], # eight - u'9' : ['9'], # nine - u':' : [':'], # colon - u';' : [';'], # semicolon - u'=' : ['='], # equal sign - u'?' : ['?'], # question mark - u'@' : ['@'], # at symbol - u'A' : ['A'], # - u'B' : ['B'], # - u'C' : ['C'], # - u'D' : ['D'], # - u'E' : ['E'], # - u'F' : ['F'], # - u'G' : ['G'], # - u'H' : ['H'], # - u'I' : ['I'], # - u'J' : ['J'], # - u'K' : ['K'], # - u'L' : ['L'], # - u'M' : ['M'], # - u'N' : ['N'], # - u'O' : ['O'], # - u'P' : ['P'], # - u'Q' : ['Q'], # - u'R' : ['R'], # - u'S' : ['S'], # - u'T' : ['T'], # - u'U' : ['U'], # - u'V' : ['V'], # - u'W' : ['W'], # - u'X' : ['X'], # - u'Y' : ['Y'], # - u'Z' : ['Z'], # - u'[' : ['['], # opening bracket - u'\\' : ['\'], # backslash - u']' : [']'], # closing bracket - u'^' : ['^'], # caret - circumflex - u'_' : ['_'], # underscore - u'`' : ['`'], # grave accent - u'a' : ['a'], # - u'b' : ['b'], # - u'c' : ['c'], # - u'd' : ['d'], # - u'e' : ['e'], # - u'f' : ['f'], # - u'g' : ['g'], # - u'h' : ['h'], # - u'i' : ['i'], # - u'j' : ['j'], # - u'k' : ['k'], # - u'l' : ['l'], # - u'm' : ['m'], # - u'n' : ['n'], # - u'o' : ['o'], # - u'p' : ['p'], # - u'q' : ['q'], # - u'r' : ['r'], # - u's' : ['s'], # - u't' : ['t'], # - u'u' : ['u'], # - u'v' : ['v'], # - u'w' : ['w'], # - u'x' : ['x'], # - u'y' : ['y'], # - u'z' : ['z'], # - u'{' : ['{'], # opening brace - u'|' : ['|'], # vertical bar - u'}' : ['}'], # closing brace - u'~' : ['~'], # equivalency sign - tilde - u'<' : ['<', '<'], # less than sign - u'>' : ['>', '>'], # greater than sign - u'¡' : ['¡', '¡'], # inverted exclamation mark - u'¢' : ['¢', '¢'], # cent sign - u'£' : ['£', '£'], # pound sign - u'¤' : ['¤', '¤'], # currency sign - u'¥' : ['¥', '¥'], # yen sign - u'¦' : ['¦', '¦'], # broken vertical bar - u'§' : ['§', '§'], # section sign - u'¨' : ['¨', '¨'], # spacing diaeresis - umlaut - u'©' : ['©', '©'], # copyright sign - u'ª' : ['ª', 'ª'], # feminine ordinal indicator - u'«' : ['«', '«'], # left double angle quotes - u'¬' : ['¬', '¬'], # not sign - u'®' : ['®', '®'], # registered trade mark sign - u'¯' : ['¯', '¯'], # spacing macron - overline - u'°' : ['°', '°'], # degree sign - u'±' : ['±', '±'], # plus-or-minus sign - u'²' : ['²', '²'], # superscript two - squared - u'³' : ['³', '³'], # superscript three - cubed - u'´' : ['´', '´'], # acute accent - spacing acute - u'µ' : ['µ', 'µ'], # micro sign - u'¶' : ['¶', '¶'], # pilcrow sign - paragraph sign - u'·' : ['·', '·'], # middle dot - Georgian comma - u'¸' : ['¸', '¸'], # spacing cedilla - u'¹' : ['¹', '¹'], # superscript one - u'º' : ['º', 'º'], # masculine ordinal indicator - u'»' : ['»', '»'], # right double angle quotes - u'¼' : ['¼', '¼'], # fraction one quarter - u'½' : ['½', '½'], # fraction one half - u'¾' : ['¾', '¾'], # fraction three quarters - u'¿' : ['¿', '¿'], # inverted question mark - u'À' : ['À', 'À'], # latin capital letter A with grave - u'Á' : ['Á', 'Á'], # latin capital letter A with acute - u'Â' : ['Â', 'Â'], # latin capital letter A with circumflex - u'Ã' : ['Ã', 'Ã'], # latin capital letter A with tilde - u'Ä' : ['Ä', 'Ä'], # latin capital letter A with diaeresis - u'Å' : ['Å', 'Å'], # latin capital letter A with ring above - u'Æ' : ['Æ', 'Æ'], # latin capital letter AE - u'Ç' : ['Ç', 'Ç'], # latin capital letter C with cedilla - u'È' : ['È', 'È'], # latin capital letter E with grave - u'É' : ['É', 'É'], # latin capital letter E with acute - u'Ê' : ['Ê', 'Ê'], # latin capital letter E with circumflex - u'Ë' : ['Ë', 'Ë'], # latin capital letter E with diaeresis - u'Ì' : ['Ì', 'Ì'], # latin capital letter I with grave - u'Í' : ['Í', 'Í'], # latin capital letter I with acute - u'Î' : ['Î', 'Î'], # latin capital letter I with circumflex - u'Ï' : ['Ï', 'Ï'], # latin capital letter I with diaeresis - u'Ð' : ['Ð', 'Ð'], # latin capital letter ETH - u'Ñ' : ['Ñ', 'Ñ'], # latin capital letter N with tilde - u'Ò' : ['Ò', 'Ò'], # latin capital letter O with grave - u'Ó' : ['Ó', 'Ó'], # latin capital letter O with acute - u'Ô' : ['Ô', 'Ô'], # latin capital letter O with circumflex - u'Õ' : ['Õ', 'Õ'], # latin capital letter O with tilde - u'Ö' : ['Ö', 'Ö'], # latin capital letter O with diaeresis - u'×' : ['×', '×'], # multiplication sign - u'Ø' : ['Ø', 'Ø'], # latin capital letter O with slash - u'Ù' : ['Ù', 'Ù'], # latin capital letter U with grave - u'Ú' : ['Ú', 'Ú'], # latin capital letter U with acute - u'Û' : ['Û', 'Û'], # latin capital letter U with circumflex - u'Ü' : ['Ü', 'Ü'], # latin capital letter U with diaeresis - u'Ý' : ['Ý', 'Ý'], # latin capital letter Y with acute - u'Þ' : ['Þ', 'Þ'], # latin capital letter THORN - u'ß' : ['ß', 'ß'], # latin small letter sharp s - ess-zed - u'à' : ['à', 'à'], # latin small letter a with grave - u'á' : ['á', 'á'], # latin small letter a with acute - u'â' : ['â', 'â'], # latin small letter a with circumflex - u'ã' : ['ã', 'ã'], # latin small letter a with tilde - u'ä' : ['ä', 'ä'], # latin small letter a with diaeresis - u'å' : ['å', 'å'], # latin small letter a with ring above - u'æ' : ['æ', 'æ'], # latin small letter ae - u'ç' : ['ç', 'ç'], # latin small letter c with cedilla - u'è' : ['è', 'è'], # latin small letter e with grave - u'é' : ['é', 'é'], # latin small letter e with acute - u'ê' : ['ê', 'ê'], # latin small letter e with circumflex - u'ë' : ['ë', 'ë'], # latin small letter e with diaeresis - u'ì' : ['ì', 'ì'], # latin small letter i with grave - u'í' : ['í', 'í'], # latin small letter i with acute - u'î' : ['î', 'î'], # latin small letter i with circumflex - u'ï' : ['ï', 'ï'], # latin small letter i with diaeresis - u'ð' : ['ð', 'ð'], # latin small letter eth - u'ñ' : ['ñ', 'ñ'], # latin small letter n with tilde - u'ò' : ['ò', 'ò'], # latin small letter o with grave - u'ó' : ['ó', 'ó'], # latin small letter o with acute - u'ô' : ['ô', 'ô'], # latin small letter o with circumflex - u'õ' : ['õ', 'õ'], # latin small letter o with tilde - u'ö' : ['ö', 'ö'], # latin small letter o with diaeresis - u'÷' : ['÷', '÷'], # division sign - u'ø' : ['ø', 'ø'], # latin small letter o with slash - u'ù' : ['ù', 'ù'], # latin small letter u with grave - u'ú' : ['ú', 'ú'], # latin small letter u with acute - u'û' : ['û', 'û'], # latin small letter u with circumflex - u'ü' : ['ü', 'ü'], # latin small letter u with diaeresis - u'ý' : ['ý', 'ý'], # latin small letter y with acute - u'þ' : ['þ', 'þ'], # latin small letter thorn - u'ÿ' : ['ÿ', 'ÿ'], # latin small letter y with diaeresis - # More - u' ' : [' '], - } - diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 0cd7da8e72..48fa1fec5e 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -10,10 +10,9 @@ __docformat__ = 'restructuredtext en' import re -from calibre.ebooks.pdb.ereader import image_name -from calibre.ebooks.htmlsymbols import HTML_SYMBOLS +from htmlentitydefs import codepoint2name -from BeautifulSoup import BeautifulSoup +from calibre.ebooks.pdb.ereader import image_name PML_HTML_RULES = [ (re.compile(r'\\p'), lambda match: '

'), @@ -71,10 +70,12 @@ def pml_to_html(pml): for rule in PML_HTML_RULES: html = rule[0].sub(rule[1], html) - for symbol in HTML_SYMBOLS.keys(): - if ord(symbol) > 128: - html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1]) - + # Turn special characters into entities. + cps = [ord(c) for c in set(html)] + cps = set(cps).intersection(codepoint2name.keys()).difference([60, 62]) + for cp in cps: + html = html.replace(unichr(cp), '&%s;' % codepoint2name[cp]) + return html def footnote_sidebar_to_html(id, pml): diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 313250bcf2..7ec561a195 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -12,7 +12,6 @@ import os import re from calibre import entity_to_unicode -from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from BeautifulSoup import BeautifulSoup @@ -82,10 +81,6 @@ class TxtWriter(object): return stripped def replace_html_symbols(self, content): - for symbol in HTML_SYMBOLS: - for code in HTML_SYMBOLS[symbol]: - content = content.replace(code, symbol) - for entity in set(re.findall('&.+?;', content)): mo = re.search('(%s)' % entity[1:-1], content) content = content.replace(entity, entity_to_unicode(mo))