diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py
deleted file mode 100644
index d46e4c707a..0000000000
--- a/src/calibre/ebooks/htmlsymbols.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Maping of non-acii symbols and their corresponding html entity number and name
-'''
-__license__ = 'GPL v3'
-__copyright__ = '2009, John Schember '
-
-# http://www.w3schools.com/tags/ref_symbols.asp
-HTML_SYMBOLS = {
- # Math Symbols
- u'∀' : ['∀', '∀'], # for all
- u'∂' : ['∂', '∂'], # part
- u'∃' : ['∃', '&exists;'], # exists
- u'∅' : ['∅', '∅'], # empty
- u'∇' : ['∇', '∇'], # nabla
- u'∈' : ['∈', '∈'], # isin
- u'∉' : ['∉', '∉'], # notin
- u'∋' : ['∋', '∋'], # ni
- u'∏' : ['∏', '∏'], # prod
- u'∑' : ['∑', '∑'], # sum
- u'−' : ['−', '−'], # minus
- u'∗' : ['∗', '∗'], # lowast
- u'√' : ['√', '√'], # square root
- u'∝' : ['∝', '∝'], # proportional to
- u'∞' : ['∞', '∞'], # infinity
- u'∠' : ['∠', '∠'], # angle
- u'∧' : ['∧', '∧'], # and
- u'∨' : ['∨', '∨'], # or
- u'∩' : ['∩', '∩'], # cap
- u'∪' : ['∪', '∪'], # cup
- u'∫' : ['∫', '∫'], # integral
- u'∴' : ['∴', '∴'], # therefore
- u'∼' : ['∼', '∼'], # simular to
- u'≅' : ['≅', '≅'], # approximately equal
- u'≈' : ['≈', '≈'], # almost equal
- u'≠' : ['≠', '≠'], # not equal
- u'≡' : ['≡', '≡'], # equivalent
- u'≤' : ['≤', '≤'], # less or equal
- u'≥' : ['≥', '≥'], # greater or equal
- u'⊂' : ['⊂', '⊂'], # subset of
- u'⊃' : ['⊃', '⊃'], # superset of
- u'⊄' : ['⊄', '⊄'], # not subset of
- u'⊆' : ['⊆', '⊆'], # subset or equal
- u'⊇' : ['⊇', '⊇'], # superset or equal
- u'⊕' : ['⊕', '⊕'], # circled plus
- u'⊗' : ['⊗', '⊗'], # cirled times
- u'⊥' : ['⊥', '⊥'], # perpendicular
- u'⋅' : ['⋅', '⋅'], # dot operator
- # Greek Letters
- u'Α' : ['Α', 'Α'], # Alpha
- u'Β' : ['Β', 'Β'], # Beta
- u'Γ' : ['Γ', 'Γ'], # Gamma
- u'Δ' : ['Δ', 'Δ'], # Delta
- u'Ε' : ['Ε', 'Ε'], # Epsilon
- u'Ζ' : ['Ζ', 'Ζ'], # Zeta
- u'Η' : ['Η', 'Η'], # Eta
- u'Θ' : ['Θ', 'Θ'], # Theta
- u'Ι' : ['Ι', 'Ι'], # Iota
- u'Κ' : ['Κ', 'Κ'], # Kappa
- u'Λ' : ['Λ', 'Λ'], # Lambda
- u'Μ' : ['Μ', 'Μ'], # Mu
- u'Ν' : ['Ν', 'Ν'], # Nu
- u'Ξ' : ['Ξ', 'Ξ'], # Xi
- u'Ο' : ['Ο', 'Ο'], # Omicron
- u'Π' : ['Π', 'Π'], # Pi
- u'Ρ' : ['Ρ', 'Ρ'], # Rho
- u'Σ' : ['Σ', 'Σ'], # Sigma
- u'Τ' : ['Τ', 'Τ'], # Tau
- u'Υ' : ['Υ', 'Υ'], # Upsilon
- u'Φ' : ['Φ', 'Φ'], # Phi
- u'Χ' : ['Χ', 'Χ'], # Chi
- u'Ψ' : ['Ψ', 'Ψ'], # Psi
- u'ω' : ['ω', 'ω'], # omega
- u'ϑ' : ['ϑ', 'ϑ'], # theta symbol
- u'ϒ' : ['ϒ', 'ϒ'], # upsilon symbol
- u'ϖ' : ['ϖ', 'ϖ'], # pi symbol
- # Other
- u'Œ' : ['Œ', 'Œ'], # capital ligature OE
- u'œ' : ['œ', 'œ'], # small ligature oe
- u'Š' : ['Š', 'Š'], # capital S with caron
- u'š' : ['š', 'š'], # small S with caron
- u'Ÿ' : ['Ÿ', 'Ÿ'], # capital Y with diaeres
- u'ƒ' : ['ƒ', 'ƒ'], # f with hook
- u'ˆ' : ['ˆ', 'ˆ'], # modifier letter circumflex accent
- u'˜' : ['˜', '˜'], # small tilde
- u'–' : ['–', '–'], # en dash
- u'—' : ['—', '—'], # em dash
- u'‘' : ['‘', '‘'], # left single quotation mark
- u'’' : ['’', '’'], # right single quotation mark
- u'‚' : ['‚', '‚'], # single low-9 quotation mark
- u'“' : ['“', '“'], # left double quotation mark
- u'”' : ['”', '”'], # right double quotation mark
- u'„' : ['„', '„'], # double low-9 quotation mark
- u'†' : ['†', '†'], # dagger
- u'‡' : ['‡', '‡'], # double dagger
- u'•' : ['•', '•'], # bullet
- u'…' : ['…', '…'], # horizontal ellipsis
- u'‰' : ['‰', '‰'], # per mille
- u'′' : ['′', '′'], # minutes
- u'″' : ['″', '″'], # seconds
- u'‹' : ['‹', '‹'], # single left angle quotation
- u'›' : ['›', '›'], # single right angle quotation
- u'‾' : ['‾', '‾'], # overline
- u'€' : ['€', '€'], # euro
- u'™' : ['™', '™'], # trademark
- u'←' : ['←', '←'], # left arrow
- u'↑' : ['↑', '↑'], # up arrow
- u'→' : ['→', '→'], # right arrow
- u'↓' : ['↓', '↓'], # down arrow
- u'↔' : ['↔', '↔'], # left right arrow
- u'↵' : ['↵', '↵'], # carriage return arrow
- u'⌈' : ['⌈', '⌈'], # left ceiling
- u'⌉' : ['⌉', '⌉'], # right ceiling
- u'⌊' : ['⌊', '⌊'], # left floor
- u'⌋' : ['⌋', '⌋'], # right floor
- u'◊' : ['◊', '◊'], # lozenge
- u'♠' : ['♠', '♠'], # spade
- u'♣' : ['♣', '♣'], # club
- u'♥' : ['♥', '♥'], # heart
- u'♦' : ['♦', '♦'], # diamond
- # Extra http://www.ascii.cl/htmlcodes.htm
- u' ' : [' '], # space
- u'!' : ['!'], # exclamation point
- u'#' : ['#'], # number sign
- u'$' : ['$'], # dollar sign
- u'%' : ['%'], # percent sign
- u'\'' : ['''], # single quote
- u'(' : ['('], # opening parenthesis
- u')' : [')'], # closing parenthesis
- u'*' : ['*'], # asterisk
- u'+' : ['+'], # plus sign
- u',' : [','], # comma
- u'-' : ['-'], # minus sign - hyphen
- u'.' : ['.'], # period
- u'/' : ['/'], # slash
- u'0' : ['0'], # zero
- u'1' : ['1'], # one
- u'2' : ['2'], # two
- u'3' : ['3'], # three
- u'4' : ['4'], # four
- u'5' : ['5'], # five
- u'6' : ['6'], # six
- u'7' : ['7'], # seven
- u'8' : ['8'], # eight
- u'9' : ['9'], # nine
- u':' : [':'], # colon
- u';' : [';'], # semicolon
- u'=' : ['='], # equal sign
- u'?' : ['?'], # question mark
- u'@' : ['@'], # at symbol
- u'A' : ['A'], #
- u'B' : ['B'], #
- u'C' : ['C'], #
- u'D' : ['D'], #
- u'E' : ['E'], #
- u'F' : ['F'], #
- u'G' : ['G'], #
- u'H' : ['H'], #
- u'I' : ['I'], #
- u'J' : ['J'], #
- u'K' : ['K'], #
- u'L' : ['L'], #
- u'M' : ['M'], #
- u'N' : ['N'], #
- u'O' : ['O'], #
- u'P' : ['P'], #
- u'Q' : ['Q'], #
- u'R' : ['R'], #
- u'S' : ['S'], #
- u'T' : ['T'], #
- u'U' : ['U'], #
- u'V' : ['V'], #
- u'W' : ['W'], #
- u'X' : ['X'], #
- u'Y' : ['Y'], #
- u'Z' : ['Z'], #
- u'[' : ['['], # opening bracket
- u'\\' : ['\'], # backslash
- u']' : [']'], # closing bracket
- u'^' : ['^'], # caret - circumflex
- u'_' : ['_'], # underscore
- u'`' : ['`'], # grave accent
- u'a' : ['a'], #
- u'b' : ['b'], #
- u'c' : ['c'], #
- u'd' : ['d'], #
- u'e' : ['e'], #
- u'f' : ['f'], #
- u'g' : ['g'], #
- u'h' : ['h'], #
- u'i' : ['i'], #
- u'j' : ['j'], #
- u'k' : ['k'], #
- u'l' : ['l'], #
- u'm' : ['m'], #
- u'n' : ['n'], #
- u'o' : ['o'], #
- u'p' : ['p'], #
- u'q' : ['q'], #
- u'r' : ['r'], #
- u's' : ['s'], #
- u't' : ['t'], #
- u'u' : ['u'], #
- u'v' : ['v'], #
- u'w' : ['w'], #
- u'x' : ['x'], #
- u'y' : ['y'], #
- u'z' : ['z'], #
- u'{' : ['{'], # opening brace
- u'|' : ['|'], # vertical bar
- u'}' : ['}'], # closing brace
- u'~' : ['~'], # equivalency sign - tilde
- u'<' : ['<', '<'], # less than sign
- u'>' : ['>', '>'], # greater than sign
- u'¡' : ['¡', '¡'], # inverted exclamation mark
- u'¢' : ['¢', '¢'], # cent sign
- u'£' : ['£', '£'], # pound sign
- u'¤' : ['¤', '¤'], # currency sign
- u'¥' : ['¥', '¥'], # yen sign
- u'¦' : ['¦', '¦'], # broken vertical bar
- u'§' : ['§', '§'], # section sign
- u'¨' : ['¨', '¨'], # spacing diaeresis - umlaut
- u'©' : ['©', '©'], # copyright sign
- u'ª' : ['ª', 'ª'], # feminine ordinal indicator
- u'«' : ['«', '«'], # left double angle quotes
- u'¬' : ['¬', '¬'], # not sign
- u'®' : ['®', '®'], # registered trade mark sign
- u'¯' : ['¯', '¯'], # spacing macron - overline
- u'°' : ['°', '°'], # degree sign
- u'±' : ['±', '±'], # plus-or-minus sign
- u'²' : ['²', '²'], # superscript two - squared
- u'³' : ['³', '³'], # superscript three - cubed
- u'´' : ['´', '´'], # acute accent - spacing acute
- u'µ' : ['µ', 'µ'], # micro sign
- u'¶' : ['¶', '¶'], # pilcrow sign - paragraph sign
- u'·' : ['·', '·'], # middle dot - Georgian comma
- u'¸' : ['¸', '¸'], # spacing cedilla
- u'¹' : ['¹', '¹'], # superscript one
- u'º' : ['º', 'º'], # masculine ordinal indicator
- u'»' : ['»', '»'], # right double angle quotes
- u'¼' : ['¼', '¼'], # fraction one quarter
- u'½' : ['½', '½'], # fraction one half
- u'¾' : ['¾', '¾'], # fraction three quarters
- u'¿' : ['¿', '¿'], # inverted question mark
- u'À' : ['À', 'À'], # latin capital letter A with grave
- u'Á' : ['Á', 'Á'], # latin capital letter A with acute
- u'Â' : ['Â', 'Â'], # latin capital letter A with circumflex
- u'Ã' : ['Ã', 'Ã'], # latin capital letter A with tilde
- u'Ä' : ['Ä', 'Ä'], # latin capital letter A with diaeresis
- u'Å' : ['Å', 'Å'], # latin capital letter A with ring above
- u'Æ' : ['Æ', 'Æ'], # latin capital letter AE
- u'Ç' : ['Ç', 'Ç'], # latin capital letter C with cedilla
- u'È' : ['È', 'È'], # latin capital letter E with grave
- u'É' : ['É', 'É'], # latin capital letter E with acute
- u'Ê' : ['Ê', 'Ê'], # latin capital letter E with circumflex
- u'Ë' : ['Ë', 'Ë'], # latin capital letter E with diaeresis
- u'Ì' : ['Ì', 'Ì'], # latin capital letter I with grave
- u'Í' : ['Í', 'Í'], # latin capital letter I with acute
- u'Î' : ['Î', 'Î'], # latin capital letter I with circumflex
- u'Ï' : ['Ï', 'Ï'], # latin capital letter I with diaeresis
- u'Ð' : ['Ð', 'Ð'], # latin capital letter ETH
- u'Ñ' : ['Ñ', 'Ñ'], # latin capital letter N with tilde
- u'Ò' : ['Ò', 'Ò'], # latin capital letter O with grave
- u'Ó' : ['Ó', 'Ó'], # latin capital letter O with acute
- u'Ô' : ['Ô', 'Ô'], # latin capital letter O with circumflex
- u'Õ' : ['Õ', 'Õ'], # latin capital letter O with tilde
- u'Ö' : ['Ö', 'Ö'], # latin capital letter O with diaeresis
- u'×' : ['×', '×'], # multiplication sign
- u'Ø' : ['Ø', 'Ø'], # latin capital letter O with slash
- u'Ù' : ['Ù', 'Ù'], # latin capital letter U with grave
- u'Ú' : ['Ú', 'Ú'], # latin capital letter U with acute
- u'Û' : ['Û', 'Û'], # latin capital letter U with circumflex
- u'Ü' : ['Ü', 'Ü'], # latin capital letter U with diaeresis
- u'Ý' : ['Ý', 'Ý'], # latin capital letter Y with acute
- u'Þ' : ['Þ', 'Þ'], # latin capital letter THORN
- u'ß' : ['ß', 'ß'], # latin small letter sharp s - ess-zed
- u'à' : ['à', 'à'], # latin small letter a with grave
- u'á' : ['á', 'á'], # latin small letter a with acute
- u'â' : ['â', 'â'], # latin small letter a with circumflex
- u'ã' : ['ã', 'ã'], # latin small letter a with tilde
- u'ä' : ['ä', 'ä'], # latin small letter a with diaeresis
- u'å' : ['å', 'å'], # latin small letter a with ring above
- u'æ' : ['æ', 'æ'], # latin small letter ae
- u'ç' : ['ç', 'ç'], # latin small letter c with cedilla
- u'è' : ['è', 'è'], # latin small letter e with grave
- u'é' : ['é', 'é'], # latin small letter e with acute
- u'ê' : ['ê', 'ê'], # latin small letter e with circumflex
- u'ë' : ['ë', 'ë'], # latin small letter e with diaeresis
- u'ì' : ['ì', 'ì'], # latin small letter i with grave
- u'í' : ['í', 'í'], # latin small letter i with acute
- u'î' : ['î', 'î'], # latin small letter i with circumflex
- u'ï' : ['ï', 'ï'], # latin small letter i with diaeresis
- u'ð' : ['ð', 'ð'], # latin small letter eth
- u'ñ' : ['ñ', 'ñ'], # latin small letter n with tilde
- u'ò' : ['ò', 'ò'], # latin small letter o with grave
- u'ó' : ['ó', 'ó'], # latin small letter o with acute
- u'ô' : ['ô', 'ô'], # latin small letter o with circumflex
- u'õ' : ['õ', 'õ'], # latin small letter o with tilde
- u'ö' : ['ö', 'ö'], # latin small letter o with diaeresis
- u'÷' : ['÷', '÷'], # division sign
- u'ø' : ['ø', 'ø'], # latin small letter o with slash
- u'ù' : ['ù', 'ù'], # latin small letter u with grave
- u'ú' : ['ú', 'ú'], # latin small letter u with acute
- u'û' : ['û', 'û'], # latin small letter u with circumflex
- u'ü' : ['ü', 'ü'], # latin small letter u with diaeresis
- u'ý' : ['ý', 'ý'], # latin small letter y with acute
- u'þ' : ['þ', 'þ'], # latin small letter thorn
- u'ÿ' : ['ÿ', 'ÿ'], # latin small letter y with diaeresis
- # More
- u' ' : [' '],
- }
-
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 0cd7da8e72..48fa1fec5e 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -10,10 +10,9 @@ __docformat__ = 'restructuredtext en'
import re
-from calibre.ebooks.pdb.ereader import image_name
-from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
+from htmlentitydefs import codepoint2name
-from BeautifulSoup import BeautifulSoup
+from calibre.ebooks.pdb.ereader import image_name
PML_HTML_RULES = [
(re.compile(r'\\p'), lambda match: '
'),
@@ -71,10 +70,12 @@ def pml_to_html(pml):
for rule in PML_HTML_RULES:
html = rule[0].sub(rule[1], html)
- for symbol in HTML_SYMBOLS.keys():
- if ord(symbol) > 128:
- html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1])
-
+ # Turn special characters into entities.
+ cps = [ord(c) for c in set(html)]
+ cps = set(cps).intersection(codepoint2name.keys()).difference([60, 62])
+ for cp in cps:
+ html = html.replace(unichr(cp), '&%s;' % codepoint2name[cp])
+
return html
def footnote_sidebar_to_html(id, pml):
diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
index 313250bcf2..7ec561a195 100644
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@@ -12,7 +12,6 @@ import os
import re
from calibre import entity_to_unicode
-from calibre.ebooks.htmlsymbols import HTML_SYMBOLS
from BeautifulSoup import BeautifulSoup
@@ -82,10 +81,6 @@ class TxtWriter(object):
return stripped
def replace_html_symbols(self, content):
- for symbol in HTML_SYMBOLS:
- for code in HTML_SYMBOLS[symbol]:
- content = content.replace(code, symbol)
-
for entity in set(re.findall('&.+?;', content)):
mo = re.search('(%s)' % entity[1:-1], content)
content = content.replace(entity, entity_to_unicode(mo))