mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add unsmarten functionality throughout TXT output.
This commit is contained in:
parent
6fc606bfa6
commit
d8e86c426f
130
src/calibre/ebooks/textile/unsmarten.py
Normal file
130
src/calibre/ebooks/textile/unsmarten.py
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
def unsmarten(txt):
|
||||||
|
from calibre.ebooks.txt.unsmarten import unsmarten as txt_unsmarten
|
||||||
|
txt = txt_unsmarten(txt)
|
||||||
|
|
||||||
|
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||||
|
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||||
|
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||||
|
txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright
|
||||||
|
txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered
|
||||||
|
txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter
|
||||||
|
txt = re.sub(u'½|½|½', r'{1/2}', txt) # half
|
||||||
|
txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||||
|
txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave
|
||||||
|
txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute
|
||||||
|
txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||||
|
txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||||
|
txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||||
|
txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||||
|
txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||||
|
txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||||
|
txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave
|
||||||
|
txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute
|
||||||
|
txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||||
|
txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||||
|
txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||||
|
txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute
|
||||||
|
txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||||
|
txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||||
|
txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||||
|
txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||||
|
txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||||
|
txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||||
|
txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||||
|
txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||||
|
txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||||
|
txt = re.sub(u'×|×|×', r'{x}', txt) # dimension
|
||||||
|
txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||||
|
txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave
|
||||||
|
txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||||
|
txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||||
|
txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||||
|
txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||||
|
txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||||
|
txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave
|
||||||
|
txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute
|
||||||
|
txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex
|
||||||
|
txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||||
|
txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||||
|
txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring
|
||||||
|
txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae
|
||||||
|
txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||||
|
txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave
|
||||||
|
txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute
|
||||||
|
txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||||
|
txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||||
|
txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave
|
||||||
|
txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute
|
||||||
|
txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex
|
||||||
|
txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||||
|
txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth
|
||||||
|
txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||||
|
txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave
|
||||||
|
txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute
|
||||||
|
txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||||
|
txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||||
|
txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||||
|
txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||||
|
txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave
|
||||||
|
txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute
|
||||||
|
txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex
|
||||||
|
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||||
|
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||||
|
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||||
|
|
||||||
|
txt = re.sub(u'Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
||||||
|
txt = re.sub(u'č|č|č', r'{cˇ}', txt) # c-caron
|
||||||
|
txt = re.sub(u'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
||||||
|
txt = re.sub(u'ď|ď|ď', r'{dˇ}', txt) # d-caron
|
||||||
|
txt = re.sub(u'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
||||||
|
txt = re.sub(u'ě|ě|ě', r'{eˇ}', txt) # e-caron
|
||||||
|
txt = re.sub(u'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
||||||
|
txt = re.sub(u'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
||||||
|
txt = re.sub(u'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
||||||
|
txt = re.sub(u'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
||||||
|
txt = re.sub(u'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
||||||
|
txt = re.sub(u'ň|ň|ň', r'{nˇ}', txt) # n-caron
|
||||||
|
|
||||||
|
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||||
|
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
||||||
|
|
||||||
|
txt = re.sub(u'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
||||||
|
txt = re.sub(u'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
||||||
|
txt = re.sub(u'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
||||||
|
txt = re.sub(u'ř|ř|ř', r'{rˇ}', txt) # r-caron
|
||||||
|
txt = re.sub(u'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
||||||
|
txt = re.sub(u'ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
||||||
|
txt = re.sub(u'Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
||||||
|
txt = re.sub(u'š|š|š', r'{sˇ}', txt) # s-caron
|
||||||
|
txt = re.sub(u'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
||||||
|
txt = re.sub(u'ť|ť|ť', r'{tˇ}', txt) # t-caron
|
||||||
|
txt = re.sub(u'Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
||||||
|
txt = re.sub(u'ů|ů|ů', r'{u°}', txt) # u-ring
|
||||||
|
txt = re.sub(u'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
||||||
|
txt = re.sub(u'ž|ž|ž', r'{zˇ}', txt) # z-caron
|
||||||
|
|
||||||
|
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
||||||
|
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
||||||
|
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
||||||
|
txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee
|
||||||
|
txt = re.sub(u'€|€|€', r'{C=}', txt) # euro
|
||||||
|
txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark
|
||||||
|
txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade
|
||||||
|
txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club
|
||||||
|
txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart
|
||||||
|
txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||||
|
|
||||||
|
# Move into main code?
|
||||||
|
#txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
|
||||||
|
#txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
||||||
|
#txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
||||||
|
|
||||||
|
return txt
|
@ -56,6 +56,10 @@ class TXTOutput(OutputFormatPlugin):
|
|||||||
'* plain: Produce plain text.\n'
|
'* plain: Produce plain text.\n'
|
||||||
'* markdown: Produce Markdown formatted text.\n'
|
'* markdown: Produce Markdown formatted text.\n'
|
||||||
'* textile: Produce Textile formatted text.')),
|
'* textile: Produce Textile formatted text.')),
|
||||||
|
OptionRecommendation(name='unsmarten_punctuation',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Convert fancy quotes, dashes and ellipsis to their '
|
||||||
|
'plain equivalents.')),
|
||||||
OptionRecommendation(name='keep_links',
|
OptionRecommendation(name='keep_links',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Do not remove links within the document. This is only ' \
|
help=_('Do not remove links within the document. This is only ' \
|
||||||
|
@ -12,6 +12,8 @@ import re
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||||
|
|
||||||
|
|
||||||
BLOCK_TAGS = [
|
BLOCK_TAGS = [
|
||||||
'div',
|
'div',
|
||||||
@ -76,6 +78,8 @@ class TXTMLizer(object):
|
|||||||
output += '\n\n\n\n\n\n'
|
output += '\n\n\n\n\n\n'
|
||||||
output = u''.join(output)
|
output = u''.join(output)
|
||||||
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
||||||
|
if self.opts.unsmarten_punctuation:
|
||||||
|
output = unsmarten(output)
|
||||||
output = self.cleanup_text(output)
|
output = self.cleanup_text(output)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
"""unsmarten : html2textile helper function"""
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||||
__version__ = '0.1'
|
__docformat__ = 'restructuredtext en'
|
||||||
__author__ = 'Leigh Parry'
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@ -16,121 +15,4 @@ def unsmarten(txt):
|
|||||||
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
||||||
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
||||||
|
|
||||||
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
|
||||||
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
|
||||||
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
|
||||||
txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright
|
|
||||||
txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered
|
|
||||||
txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter
|
|
||||||
txt = re.sub(u'½|½|½', r'{1/2}', txt) # half
|
|
||||||
txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
|
||||||
txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave
|
|
||||||
txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute
|
|
||||||
txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
|
||||||
txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
|
||||||
txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
|
||||||
txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring
|
|
||||||
txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE
|
|
||||||
txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
|
||||||
txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave
|
|
||||||
txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute
|
|
||||||
txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
|
||||||
txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
|
||||||
txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
|
||||||
txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute
|
|
||||||
txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
|
||||||
txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
|
||||||
txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
|
||||||
txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
|
||||||
txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
|
||||||
txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
|
||||||
txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
|
||||||
txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
|
||||||
txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
|
||||||
txt = re.sub(u'×|×|×', r'{x}', txt) # dimension
|
|
||||||
txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
|
||||||
txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave
|
|
||||||
txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
|
||||||
txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
|
||||||
txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
|
||||||
txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
|
||||||
txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s
|
|
||||||
txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave
|
|
||||||
txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute
|
|
||||||
txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex
|
|
||||||
txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde
|
|
||||||
txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
|
||||||
txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring
|
|
||||||
txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae
|
|
||||||
txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
|
||||||
txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave
|
|
||||||
txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute
|
|
||||||
txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
|
||||||
txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
|
||||||
txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave
|
|
||||||
txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute
|
|
||||||
txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex
|
|
||||||
txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
|
||||||
txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth
|
|
||||||
txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
|
||||||
txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave
|
|
||||||
txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute
|
|
||||||
txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
|
||||||
txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde
|
|
||||||
txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
|
||||||
txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke
|
|
||||||
txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave
|
|
||||||
txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute
|
|
||||||
txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex
|
|
||||||
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
|
||||||
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
|
||||||
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
|
||||||
|
|
||||||
txt = re.sub(u'Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
|
||||||
txt = re.sub(u'č|č|č', r'{cˇ}', txt) # c-caron
|
|
||||||
txt = re.sub(u'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
|
||||||
txt = re.sub(u'ď|ď|ď', r'{dˇ}', txt) # d-caron
|
|
||||||
txt = re.sub(u'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
|
||||||
txt = re.sub(u'ě|ě|ě', r'{eˇ}', txt) # e-caron
|
|
||||||
txt = re.sub(u'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
|
||||||
txt = re.sub(u'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
|
||||||
txt = re.sub(u'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
|
||||||
txt = re.sub(u'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
|
||||||
txt = re.sub(u'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
|
||||||
txt = re.sub(u'ň|ň|ň', r'{nˇ}', txt) # n-caron
|
|
||||||
|
|
||||||
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
|
||||||
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
|
||||||
|
|
||||||
txt = re.sub(u'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
|
||||||
txt = re.sub(u'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
|
||||||
txt = re.sub(u'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
|
||||||
txt = re.sub(u'ř|ř|ř', r'{rˇ}', txt) # r-caron
|
|
||||||
txt = re.sub(u'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
|
||||||
txt = re.sub(u'ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
|
||||||
txt = re.sub(u'Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
|
||||||
txt = re.sub(u'š|š|š', r'{sˇ}', txt) # s-caron
|
|
||||||
txt = re.sub(u'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
|
||||||
txt = re.sub(u'ť|ť|ť', r'{tˇ}', txt) # t-caron
|
|
||||||
txt = re.sub(u'Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
|
||||||
txt = re.sub(u'ů|ů|ů', r'{u°}', txt) # u-ring
|
|
||||||
txt = re.sub(u'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
|
||||||
txt = re.sub(u'ž|ž|ž', r'{zˇ}', txt) # z-caron
|
|
||||||
|
|
||||||
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
|
||||||
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
|
||||||
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
|
||||||
txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee
|
|
||||||
txt = re.sub(u'€|€|€', r'{C=}', txt) # euro
|
|
||||||
txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark
|
|
||||||
txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade
|
|
||||||
txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club
|
|
||||||
txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart
|
|
||||||
txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond
|
|
||||||
|
|
||||||
# Move into main code?
|
|
||||||
# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
|
|
||||||
# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
|
||||||
# txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
|
||||||
|
|
||||||
return txt
|
return txt
|
||||||
|
Loading…
x
Reference in New Issue
Block a user