mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add new but still wip textile output generator.
This commit is contained in:
parent
9b581963ed
commit
804b248d46
@ -70,16 +70,17 @@ class TXTOutput(OutputFormatPlugin):
|
||||
])
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
print 'New'
|
||||
if opts.txt_output_formatting.lower() == 'markdown':
|
||||
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
||||
writer = MarkdownMLizer(log)
|
||||
self.writer = MarkdownMLizer(log)
|
||||
elif opts.txt_output_formatting.lower() == 'textile':
|
||||
from calibre.ebooks.txt.textileml import TextileMLizer
|
||||
writer = TextileMLizer(log)
|
||||
self.writer = TextileMLizer(log)
|
||||
else:
|
||||
writer = TXTMLizer(log)
|
||||
self.writer = TXTMLizer(log)
|
||||
|
||||
txt = writer.extract_content(oeb_book, opts)
|
||||
txt = self.writer.extract_content(oeb_book, opts)
|
||||
txt = clean_ascii_chars(txt)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
@ -118,10 +119,18 @@ class TXTZOutput(TXTOutput):
|
||||
# Images
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||
if hasattr(self.writer, 'images'):
|
||||
path = os.path.join(tdir, 'images')
|
||||
if item.href in self.writer.images:
|
||||
href = self.writer.images[item.href]
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||
href = os.path.basename(item.href)
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
with open(os.path.join(tdir, item.href), 'wb') as imgf:
|
||||
with open(os.path.join(path, href), 'wb') as imgf:
|
||||
imgf.write(item.data)
|
||||
|
||||
# Metadata
|
||||
|
@ -1,7 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
@ -10,53 +10,328 @@ Transform OEB content into Textile formatted plain text
|
||||
|
||||
import re
|
||||
|
||||
from lxml import etree
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
from calibre.utils.html2textile import html2textile
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||
from operator import itemgetter
|
||||
|
||||
class TextileMLizer(object):
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
class TextileMLizer(OEB2HTML):
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
self.log.info('Converting XHTML to Textile formatted TXT...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
self.in_pre = False
|
||||
self.in_table = False
|
||||
self.links = {}
|
||||
self.list = []
|
||||
self.images = {}
|
||||
self.base_hrefs = [item.href for item in oeb_book.spine]
|
||||
self.map_resources(oeb_book)
|
||||
|
||||
return self.mlize_spine()
|
||||
self.style_bold = False
|
||||
self.style_italic = False
|
||||
self.style_under = False
|
||||
self.style_strike = False
|
||||
self.style_smallcap = False
|
||||
|
||||
def mlize_spine(self):
|
||||
txt = self.mlize_spine(oeb_book)
|
||||
txt = unsmarten(txt)
|
||||
|
||||
# Do some tidying up
|
||||
txt = self.tidy_up(txt)
|
||||
|
||||
return txt
|
||||
|
||||
def mlize_spine(self, oeb_book):
|
||||
output = [u'']
|
||||
|
||||
for item in self.oeb_book.spine:
|
||||
for item in oeb_book.spine:
|
||||
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
|
||||
self.rewrite_ids(item.data, item)
|
||||
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
||||
output.append('\n\n')
|
||||
return ''.join(output)
|
||||
|
||||
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||
def tidy_up(self, text):
|
||||
def check_count(text, tests):
|
||||
x = []
|
||||
for i, t in enumerate(reversed(tests)):
|
||||
x.append((text.count(t), i, t))
|
||||
if x:
|
||||
return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2]
|
||||
return ''
|
||||
|
||||
if not self.opts.keep_links:
|
||||
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
|
||||
if not self.opts.keep_image_references:
|
||||
html = re.sub(r'<\s*img[^>]*>', '', html)
|
||||
# NEEDS TWEAKING
|
||||
# def check_escaping(text, tests):
|
||||
# for t in tests:
|
||||
# text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text)
|
||||
# text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text)
|
||||
# return text
|
||||
|
||||
text = html2textile(html)
|
||||
txt = check_count(text, ['\np<. ', '\np<>. ', '\np. '])
|
||||
text = re.sub(txt+'(\S)', r'\n\1', text)
|
||||
|
||||
# Ensure the section ends with at least two new line characters.
|
||||
# This is to prevent the last paragraph from a section being
|
||||
# combined into the fist paragraph of the next.
|
||||
end_chars = text[-4:]
|
||||
# Convert all newlines to \n
|
||||
end_chars = end_chars.replace('\r\n', '\n')
|
||||
end_chars = end_chars.replace('\r', '\n')
|
||||
end_chars = end_chars[-2:]
|
||||
if not end_chars[1] == '\n':
|
||||
text += '\n\n'
|
||||
if end_chars[1] == '\n' and not end_chars[0] == '\n':
|
||||
text += '\n'
|
||||
# text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-'])
|
||||
|
||||
output += text
|
||||
text = re.sub('\npre\. bc\.', '\nbc.', text)
|
||||
text = re.sub('\np=. p. ', '\np. ', text)
|
||||
text = re.sub('\np=. \n', '\n', text)
|
||||
text = re.sub('\n{3,}', '\n\n', text)
|
||||
text = re.sub(' \|', '|', text)
|
||||
|
||||
output = u''.join(output)
|
||||
# started work on trying to fix footnotes
|
||||
# text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text)
|
||||
return text
|
||||
|
||||
return output
|
||||
def remove_newlines(self, text):
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
# Condense redundant spaces created by replacing newlines with spaces.
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
text = re.sub(r'\t+', '', text)
|
||||
return text
|
||||
|
||||
def remove_leading_ws(self, text):
|
||||
text = text.replace('\r\n', '\n')
|
||||
text = text.replace('\r', '\n')
|
||||
text = re.sub(r'\n+', '\n', text)
|
||||
text = re.sub(r'\n[\t ]+', '\n', text)
|
||||
return text
|
||||
|
||||
def check_align(self, style, align, tests):
|
||||
for i in tests:
|
||||
if style[align] == i[0]:
|
||||
return i[1]
|
||||
return ''
|
||||
|
||||
def check_padding(self, style, tests):
|
||||
txt = ''
|
||||
for i in tests:
|
||||
try:
|
||||
ems = int(round(float(style[i[0]] / style['font-size'])))
|
||||
if ems >=1:
|
||||
txt += i[1] * ems
|
||||
except:
|
||||
pass
|
||||
return txt
|
||||
|
||||
def check_id_tag(self, attribs):
|
||||
txt = ''
|
||||
if attribs.has_key('id'):
|
||||
txt = '(#'+attribs['id']+')'
|
||||
return txt
|
||||
|
||||
def build_block(self, tag, style, attribs, finish):
|
||||
txt = tag
|
||||
if self.opts.keep_links:
|
||||
txt += self.check_id_tag(attribs)
|
||||
txt += self.check_padding(style, [['padding-left','('],['padding-right',')']])
|
||||
txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
|
||||
txt += finish
|
||||
return txt
|
||||
|
||||
def dump_text(self, elem, stylizer, page, tag_stack=[]):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
'''
|
||||
|
||||
# We can only processes tags. If there isn't a tag return any text.
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
p = elem.getparent()
|
||||
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
|
||||
and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
# Setup our variables.
|
||||
text = ['']
|
||||
style = stylizer.style(elem)
|
||||
tags = []
|
||||
tag = barename(elem.tag)
|
||||
attribs = elem.attrib
|
||||
|
||||
# Ignore anything that is set to not be displayed.
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
return ['']
|
||||
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
|
||||
text.append(self.build_block(tag, style, attribs, '. '))
|
||||
tags.append('\n')
|
||||
|
||||
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
||||
if self.style_bold == False:
|
||||
text.append('*')
|
||||
tags.append('*')
|
||||
self.style_bold = True
|
||||
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
||||
if self.style_italic == False:
|
||||
text.append('_')
|
||||
tags.append('_')
|
||||
self.style_italic = True
|
||||
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
|
||||
if tag != 'a':
|
||||
if self.style_under == False:
|
||||
text.append('+')
|
||||
tags.append('+')
|
||||
self.style_under = True
|
||||
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
|
||||
if self.style_strike == False:
|
||||
text.append('-')
|
||||
tags.append('-')
|
||||
self.style_strike = True
|
||||
if style['font-variant'] == 'small-caps':
|
||||
if self.style_smallcap == 0:
|
||||
text.append('&')
|
||||
tags.append('&')
|
||||
self.style_smallcap = 1
|
||||
if tag == 'br':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'blockquote':
|
||||
text.append('bq. ')
|
||||
tags.append('\n')
|
||||
elif tag in ('abbr', 'acronym'):
|
||||
text.append('')
|
||||
txt = attribs['title']
|
||||
tags.append('(' + txt + ')')
|
||||
elif tag == 'sup':
|
||||
text.append('^')
|
||||
tags.append('^')
|
||||
elif tag == 'sub':
|
||||
text.append('~')
|
||||
tags.append('~')
|
||||
elif tag == 'code':
|
||||
if self.in_pre:
|
||||
text.append('bc. ')
|
||||
tags.append('\n')
|
||||
else:
|
||||
text.append('@')
|
||||
tags.append('@')
|
||||
elif tag == 'cite':
|
||||
text.append('??')
|
||||
tags.append('??')
|
||||
elif tag == 'hr':
|
||||
text.append('\n***\n')
|
||||
tags.append('\n')
|
||||
elif tag == 'pre':
|
||||
self.in_pre = True
|
||||
text.append('pre. ')
|
||||
tags.append('pre')
|
||||
elif tag == 'a':
|
||||
if self.opts.keep_links:
|
||||
text.append ('"')
|
||||
tags.append('":' + attribs['href'])
|
||||
if attribs.has_key('title'):
|
||||
tags.append('(' + attribs['title'] + ')')
|
||||
elif tag == 'img':
|
||||
if self.opts.keep_image_references:
|
||||
text.append ('!' + attribs['src'])
|
||||
if attribs.has_key('alt'):
|
||||
txt = attribs['alt']
|
||||
if txt != '':
|
||||
text.append('(' + txt + ')')
|
||||
tags.append('!')
|
||||
elif tag in ('ol', 'ul'):
|
||||
self.list.append({'name':tag, 'num':0})
|
||||
text.append('')
|
||||
tags.append(tag)
|
||||
elif tag == 'li':
|
||||
if self.list: li = self.list[-1]
|
||||
else: li = {'name':'ul', 'num':0}
|
||||
if li['name'] == 'ul': text.append('*'*len(self.list)+' ')
|
||||
elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
|
||||
elif tag == 'dl':
|
||||
text.append('\n')
|
||||
tags.append('')
|
||||
elif tag == 'dt':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'dd':
|
||||
text.append(' ')
|
||||
tags.append('')
|
||||
elif tag == 'dd':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'table':
|
||||
self.in_table = True
|
||||
text.append('')
|
||||
tags.append('table')
|
||||
elif tag == 'tr':
|
||||
text.append('')
|
||||
tags.append('|\n')
|
||||
elif tag == 'td':
|
||||
text.append('|')
|
||||
txt = ''
|
||||
txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
|
||||
txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']])
|
||||
if attribs.has_key ('colspan'):
|
||||
txt += '\\' + attribs['colspan']
|
||||
if attribs.has_key ('rowspan'):
|
||||
txt += '/' + attribs['rowspan']
|
||||
if txt != '':
|
||||
text.append(txt+'. ')
|
||||
tags.append('')
|
||||
elif tag == 'th':
|
||||
text.append('|_. ')
|
||||
tags.append('')
|
||||
|
||||
if self.opts.keep_links and attribs.has_key('id'):
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
|
||||
text.append('(#' + attribs['id'] + ')')
|
||||
|
||||
# If wanted process all style tags here - before taxt in tags is written
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
txt = elem.text
|
||||
if not self.in_pre:
|
||||
if self.in_table:
|
||||
txt = self.remove_newlines(txt)
|
||||
else:
|
||||
txt = self.remove_leading_ws(txt)
|
||||
text.append(txt)
|
||||
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer, page, tag_stack+tags)
|
||||
|
||||
# Close all open tags.
|
||||
tags.reverse()
|
||||
for t in tags:
|
||||
if tag in ('pre', 'ul', 'ol', 'li', 'table'):
|
||||
if tag == 'pre':
|
||||
self.in_pre = False
|
||||
if tag == 'table':
|
||||
self.in_table = False
|
||||
if tag in ('ul', 'ol'):
|
||||
if self.list: self.list.pop()
|
||||
else:
|
||||
text.append('%s' % t)
|
||||
if t == '*': self.style_bold = False
|
||||
if t == '_': self.style_italic = False
|
||||
if t == '+': self.style_under = False
|
||||
if t == '-': self.style_strike = False
|
||||
if t == '&': self.style_smallcap = False
|
||||
|
||||
# Add the text that is outside of the tag.
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
tail = elem.tail
|
||||
if not self.in_pre:
|
||||
if self.in_table:
|
||||
tail = self.remove_newlines(tail)
|
||||
else:
|
||||
tail = self.remove_leading_ws(tail)
|
||||
text.append(tail)
|
||||
|
||||
return text
|
||||
|
109
src/calibre/ebooks/txt/unsmarten.py
Normal file
109
src/calibre/ebooks/txt/unsmarten.py
Normal file
@ -0,0 +1,109 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
'''
|
||||
|
||||
'''
|
||||
|
||||
__version__ = '0.1'
|
||||
__author__ = 'Leigh Parry'
|
||||
|
||||
import re
|
||||
|
||||
def unsmarten(txt):
|
||||
txt = re.sub(u'–|–|–', r'-', txt) # en-dash
|
||||
txt = re.sub(u'—|—|—', r'--', txt) # em-dash
|
||||
txt = re.sub(u'…|…|…', r'...', txt) # ellipsis
|
||||
|
||||
txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote
|
||||
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
||||
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
||||
|
||||
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||
txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright
|
||||
txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered
|
||||
txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter
|
||||
txt = re.sub(u'½|½|½', r'{1/2}', txt) # half
|
||||
txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||
txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave
|
||||
txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute
|
||||
txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||
txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||
txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||
txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||
txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||
txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||
txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave
|
||||
txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute
|
||||
txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||
txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||
txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||
txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute
|
||||
txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||
txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||
txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||
txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||
txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||
txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||
txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||
txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||
txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||
txt = re.sub(u'×|×|×', r'{x}', txt) # dimension
|
||||
txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||
txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave
|
||||
txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||
txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||
txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||
txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||
txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||
txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave
|
||||
txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute
|
||||
txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex
|
||||
txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||
txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||
txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring
|
||||
txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae
|
||||
txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||
txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave
|
||||
txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute
|
||||
txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||
txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||
txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave
|
||||
txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute
|
||||
txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex
|
||||
txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||
txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth
|
||||
txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||
txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave
|
||||
txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute
|
||||
txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||
txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||
txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||
txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||
txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave
|
||||
txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute
|
||||
txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex
|
||||
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
||||
txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron
|
||||
txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron
|
||||
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
||||
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
||||
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
||||
txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee
|
||||
txt = re.sub(u'€|€|€', r'{C=}', txt) # euro
|
||||
txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark
|
||||
txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade
|
||||
txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club
|
||||
txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart
|
||||
txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||
|
||||
txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
|
||||
txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
||||
txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
||||
|
||||
return txt
|
Loading…
x
Reference in New Issue
Block a user