mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
FB2 Output: Add cover to FB2 metadata. TXT Input: Support for textile markup
This commit is contained in:
commit
ff37f2e9fc
@ -16,7 +16,6 @@ import uuid
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre import guess_type
|
|
||||||
from calibre import prepare_string_for_xml
|
from calibre import prepare_string_for_xml
|
||||||
from calibre.constants import __appname__, __version__
|
from calibre.constants import __appname__, __version__
|
||||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||||
@ -41,7 +40,7 @@ class FB2MLizer(object):
|
|||||||
# in different directories. FB2 images are all in a flat layout so we rename all images
|
# in different directories. FB2 images are all in a flat layout so we rename all images
|
||||||
# into a sequential numbering system to ensure there are no collisions between image names.
|
# into a sequential numbering system to ensure there are no collisions between image names.
|
||||||
self.image_hrefs = {}
|
self.image_hrefs = {}
|
||||||
# Mapping of toc items and their
|
# Mapping of toc items and their
|
||||||
self.toc = {}
|
self.toc = {}
|
||||||
# Used to see whether a new <section> needs to be opened
|
# Used to see whether a new <section> needs to be opened
|
||||||
self.section_level = 0
|
self.section_level = 0
|
||||||
@ -51,7 +50,7 @@ class FB2MLizer(object):
|
|||||||
self.oeb_book = oeb_book
|
self.oeb_book = oeb_book
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.reset_state()
|
self.reset_state()
|
||||||
|
|
||||||
# Used for adding <section>s and <title>s to allow readers
|
# Used for adding <section>s and <title>s to allow readers
|
||||||
# to generate toc from the document.
|
# to generate toc from the document.
|
||||||
if self.opts.sectionize == 'toc':
|
if self.opts.sectionize == 'toc':
|
||||||
@ -75,20 +74,20 @@ class FB2MLizer(object):
|
|||||||
text = re.sub(r'(?miu)<p>\s*</p>', '', text)
|
text = re.sub(r'(?miu)<p>\s*</p>', '', text)
|
||||||
text = re.sub(r'(?miu)\s*</p>', '</p>', text)
|
text = re.sub(r'(?miu)\s*</p>', '</p>', text)
|
||||||
text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
|
text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
|
||||||
|
|
||||||
text = re.sub(r'(?miu)<title>\s*</title>', '', text)
|
text = re.sub(r'(?miu)<title>\s*</title>', '', text)
|
||||||
text = re.sub(r'(?miu)\s+</title>', '</title>', text)
|
text = re.sub(r'(?miu)\s+</title>', '</title>', text)
|
||||||
|
|
||||||
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
|
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
|
||||||
text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
|
text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
|
||||||
text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
|
text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
|
||||||
text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
|
text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
|
||||||
text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
|
text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
|
||||||
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
|
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
|
||||||
|
|
||||||
if self.opts.insert_blank_line:
|
if self.opts.insert_blank_line:
|
||||||
text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
|
text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def fb2_header(self):
|
def fb2_header(self):
|
||||||
@ -102,6 +101,7 @@ class FB2MLizer(object):
|
|||||||
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
|
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
|
||||||
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
|
metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
|
||||||
metadata['id'] = None
|
metadata['id'] = None
|
||||||
|
metadata['cover'] = self.get_cover()
|
||||||
|
|
||||||
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
|
author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
|
||||||
if len(author_parts) == 1:
|
if len(author_parts) == 1:
|
||||||
@ -121,10 +121,11 @@ class FB2MLizer(object):
|
|||||||
break
|
break
|
||||||
if metadata['id'] is None:
|
if metadata['id'] is None:
|
||||||
self.log.warn('No UUID identifier found')
|
self.log.warn('No UUID identifier found')
|
||||||
metadata['id'] = str(uuid.uuid4())
|
metadata['id'] = str(uuid.uuid4())
|
||||||
|
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
metadata[key] = prepare_string_for_xml(value)
|
if not key == 'cover':
|
||||||
|
metadata[key] = prepare_string_for_xml(value)
|
||||||
|
|
||||||
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
|
return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
|
||||||
'<description>' \
|
'<description>' \
|
||||||
@ -136,6 +137,7 @@ class FB2MLizer(object):
|
|||||||
'<last-name>%(author_last)s</last-name>' \
|
'<last-name>%(author_last)s</last-name>' \
|
||||||
'</author>' \
|
'</author>' \
|
||||||
'<book-title>%(title)s</book-title>' \
|
'<book-title>%(title)s</book-title>' \
|
||||||
|
'%(cover)s' \
|
||||||
'<lang>%(lang)s</lang>' \
|
'<lang>%(lang)s</lang>' \
|
||||||
'</title-info>' \
|
'</title-info>' \
|
||||||
'<document-info>' \
|
'<document-info>' \
|
||||||
@ -154,48 +156,66 @@ class FB2MLizer(object):
|
|||||||
def fb2_footer(self):
|
def fb2_footer(self):
|
||||||
return u'</FictionBook>'
|
return u'</FictionBook>'
|
||||||
|
|
||||||
|
def get_cover(self):
|
||||||
|
cover_href = None
|
||||||
|
|
||||||
|
# Get the raster cover if it's available.
|
||||||
|
if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
||||||
|
id = unicode(self.oeb_book.metadata.cover[0])
|
||||||
|
cover_item = self.oeb_book.manifest.ids[id]
|
||||||
|
if cover_item.media_type in OEB_RASTER_IMAGES:
|
||||||
|
cover_href = cover_item.href
|
||||||
|
print 1
|
||||||
|
else:
|
||||||
|
# Figure out if we have a title page or a cover page
|
||||||
|
page_name = ''
|
||||||
|
if 'titlepage' in self.oeb_book.guide:
|
||||||
|
page_name = 'titlepage'
|
||||||
|
elif 'cover' in self.oeb_book.guide:
|
||||||
|
page_name = 'cover'
|
||||||
|
|
||||||
|
if page_name:
|
||||||
|
cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
|
||||||
|
# Get the first image in the page
|
||||||
|
for img in cover_item.xpath('//img'):
|
||||||
|
cover_href = cover_item.abshref(img.get('src'))
|
||||||
|
print cover_href
|
||||||
|
break
|
||||||
|
|
||||||
|
if cover_href:
|
||||||
|
# Only write the image tag if it is in the manifest.
|
||||||
|
if cover_href in self.oeb_book.manifest.hrefs.keys():
|
||||||
|
if cover_href not in self.image_hrefs.keys():
|
||||||
|
self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
|
||||||
|
return u'<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
|
||||||
|
|
||||||
|
return u''
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
text = ['<body>']
|
text = ['<body>']
|
||||||
|
|
||||||
# Create main section if there are no others to create
|
# Create main section if there are no others to create
|
||||||
if self.opts.sectionize == 'nothing':
|
if self.opts.sectionize == 'nothing':
|
||||||
text.append('<section>')
|
text.append('<section>')
|
||||||
self.section_level += 1
|
self.section_level += 1
|
||||||
|
|
||||||
# Insert the title page / cover into the spine if it is not already referenced.
|
|
||||||
title_name = u''
|
|
||||||
if 'titlepage' in self.oeb_book.guide:
|
|
||||||
title_name = 'titlepage'
|
|
||||||
elif 'cover' in self.oeb_book.guide:
|
|
||||||
title_name = 'cover'
|
|
||||||
if title_name:
|
|
||||||
title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
|
|
||||||
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
|
|
||||||
self.oeb_book.spine.insert(0, title_item, True)
|
|
||||||
# Create xhtml page to reference cover image so it can be used.
|
|
||||||
if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
|
||||||
id = unicode(self.oeb_book.metadata.cover[0])
|
|
||||||
cover_item = self.oeb_book.manifest.ids[id]
|
|
||||||
if cover_item.media_type in OEB_RASTER_IMAGES:
|
|
||||||
self.insert_image_cover(cover_item.href)
|
|
||||||
|
|
||||||
for item in self.oeb_book.spine:
|
for item in self.oeb_book.spine:
|
||||||
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
||||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
|
|
||||||
# Start a <section> if we must sectionize each file or if the TOC references this page
|
# Start a <section> if we must sectionize each file or if the TOC references this page
|
||||||
page_section_open = False
|
page_section_open = False
|
||||||
if self.opts.sectionize == 'files' or self.toc.get(item.href) == 'page':
|
if self.opts.sectionize == 'files' or self.toc.get(item.href) == 'page':
|
||||||
text.append('<section>')
|
text.append('<section>')
|
||||||
page_section_open = True
|
page_section_open = True
|
||||||
self.section_level += 1
|
self.section_level += 1
|
||||||
|
|
||||||
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
||||||
|
|
||||||
if page_section_open:
|
if page_section_open:
|
||||||
text.append('</section>')
|
text.append('</section>')
|
||||||
self.section_level -= 1
|
self.section_level -= 1
|
||||||
|
|
||||||
# Close any open sections
|
# Close any open sections
|
||||||
while self.section_level > 0:
|
while self.section_level > 0:
|
||||||
text.append('</section>')
|
text.append('</section>')
|
||||||
@ -203,17 +223,6 @@ class FB2MLizer(object):
|
|||||||
|
|
||||||
return ''.join(text) + '</body>'
|
return ''.join(text) + '</body>'
|
||||||
|
|
||||||
def insert_image_cover(self, image_href):
|
|
||||||
from calibre.ebooks.oeb.base import RECOVER_PARSER
|
|
||||||
try:
|
|
||||||
root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER)
|
|
||||||
except:
|
|
||||||
root = etree.fromstring(u'', parser=RECOVER_PARSER)
|
|
||||||
|
|
||||||
id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml')
|
|
||||||
item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root)
|
|
||||||
self.oeb_book.spine.insert(0, item, True)
|
|
||||||
|
|
||||||
def fb2mlize_images(self):
|
def fb2mlize_images(self):
|
||||||
'''
|
'''
|
||||||
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
|
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
|
||||||
@ -345,7 +354,7 @@ class FB2MLizer(object):
|
|||||||
self.toc[page.href] = None
|
self.toc[page.href] = None
|
||||||
elif toc_entry and elem_tree.attrib.get('id', None):
|
elif toc_entry and elem_tree.attrib.get('id', None):
|
||||||
newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
|
newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
|
||||||
|
|
||||||
# Start a new section if necessary
|
# Start a new section if necessary
|
||||||
if newlevel:
|
if newlevel:
|
||||||
if not (newlevel > self.section_level):
|
if not (newlevel > self.section_level):
|
||||||
|
6
src/calibre/ebooks/textile/__init__.py
Normal file
6
src/calibre/ebooks/textile/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from functions import textile, textile_restricted, Textile
|
||||||
|
|
||||||
|
if False:
|
||||||
|
textile, textile_restricted, Textile
|
||||||
|
|
||||||
|
__all__ = ['textile', 'textile_restricted']
|
981
src/calibre/ebooks/textile/functions.py
Normal file
981
src/calibre/ebooks/textile/functions.py
Normal file
@ -0,0 +1,981 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
PyTextile
|
||||||
|
|
||||||
|
A Humane Web Text Generator
|
||||||
|
"""
|
||||||
|
|
||||||
|
__version__ = '2.1.4'
|
||||||
|
|
||||||
|
__date__ = '2009/12/04'
|
||||||
|
|
||||||
|
__copyright__ = """
|
||||||
|
Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
|
||||||
|
Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
|
||||||
|
Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/
|
||||||
|
|
||||||
|
Original PHP Version:
|
||||||
|
Copyright (c) 2003-2004, Dean Allen <dean@textism.com>
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Thanks to Carlo Zottmann <carlo@g-blog.net> for refactoring
|
||||||
|
Textile's procedural code into a class framework
|
||||||
|
|
||||||
|
Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
__license__ = """
|
||||||
|
L I C E N S E
|
||||||
|
=============
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name Textile nor the names of its contributors may be used to
|
||||||
|
endorse or promote products derived from this software without specific
|
||||||
|
prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from urlparse import urlparse
|
||||||
|
|
||||||
|
def _normalize_newlines(string):
|
||||||
|
out = re.sub(r'\r\n', '\n', string)
|
||||||
|
out = re.sub(r'\n{3,}', '\n\n', out)
|
||||||
|
out = re.sub(r'\n\s*\n', '\n\n', out)
|
||||||
|
out = re.sub(r'"$', '" ', out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def getimagesize(url):
|
||||||
|
"""
|
||||||
|
Attempts to determine an image's width and height, and returns a string
|
||||||
|
suitable for use in an <img> tag, or None in case of failure.
|
||||||
|
Requires that PIL is installed.
|
||||||
|
|
||||||
|
>>> getimagesize("http://www.google.com/intl/en_ALL/images/logo.gif")
|
||||||
|
... #doctest: +ELLIPSIS, +SKIP
|
||||||
|
'width="..." height="..."'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ImageFile
|
||||||
|
import urllib2
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
p = ImageFile.Parser()
|
||||||
|
f = urllib2.urlopen(url)
|
||||||
|
while True:
|
||||||
|
s = f.read(1024)
|
||||||
|
if not s:
|
||||||
|
break
|
||||||
|
p.feed(s)
|
||||||
|
if p.image:
|
||||||
|
return 'width="%i" height="%i"' % p.image.size
|
||||||
|
except (IOError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
class Textile(object):
|
||||||
|
hlgn = r'(?:\<(?!>)|(?<!<)\>|\<\>|\=|[()]+(?! ))'
|
||||||
|
vlgn = r'[\-^~]'
|
||||||
|
clas = r'(?:\([^)]+\))'
|
||||||
|
lnge = r'(?:\[[^\]]+\])'
|
||||||
|
styl = r'(?:\{[^}]+\})'
|
||||||
|
cspn = r'(?:\\\d+)'
|
||||||
|
rspn = r'(?:\/\d+)'
|
||||||
|
a = r'(?:%s|%s)*' % (hlgn, vlgn)
|
||||||
|
s = r'(?:%s|%s)*' % (cspn, rspn)
|
||||||
|
c = r'(?:%s)*' % '|'.join([clas, styl, lnge, hlgn])
|
||||||
|
|
||||||
|
pnct = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]'
|
||||||
|
# urlch = r'[\w"$\-_.+!*\'(),";/?:@=&%#{}|\\^~\[\]`]'
|
||||||
|
urlch = '[\w"$\-_.+*\'(),";\/?:@=&%#{}|\\^~\[\]`]'
|
||||||
|
|
||||||
|
url_schemes = ('http', 'https', 'ftp', 'mailto')
|
||||||
|
|
||||||
|
btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p')
|
||||||
|
btag_lite = ('bq', 'bc', 'p')
|
||||||
|
|
||||||
|
glyph_defaults = (
|
||||||
|
('txt_quote_single_open', '‘'),
|
||||||
|
('txt_quote_single_close', '’'),
|
||||||
|
('txt_quote_double_open', '“'),
|
||||||
|
('txt_quote_double_close', '”'),
|
||||||
|
('txt_apostrophe', '’'),
|
||||||
|
('txt_prime', '′'),
|
||||||
|
('txt_prime_double', '″'),
|
||||||
|
('txt_ellipsis', '…'),
|
||||||
|
('txt_emdash', '—'),
|
||||||
|
('txt_endash', '–'),
|
||||||
|
('txt_dimension', '×'),
|
||||||
|
('txt_trademark', '™'),
|
||||||
|
('txt_registered', '®'),
|
||||||
|
('txt_copyright', '©'),
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, restricted=False, lite=False, noimage=False):
|
||||||
|
"""docstring for __init__"""
|
||||||
|
self.restricted = restricted
|
||||||
|
self.lite = lite
|
||||||
|
self.noimage = noimage
|
||||||
|
self.get_sizes = False
|
||||||
|
self.fn = {}
|
||||||
|
self.urlrefs = {}
|
||||||
|
self.shelf = {}
|
||||||
|
self.rel = ''
|
||||||
|
self.html_type = 'xhtml'
|
||||||
|
|
||||||
|
def textile(self, text, rel=None, head_offset=0, html_type='xhtml'):
|
||||||
|
"""
|
||||||
|
>>> import textile
|
||||||
|
>>> textile.textile('some textile')
|
||||||
|
u'\\t<p>some textile</p>'
|
||||||
|
"""
|
||||||
|
self.html_type = html_type
|
||||||
|
|
||||||
|
# text = unicode(text)
|
||||||
|
text = _normalize_newlines(text)
|
||||||
|
|
||||||
|
if self.restricted:
|
||||||
|
text = self.encode_html(text, quotes=False)
|
||||||
|
|
||||||
|
if rel:
|
||||||
|
self.rel = ' rel="%s"' % rel
|
||||||
|
|
||||||
|
text = self.getRefs(text)
|
||||||
|
|
||||||
|
text = self.block(text, int(head_offset))
|
||||||
|
|
||||||
|
text = self.retrieve(text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def pba(self, input, element=None):
|
||||||
|
"""
|
||||||
|
Parse block attributes.
|
||||||
|
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.pba(r'\3')
|
||||||
|
''
|
||||||
|
>>> t.pba(r'\\3', element='td')
|
||||||
|
' colspan="3"'
|
||||||
|
>>> t.pba(r'/4', element='td')
|
||||||
|
' rowspan="4"'
|
||||||
|
>>> t.pba(r'\\3/4', element='td')
|
||||||
|
' colspan="3" rowspan="4"'
|
||||||
|
|
||||||
|
>>> t.vAlign('^')
|
||||||
|
'top'
|
||||||
|
|
||||||
|
>>> t.pba('^', element='td')
|
||||||
|
' style="vertical-align:top;"'
|
||||||
|
|
||||||
|
>>> t.pba('{line-height:18px}')
|
||||||
|
' style="line-height:18px;"'
|
||||||
|
|
||||||
|
>>> t.pba('(foo-bar)')
|
||||||
|
' class="foo-bar"'
|
||||||
|
|
||||||
|
>>> t.pba('(#myid)')
|
||||||
|
' id="myid"'
|
||||||
|
|
||||||
|
>>> t.pba('(foo-bar#myid)')
|
||||||
|
' class="foo-bar" id="myid"'
|
||||||
|
|
||||||
|
>>> t.pba('((((')
|
||||||
|
' style="padding-left:4em;"'
|
||||||
|
|
||||||
|
>>> t.pba(')))')
|
||||||
|
' style="padding-right:3em;"'
|
||||||
|
|
||||||
|
>>> t.pba('[fr]')
|
||||||
|
' lang="fr"'
|
||||||
|
|
||||||
|
"""
|
||||||
|
style = []
|
||||||
|
aclass = ''
|
||||||
|
lang = ''
|
||||||
|
colspan = ''
|
||||||
|
rowspan = ''
|
||||||
|
id = ''
|
||||||
|
|
||||||
|
if not input:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
matched = input
|
||||||
|
if element == 'td':
|
||||||
|
m = re.search(r'\\(\d+)', matched)
|
||||||
|
if m:
|
||||||
|
colspan = m.group(1)
|
||||||
|
|
||||||
|
m = re.search(r'/(\d+)', matched)
|
||||||
|
if m:
|
||||||
|
rowspan = m.group(1)
|
||||||
|
|
||||||
|
if element == 'td' or element == 'tr':
|
||||||
|
m = re.search(r'(%s)' % self.vlgn, matched)
|
||||||
|
if m:
|
||||||
|
style.append("vertical-align:%s;" % self.vAlign(m.group(1)))
|
||||||
|
|
||||||
|
m = re.search(r'\{([^}]*)\}', matched)
|
||||||
|
if m:
|
||||||
|
style.append(m.group(1).rstrip(';') + ';')
|
||||||
|
matched = matched.replace(m.group(0), '')
|
||||||
|
|
||||||
|
m = re.search(r'\[([^\]]+)\]', matched, re.U)
|
||||||
|
if m:
|
||||||
|
lang = m.group(1)
|
||||||
|
matched = matched.replace(m.group(0), '')
|
||||||
|
|
||||||
|
m = re.search(r'\(([^()]+)\)', matched, re.U)
|
||||||
|
if m:
|
||||||
|
aclass = m.group(1)
|
||||||
|
matched = matched.replace(m.group(0), '')
|
||||||
|
|
||||||
|
m = re.search(r'([(]+)', matched)
|
||||||
|
if m:
|
||||||
|
style.append("padding-left:%sem;" % len(m.group(1)))
|
||||||
|
matched = matched.replace(m.group(0), '')
|
||||||
|
|
||||||
|
m = re.search(r'([)]+)', matched)
|
||||||
|
if m:
|
||||||
|
style.append("padding-right:%sem;" % len(m.group(1)))
|
||||||
|
matched = matched.replace(m.group(0), '')
|
||||||
|
|
||||||
|
m = re.search(r'(%s)' % self.hlgn, matched)
|
||||||
|
if m:
|
||||||
|
style.append("text-align:%s;" % self.hAlign(m.group(1)))
|
||||||
|
|
||||||
|
m = re.search(r'^(.*)#(.*)$', aclass)
|
||||||
|
if m:
|
||||||
|
id = m.group(2)
|
||||||
|
aclass = m.group(1)
|
||||||
|
|
||||||
|
if self.restricted:
|
||||||
|
if lang:
|
||||||
|
return ' lang="%s"'
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
result = []
|
||||||
|
if style:
|
||||||
|
result.append(' style="%s"' % "".join(style))
|
||||||
|
if aclass:
|
||||||
|
result.append(' class="%s"' % aclass)
|
||||||
|
if lang:
|
||||||
|
result.append(' lang="%s"' % lang)
|
||||||
|
if id:
|
||||||
|
result.append(' id="%s"' % id)
|
||||||
|
if colspan:
|
||||||
|
result.append(' colspan="%s"' % colspan)
|
||||||
|
if rowspan:
|
||||||
|
result.append(' rowspan="%s"' % rowspan)
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
def hasRawText(self, text):
|
||||||
|
"""
|
||||||
|
checks whether the text has text not already enclosed by a block tag
|
||||||
|
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.hasRawText('<p>foo bar biz baz</p>')
|
||||||
|
False
|
||||||
|
|
||||||
|
>>> t.hasRawText(' why yes, yes it does')
|
||||||
|
True
|
||||||
|
|
||||||
|
"""
|
||||||
|
r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|pre|h\d)[^>]*?>.*</\1>', re.S).sub('', text.strip()).strip()
|
||||||
|
r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r)
|
||||||
|
return '' != r
|
||||||
|
|
||||||
|
def table(self, text):
|
||||||
|
r"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.table('|one|two|three|\n|a|b|c|')
|
||||||
|
'\t<table>\n\t\t<tr>\n\t\t\t<td>one</td>\n\t\t\t<td>two</td>\n\t\t\t<td>three</td>\n\t\t</tr>\n\t\t<tr>\n\t\t\t<td>a</td>\n\t\t\t<td>b</td>\n\t\t\t<td>c</td>\n\t\t</tr>\n\t</table>\n\n'
|
||||||
|
"""
|
||||||
|
text = text + "\n\n"
|
||||||
|
pattern = re.compile(r'^(?:table(_?%(s)s%(a)s%(c)s)\. ?\n)?^(%(a)s%(c)s\.? ?\|.*\|)\n\n' % {'s':self.s, 'a':self.a, 'c':self.c}, re.S|re.M|re.U)
|
||||||
|
return pattern.sub(self.fTable, text)
|
||||||
|
|
||||||
|
def fTable(self, match):
|
||||||
|
tatts = self.pba(match.group(1), 'table')
|
||||||
|
rows = []
|
||||||
|
for row in [ x for x in match.group(2).split('\n') if x]:
|
||||||
|
rmtch = re.search(r'^(%s%s\. )(.*)' % (self.a, self.c), row.lstrip())
|
||||||
|
if rmtch:
|
||||||
|
ratts = self.pba(rmtch.group(1), 'tr')
|
||||||
|
row = rmtch.group(2)
|
||||||
|
else:
|
||||||
|
ratts = ''
|
||||||
|
|
||||||
|
cells = []
|
||||||
|
for cell in row.split('|')[1:-1]:
|
||||||
|
ctyp = 'd'
|
||||||
|
if re.search(r'^_', cell):
|
||||||
|
ctyp = "h"
|
||||||
|
cmtch = re.search(r'^(_?%s%s%s\. )(.*)' % (self.s, self.a, self.c), cell)
|
||||||
|
if cmtch:
|
||||||
|
catts = self.pba(cmtch.group(1), 'td')
|
||||||
|
cell = cmtch.group(2)
|
||||||
|
else:
|
||||||
|
catts = ''
|
||||||
|
|
||||||
|
cell = self.graf(self.span(cell))
|
||||||
|
cells.append('\t\t\t<t%s%s>%s</t%s>' % (ctyp, catts, cell, ctyp))
|
||||||
|
rows.append("\t\t<tr%s>\n%s\n\t\t</tr>" % (ratts, '\n'.join(cells)))
|
||||||
|
cells = []
|
||||||
|
catts = None
|
||||||
|
return "\t<table%s>\n%s\n\t</table>\n\n" % (tatts, '\n'.join(rows))
|
||||||
|
|
||||||
|
def lists(self, text):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.lists("* one\\n* two\\n* three")
|
||||||
|
'\\t<ul>\\n\\t\\t<li>one</li>\\n\\t\\t<li>two</li>\\n\\t\\t<li>three</li>\\n\\t</ul>'
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r'^([#*]+%s .*)$(?![^#*])' % self.c, re.U|re.M|re.S)
|
||||||
|
return pattern.sub(self.fList, text)
|
||||||
|
|
||||||
|
def fList(self, match):
|
||||||
|
text = match.group(0).split("\n")
|
||||||
|
result = []
|
||||||
|
lists = []
|
||||||
|
for i, line in enumerate(text):
|
||||||
|
try:
|
||||||
|
nextline = text[i+1]
|
||||||
|
except IndexError:
|
||||||
|
nextline = ''
|
||||||
|
|
||||||
|
m = re.search(r"^([#*]+)(%s%s) (.*)$" % (self.a, self.c), line, re.S)
|
||||||
|
if m:
|
||||||
|
tl, atts, content = m.groups()
|
||||||
|
nl = ''
|
||||||
|
nm = re.search(r'^([#*]+)\s.*', nextline)
|
||||||
|
if nm:
|
||||||
|
nl = nm.group(1)
|
||||||
|
if tl not in lists:
|
||||||
|
lists.append(tl)
|
||||||
|
atts = self.pba(atts)
|
||||||
|
line = "\t<%sl%s>\n\t\t<li>%s" % (self.lT(tl), atts, self.graf(content))
|
||||||
|
else:
|
||||||
|
line = "\t\t<li>" + self.graf(content)
|
||||||
|
|
||||||
|
if len(nl) <= len(tl):
|
||||||
|
line = line + "</li>"
|
||||||
|
for k in reversed(lists):
|
||||||
|
if len(k) > len(nl):
|
||||||
|
line = line + "\n\t</%sl>" % self.lT(k)
|
||||||
|
if len(k) > 1:
|
||||||
|
line = line + "</li>"
|
||||||
|
lists.remove(k)
|
||||||
|
|
||||||
|
result.append(line)
|
||||||
|
return "\n".join(result)
|
||||||
|
|
||||||
|
def lT(self, input):
|
||||||
|
if re.search(r'^#+', input):
|
||||||
|
return 'o'
|
||||||
|
else:
|
||||||
|
return 'u'
|
||||||
|
|
||||||
|
def doPBr(self, in_):
|
||||||
|
return re.compile(r'<(p)([^>]*?)>(.*)(</\1>)', re.S).sub(self.doBr, in_)
|
||||||
|
|
||||||
|
def doBr(self, match):
|
||||||
|
if self.html_type == 'html':
|
||||||
|
content = re.sub(r'(.+)(?:(?<!<br>)|(?<!<br />))\n(?![#*\s|])', '\\1<br>', match.group(3))
|
||||||
|
else:
|
||||||
|
content = re.sub(r'(.+)(?:(?<!<br>)|(?<!<br />))\n(?![#*\s|])', '\\1<br />', match.group(3))
|
||||||
|
return '<%s%s>%s%s' % (match.group(1), match.group(2), content, match.group(4))
|
||||||
|
|
||||||
|
def block(self, text, head_offset = 0):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.block('h1. foobar baby')
|
||||||
|
'\\t<h1>foobar baby</h1>'
|
||||||
|
"""
|
||||||
|
if not self.lite:
|
||||||
|
tre = '|'.join(self.btag)
|
||||||
|
else:
|
||||||
|
tre = '|'.join(self.btag_lite)
|
||||||
|
text = text.split('\n\n')
|
||||||
|
|
||||||
|
tag = 'p'
|
||||||
|
atts = cite = graf = ext = c1 = ''
|
||||||
|
|
||||||
|
out = []
|
||||||
|
|
||||||
|
anon = False
|
||||||
|
for line in text:
|
||||||
|
pattern = r'^(%s)(%s%s)\.(\.?)(?::(\S+))? (.*)$' % (tre, self.a, self.c)
|
||||||
|
match = re.search(pattern, line, re.S)
|
||||||
|
if match:
|
||||||
|
if ext:
|
||||||
|
out.append(out.pop() + c1)
|
||||||
|
|
||||||
|
tag, atts, ext, cite, graf = match.groups()
|
||||||
|
h_match = re.search(r'h([1-6])', tag)
|
||||||
|
if h_match:
|
||||||
|
head_level, = h_match.groups()
|
||||||
|
tag = 'h%i' % max(1,
|
||||||
|
min(int(head_level) + head_offset,
|
||||||
|
6))
|
||||||
|
o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext,
|
||||||
|
cite, graf)
|
||||||
|
# leave off c1 if this block is extended,
|
||||||
|
# we'll close it at the start of the next block
|
||||||
|
|
||||||
|
if ext:
|
||||||
|
line = "%s%s%s%s" % (o1, o2, content, c2)
|
||||||
|
else:
|
||||||
|
line = "%s%s%s%s%s" % (o1, o2, content, c2, c1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
anon = True
|
||||||
|
if ext or not re.search(r'^\s', line):
|
||||||
|
o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext,
|
||||||
|
cite, line)
|
||||||
|
# skip $o1/$c1 because this is part of a continuing
|
||||||
|
# extended block
|
||||||
|
if tag == 'p' and not self.hasRawText(content):
|
||||||
|
line = content
|
||||||
|
else:
|
||||||
|
line = "%s%s%s" % (o2, content, c2)
|
||||||
|
else:
|
||||||
|
line = self.graf(line)
|
||||||
|
|
||||||
|
line = self.doPBr(line)
|
||||||
|
if self.html_type == 'xhtml':
|
||||||
|
line = re.sub(r'<br>', '<br />', line)
|
||||||
|
|
||||||
|
if ext and anon:
|
||||||
|
out.append(out.pop() + "\n" + line)
|
||||||
|
else:
|
||||||
|
out.append(line)
|
||||||
|
|
||||||
|
if not ext:
|
||||||
|
tag = 'p'
|
||||||
|
atts = ''
|
||||||
|
cite = ''
|
||||||
|
graf = ''
|
||||||
|
|
||||||
|
if ext:
|
||||||
|
out.append(out.pop() + c1)
|
||||||
|
return '\n\n'.join(out)
|
||||||
|
|
||||||
|
def fBlock(self, tag, atts, ext, cite, content):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.fBlock("bq", "", None, "", "Hello BlockQuote")
|
||||||
|
('\\t<blockquote>\\n', '\\t\\t<p>', 'Hello BlockQuote', '</p>', '\\n\\t</blockquote>')
|
||||||
|
|
||||||
|
>>> t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote")
|
||||||
|
('\\t<blockquote cite="http://google.com">\\n', '\\t\\t<p>', 'Hello BlockQuote', '</p>', '\\n\\t</blockquote>')
|
||||||
|
|
||||||
|
>>> t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS
|
||||||
|
('<pre>', '<code>', ..., '</code>', '</pre>')
|
||||||
|
|
||||||
|
>>> t.fBlock("h1", "", None, "", "foobar")
|
||||||
|
('', '\\t<h1>', 'foobar', '</h1>', '')
|
||||||
|
"""
|
||||||
|
atts = self.pba(atts)
|
||||||
|
o1 = o2 = c2 = c1 = ''
|
||||||
|
|
||||||
|
m = re.search(r'fn(\d+)', tag)
|
||||||
|
if m:
|
||||||
|
tag = 'p'
|
||||||
|
if m.group(1) in self.fn:
|
||||||
|
fnid = self.fn[m.group(1)]
|
||||||
|
else:
|
||||||
|
fnid = m.group(1)
|
||||||
|
atts = atts + ' id="fn%s"' % fnid
|
||||||
|
if atts.find('class=') < 0:
|
||||||
|
atts = atts + ' class="footnote"'
|
||||||
|
content = ('<sup>%s</sup>' % m.group(1)) + content
|
||||||
|
|
||||||
|
if tag == 'bq':
|
||||||
|
cite = self.checkRefs(cite)
|
||||||
|
if cite:
|
||||||
|
cite = ' cite="%s"' % cite
|
||||||
|
else:
|
||||||
|
cite = ''
|
||||||
|
o1 = "\t<blockquote%s%s>\n" % (cite, atts)
|
||||||
|
o2 = "\t\t<p%s>" % atts
|
||||||
|
c2 = "</p>"
|
||||||
|
c1 = "\n\t</blockquote>"
|
||||||
|
|
||||||
|
elif tag == 'bc':
|
||||||
|
o1 = "<pre%s>" % atts
|
||||||
|
o2 = "<code%s>" % atts
|
||||||
|
c2 = "</code>"
|
||||||
|
c1 = "</pre>"
|
||||||
|
content = self.shelve(self.encode_html(content.rstrip("\n") + "\n"))
|
||||||
|
|
||||||
|
elif tag == 'notextile':
|
||||||
|
content = self.shelve(content)
|
||||||
|
o1 = o2 = ''
|
||||||
|
c1 = c2 = ''
|
||||||
|
|
||||||
|
elif tag == 'pre':
|
||||||
|
content = self.shelve(self.encode_html(content.rstrip("\n") + "\n"))
|
||||||
|
o1 = "<pre%s>" % atts
|
||||||
|
o2 = c2 = ''
|
||||||
|
c1 = '</pre>'
|
||||||
|
|
||||||
|
else:
|
||||||
|
o2 = "\t<%s%s>" % (tag, atts)
|
||||||
|
c2 = "</%s>" % tag
|
||||||
|
|
||||||
|
content = self.graf(content)
|
||||||
|
return o1, o2, content, c2, c1
|
||||||
|
|
||||||
|
def footnoteRef(self, text):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS
|
||||||
|
'foo<sup class="footnote"><a href="#fn...">1</a></sup> '
|
||||||
|
"""
|
||||||
|
return re.sub(r'\b\[([0-9]+)\](\s)?', self.footnoteID, text)
|
||||||
|
|
||||||
|
def footnoteID(self, match):
|
||||||
|
id, t = match.groups()
|
||||||
|
if id not in self.fn:
|
||||||
|
self.fn[id] = str(uuid.uuid4())
|
||||||
|
fnid = self.fn[id]
|
||||||
|
if not t:
|
||||||
|
t = ''
|
||||||
|
return '<sup class="footnote"><a href="#fn%s">%s</a></sup>%s' % (fnid, id, t)
|
||||||
|
|
||||||
|
def glyphs(self, text):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
|
||||||
|
>>> t.glyphs("apostrophe's")
|
||||||
|
'apostrophe’s'
|
||||||
|
|
||||||
|
>>> t.glyphs("back in '88")
|
||||||
|
'back in ’88'
|
||||||
|
|
||||||
|
>>> t.glyphs('foo ...')
|
||||||
|
'foo …'
|
||||||
|
|
||||||
|
>>> t.glyphs('--')
|
||||||
|
'—'
|
||||||
|
|
||||||
|
>>> t.glyphs('FooBar[tm]')
|
||||||
|
'FooBar™'
|
||||||
|
|
||||||
|
>>> t.glyphs("<p><cite>Cat's Cradle</cite> by Vonnegut</p>")
|
||||||
|
'<p><cite>Cat’s Cradle</cite> by Vonnegut</p>'
|
||||||
|
|
||||||
|
"""
|
||||||
|
# fix: hackish
|
||||||
|
text = re.sub(r'"\Z', '\" ', text)
|
||||||
|
|
||||||
|
glyph_search = (
|
||||||
|
re.compile(r"(\w)\'(\w)"), # apostrophe's
|
||||||
|
re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88
|
||||||
|
re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing
|
||||||
|
re.compile(r'\'/'), # single opening
|
||||||
|
re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing
|
||||||
|
re.compile(r'"'), # double opening
|
||||||
|
re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym
|
||||||
|
re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase
|
||||||
|
re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis
|
||||||
|
re.compile(r'(\s?)--(\s?)'), # em dash
|
||||||
|
re.compile(r'\s-(?:\s|$)'), # en dash
|
||||||
|
re.compile(r'(\d+)( ?)x( ?)(?=\d+)'), # dimension sign
|
||||||
|
re.compile(r'\b ?[([]TM[])]', re.I), # trademark
|
||||||
|
re.compile(r'\b ?[([]R[])]', re.I), # registered
|
||||||
|
re.compile(r'\b ?[([]C[])]', re.I), # copyright
|
||||||
|
)
|
||||||
|
|
||||||
|
glyph_replace = [x % dict(self.glyph_defaults) for x in (
|
||||||
|
r'\1%(txt_apostrophe)s\2', # apostrophe's
|
||||||
|
r'\1%(txt_apostrophe)s\2', # back in '88
|
||||||
|
r'\1%(txt_quote_single_close)s', # single closing
|
||||||
|
r'%(txt_quote_single_open)s', # single opening
|
||||||
|
r'\1%(txt_quote_double_close)s', # double closing
|
||||||
|
r'%(txt_quote_double_open)s', # double opening
|
||||||
|
r'<acronym title="\2">\1</acronym>', # 3+ uppercase acronym
|
||||||
|
r'<span class="caps">\1</span>', # 3+ uppercase
|
||||||
|
r'\1%(txt_ellipsis)s', # ellipsis
|
||||||
|
r'\1%(txt_emdash)s\2', # em dash
|
||||||
|
r' %(txt_endash)s ', # en dash
|
||||||
|
r'\1\2%(txt_dimension)s\3', # dimension sign
|
||||||
|
r'%(txt_trademark)s', # trademark
|
||||||
|
r'%(txt_registered)s', # registered
|
||||||
|
r'%(txt_copyright)s', # copyright
|
||||||
|
)]
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for line in re.compile(r'(<.*?>)', re.U).split(text):
|
||||||
|
if not re.search(r'<.*>', line):
|
||||||
|
for s, r in zip(glyph_search, glyph_replace):
|
||||||
|
line = s.sub(r, line)
|
||||||
|
result.append(line)
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
def vAlign(self, input):
|
||||||
|
d = {'^':'top', '-':'middle', '~':'bottom'}
|
||||||
|
return d.get(input, '')
|
||||||
|
|
||||||
|
def hAlign(self, input):
|
||||||
|
d = {'<':'left', '=':'center', '>':'right', '<>': 'justify'}
|
||||||
|
return d.get(input, '')
|
||||||
|
|
||||||
|
def getRefs(self, text):
|
||||||
|
"""
|
||||||
|
what is this for?
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U)
|
||||||
|
text = pattern.sub(self.refs, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def refs(self, match):
|
||||||
|
flag, url = match.groups()
|
||||||
|
self.urlrefs[flag] = url
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def checkRefs(self, url):
|
||||||
|
return self.urlrefs.get(url, url)
|
||||||
|
|
||||||
|
def isRelURL(self, url):
|
||||||
|
"""
|
||||||
|
Identify relative urls.
|
||||||
|
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.isRelURL("http://www.google.com/")
|
||||||
|
False
|
||||||
|
>>> t.isRelURL("/foo")
|
||||||
|
True
|
||||||
|
|
||||||
|
"""
|
||||||
|
(scheme, netloc) = urlparse(url)[0:2]
|
||||||
|
return not scheme and not netloc
|
||||||
|
|
||||||
|
def relURL(self, url):
|
||||||
|
scheme = urlparse(url)[0]
|
||||||
|
if self.restricted and scheme and scheme not in self.url_schemes:
|
||||||
|
return '#'
|
||||||
|
return url
|
||||||
|
|
||||||
|
def shelve(self, text):
|
||||||
|
id = str(uuid.uuid4())
|
||||||
|
self.shelf[id] = text
|
||||||
|
return id
|
||||||
|
|
||||||
|
def retrieve(self, text):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> id = t.shelve("foobar")
|
||||||
|
>>> t.retrieve(id)
|
||||||
|
'foobar'
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
old = text
|
||||||
|
for k, v in self.shelf.items():
|
||||||
|
text = text.replace(k, v)
|
||||||
|
if text == old:
|
||||||
|
break
|
||||||
|
return text
|
||||||
|
|
||||||
|
def encode_html(self, text, quotes=True):
|
||||||
|
a = (
|
||||||
|
('&', '&'),
|
||||||
|
('<', '<'),
|
||||||
|
('>', '>')
|
||||||
|
)
|
||||||
|
|
||||||
|
if quotes:
|
||||||
|
a = a + (
|
||||||
|
("'", '''),
|
||||||
|
('"', '"')
|
||||||
|
)
|
||||||
|
|
||||||
|
for k, v in a:
|
||||||
|
text = text.replace(k, v)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def graf(self, text):
|
||||||
|
if not self.lite:
|
||||||
|
text = self.noTextile(text)
|
||||||
|
text = self.code(text)
|
||||||
|
|
||||||
|
text = self.links(text)
|
||||||
|
|
||||||
|
if not self.noimage:
|
||||||
|
text = self.image(text)
|
||||||
|
|
||||||
|
if not self.lite:
|
||||||
|
text = self.lists(text)
|
||||||
|
text = self.table(text)
|
||||||
|
|
||||||
|
text = self.span(text)
|
||||||
|
text = self.footnoteRef(text)
|
||||||
|
text = self.glyphs(text)
|
||||||
|
|
||||||
|
return text.rstrip('\n')
|
||||||
|
|
||||||
|
def links(self, text):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS
|
||||||
|
'fooobar ... and hello world ...'
|
||||||
|
"""
|
||||||
|
|
||||||
|
punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
|
||||||
|
|
||||||
|
pattern = r'''
|
||||||
|
(?P<pre> [\s\[{(]|[%s] )?
|
||||||
|
" # start
|
||||||
|
(?P<atts> %s )
|
||||||
|
(?P<text> [^"]+? )
|
||||||
|
\s?
|
||||||
|
(?: \(([^)]+?)\)(?=") )? # $title
|
||||||
|
":
|
||||||
|
(?P<url> (?:ftp|https?)? (?: :// )? [-A-Za-z0-9+&@#/?=~_()|!:,.;]*[-A-Za-z0-9+&@#/=~_()|] )
|
||||||
|
(?P<post> [^\w\/;]*? )
|
||||||
|
(?=<|\s|$)
|
||||||
|
''' % (re.escape(punct), self.c)
|
||||||
|
|
||||||
|
text = re.compile(pattern, re.X).sub(self.fLink, text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def fLink(self, match):
|
||||||
|
pre, atts, text, title, url, post = match.groups()
|
||||||
|
|
||||||
|
if pre == None:
|
||||||
|
pre = ''
|
||||||
|
|
||||||
|
# assume ) at the end of the url is not actually part of the url
|
||||||
|
# unless the url also contains a (
|
||||||
|
if url.endswith(')') and not url.find('(') > -1:
|
||||||
|
post = url[-1] + post
|
||||||
|
url = url[:-1]
|
||||||
|
|
||||||
|
url = self.checkRefs(url)
|
||||||
|
|
||||||
|
atts = self.pba(atts)
|
||||||
|
if title:
|
||||||
|
atts = atts + ' title="%s"' % self.encode_html(title)
|
||||||
|
|
||||||
|
if not self.noimage:
|
||||||
|
text = self.image(text)
|
||||||
|
|
||||||
|
text = self.span(text)
|
||||||
|
text = self.glyphs(text)
|
||||||
|
|
||||||
|
url = self.relURL(url)
|
||||||
|
out = '<a href="%s"%s%s>%s</a>' % (self.encode_html(url), atts, self.rel, text)
|
||||||
|
out = self.shelve(out)
|
||||||
|
return ''.join([pre, out, post])
|
||||||
|
|
||||||
|
def span(self, text):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
|
||||||
|
'hello <span class="bob">span <strong>strong</strong> and <b>bold</b></span> goodbye'
|
||||||
|
"""
|
||||||
|
qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
|
||||||
|
pnct = ".,\"'?!;:"
|
||||||
|
|
||||||
|
for qtag in qtags:
|
||||||
|
pattern = re.compile(r"""
|
||||||
|
(?:^|(?<=[\s>%(pnct)s])|([\]}]))
|
||||||
|
(%(qtag)s)(?!%(qtag)s)
|
||||||
|
(%(c)s)
|
||||||
|
(?::(\S+))?
|
||||||
|
([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n])
|
||||||
|
([%(pnct)s]*)
|
||||||
|
%(qtag)s
|
||||||
|
(?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s))
|
||||||
|
""" % {'qtag':qtag, 'c':self.c, 'pnct':pnct,
|
||||||
|
'selfpnct':self.pnct}, re.X)
|
||||||
|
text = pattern.sub(self.fSpan, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def fSpan(self, match):
|
||||||
|
_, tag, atts, cite, content, end, _ = match.groups()
|
||||||
|
|
||||||
|
qtags = {
|
||||||
|
'*': 'strong',
|
||||||
|
'**': 'b',
|
||||||
|
'??': 'cite',
|
||||||
|
'_' : 'em',
|
||||||
|
'__': 'i',
|
||||||
|
'-' : 'del',
|
||||||
|
'%' : 'span',
|
||||||
|
'+' : 'ins',
|
||||||
|
'~' : 'sub',
|
||||||
|
'^' : 'sup'
|
||||||
|
}
|
||||||
|
tag = qtags[tag]
|
||||||
|
atts = self.pba(atts)
|
||||||
|
if cite:
|
||||||
|
atts = atts + 'cite="%s"' % cite
|
||||||
|
|
||||||
|
content = self.span(content)
|
||||||
|
|
||||||
|
out = "<%s%s>%s%s</%s>" % (tag, atts, content, end, tag)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def image(self, text):
|
||||||
|
"""
|
||||||
|
>>> t = Textile()
|
||||||
|
>>> t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
|
||||||
|
'<a href="http://jsamsa.com"><img src="/imgs/myphoto.jpg" alt="" /></a>'
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r"""
|
||||||
|
(?:[\[{])? # pre
|
||||||
|
\! # opening !
|
||||||
|
(%s) # optional style,class atts
|
||||||
|
(?:\. )? # optional dot-space
|
||||||
|
([^\s(!]+) # presume this is the src
|
||||||
|
\s? # optional space
|
||||||
|
(?:\(([^\)]+)\))? # optional title
|
||||||
|
\! # closing
|
||||||
|
(?::(\S+))? # optional href
|
||||||
|
(?:[\]}]|(?=\s|$)) # lookahead: space or end of string
|
||||||
|
""" % self.c, re.U|re.X)
|
||||||
|
return pattern.sub(self.fImage, text)
|
||||||
|
|
||||||
|
def fImage(self, match):
|
||||||
|
# (None, '', '/imgs/myphoto.jpg', None, None)
|
||||||
|
atts, url, title, href = match.groups()
|
||||||
|
atts = self.pba(atts)
|
||||||
|
|
||||||
|
if title:
|
||||||
|
atts = atts + ' title="%s" alt="%s"' % (title, title)
|
||||||
|
else:
|
||||||
|
atts = atts + ' alt=""'
|
||||||
|
|
||||||
|
if not self.isRelURL(url) and self.get_sizes:
|
||||||
|
size = getimagesize(url)
|
||||||
|
if (size):
|
||||||
|
atts += " %s" % size
|
||||||
|
|
||||||
|
if href:
|
||||||
|
href = self.checkRefs(href)
|
||||||
|
|
||||||
|
url = self.checkRefs(url)
|
||||||
|
url = self.relURL(url)
|
||||||
|
|
||||||
|
out = []
|
||||||
|
if href:
|
||||||
|
out.append('<a href="%s" class="img">' % href)
|
||||||
|
if self.html_type == 'html':
|
||||||
|
out.append('<img src="%s"%s>' % (url, atts))
|
||||||
|
else:
|
||||||
|
out.append('<img src="%s"%s />' % (url, atts))
|
||||||
|
if href:
|
||||||
|
out.append('</a>')
|
||||||
|
|
||||||
|
return ''.join(out)
|
||||||
|
|
||||||
|
def code(self, text):
|
||||||
|
text = self.doSpecial(text, '<code>', '</code>', self.fCode)
|
||||||
|
text = self.doSpecial(text, '@', '@', self.fCode)
|
||||||
|
text = self.doSpecial(text, '<pre>', '</pre>', self.fPre)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def fCode(self, match):
|
||||||
|
before, text, after = match.groups()
|
||||||
|
if after == None:
|
||||||
|
after = ''
|
||||||
|
# text needs to be escaped
|
||||||
|
if not self.restricted:
|
||||||
|
text = self.encode_html(text)
|
||||||
|
return ''.join([before, self.shelve('<code>%s</code>' % text), after])
|
||||||
|
|
||||||
|
def fPre(self, match):
|
||||||
|
before, text, after = match.groups()
|
||||||
|
if after == None:
|
||||||
|
after = ''
|
||||||
|
# text needs to be escapedd
|
||||||
|
if not self.restricted:
|
||||||
|
text = self.encode_html(text)
|
||||||
|
return ''.join([before, '<pre>', self.shelve(text), '</pre>', after])
|
||||||
|
|
||||||
|
def doSpecial(self, text, start, end, method=None):
|
||||||
|
if method == None:
|
||||||
|
method = self.fSpecial
|
||||||
|
pattern = re.compile(r'(^|\s|[\[({>])%s(.*?)%s(\s|$|[\])}])?' % (re.escape(start), re.escape(end)), re.M|re.S)
|
||||||
|
return pattern.sub(method, text)
|
||||||
|
|
||||||
|
def fSpecial(self, match):
|
||||||
|
"""
|
||||||
|
special blocks like notextile or code
|
||||||
|
"""
|
||||||
|
before, text, after = match.groups()
|
||||||
|
if after == None:
|
||||||
|
after = ''
|
||||||
|
return ''.join([before, self.shelve(self.encode_html(text)), after])
|
||||||
|
|
||||||
|
def noTextile(self, text):
|
||||||
|
text = self.doSpecial(text, '<notextile>', '</notextile>', self.fTextile)
|
||||||
|
return self.doSpecial(text, '==', '==', self.fTextile)
|
||||||
|
|
||||||
|
def fTextile(self, match):
|
||||||
|
before, notextile, after = match.groups()
|
||||||
|
if after == None:
|
||||||
|
after = ''
|
||||||
|
return ''.join([before, self.shelve(notextile), after])
|
||||||
|
|
||||||
|
|
||||||
|
def textile(text, head_offset=0, html_type='xhtml', encoding=None, output=None):
|
||||||
|
"""
|
||||||
|
this function takes additional parameters:
|
||||||
|
head_offset - offset to apply to heading levels (default: 0)
|
||||||
|
html_type - 'xhtml' or 'html' style tags (default: 'xhtml')
|
||||||
|
"""
|
||||||
|
return Textile().textile(text, head_offset=head_offset,
|
||||||
|
html_type=html_type)
|
||||||
|
|
||||||
|
def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
|
||||||
|
"""
|
||||||
|
Restricted version of Textile designed for weblog comments and other
|
||||||
|
untrusted input.
|
||||||
|
|
||||||
|
Raw HTML is escaped.
|
||||||
|
Style attributes are disabled.
|
||||||
|
rel='nofollow' is added to external links.
|
||||||
|
|
||||||
|
When lite=True is set (the default):
|
||||||
|
Block tags are restricted to p, bq, and bc.
|
||||||
|
Lists and tables are disabled.
|
||||||
|
|
||||||
|
When noimage=True is set (the default):
|
||||||
|
Image tags are disabled.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return Textile(restricted=True, lite=lite,
|
||||||
|
noimage=noimage).textile(text, rel='nofollow',
|
||||||
|
html_type=html_type)
|
||||||
|
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
convert_heuristic, normalize_line_endings
|
convert_heuristic, normalize_line_endings, convert_textile
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -41,6 +41,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
'paragraph and no styling is applied.\n'
|
'paragraph and no styling is applied.\n'
|
||||||
'* heuristic: Process using heuristics to determine formatting such '
|
'* heuristic: Process using heuristics to determine formatting such '
|
||||||
'as chapter headings and italic text.\n'
|
'as chapter headings and italic text.\n'
|
||||||
|
'* textile: Processing using textile formatting.\n'
|
||||||
'* markdown: Processing using markdown formatting. '
|
'* markdown: Processing using markdown formatting. '
|
||||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
@ -91,6 +92,9 @@ class TXTInput(InputFormatPlugin):
|
|||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||||
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
||||||
|
elif options.formatting_type == 'textile':
|
||||||
|
log.debug('Running text though textile conversion...')
|
||||||
|
html = convert_textile(txt)
|
||||||
else:
|
else:
|
||||||
# Determine the paragraph type of the document.
|
# Determine the paragraph type of the document.
|
||||||
if options.paragraph_type == 'auto':
|
if options.paragraph_type == 'auto':
|
||||||
|
@ -7,7 +7,6 @@ Read content from txt file.
|
|||||||
import os, re
|
import os, re
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.markdown import markdown
|
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
@ -37,7 +36,7 @@ def clean_txt(txt):
|
|||||||
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
|
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
|
||||||
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
|
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
|
||||||
txt = illegal_chars.sub('', txt)
|
txt = illegal_chars.sub('', txt)
|
||||||
|
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def split_txt(txt, epub_split_size_kb=0):
|
def split_txt(txt, epub_split_size_kb=0):
|
||||||
@ -74,12 +73,18 @@ def convert_heuristic(txt, title='', epub_split_size_kb=0):
|
|||||||
return tp.convert(txt, title, epub_split_size_kb)
|
return tp.convert(txt, title, epub_split_size_kb)
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
|
from calibre.ebooks.markdown import markdown
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
extensions=['footnotes', 'tables', 'toc'],
|
extensions=['footnotes', 'tables', 'toc'],
|
||||||
extension_configs={"toc": {"disable_toc": disable_toc}},
|
extension_configs={"toc": {"disable_toc": disable_toc}},
|
||||||
safe_mode=False)
|
safe_mode=False)
|
||||||
return HTML_TEMPLATE % (title, md.convert(txt))
|
return HTML_TEMPLATE % (title, md.convert(txt))
|
||||||
|
|
||||||
|
def convert_textile(txt, title=''):
|
||||||
|
from calibre.ebooks.textile import textile
|
||||||
|
html = textile(txt, encoding='utf-8')
|
||||||
|
return HTML_TEMPLATE % (title, html)
|
||||||
|
|
||||||
def normalize_line_endings(txt):
|
def normalize_line_endings(txt):
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
@ -115,66 +120,75 @@ def split_string_separator(txt, size) :
|
|||||||
def detect_paragraph_type(txt):
|
def detect_paragraph_type(txt):
|
||||||
'''
|
'''
|
||||||
Tries to determine the formatting of the document.
|
Tries to determine the formatting of the document.
|
||||||
|
|
||||||
block: Paragraphs are separated by a blank line.
|
block: Paragraphs are separated by a blank line.
|
||||||
single: Each line is a paragraph.
|
single: Each line is a paragraph.
|
||||||
print: Each paragraph starts with a 2+ spaces or a tab
|
print: Each paragraph starts with a 2+ spaces or a tab
|
||||||
and ends when a new paragraph is reached.
|
and ends when a new paragraph is reached.
|
||||||
unformatted: most lines have hard line breaks, few/no blank lines or indents
|
unformatted: most lines have hard line breaks, few/no blank lines or indents
|
||||||
|
|
||||||
returns block, single, print, unformatted
|
returns block, single, print, unformatted
|
||||||
'''
|
'''
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||||
|
|
||||||
# Check for hard line breaks - true if 55% of the doc breaks in the same region
|
# Check for hard line breaks - true if 55% of the doc breaks in the same region
|
||||||
docanalysis = DocAnalysis('txt', txt)
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
hardbreaks = docanalysis.line_histogram(.55)
|
hardbreaks = docanalysis.line_histogram(.55)
|
||||||
|
|
||||||
if hardbreaks:
|
if hardbreaks:
|
||||||
# Determine print percentage
|
# Determine print percentage
|
||||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
print_percent = tab_line_count / float(txt_line_count)
|
print_percent = tab_line_count / float(txt_line_count)
|
||||||
|
|
||||||
# Determine block percentage
|
# Determine block percentage
|
||||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
block_percent = empty_line_count / float(txt_line_count)
|
block_percent = empty_line_count / float(txt_line_count)
|
||||||
|
|
||||||
# Compare the two types - the type with the larger number of instances wins
|
# Compare the two types - the type with the larger number of instances wins
|
||||||
# in cases where only one or the other represents the vast majority of the document neither wins
|
# in cases where only one or the other represents the vast majority of the document neither wins
|
||||||
if print_percent >= block_percent:
|
if print_percent >= block_percent:
|
||||||
if .15 <= print_percent <= .75:
|
if .15 <= print_percent <= .75:
|
||||||
return 'print'
|
return 'print'
|
||||||
elif .15 <= block_percent <= .75:
|
elif .15 <= block_percent <= .75:
|
||||||
return 'block'
|
return 'block'
|
||||||
|
|
||||||
# Assume unformatted text with hardbreaks if nothing else matches
|
# Assume unformatted text with hardbreaks if nothing else matches
|
||||||
return 'unformatted'
|
return 'unformatted'
|
||||||
|
|
||||||
# return single if hardbreaks is false
|
# return single if hardbreaks is false
|
||||||
return 'single'
|
return 'single'
|
||||||
|
|
||||||
|
|
||||||
def detect_formatting_type(txt):
|
def detect_formatting_type(txt):
|
||||||
|
markdown_count = 0
|
||||||
|
textile_count = 0
|
||||||
|
|
||||||
# Check for markdown
|
# Check for markdown
|
||||||
# Headings
|
# Headings
|
||||||
if len(re.findall('(?mu)^#+', txt)) >= 5:
|
markdown_count += len(re.findall('(?mu)^#+', txt))
|
||||||
return 'markdown'
|
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
||||||
if len(re.findall('(?mu)^=+$', txt)) >= 5:
|
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
||||||
return 'markdown'
|
|
||||||
if len(re.findall('(?mu)^-+$', txt)) >= 5:
|
|
||||||
return 'markdown'
|
|
||||||
# Images
|
# Images
|
||||||
if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
|
markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
|
||||||
return 'markdown'
|
|
||||||
# Links
|
# Links
|
||||||
if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
|
markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
|
||||||
return 'markdown'
|
|
||||||
# Escaped characters
|
# Check for textile
|
||||||
md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
|
# Headings
|
||||||
for c in md_escapted_characters:
|
textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
|
||||||
if txt.count('\\'+c) > 10:
|
# Block quote.
|
||||||
|
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
|
||||||
|
# Images
|
||||||
|
textile_count += len(re.findall(r'\![^\s]+(:[^\s]+)*', txt))
|
||||||
|
# Links
|
||||||
|
textile_count += len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
|
||||||
|
|
||||||
|
if markdown_count > 5 or textile_count > 5:
|
||||||
|
if markdown_count > textile_count:
|
||||||
return 'markdown'
|
return 'markdown'
|
||||||
|
else:
|
||||||
|
return 'textile'
|
||||||
|
|
||||||
return 'heuristic'
|
return 'heuristic'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user