mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make the comments to HTML transform faster and more robust
This commit is contained in:
parent
1483703aa0
commit
9235d98428
@ -8,9 +8,14 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from calibre.constants import preferred_encoding
|
from calibre.constants import preferred_encoding
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
|
||||||
|
CData, Comment, Declaration, ProcessingInstruction
|
||||||
from calibre import prepare_string_for_xml
|
from calibre import prepare_string_for_xml
|
||||||
|
|
||||||
|
# Hackish - ignoring sentences ending or beginning in numbers to avoid
|
||||||
|
# confusion with decimal points.
|
||||||
|
lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])')
|
||||||
|
|
||||||
def comments_to_html(comments):
|
def comments_to_html(comments):
|
||||||
'''
|
'''
|
||||||
Convert random comment text to normalized, xml-legal block of <p>s
|
Convert random comment text to normalized, xml-legal block of <p>s
|
||||||
@ -41,36 +46,25 @@ def comments_to_html(comments):
|
|||||||
|
|
||||||
if '<' not in comments:
|
if '<' not in comments:
|
||||||
comments = prepare_string_for_xml(comments)
|
comments = prepare_string_for_xml(comments)
|
||||||
comments = comments.replace(u'\n', u'<br />')
|
parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
|
||||||
return u'<p>%s</p>'%comments
|
for x in comments.split('\n\n')]
|
||||||
|
return '\n'.join(parts)
|
||||||
# Hackish - ignoring sentences ending or beginning in numbers to avoid
|
|
||||||
# confusion with decimal points.
|
|
||||||
|
|
||||||
# Explode lost CRs to \n\n
|
# Explode lost CRs to \n\n
|
||||||
for lost_cr in re.finditer('([a-z])([\.\?!])([A-Z])', comments):
|
for lost_cr in lost_cr_pat.finditer(comments):
|
||||||
comments = comments.replace(lost_cr.group(),
|
comments = comments.replace(lost_cr.group(),
|
||||||
'%s%s\n\n%s' % (lost_cr.group(1),
|
'%s%s\n\n%s' % (lost_cr.group(1),
|
||||||
lost_cr.group(2),
|
lost_cr.group(2),
|
||||||
lost_cr.group(3)))
|
lost_cr.group(3)))
|
||||||
|
|
||||||
|
comments = comments.replace(u'\r', u'')
|
||||||
# Convert \n\n to <p>s
|
# Convert \n\n to <p>s
|
||||||
if re.search('\n\n', comments):
|
comments = comments.replace(u'\n\n', u'<p>')
|
||||||
soup = BeautifulSoup()
|
|
||||||
split_ps = comments.split(u'\n\n')
|
|
||||||
tsc = 0
|
|
||||||
for p in split_ps:
|
|
||||||
pTag = Tag(soup,'p')
|
|
||||||
pTag.insert(0,p)
|
|
||||||
soup.insert(tsc,pTag)
|
|
||||||
tsc += 1
|
|
||||||
comments = soup.renderContents(None)
|
|
||||||
|
|
||||||
# Convert solo returns to <br />
|
# Convert solo returns to <br />
|
||||||
comments = re.sub('[\r\n]','<br />', comments)
|
comments = comments.replace(u'\n', '<br />')
|
||||||
|
|
||||||
# Convert two hyphens to emdash
|
# Convert two hyphens to emdash
|
||||||
comments = re.sub('--', '—', comments)
|
comments = comments.replace('--', '—')
|
||||||
|
|
||||||
soup = BeautifulSoup(comments)
|
soup = BeautifulSoup(comments)
|
||||||
result = BeautifulSoup()
|
result = BeautifulSoup()
|
||||||
rtc = 0
|
rtc = 0
|
||||||
@ -85,35 +79,52 @@ def comments_to_html(comments):
|
|||||||
ptc = 0
|
ptc = 0
|
||||||
pTag.insert(ptc,prepare_string_for_xml(token))
|
pTag.insert(ptc,prepare_string_for_xml(token))
|
||||||
ptc += 1
|
ptc += 1
|
||||||
|
elif type(token) in (CData, Comment, Declaration,
|
||||||
elif token.name in ['br','b','i','em']:
|
ProcessingInstruction):
|
||||||
|
continue
|
||||||
|
elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
|
||||||
|
'hr']:
|
||||||
if not open_pTag:
|
if not open_pTag:
|
||||||
pTag = Tag(result,'p')
|
pTag = Tag(result,'p')
|
||||||
open_pTag = True
|
open_pTag = True
|
||||||
ptc = 0
|
ptc = 0
|
||||||
pTag.insert(ptc, token)
|
pTag.insert(ptc, token)
|
||||||
ptc += 1
|
ptc += 1
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if open_pTag:
|
if open_pTag:
|
||||||
result.insert(rtc, pTag)
|
result.insert(rtc, pTag)
|
||||||
rtc += 1
|
rtc += 1
|
||||||
open_pTag = False
|
open_pTag = False
|
||||||
ptc = 0
|
ptc = 0
|
||||||
# Clean up NavigableStrings for xml
|
|
||||||
sub_tokens = list(token.contents)
|
|
||||||
for sub_token in sub_tokens:
|
|
||||||
if type(sub_token) is NavigableString:
|
|
||||||
sub_token.replaceWith(prepare_string_for_xml(sub_token))
|
|
||||||
result.insert(rtc, token)
|
result.insert(rtc, token)
|
||||||
rtc += 1
|
rtc += 1
|
||||||
|
|
||||||
if open_pTag:
|
if open_pTag:
|
||||||
result.insert(rtc, pTag)
|
result.insert(rtc, pTag)
|
||||||
|
|
||||||
paras = result.findAll('p')
|
for p in result.findAll('p'):
|
||||||
for p in paras:
|
|
||||||
p['class'] = 'description'
|
p['class'] = 'description'
|
||||||
|
|
||||||
|
for t in result.findAll(text=True):
|
||||||
|
t.replaceWith(prepare_string_for_xml(unicode(t)))
|
||||||
|
|
||||||
return result.renderContents(encoding=None)
|
return result.renderContents(encoding=None)
|
||||||
|
|
||||||
|
def test():
|
||||||
|
for pat, val in [
|
||||||
|
('lineone\n\nlinetwo',
|
||||||
|
'<p class="description">lineone</p>\n<p class="description">linetwo</p>'),
|
||||||
|
('a <b>b&c</b>\nf', '<p class="description">a <b>b&c;</b><br />f</p>'),
|
||||||
|
('a <?xml asd> b\n\ncd', '<p class="description">a b</p><p class="description">cd</p>'),
|
||||||
|
]:
|
||||||
|
print
|
||||||
|
print 'Testing: %r'%pat
|
||||||
|
cval = comments_to_html(pat)
|
||||||
|
print 'Value: %r'%cval
|
||||||
|
if comments_to_html(pat) != val:
|
||||||
|
print 'FAILED'
|
||||||
|
break
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user