Normalize white space in titles

This commit is contained in:
Kovid Goyal 2016-06-18 17:40:30 +05:30
parent d6754fe4d8
commit 7f8ff21a91
2 changed files with 16 additions and 1 deletions

View File

@ -5,6 +5,7 @@
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from collections import defaultdict
from functools import wraps
import re
from lxml import etree
@ -70,6 +71,17 @@ def ensure_id(root, elem):
eid = ensure_unique('id', frozenset(XPath('//*/@id')(root)))
elem.set('id', eid)
return eid
def normalize_whitespace(text):
if not text:
return text
return re.sub(r'\s+', ' ', text).strip()
def simple_text(f):
@wraps(f)
def wrapper(*args, **kw):
return normalize_whitespace(f(*args, **kw))
return wrapper
# }}}
# Prefixes {{{
@ -229,10 +241,12 @@ def find_main_title(root, refines, remove_blanks=False):
main_title = first_title
return main_title
@simple_text
def read_title(root, prefixes, refines):
main_title = find_main_title(root, refines)
return None if main_title is None else main_title.text.strip()
@simple_text
def read_title_sort(root, prefixes, refines):
main_title = find_main_title(root, refines)
if main_title is not None:

View File

@ -83,8 +83,9 @@ class TestOPF3(unittest.TestCase):
self.ae(rt(root), 'xxx')
self.ae(st(root, 'abc', 'cba'), 'abc')
self.ae(read_title_sort(root, reserved_prefixes, read_refines(root)), 'cba')
root = self.get_opf('''<dc:title>yyy</dc:title><dc:title id='t'>xxx
root = self.get_opf('''<dc:title>yyy</dc:title><dc:title id='t'>x xx
</dc:title><meta refines='#t' property='title-type'>main</meta><meta name="calibre:title_sort" content="sorted"/>''')
self.ae(rt(root), 'x xx')
self.ae(read_title_sort(root, reserved_prefixes, read_refines(root)), 'sorted')
self.ae(st(root, 'abc'), 'abc')
# }}}