mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Rewrite numeric title sorting, pass <br/> in comments, force empty author to 'Unknown'
This commit is contained in:
parent
1d28cb7ceb
commit
9bc0a9be43
@ -306,17 +306,19 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
456 => four hundred fifty-six
|
456 => four hundred fifty-six
|
||||||
4:56 => four fifty-six
|
4:56 => four fifty-six
|
||||||
'''
|
'''
|
||||||
|
ORDINALS = ['zeroth','first','second','third','fourth','fifth','sixth','seventh','eighth','ninth']
|
||||||
lessThanTwenty = ["<zero>","one","two","three","four","five","six","seven","eight","nine",
|
lessThanTwenty = ["<zero>","one","two","three","four","five","six","seven","eight","nine",
|
||||||
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen",
|
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen",
|
||||||
"eighteen","nineteen"]
|
"eighteen","nineteen"]
|
||||||
tens = ["<zero>","<tens>","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
|
tens = ["<zero>","<tens>","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
|
||||||
hundreds = ["<zero>","one","two","three","four","five","six","seven","eight","nine"]
|
hundreds = ["<zero>","one","two","three","four","five","six","seven","eight","nine"]
|
||||||
|
|
||||||
def __init__(self, number):
|
def __init__(self, number, verbose=False):
|
||||||
self.number = number
|
self.number = number
|
||||||
self.number_as_float = 0.0
|
self.number_as_float = 0.0
|
||||||
self.text = ''
|
self.text = ''
|
||||||
|
self.verbose = verbose
|
||||||
|
self.log = Log()
|
||||||
self.numberTranslate()
|
self.numberTranslate()
|
||||||
|
|
||||||
def stringFromInt(self, intToTranslate):
|
def stringFromInt(self, intToTranslate):
|
||||||
@ -324,7 +326,6 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
# intToTranslate is a three-digit number
|
# intToTranslate is a three-digit number
|
||||||
|
|
||||||
tensComponentString = ""
|
tensComponentString = ""
|
||||||
|
|
||||||
hundredsComponent = intToTranslate - (intToTranslate % 100)
|
hundredsComponent = intToTranslate - (intToTranslate % 100)
|
||||||
tensComponent = intToTranslate % 100
|
tensComponent = intToTranslate % 100
|
||||||
|
|
||||||
@ -336,8 +337,7 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
|
|
||||||
# Build the tens component
|
# Build the tens component
|
||||||
if tensComponent < 20:
|
if tensComponent < 20:
|
||||||
if tensComponent > 0:
|
tensComponentString = self.lessThanTwenty[tensComponent]
|
||||||
tensComponentString = self.lessThanTwenty[tensComponent]
|
|
||||||
else:
|
else:
|
||||||
tensPart = ""
|
tensPart = ""
|
||||||
onesPart = ""
|
onesPart = ""
|
||||||
@ -369,9 +369,27 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
hundredsString = ""
|
hundredsString = ""
|
||||||
thousandsString = ""
|
thousandsString = ""
|
||||||
resultString = ""
|
resultString = ""
|
||||||
|
self.suffix = ''
|
||||||
|
|
||||||
|
if self.verbose: self.log("numberTranslate(): %s" % self.number)
|
||||||
|
|
||||||
|
# Special case ordinals
|
||||||
|
if re.search('[st|nd|rd|th]',self.number):
|
||||||
|
self.number = re.sub(',','',self.number)
|
||||||
|
ordinal_suffix = re.search('[\D]', self.number)
|
||||||
|
ordinal_number = re.sub('\D','',re.sub(',','',self.number))
|
||||||
|
if self.verbose: self.log("Ordinal: %s" % ordinal_number)
|
||||||
|
self.number_as_float = ordinal_number
|
||||||
|
self.suffix = self.number[ordinal_suffix.start():]
|
||||||
|
if int(ordinal_number) > 9:
|
||||||
|
# Some typos (e.g., 'twentyth'), acceptable
|
||||||
|
self.text = '%s' % (EPUB_MOBI.NumberToText(ordinal_number).text)
|
||||||
|
else:
|
||||||
|
self.text = '%s' % (self.ORDINALS[int(ordinal_number)])
|
||||||
|
|
||||||
# Test for time
|
# Test for time
|
||||||
if re.search(':',self.number):
|
elif re.search(':',self.number):
|
||||||
|
if self.verbose: self.log("Time: %s" % self.number)
|
||||||
self.number_as_float = re.sub(':','.',self.number)
|
self.number_as_float = re.sub(':','.',self.number)
|
||||||
time_strings = self.number.split(":")
|
time_strings = self.number.split(":")
|
||||||
hours = EPUB_MOBI.NumberToText(time_strings[0]).text
|
hours = EPUB_MOBI.NumberToText(time_strings[0]).text
|
||||||
@ -380,11 +398,13 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
|
|
||||||
# Test for %
|
# Test for %
|
||||||
elif re.search('%', self.number):
|
elif re.search('%', self.number):
|
||||||
|
if self.verbose: self.log("Percent: %s" % self.number)
|
||||||
self.number_as_float = self.number.split('%')[0]
|
self.number_as_float = self.number.split('%')[0]
|
||||||
self.text = EPUB_MOBI.NumberToText(self.number.replace('%',' percent')).text
|
self.text = EPUB_MOBI.NumberToText(self.number.replace('%',' percent')).text
|
||||||
|
|
||||||
# Test for decimal
|
# Test for decimal
|
||||||
elif re.search('\.',self.number):
|
elif re.search('\.',self.number):
|
||||||
|
if self.verbose: self.log("Decimal: %s" % self.number)
|
||||||
self.number_as_float = self.number
|
self.number_as_float = self.number
|
||||||
decimal_strings = self.number.split(".")
|
decimal_strings = self.number.split(".")
|
||||||
left = EPUB_MOBI.NumberToText(decimal_strings[0]).text
|
left = EPUB_MOBI.NumberToText(decimal_strings[0]).text
|
||||||
@ -393,6 +413,7 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
|
|
||||||
# Test for hypenated
|
# Test for hypenated
|
||||||
elif re.search('-', self.number):
|
elif re.search('-', self.number):
|
||||||
|
if self.verbose: self.log("Hyphenated: %s" % self.number)
|
||||||
self.number_as_float = self.number.split('-')[0]
|
self.number_as_float = self.number.split('-')[0]
|
||||||
strings = self.number.split('-')
|
strings = self.number.split('-')
|
||||||
if re.search('[0-9]+', strings[0]):
|
if re.search('[0-9]+', strings[0]):
|
||||||
@ -403,49 +424,54 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
right = EPUB_MOBI.NumberToText(strings[1]).text
|
right = EPUB_MOBI.NumberToText(strings[1]).text
|
||||||
self.text = '%s-%s' % (left, right)
|
self.text = '%s-%s' % (left, right)
|
||||||
|
|
||||||
# Test for $xx,xxx
|
# Test for only commas and numbers
|
||||||
elif re.search('[$,]', self.number):
|
elif re.search(',', self.number) and not re.search('[^0-9,]',self.number):
|
||||||
self.number_as_float = re.sub('[$,]','',self.number)
|
if self.verbose: self.log("Comma(s): %s" % self.number)
|
||||||
self.text = EPUB_MOBI.NumberToText(self.number_as_float).text
|
|
||||||
|
|
||||||
# Test for comma
|
|
||||||
elif re.search(',', self.number):
|
|
||||||
self.number_as_float = re.sub(',','',self.number)
|
self.number_as_float = re.sub(',','',self.number)
|
||||||
self.text = EPUB_MOBI.NumberToText(self.number_as_float).text
|
self.text = EPUB_MOBI.NumberToText(self.number_as_float).text
|
||||||
|
|
||||||
# Test for hybrid e.g., 'K2'
|
# Test for hybrid e.g., 'K2, 2nd, 10@10'
|
||||||
elif re.search('[\D]+', self.number):
|
elif re.search('[\D]+', self.number):
|
||||||
result = []
|
if self.verbose: self.log("Hybrid: %s" % self.number)
|
||||||
for char in self.number:
|
# Split the token into number/text
|
||||||
if re.search('[\d]+', char):
|
number_position = re.search('\d',self.number).start()
|
||||||
result.append(EPUB_MOBI.NumberToText(char).text)
|
text_position = re.search('\D',self.number).start()
|
||||||
else:
|
if number_position < text_position:
|
||||||
result.append(char)
|
number = self.number[:text_position]
|
||||||
self.text = ''.join(result)
|
text = self.number[text_position:]
|
||||||
|
self.text = '%s%s' % (EPUB_MOBI.NumberToText(number).text,text)
|
||||||
|
else:
|
||||||
|
text = self.number[:number_position]
|
||||||
|
number = self.number[number_position:]
|
||||||
|
self.text = '%s%s' % (text, EPUB_MOBI.NumberToText(number).text)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
if self.verbose: self.log("Clean: %s" % self.number)
|
||||||
try:
|
try:
|
||||||
self.float_as_number = float(self.number)
|
self.float_as_number = float(self.number)
|
||||||
number = int(self.number)
|
number = int(self.number)
|
||||||
except:
|
except:
|
||||||
return
|
return
|
||||||
|
|
||||||
if number > 1000000:
|
if number > 10**9:
|
||||||
self.text = "%d out of range" % number
|
self.text = "%d out of range" % number
|
||||||
return
|
return
|
||||||
|
|
||||||
if number == 1000000:
|
if number == 10**9:
|
||||||
self.text = "one million"
|
self.text = "one billion"
|
||||||
else :
|
else :
|
||||||
# Strip out the three-digit number groups
|
# Isolate the three-digit number groups
|
||||||
thousandsNumber = number/1000
|
millionsNumber = number/10**6
|
||||||
hundredsNumber = number - (thousandsNumber * 1000)
|
thousandsNumber = (number - (millionsNumber * 10**6))/10**3
|
||||||
|
hundredsNumber = number - (millionsNumber * 10**6) - (thousandsNumber * 10**3)
|
||||||
|
if self.verbose:
|
||||||
|
print "Converting %s %s %s" % (millionsNumber, thousandsNumber, hundredsNumber)
|
||||||
|
|
||||||
# Convert the lower 3 numbers - hundredsNumber
|
# Convert hundredsNumber
|
||||||
if hundredsNumber :
|
if hundredsNumber :
|
||||||
hundredsString = self.stringFromInt(hundredsNumber)
|
hundredsString = self.stringFromInt(hundredsNumber)
|
||||||
|
|
||||||
# Convert the upper 3 numbers - thousandsNumber
|
# Convert thousandsNumber
|
||||||
if thousandsNumber:
|
if thousandsNumber:
|
||||||
if number > 1099 and number < 2000:
|
if number > 1099 and number < 2000:
|
||||||
resultString = '%s %s' % (self.lessThanTwenty[number/100],
|
resultString = '%s %s' % (self.lessThanTwenty[number/100],
|
||||||
@ -455,19 +481,26 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
else:
|
else:
|
||||||
thousandsString = self.stringFromInt(thousandsNumber)
|
thousandsString = self.stringFromInt(thousandsNumber)
|
||||||
|
|
||||||
|
# Convert millionsNumber
|
||||||
|
if millionsNumber:
|
||||||
|
millionsString = self.stringFromInt(millionsNumber)
|
||||||
|
|
||||||
# Concatenate the strings
|
# Concatenate the strings
|
||||||
if thousandsNumber and not hundredsNumber:
|
resultString = ''
|
||||||
resultString = "%s thousand" % thousandsString
|
if millionsNumber:
|
||||||
|
resultString += "%s million " % millionsString
|
||||||
|
|
||||||
if thousandsNumber and hundredsNumber:
|
if thousandsNumber:
|
||||||
resultString = "%s thousand %s" % (thousandsString, hundredsString)
|
resultString += "%s thousand " % thousandsString
|
||||||
|
|
||||||
if not thousandsNumber and hundredsNumber:
|
if hundredsNumber:
|
||||||
resultString = "%s" % hundredsString
|
resultString += "%s" % hundredsString
|
||||||
|
|
||||||
if not thousandsNumber and not hundredsNumber:
|
if not millionsNumber and not thousandsNumber and not hundredsNumber:
|
||||||
resultString = "zero"
|
resultString = "zero"
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
self.log(u'resultString: %s' % resultString)
|
||||||
self.text = resultString.strip().capitalize()
|
self.text = resultString.strip().capitalize()
|
||||||
|
|
||||||
class CatalogBuilder(object):
|
class CatalogBuilder(object):
|
||||||
@ -860,7 +893,10 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
|
|
||||||
title = this_title['title'] = self.convertHTMLEntities(record['title'])
|
title = this_title['title'] = self.convertHTMLEntities(record['title'])
|
||||||
this_title['title_sort'] = self.generateSortTitle(title)
|
this_title['title_sort'] = self.generateSortTitle(title)
|
||||||
this_title['author'] = " & ".join(record['authors'])
|
if 'authors' in record and len(record['authors']):
|
||||||
|
this_title['author'] = " & ".join(record['authors'])
|
||||||
|
else:
|
||||||
|
this_title['author'] = 'Unknown'
|
||||||
this_title['author_sort'] = record['author_sort'] if len(record['author_sort']) \
|
this_title['author_sort'] = record['author_sort'] if len(record['author_sort']) \
|
||||||
else self.author_to_author_sort(this_title['author'])
|
else self.author_to_author_sort(this_title['author'])
|
||||||
this_title['id'] = record['id']
|
this_title['id'] = record['id']
|
||||||
@ -872,10 +908,14 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
this_title['timestamp'] = record['timestamp']
|
this_title['timestamp'] = record['timestamp']
|
||||||
if record['comments']:
|
if record['comments']:
|
||||||
#this_title['description'] = re.sub('&', '&', record['comments'])
|
#this_title['description'] = re.sub('&', '&', record['comments'])
|
||||||
if re.search('<(?P<tag>.+)>.+</(?P=tag)>|<!--.+-->|<.+/>',record['comments']):
|
has_xml = re.search('<(?P<tag>.+)>.+</(?P=tag)>|<!--.+-->|<.+/>',record['comments'])
|
||||||
self.opts.log(" %d: %s (%s) contains suspect metadata" % \
|
if has_xml and not re.search('<br', record['comments']):
|
||||||
|
self.opts.log.warning(" %d: %s (%s) contains suspect markup" % \
|
||||||
(this_title['id'], this_title['title'],this_title['author']))
|
(this_title['id'], this_title['title'],this_title['author']))
|
||||||
this_title['description'] = prepare_string_for_xml(record['comments'])
|
this_title['description'] = prepare_string_for_xml(record['comments'])
|
||||||
|
else:
|
||||||
|
# If <br/> present, take a chance that the markup is valid
|
||||||
|
this_title['description'] = record['comments']
|
||||||
this_title['short_description'] = self.generateShortDescription(this_title['description'])
|
this_title['short_description'] = self.generateShortDescription(this_title['description'])
|
||||||
else:
|
else:
|
||||||
this_title['description'] = None
|
this_title['description'] = None
|
||||||
@ -903,8 +943,10 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
key=lambda x:(x['title_sort'].upper(), x['title_sort'].upper()))
|
key=lambda x:(x['title_sort'].upper(), x['title_sort'].upper()))
|
||||||
if False and self.verbose:
|
if False and self.verbose:
|
||||||
self.opts.log.info("fetchBooksByTitle(): %d books" % len(self.booksByTitle))
|
self.opts.log.info("fetchBooksByTitle(): %d books" % len(self.booksByTitle))
|
||||||
|
self.opts.log.info(" %-40s %-40s" % ('title', 'title_sort'))
|
||||||
for title in self.booksByTitle:
|
for title in self.booksByTitle:
|
||||||
self.opts.log.info((u" %-50s %-25s" % (title['title'][0:45], title['title_sort'][0:20])).encode('utf-8'))
|
self.opts.log.info((u" %-40s %-40s" % (title['title'][0:40],
|
||||||
|
title['title_sort'][0:40])).encode('utf-8'))
|
||||||
|
|
||||||
def fetchBooksByAuthor(self):
|
def fetchBooksByAuthor(self):
|
||||||
# Generate a list of titles sorted by author from the database
|
# Generate a list of titles sorted by author from the database
|
||||||
@ -2643,27 +2685,26 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
# Leading numbers optionally translated to text equivalent
|
# Leading numbers optionally translated to text equivalent
|
||||||
# Capitalize leading sort word
|
# Capitalize leading sort word
|
||||||
if i==0:
|
if i==0:
|
||||||
if self.opts.numbers_as_text and re.search('[0-9]+',word):
|
if self.opts.numbers_as_text and re.match('[0-9]+',word[0]):
|
||||||
translated.append(EPUB_MOBI.NumberToText(word).text.capitalize())
|
translated.append(EPUB_MOBI.NumberToText(word).text.capitalize())
|
||||||
else:
|
else:
|
||||||
if re.search('-',word):
|
if re.match('[0-9]+',word[0]):
|
||||||
# Split hyphenated words for sorting
|
word = word.replace(',','')
|
||||||
tokens = word.split('-')
|
suffix = re.search('[\D]', word)
|
||||||
title_words[0] = tokens[0]
|
if suffix:
|
||||||
title_words.insert(1,tokens[1])
|
word = '%10.0f%s' % (float(word[:suffix.start()]),word[suffix.start():])
|
||||||
if re.search('[0-9]+',word):
|
else:
|
||||||
# Coerce standard-width strings for numbers for value sorting
|
word = '%10.0f' % (float(word))
|
||||||
# Any non-digit is interpreted as a decimal point
|
|
||||||
# word = '%10.2f' % float(re.sub('[^\d\.]','',word))
|
|
||||||
try:
|
|
||||||
word = '%10.2f' % float(re.sub('[^\d\.]','.',word))
|
|
||||||
except:
|
|
||||||
word = '%10.2f' % float(EPUB_MOBI.NumberToText(word).number_as_float)
|
|
||||||
translated.append(word.capitalize())
|
translated.append(word.capitalize())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if re.search('[0-9]+',word):
|
if re.search('[0-9]+',word[0]):
|
||||||
# Coerce standard-width strings for numbers
|
word = word.replace(',','')
|
||||||
word = '%10.2f' % float(re.sub('[^\d\.]','',word))
|
suffix = re.search('[\D]', word)
|
||||||
|
if suffix:
|
||||||
|
word = '%10.0f%s' % (float(word[:suffix.start()]),word[suffix.start():])
|
||||||
|
else:
|
||||||
|
word = '%10.0f' % (float(word))
|
||||||
translated.append(word)
|
translated.append(word)
|
||||||
return ' '.join(translated)
|
return ' '.join(translated)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user