Rewrite numeric title sorting, pass <br/> in comments, force empty author to 'Unknown'

This commit is contained in:
GRiker 2010-02-02 07:57:32 -07:00
parent 1d28cb7ceb
commit 9bc0a9be43

View File

@ -306,17 +306,19 @@ class EPUB_MOBI(CatalogPlugin):
456 => four hundred fifty-six 456 => four hundred fifty-six
4:56 => four fifty-six 4:56 => four fifty-six
''' '''
ORDINALS = ['zeroth','first','second','third','fourth','fifth','sixth','seventh','eighth','ninth']
lessThanTwenty = ["<zero>","one","two","three","four","five","six","seven","eight","nine", lessThanTwenty = ["<zero>","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen", "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen",
"eighteen","nineteen"] "eighteen","nineteen"]
tens = ["<zero>","<tens>","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"] tens = ["<zero>","<tens>","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
hundreds = ["<zero>","one","two","three","four","five","six","seven","eight","nine"] hundreds = ["<zero>","one","two","three","four","five","six","seven","eight","nine"]
def __init__(self, number): def __init__(self, number, verbose=False):
self.number = number self.number = number
self.number_as_float = 0.0 self.number_as_float = 0.0
self.text = '' self.text = ''
self.verbose = verbose
self.log = Log()
self.numberTranslate() self.numberTranslate()
def stringFromInt(self, intToTranslate): def stringFromInt(self, intToTranslate):
@ -324,7 +326,6 @@ class EPUB_MOBI(CatalogPlugin):
# intToTranslate is a three-digit number # intToTranslate is a three-digit number
tensComponentString = "" tensComponentString = ""
hundredsComponent = intToTranslate - (intToTranslate % 100) hundredsComponent = intToTranslate - (intToTranslate % 100)
tensComponent = intToTranslate % 100 tensComponent = intToTranslate % 100
@ -336,8 +337,7 @@ class EPUB_MOBI(CatalogPlugin):
# Build the tens component # Build the tens component
if tensComponent < 20: if tensComponent < 20:
if tensComponent > 0: tensComponentString = self.lessThanTwenty[tensComponent]
tensComponentString = self.lessThanTwenty[tensComponent]
else: else:
tensPart = "" tensPart = ""
onesPart = "" onesPart = ""
@ -369,9 +369,27 @@ class EPUB_MOBI(CatalogPlugin):
hundredsString = "" hundredsString = ""
thousandsString = "" thousandsString = ""
resultString = "" resultString = ""
self.suffix = ''
if self.verbose: self.log("numberTranslate(): %s" % self.number)
# Special case ordinals
if re.search('[st|nd|rd|th]',self.number):
self.number = re.sub(',','',self.number)
ordinal_suffix = re.search('[\D]', self.number)
ordinal_number = re.sub('\D','',re.sub(',','',self.number))
if self.verbose: self.log("Ordinal: %s" % ordinal_number)
self.number_as_float = ordinal_number
self.suffix = self.number[ordinal_suffix.start():]
if int(ordinal_number) > 9:
# Some typos (e.g., 'twentyth'), acceptable
self.text = '%s' % (EPUB_MOBI.NumberToText(ordinal_number).text)
else:
self.text = '%s' % (self.ORDINALS[int(ordinal_number)])
# Test for time # Test for time
if re.search(':',self.number): elif re.search(':',self.number):
if self.verbose: self.log("Time: %s" % self.number)
self.number_as_float = re.sub(':','.',self.number) self.number_as_float = re.sub(':','.',self.number)
time_strings = self.number.split(":") time_strings = self.number.split(":")
hours = EPUB_MOBI.NumberToText(time_strings[0]).text hours = EPUB_MOBI.NumberToText(time_strings[0]).text
@ -380,11 +398,13 @@ class EPUB_MOBI(CatalogPlugin):
# Test for % # Test for %
elif re.search('%', self.number): elif re.search('%', self.number):
if self.verbose: self.log("Percent: %s" % self.number)
self.number_as_float = self.number.split('%')[0] self.number_as_float = self.number.split('%')[0]
self.text = EPUB_MOBI.NumberToText(self.number.replace('%',' percent')).text self.text = EPUB_MOBI.NumberToText(self.number.replace('%',' percent')).text
# Test for decimal # Test for decimal
elif re.search('\.',self.number): elif re.search('\.',self.number):
if self.verbose: self.log("Decimal: %s" % self.number)
self.number_as_float = self.number self.number_as_float = self.number
decimal_strings = self.number.split(".") decimal_strings = self.number.split(".")
left = EPUB_MOBI.NumberToText(decimal_strings[0]).text left = EPUB_MOBI.NumberToText(decimal_strings[0]).text
@ -393,6 +413,7 @@ class EPUB_MOBI(CatalogPlugin):
# Test for hypenated # Test for hypenated
elif re.search('-', self.number): elif re.search('-', self.number):
if self.verbose: self.log("Hyphenated: %s" % self.number)
self.number_as_float = self.number.split('-')[0] self.number_as_float = self.number.split('-')[0]
strings = self.number.split('-') strings = self.number.split('-')
if re.search('[0-9]+', strings[0]): if re.search('[0-9]+', strings[0]):
@ -403,49 +424,54 @@ class EPUB_MOBI(CatalogPlugin):
right = EPUB_MOBI.NumberToText(strings[1]).text right = EPUB_MOBI.NumberToText(strings[1]).text
self.text = '%s-%s' % (left, right) self.text = '%s-%s' % (left, right)
# Test for $xx,xxx # Test for only commas and numbers
elif re.search('[$,]', self.number): elif re.search(',', self.number) and not re.search('[^0-9,]',self.number):
self.number_as_float = re.sub('[$,]','',self.number) if self.verbose: self.log("Comma(s): %s" % self.number)
self.text = EPUB_MOBI.NumberToText(self.number_as_float).text
# Test for comma
elif re.search(',', self.number):
self.number_as_float = re.sub(',','',self.number) self.number_as_float = re.sub(',','',self.number)
self.text = EPUB_MOBI.NumberToText(self.number_as_float).text self.text = EPUB_MOBI.NumberToText(self.number_as_float).text
# Test for hybrid e.g., 'K2' # Test for hybrid e.g., 'K2, 2nd, 10@10'
elif re.search('[\D]+', self.number): elif re.search('[\D]+', self.number):
result = [] if self.verbose: self.log("Hybrid: %s" % self.number)
for char in self.number: # Split the token into number/text
if re.search('[\d]+', char): number_position = re.search('\d',self.number).start()
result.append(EPUB_MOBI.NumberToText(char).text) text_position = re.search('\D',self.number).start()
else: if number_position < text_position:
result.append(char) number = self.number[:text_position]
self.text = ''.join(result) text = self.number[text_position:]
self.text = '%s%s' % (EPUB_MOBI.NumberToText(number).text,text)
else:
text = self.number[:number_position]
number = self.number[number_position:]
self.text = '%s%s' % (text, EPUB_MOBI.NumberToText(number).text)
else: else:
if self.verbose: self.log("Clean: %s" % self.number)
try: try:
self.float_as_number = float(self.number) self.float_as_number = float(self.number)
number = int(self.number) number = int(self.number)
except: except:
return return
if number > 1000000: if number > 10**9:
self.text = "%d out of range" % number self.text = "%d out of range" % number
return return
if number == 1000000: if number == 10**9:
self.text = "one million" self.text = "one billion"
else : else :
# Strip out the three-digit number groups # Isolate the three-digit number groups
thousandsNumber = number/1000 millionsNumber = number/10**6
hundredsNumber = number - (thousandsNumber * 1000) thousandsNumber = (number - (millionsNumber * 10**6))/10**3
hundredsNumber = number - (millionsNumber * 10**6) - (thousandsNumber * 10**3)
if self.verbose:
print "Converting %s %s %s" % (millionsNumber, thousandsNumber, hundredsNumber)
# Convert the lower 3 numbers - hundredsNumber # Convert hundredsNumber
if hundredsNumber : if hundredsNumber :
hundredsString = self.stringFromInt(hundredsNumber) hundredsString = self.stringFromInt(hundredsNumber)
# Convert the upper 3 numbers - thousandsNumber # Convert thousandsNumber
if thousandsNumber: if thousandsNumber:
if number > 1099 and number < 2000: if number > 1099 and number < 2000:
resultString = '%s %s' % (self.lessThanTwenty[number/100], resultString = '%s %s' % (self.lessThanTwenty[number/100],
@ -455,19 +481,26 @@ class EPUB_MOBI(CatalogPlugin):
else: else:
thousandsString = self.stringFromInt(thousandsNumber) thousandsString = self.stringFromInt(thousandsNumber)
# Convert millionsNumber
if millionsNumber:
millionsString = self.stringFromInt(millionsNumber)
# Concatenate the strings # Concatenate the strings
if thousandsNumber and not hundredsNumber: resultString = ''
resultString = "%s thousand" % thousandsString if millionsNumber:
resultString += "%s million " % millionsString
if thousandsNumber and hundredsNumber: if thousandsNumber:
resultString = "%s thousand %s" % (thousandsString, hundredsString) resultString += "%s thousand " % thousandsString
if not thousandsNumber and hundredsNumber: if hundredsNumber:
resultString = "%s" % hundredsString resultString += "%s" % hundredsString
if not thousandsNumber and not hundredsNumber: if not millionsNumber and not thousandsNumber and not hundredsNumber:
resultString = "zero" resultString = "zero"
if self.verbose:
self.log(u'resultString: %s' % resultString)
self.text = resultString.strip().capitalize() self.text = resultString.strip().capitalize()
class CatalogBuilder(object): class CatalogBuilder(object):
@ -860,7 +893,10 @@ class EPUB_MOBI(CatalogPlugin):
title = this_title['title'] = self.convertHTMLEntities(record['title']) title = this_title['title'] = self.convertHTMLEntities(record['title'])
this_title['title_sort'] = self.generateSortTitle(title) this_title['title_sort'] = self.generateSortTitle(title)
this_title['author'] = " &amp; ".join(record['authors']) if 'authors' in record and len(record['authors']):
this_title['author'] = " &amp; ".join(record['authors'])
else:
this_title['author'] = 'Unknown'
this_title['author_sort'] = record['author_sort'] if len(record['author_sort']) \ this_title['author_sort'] = record['author_sort'] if len(record['author_sort']) \
else self.author_to_author_sort(this_title['author']) else self.author_to_author_sort(this_title['author'])
this_title['id'] = record['id'] this_title['id'] = record['id']
@ -872,10 +908,14 @@ class EPUB_MOBI(CatalogPlugin):
this_title['timestamp'] = record['timestamp'] this_title['timestamp'] = record['timestamp']
if record['comments']: if record['comments']:
#this_title['description'] = re.sub('&', '&amp;', record['comments']) #this_title['description'] = re.sub('&', '&amp;', record['comments'])
if re.search('<(?P<tag>.+)>.+</(?P=tag)>|<!--.+-->|<.+/>',record['comments']): has_xml = re.search('<(?P<tag>.+)>.+</(?P=tag)>|<!--.+-->|<.+/>',record['comments'])
self.opts.log(" %d: %s (%s) contains suspect metadata" % \ if has_xml and not re.search('<br', record['comments']):
self.opts.log.warning(" %d: %s (%s) contains suspect markup" % \
(this_title['id'], this_title['title'],this_title['author'])) (this_title['id'], this_title['title'],this_title['author']))
this_title['description'] = prepare_string_for_xml(record['comments']) this_title['description'] = prepare_string_for_xml(record['comments'])
else:
# If <br/> present, take a chance that the markup is valid
this_title['description'] = record['comments']
this_title['short_description'] = self.generateShortDescription(this_title['description']) this_title['short_description'] = self.generateShortDescription(this_title['description'])
else: else:
this_title['description'] = None this_title['description'] = None
@ -903,8 +943,10 @@ class EPUB_MOBI(CatalogPlugin):
key=lambda x:(x['title_sort'].upper(), x['title_sort'].upper())) key=lambda x:(x['title_sort'].upper(), x['title_sort'].upper()))
if False and self.verbose: if False and self.verbose:
self.opts.log.info("fetchBooksByTitle(): %d books" % len(self.booksByTitle)) self.opts.log.info("fetchBooksByTitle(): %d books" % len(self.booksByTitle))
self.opts.log.info(" %-40s %-40s" % ('title', 'title_sort'))
for title in self.booksByTitle: for title in self.booksByTitle:
self.opts.log.info((u" %-50s %-25s" % (title['title'][0:45], title['title_sort'][0:20])).encode('utf-8')) self.opts.log.info((u" %-40s %-40s" % (title['title'][0:40],
title['title_sort'][0:40])).encode('utf-8'))
def fetchBooksByAuthor(self): def fetchBooksByAuthor(self):
# Generate a list of titles sorted by author from the database # Generate a list of titles sorted by author from the database
@ -2643,27 +2685,26 @@ class EPUB_MOBI(CatalogPlugin):
# Leading numbers optionally translated to text equivalent # Leading numbers optionally translated to text equivalent
# Capitalize leading sort word # Capitalize leading sort word
if i==0: if i==0:
if self.opts.numbers_as_text and re.search('[0-9]+',word): if self.opts.numbers_as_text and re.match('[0-9]+',word[0]):
translated.append(EPUB_MOBI.NumberToText(word).text.capitalize()) translated.append(EPUB_MOBI.NumberToText(word).text.capitalize())
else: else:
if re.search('-',word): if re.match('[0-9]+',word[0]):
# Split hyphenated words for sorting word = word.replace(',','')
tokens = word.split('-') suffix = re.search('[\D]', word)
title_words[0] = tokens[0] if suffix:
title_words.insert(1,tokens[1]) word = '%10.0f%s' % (float(word[:suffix.start()]),word[suffix.start():])
if re.search('[0-9]+',word): else:
# Coerce standard-width strings for numbers for value sorting word = '%10.0f' % (float(word))
# Any non-digit is interpreted as a decimal point
# word = '%10.2f' % float(re.sub('[^\d\.]','',word))
try:
word = '%10.2f' % float(re.sub('[^\d\.]','.',word))
except:
word = '%10.2f' % float(EPUB_MOBI.NumberToText(word).number_as_float)
translated.append(word.capitalize()) translated.append(word.capitalize())
else: else:
if re.search('[0-9]+',word): if re.search('[0-9]+',word[0]):
# Coerce standard-width strings for numbers word = word.replace(',','')
word = '%10.2f' % float(re.sub('[^\d\.]','',word)) suffix = re.search('[\D]', word)
if suffix:
word = '%10.0f%s' % (float(word[:suffix.start()]),word[suffix.start():])
else:
word = '%10.0f' % (float(word))
translated.append(word) translated.append(word)
return ' '.join(translated) return ' '.join(translated)