Normalized genre names truncated to available width.

This commit is contained in:
GRiker 2012-11-05 15:10:25 -07:00
parent b306cfb3ae
commit 5cdbd831af
2 changed files with 110 additions and 52 deletions

View File

@ -35,3 +35,4 @@ nbproject/
.settings/ .settings/
*.DS_Store *.DS_Store
calibre_plugins/ calibre_plugins/
./src/calibre/gui2/catalog/catalog_csv_xml.ui.autosave

View File

@ -14,11 +14,12 @@ from calibre.customize.conversion import DummyReporter
from calibre.customize.ui import output_profiles from calibre.customize.ui import output_profiles
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag, NavigableString from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag, NavigableString
from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.metadata import author_to_author_sort
from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.config import config_dir from calibre.utils.config import config_dir
from calibre.utils.date import format_date, is_date_undefined, now as nowf from calibre.utils.date import format_date, is_date_undefined, now as nowf
from calibre.utils.filenames import ascii_text from calibre.utils.filenames import ascii_text, shorten_components_to
from calibre.utils.icu import capitalize, collation_order, sort_key from calibre.utils.icu import capitalize, collation_order, sort_key
from calibre.utils.magick.draw import thumbnail from calibre.utils.magick.draw import thumbnail
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -109,6 +110,7 @@ class CatalogBuilder(object):
self.stylesheet = stylesheet self.stylesheet = stylesheet
self.cache_dir = os.path.join(config_dir, 'caches', 'catalog') self.cache_dir = os.path.join(config_dir, 'caches', 'catalog')
self.catalog_path = PersistentTemporaryDirectory("_epub_mobi_catalog", prefix='') self.catalog_path = PersistentTemporaryDirectory("_epub_mobi_catalog", prefix='')
self.content_dir = os.path.join(self.catalog_path, "content")
self.excluded_tags = self.get_excluded_tags() self.excluded_tags = self.get_excluded_tags()
self.generate_for_kindle_azw3 = True if (_opts.fmt == 'azw3' and self.generate_for_kindle_azw3 = True if (_opts.fmt == 'azw3' and
_opts.output_profile and _opts.output_profile and
@ -127,12 +129,13 @@ class CatalogBuilder(object):
self.books_by_title = None self.books_by_title = None
self.books_by_title_no_series_prefix = None self.books_by_title_no_series_prefix = None
self.books_to_catalog = None self.books_to_catalog = None
self.content_dir = os.path.join(self.catalog_path, "content")
self.current_step = 0.0 self.current_step = 0.0
self.error = [] self.error = []
self.generate_recently_read = False self.generate_recently_read = False
self.genres = [] self.genres = []
self.genre_tags_dict = None self.genre_tags_dict = \
self.filter_db_tags(max_len = 245 - len("%s/Genre_.html" % self.content_dir)) \
if self.opts.generate_genres else None
self.html_filelist_1 = [] self.html_filelist_1 = []
self.html_filelist_2 = [] self.html_filelist_2 = []
self.merge_comments_rule = dict(zip(['field','position','hr'], self.merge_comments_rule = dict(zip(['field','position','hr'],
@ -505,7 +508,7 @@ class CatalogBuilder(object):
if not os.path.isdir(images_path): if not os.path.isdir(images_path):
os.makedirs(images_path) os.makedirs(images_path)
def detect_author_sort_mismatches(self): def detect_author_sort_mismatches(self, books_to_test):
""" Detect author_sort mismatches. """ Detect author_sort mismatches.
Sort by author, look for inconsistencies in author_sort among Sort by author, look for inconsistencies in author_sort among
@ -513,17 +516,18 @@ class CatalogBuilder(object):
annoyance for EPUB. annoyance for EPUB.
Inputs: Inputs:
self.books_to_catalog (list): list of books to catalog books_by_author (list): list of books to test, possibly unsorted
Output: Output:
self.books_by_author (list): sorted by author (none)
Exceptions: Exceptions:
AuthorSortMismatchException: author_sort mismatch detected AuthorSortMismatchException: author_sort mismatch detected
""" """
self.books_by_author = sorted(list(self.books_to_catalog), key=self._kf_books_by_author_sorter_author) books_by_author = sorted(list(books_to_test), key=self._kf_books_by_author_sorter_author)
authors = [(record['author'], record['author_sort']) for record in self.books_by_author]
authors = [(record['author'], record['author_sort']) for record in books_by_author]
current_author = authors[0] current_author = authors[0]
for (i,author) in enumerate(authors): for (i,author) in enumerate(authors):
if author != current_author and i: if author != current_author and i:
@ -701,6 +705,7 @@ class CatalogBuilder(object):
def fetch_books_by_author(self): def fetch_books_by_author(self):
""" Generate a list of books sorted by author. """ Generate a list of books sorted by author.
For books with multiple authors, relist book with additional authors.
Sort the database by author. Report author_sort inconsistencies as warning when Sort the database by author. Report author_sort inconsistencies as warning when
building EPUB or MOBI, error when building MOBI. Collect a list of unique authors building EPUB or MOBI, error when building MOBI. Collect a list of unique authors
to self.authors. to self.authors.
@ -720,25 +725,29 @@ class CatalogBuilder(object):
self.update_progress_full_step(_("Sorting database")) self.update_progress_full_step(_("Sorting database"))
self.detect_author_sort_mismatches() books_by_author = list(self.books_to_catalog)
self.detect_author_sort_mismatches(books_by_author)
books_by_author = self.relist_multiple_authors(books_by_author)
#books_by_author = sorted(list(books_by_author), key=self._kf_books_by_author_sorter_author)
# Sort authors using sort_key to normalize accented letters
# Determine the longest author_sort length before sorting # Determine the longest author_sort length before sorting
asl = [i['author_sort'] for i in self.books_by_author] asl = [i['author_sort'] for i in books_by_author]
las = max(asl, key=len) las = max(asl, key=len)
self.books_by_author = sorted(self.books_to_catalog,
books_by_author = sorted(books_by_author,
key=lambda x: sort_key(self._kf_books_by_author_sorter_author_sort(x, len(las)))) key=lambda x: sort_key(self._kf_books_by_author_sorter_author_sort(x, len(las))))
if self.DEBUG and self.opts.verbose: if self.DEBUG and self.opts.verbose:
tl = [i['title'] for i in self.books_by_author] tl = [i['title'] for i in books_by_author]
lt = max(tl, key=len) lt = max(tl, key=len)
fs = '{:<6}{:<%d} {:<%d} {!s}' % (len(lt),len(las)) fs = '{:<6}{:<%d} {:<%d} {!s}' % (len(lt),len(las))
print(fs.format('','Title','Author','Series')) print(fs.format('','Title','Author','Series'))
for i in self.books_by_author: for i in books_by_author:
print(fs.format('', i['title'],i['author_sort'],i['series'])) print(fs.format('', i['title'],i['author_sort'],i['series']))
# Build the unique_authors set from existing data # Build the unique_authors set from existing data
authors = [(record['author'], capitalize(record['author_sort'])) for record in self.books_by_author] authors = [(record['author'], capitalize(record['author_sort'])) for record in books_by_author]
# authors[] contains a list of all book authors, with multiple entries for multiple books by author # authors[] contains a list of all book authors, with multiple entries for multiple books by author
# authors[]: (([0]:friendly [1]:sort)) # authors[]: (([0]:friendly [1]:sort))
@ -776,6 +785,7 @@ class CatalogBuilder(object):
author[2])).encode('utf-8')) author[2])).encode('utf-8'))
self.authors = unique_authors self.authors = unique_authors
self.books_by_author = books_by_author
return True return True
def fetch_books_by_title(self): def fetch_books_by_title(self):
@ -863,15 +873,15 @@ class CatalogBuilder(object):
this_title['series_index'] = 0.0 this_title['series_index'] = 0.0
this_title['title_sort'] = self.generate_sort_title(this_title['title']) this_title['title_sort'] = self.generate_sort_title(this_title['title'])
if 'authors' in record:
# from calibre.ebooks.metadata import authors_to_string
# return authors_to_string(self.authors)
if 'authors' in record:
this_title['authors'] = record['authors'] this_title['authors'] = record['authors']
# Synthesize author attribution from authors list
if record['authors']: if record['authors']:
this_title['author'] = " &amp; ".join(record['authors']) this_title['author'] = " &amp; ".join(record['authors'])
else: else:
this_title['author'] = 'Unknown' this_title['author'] = _('Unknown')
this_title['authors'] = [this_title['author']]
if 'author_sort' in record and record['author_sort'].strip(): if 'author_sort' in record and record['author_sort'].strip():
this_title['author_sort'] = record['author_sort'] this_title['author_sort'] = record['author_sort']
@ -1093,7 +1103,7 @@ class CatalogBuilder(object):
self.bookmarked_books = bookmarks self.bookmarked_books = bookmarks
def filter_db_tags(self): def filter_db_tags(self, max_len):
""" Remove excluded tags from data set, return normalized genre list. """ Remove excluded tags from data set, return normalized genre list.
Filter all db tags, removing excluded tags supplied in opts. Filter all db tags, removing excluded tags supplied in opts.
@ -1101,13 +1111,13 @@ class CatalogBuilder(object):
tags are flattened to alphanumeric ascii_text. tags are flattened to alphanumeric ascii_text.
Args: Args:
(none) max_len: maximum length of normalized tag to fit within OS constraints
Return: Return:
genre_tags_dict (dict): dict of filtered, normalized tags in data set genre_tags_dict (dict): dict of filtered, normalized tags in data set
""" """
def _format_tag_list(tags, indent=2, line_break=70, header='Tag list'): def _format_tag_list(tags, indent=1, line_break=70, header='Tag list'):
def _next_tag(sorted_tags): def _next_tag(sorted_tags):
for (i, tag) in enumerate(sorted_tags): for (i, tag) in enumerate(sorted_tags):
if i < len(tags) - 1: if i < len(tags) - 1:
@ -1126,6 +1136,31 @@ class CatalogBuilder(object):
out_str = ' ' * (indent + 1) out_str = ' ' * (indent + 1)
return ans + out_str return ans + out_str
def _normalize_tag(tag, max_len):
""" Generate an XHTML-legal anchor string from tag.
Parse tag for non-ascii, convert to unicode name.
Args:
tags (str): tag name possible containing symbols
max_len (int): maximum length of tag
Return:
normalized (str): unicode names substituted for non-ascii chars,
clipped to max_len
"""
normalized = massaged = re.sub('\s','',ascii_text(tag).lower())
if re.search('\W',normalized):
normalized = ''
for c in massaged:
if re.search('\W',c):
normalized += self.generate_unicode_name(c)
else:
normalized += c
shortened = shorten_components_to(max_len, [normalized])[0]
return shortened
# Entry point # Entry point
normalized_tags = [] normalized_tags = []
friendly_tags = [] friendly_tags = []
@ -1144,7 +1179,7 @@ class CatalogBuilder(object):
if tag == ' ': if tag == ' ':
continue continue
normalized_tags.append(self.normalize_tag(tag)) normalized_tags.append(_normalize_tag(tag, max_len))
friendly_tags.append(tag) friendly_tags.append(tag)
genre_tags_dict = dict(zip(friendly_tags,normalized_tags)) genre_tags_dict = dict(zip(friendly_tags,normalized_tags))
@ -1941,8 +1976,6 @@ class CatalogBuilder(object):
self.update_progress_full_step(_("Genres HTML")) self.update_progress_full_step(_("Genres HTML"))
self.genre_tags_dict = self.filter_db_tags()
# Extract books matching filtered_tags # Extract books matching filtered_tags
genre_list = [] genre_list = []
for friendly_tag in sorted(self.genre_tags_dict, key=sort_key): for friendly_tag in sorted(self.genre_tags_dict, key=sort_key):
@ -2024,10 +2057,11 @@ class CatalogBuilder(object):
books_by_current_author += 1 books_by_current_author += 1
# Write the genre book list as an article # Write the genre book list as an article
titles_spanned = self.generate_html_by_genre(genre, True if index==0 else False, outfile = "%s/Genre_%s.html" % (self.content_dir, genre)
genre_tag_set[genre], titles_spanned = self.generate_html_by_genre(genre,
"%s/Genre_%s.html" % (self.content_dir, True if index==0 else False,
genre)) genre_tag_set[genre],
outfile)
tag_file = "content/Genre_%s.html" % genre tag_file = "content/Genre_%s.html" % genre
master_genre_list.append({'tag':genre, master_genre_list.append({'tag':genre,
@ -2549,7 +2583,7 @@ class CatalogBuilder(object):
for (i, tag) in enumerate(sorted(book.get('tags', []))): for (i, tag) in enumerate(sorted(book.get('tags', []))):
aTag = Tag(_soup,'a') aTag = Tag(_soup,'a')
if self.opts.generate_genres: if self.opts.generate_genres:
aTag['href'] = "Genre_%s.html" % self.normalize_tag(tag) aTag['href'] = "Genre_%s.html" % self.genre_tags_dict[tag]
aTag.insert(0,escape(NavigableString(tag))) aTag.insert(0,escape(NavigableString(tag)))
genresTag.insert(gtc, aTag) genresTag.insert(gtc, aTag)
gtc += 1 gtc += 1
@ -4603,28 +4637,6 @@ class CatalogBuilder(object):
return merged return merged
def normalize_tag(self, tag):
""" Generate an XHTML-legal anchor string from tag.
Parse tag for non-ascii, convert to unicode name.
Args:
tags (str): tag name possible containing symbols
Return:
normalized (str): unicode names substituted for non-ascii chars
"""
normalized = massaged = re.sub('\s','',ascii_text(tag).lower())
if re.search('\W',normalized):
normalized = ''
for c in massaged:
if re.search('\W',c):
normalized += self.generate_unicode_name(c)
else:
normalized += c
return normalized
def process_exclusions(self, data_set): def process_exclusions(self, data_set):
""" Filter data_set based on exclusion_rules. """ Filter data_set based on exclusion_rules.
@ -4697,6 +4709,51 @@ class CatalogBuilder(object):
else: else:
return data_set return data_set
def relist_multiple_authors(self, books_by_author):
""" Create multiple entries for books with multiple authors
Given a list of books by author, scan list for books with multiple
authors. Add a cloned copy of the book per additional author.
Args:
books_by_author (list): book list possibly containing books
with multiple authors
Return:
(list): books_by_author with additional entries for books with
multiple authors
"""
# Scan list looking for entries with len(authors) > 1
# Clone multiples, swapping additional author into first place,
# computing author_sort to match
# from calibre.ebooks.metadata import authors_to_string
# return authors_to_string(self.authors)
multiple_author_books = []
# Find the multiple author books
for book in books_by_author:
if len(book['authors']) > 1:
multiple_author_books.append(book)
for book in multiple_author_books:
cloned_authors = list(book['authors'])
for x, author in enumerate(book['authors']):
if x:
first_author = cloned_authors.pop(0)
cloned_authors.append(first_author)
new_book = deepcopy(book)
new_book['author'] = ' & '.join(cloned_authors)
new_book['authors'] = list(cloned_authors)
asl = [author_to_author_sort(auth) for auth in cloned_authors]
new_book['author_sort'] = ' & '.join(asl)
#print("'%s' (%s) (%s)" % (new_book['title'], new_book['authors'], new_book['author_sort']))
books_by_author.append(new_book)
return books_by_author
def update_progress_full_step(self, description): def update_progress_full_step(self, description):
""" Update calibre's job status UI. """ Update calibre's job status UI.