mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-03-24 02:17:51 -04:00
264 lines
8.4 KiB
Python
264 lines
8.4 KiB
Python
"""
|
|
Tagging utilities - from user tag input parsing to tag cloud
|
|
calculation.
|
|
"""
|
|
import math
|
|
import types
|
|
|
|
from django.db.models.query import QuerySet
|
|
from django.utils.encoding import force_unicode
|
|
from django.utils.translation import ugettext as _
|
|
|
|
# Python 2.3 compatibility
|
|
try:
|
|
set
|
|
except NameError:
|
|
from sets import Set as set
|
|
|
|
def parse_tag_input(input):
|
|
"""
|
|
Parses tag input, with multiple word input being activated and
|
|
delineated by commas and double quotes. Quotes take precedence, so
|
|
they may contain commas.
|
|
|
|
Returns a sorted list of unique tag names.
|
|
"""
|
|
if not input:
|
|
return []
|
|
|
|
input = force_unicode(input)
|
|
|
|
# Special case - if there are no commas or double quotes in the
|
|
# input, we don't *do* a recall... I mean, we know we only need to
|
|
# split on spaces.
|
|
if u',' not in input and u'"' not in input:
|
|
words = list(set(split_strip(input, u' ')))
|
|
words.sort()
|
|
return words
|
|
|
|
words = []
|
|
buffer = []
|
|
# Defer splitting of non-quoted sections until we know if there are
|
|
# any unquoted commas.
|
|
to_be_split = []
|
|
saw_loose_comma = False
|
|
open_quote = False
|
|
i = iter(input)
|
|
try:
|
|
while 1:
|
|
c = i.next()
|
|
if c == u'"':
|
|
if buffer:
|
|
to_be_split.append(u''.join(buffer))
|
|
buffer = []
|
|
# Find the matching quote
|
|
open_quote = True
|
|
c = i.next()
|
|
while c != u'"':
|
|
buffer.append(c)
|
|
c = i.next()
|
|
if buffer:
|
|
word = u''.join(buffer).strip()
|
|
if word:
|
|
words.append(word)
|
|
buffer = []
|
|
open_quote = False
|
|
else:
|
|
if not saw_loose_comma and c == u',':
|
|
saw_loose_comma = True
|
|
buffer.append(c)
|
|
except StopIteration:
|
|
# If we were parsing an open quote which was never closed treat
|
|
# the buffer as unquoted.
|
|
if buffer:
|
|
if open_quote and u',' in buffer:
|
|
saw_loose_comma = True
|
|
to_be_split.append(u''.join(buffer))
|
|
if to_be_split:
|
|
if saw_loose_comma:
|
|
delimiter = u','
|
|
else:
|
|
delimiter = u' '
|
|
for chunk in to_be_split:
|
|
words.extend(split_strip(chunk, delimiter))
|
|
words = list(set(words))
|
|
words.sort()
|
|
return words
|
|
|
|
def split_strip(input, delimiter=u','):
|
|
"""
|
|
Splits ``input`` on ``delimiter``, stripping each resulting string
|
|
and returning a list of non-empty strings.
|
|
"""
|
|
if not input:
|
|
return []
|
|
|
|
words = [w.strip() for w in input.split(delimiter)]
|
|
return [w for w in words if w]
|
|
|
|
def edit_string_for_tags(tags):
|
|
"""
|
|
Given list of ``Tag`` instances, creates a string representation of
|
|
the list suitable for editing by the user, such that submitting the
|
|
given string representation back without changing it will give the
|
|
same list of tags.
|
|
|
|
Tag names which contain commas will be double quoted.
|
|
|
|
If any tag name which isn't being quoted contains whitespace, the
|
|
resulting string of tag names will be comma-delimited, otherwise
|
|
it will be space-delimited.
|
|
"""
|
|
names = []
|
|
use_commas = False
|
|
for tag in tags:
|
|
name = tag.name
|
|
if u',' in name:
|
|
names.append('"%s"' % name)
|
|
continue
|
|
elif u' ' in name:
|
|
if not use_commas:
|
|
use_commas = True
|
|
names.append(name)
|
|
if use_commas:
|
|
glue = u', '
|
|
else:
|
|
glue = u' '
|
|
return glue.join(names)
|
|
|
|
def get_queryset_and_model(queryset_or_model):
|
|
"""
|
|
Given a ``QuerySet`` or a ``Model``, returns a two-tuple of
|
|
(queryset, model).
|
|
|
|
If a ``Model`` is given, the ``QuerySet`` returned will be created
|
|
using its default manager.
|
|
"""
|
|
try:
|
|
return queryset_or_model, queryset_or_model.model
|
|
except AttributeError:
|
|
return queryset_or_model._default_manager.all(), queryset_or_model
|
|
|
|
def get_tag_list(tags):
|
|
"""
|
|
Utility function for accepting tag input in a flexible manner.
|
|
|
|
If a ``Tag`` object is given, it will be returned in a list as
|
|
its single occupant.
|
|
|
|
If given, the tag names in the following will be used to create a
|
|
``Tag`` ``QuerySet``:
|
|
|
|
* A string, which may contain multiple tag names.
|
|
* A list or tuple of strings corresponding to tag names.
|
|
* A list or tuple of integers corresponding to tag ids.
|
|
|
|
If given, the following will be returned as-is:
|
|
|
|
* A list or tuple of ``Tag`` objects.
|
|
* A ``Tag`` ``QuerySet``.
|
|
|
|
"""
|
|
from tagging.models import Tag
|
|
if isinstance(tags, Tag):
|
|
return [tags]
|
|
elif isinstance(tags, QuerySet) and tags.model is Tag:
|
|
return tags
|
|
elif isinstance(tags, types.StringTypes):
|
|
return Tag.objects.filter(name__in=parse_tag_input(tags))
|
|
elif isinstance(tags, (types.ListType, types.TupleType)):
|
|
if len(tags) == 0:
|
|
return tags
|
|
contents = set()
|
|
for item in tags:
|
|
if isinstance(item, types.StringTypes):
|
|
contents.add('string')
|
|
elif isinstance(item, Tag):
|
|
contents.add('tag')
|
|
elif isinstance(item, (types.IntType, types.LongType)):
|
|
contents.add('int')
|
|
if len(contents) == 1:
|
|
if 'string' in contents:
|
|
return Tag.objects.filter(name__in=[force_unicode(tag) \
|
|
for tag in tags])
|
|
elif 'tag' in contents:
|
|
return tags
|
|
elif 'int' in contents:
|
|
return Tag.objects.filter(id__in=tags)
|
|
else:
|
|
raise ValueError(_('If a list or tuple of tags is provided, they must all be tag names, Tag objects or Tag ids.'))
|
|
else:
|
|
raise ValueError(_('The tag input given was invalid.'))
|
|
|
|
def get_tag(tag):
|
|
"""
|
|
Utility function for accepting single tag input in a flexible
|
|
manner.
|
|
|
|
If a ``Tag`` object is given it will be returned as-is; if a
|
|
string or integer are given, they will be used to lookup the
|
|
appropriate ``Tag``.
|
|
|
|
If no matching tag can be found, ``None`` will be returned.
|
|
"""
|
|
from tagging.models import Tag
|
|
if isinstance(tag, Tag):
|
|
return tag
|
|
|
|
try:
|
|
if isinstance(tag, types.StringTypes):
|
|
return Tag.objects.get(name=tag)
|
|
elif isinstance(tag, (types.IntType, types.LongType)):
|
|
return Tag.objects.get(id=tag)
|
|
except Tag.DoesNotExist:
|
|
pass
|
|
|
|
return None
|
|
|
|
# Font size distribution algorithms
|
|
LOGARITHMIC, LINEAR = 1, 2
|
|
|
|
def _calculate_thresholds(min_weight, max_weight, steps):
|
|
delta = (max_weight - min_weight) / float(steps)
|
|
return [min_weight + i * delta for i in range(1, steps + 1)]
|
|
|
|
def _calculate_tag_weight(weight, max_weight, distribution):
|
|
"""
|
|
Logarithmic tag weight calculation is based on code from the
|
|
`Tag Cloud`_ plugin for Mephisto, by Sven Fuchs.
|
|
|
|
.. _`Tag Cloud`: http://www.artweb-design.de/projects/mephisto-plugin-tag-cloud
|
|
"""
|
|
if distribution == LINEAR or max_weight == 1:
|
|
return weight
|
|
elif distribution == LOGARITHMIC:
|
|
return math.log(weight) * max_weight / math.log(max_weight)
|
|
raise ValueError(_('Invalid distribution algorithm specified: %s.') % distribution)
|
|
|
|
def calculate_cloud(tags, steps=4, distribution=LOGARITHMIC):
|
|
"""
|
|
Add a ``font_size`` attribute to each tag according to the
|
|
frequency of its use, as indicated by its ``count``
|
|
attribute.
|
|
|
|
``steps`` defines the range of font sizes - ``font_size`` will
|
|
be an integer between 1 and ``steps`` (inclusive).
|
|
|
|
``distribution`` defines the type of font size distribution
|
|
algorithm which will be used - logarithmic or linear. It must be
|
|
one of ``tagging.utils.LOGARITHMIC`` or ``tagging.utils.LINEAR``.
|
|
"""
|
|
if len(tags) > 0:
|
|
counts = [tag.count for tag in tags]
|
|
min_weight = float(min(counts))
|
|
max_weight = float(max(counts))
|
|
thresholds = _calculate_thresholds(min_weight, max_weight, steps)
|
|
for tag in tags:
|
|
font_set = False
|
|
tag_weight = _calculate_tag_weight(tag.count, max_weight, distribution)
|
|
for i in range(steps):
|
|
if not font_set and tag_weight <= thresholds[i]:
|
|
tag.font_size = i + 1
|
|
font_set = True
|
|
return tags
|