Make tvdb name normalizer unicode aware

This commit is contained in:
Erik Rigtorp 2020-05-07 18:45:04 -07:00
parent 762e0c8d17
commit 82e8865147

View File

@ -274,16 +274,6 @@ namespace MediaBrowser.Providers.Plugins.TheTvdb
.ToList(); .ToList();
} }
/// <summary>
/// The remove.
/// </summary>
const string remove = "\"'!`?";
/// <summary>
/// The spacers.
/// </summary>
const string spacers = "/,.:;\\(){}[]+-_=*"; // (there are two types of dashes, short and long)
/// <summary> /// <summary>
/// Gets the name of the comparable. /// Gets the name of the comparable.
/// </summary> /// </summary>
@ -293,33 +283,11 @@ namespace MediaBrowser.Providers.Plugins.TheTvdb
{ {
name = name.ToLowerInvariant(); name = name.ToLowerInvariant();
name = name.Normalize(NormalizationForm.FormKD); name = name.Normalize(NormalizationForm.FormKD);
var sb = new StringBuilder(); name = name.Replace(", the", string.Empty).Replace("the ", " ").Replace(" the ", " ");
foreach (var c in name) name = name.Replace("&", " and " );
{ name = Regex.Replace(name, @"[\p{Lm}\p{Mn}]", string.Empty); // Remove diacritics, etc
if (c >= 0x2B0 && c <= 0x0333) name = Regex.Replace(name, @"[\W\p{Pc}]+", " "); // Replace sequences of non-word characters and _ with " "
{ return name.Trim();
// skip char modifier and diacritics
}
else if (remove.IndexOf(c) > -1)
{
// skip chars we are removing
}
else if (spacers.IndexOf(c) > -1)
{
sb.Append(" ");
}
else if (c == '&')
{
sb.Append(" and ");
}
else
{
sb.Append(c);
}
}
sb.Replace(", the", string.Empty).Replace("the ", " ").Replace(" the ", " ");
return Regex.Replace(sb.ToString().Trim(), @"\s+", " ");
} }
private void MapSeriesToResult(MetadataResult<Series> result, TvDbSharper.Dto.Series tvdbSeries, string metadataLanguage) private void MapSeriesToResult(MetadataResult<Series> result, TvDbSharper.Dto.Series tvdbSeries, string metadataLanguage)