Kavita/API/Services/Tasks/ScannerService.cs
Joseph Milazzo d7d7f9b529
Collection Support (#234)
* Readme refactored to be more clean and clear, taking inspiration from wiki.js's readme.

* Initial backend for Collections and basic metadata implemented.

* More build flavors for Raspberry Pi users and updated Install since we don't need users to set their own JWT Token Key. Update a typo in appsettings.json file for prod.

* Fixed #224. Sort before getting a First?Last() chatper

* The rough ability to add and get series metadata and tags.

* Fix a bug on getting metadata for when it doesn't exist.

* Fixed a bug where flattening directories with some unique filenames could cause reading order of images to be out of order.

* Added a seed code to ensure all series have SeriesMetdata

* Ensure all instances of opening an epub is using "using" so we don't lock the file. When we have a malformed html file, log the issues and inform the user we can't open the file.

* Book reader now handles @Import "" statements in CSS and inlines the css into css file that references them. This allows for them to be scoped. In addition, if the html or body tag had classes, we now send back a single div with those classes.

* Fixed GetSeriesDtoForCollectionAsync which was not properly returning series

* Implemented cover image for collection tag. Fixed an issue in metadata update call.

* Add check for user access when resolving series for a collection tag. When asking for all tags, if the user is not an admin, only give promotoed tags back.

* Implemented updateTag api

* Implemented the ability to update series the tags have access to.

* Cleanup, sorting, and null check

* More sorting changes

* Ensure we can delete tags when editing a series tags

* Fix order of update to make sure a tag is properly deleted

* Code smells
2021-05-30 17:24:23 -05:00

538 lines
22 KiB
C#

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using API.Comparators;
using API.Data;
using API.Entities;
using API.Entities.Enums;
using API.Extensions;
using API.Interfaces;
using API.Interfaces.Services;
using API.Parser;
using Hangfire;
using Microsoft.Extensions.Logging;
namespace API.Services.Tasks
{
public class ScannerService : IScannerService
{
private readonly IUnitOfWork _unitOfWork;
private readonly ILogger<ScannerService> _logger;
private readonly IArchiveService _archiveService;
private readonly IMetadataService _metadataService;
private readonly IBookService _bookService;
private ConcurrentDictionary<string, List<ParserInfo>> _scannedSeries;
private readonly NaturalSortComparer _naturalSort;
public ScannerService(IUnitOfWork unitOfWork, ILogger<ScannerService> logger, IArchiveService archiveService,
IMetadataService metadataService, IBookService bookService)
{
_unitOfWork = unitOfWork;
_logger = logger;
_archiveService = archiveService;
_metadataService = metadataService;
_bookService = bookService;
_naturalSort = new NaturalSortComparer();
}
[DisableConcurrentExecution(timeoutInSeconds: 360)]
[AutomaticRetry(Attempts = 0, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
public void ScanLibraries()
{
var libraries = Task.Run(() => _unitOfWork.LibraryRepository.GetLibrariesAsync()).Result.ToList();
foreach (var lib in libraries)
{
ScanLibrary(lib.Id, false);
}
}
private bool ShouldSkipFolderScan(FolderPath folder, ref int skippedFolders)
{
// NOTE: The only way to skip folders is if Directory hasn't been modified, we aren't doing a forcedUpdate and version hasn't changed between scans.
return false;
// if (!_forceUpdate && Directory.GetLastWriteTime(folder.Path) < folder.LastScanned)
// {
// _logger.LogDebug("{FolderPath} hasn't been modified since last scan. Skipping", folder.Path);
// skippedFolders += 1;
// return true;
// }
//return false;
}
[DisableConcurrentExecution(360)]
[AutomaticRetry(Attempts = 0, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
public void ScanLibrary(int libraryId, bool forceUpdate)
{
var sw = Stopwatch.StartNew();
_scannedSeries = new ConcurrentDictionary<string, List<ParserInfo>>();
Library library;
try
{
library = Task.Run(() => _unitOfWork.LibraryRepository.GetFullLibraryForIdAsync(libraryId)).GetAwaiter().GetResult();
}
catch (Exception ex)
{
// This usually only fails if user is not authenticated.
_logger.LogError(ex, "There was an issue fetching Library {LibraryId}", libraryId);
return;
}
var series = ScanLibrariesForSeries(forceUpdate, library, sw, out var totalFiles, out var scanElapsedTime);
UpdateLibrary(library, series);
_unitOfWork.LibraryRepository.Update(library);
if (Task.Run(() => _unitOfWork.Complete()).Result)
{
_logger.LogInformation("Processed {TotalFiles} files and {ParsedSeriesCount} series in {ElapsedScanTime} milliseconds for {LibraryName}", totalFiles, series.Keys.Count, sw.ElapsedMilliseconds + scanElapsedTime, library.Name);
}
else
{
_logger.LogCritical("There was a critical error that resulted in a failed scan. Please check logs and rescan");
}
CleanupUserProgress();
BackgroundJob.Enqueue(() => _metadataService.RefreshMetadata(libraryId, forceUpdate));
}
/// <summary>
/// Remove any user progress rows that no longer exist since scan library ran and deleted series/volumes/chapters
/// </summary>
private void CleanupUserProgress()
{
var cleanedUp = Task.Run(() => _unitOfWork.AppUserProgressRepository.CleanupAbandonedChapters()).Result;
_logger.LogInformation("Removed {Count} abandoned progress rows", cleanedUp);
}
private Dictionary<string, List<ParserInfo>> ScanLibrariesForSeries(bool forceUpdate, Library library, Stopwatch sw, out int totalFiles,
out long scanElapsedTime)
{
_logger.LogInformation("Beginning scan on {LibraryName}. Forcing metadata update: {ForceUpdate}", library.Name,
forceUpdate);
totalFiles = 0;
var skippedFolders = 0;
foreach (var folderPath in library.Folders)
{
if (ShouldSkipFolderScan(folderPath, ref skippedFolders)) continue;
// NOTE: we can refactor this to allow all filetypes and handle everything in the ProcessFile to allow mixed library types.
var searchPattern = Parser.Parser.ArchiveFileExtensions;
if (library.Type == LibraryType.Book)
{
searchPattern = Parser.Parser.BookFileExtensions;
}
try
{
totalFiles += DirectoryService.TraverseTreeParallelForEach(folderPath.Path, (f) =>
{
try
{
ProcessFile(f, folderPath.Path, library.Type);
}
catch (FileNotFoundException exception)
{
_logger.LogError(exception, "The file {Filename} could not be found", f);
}
}, searchPattern, _logger);
}
catch (ArgumentException ex)
{
_logger.LogError(ex, "The directory '{FolderPath}' does not exist", folderPath.Path);
}
folderPath.LastScanned = DateTime.Now;
}
scanElapsedTime = sw.ElapsedMilliseconds;
_logger.LogInformation("Folders Scanned {TotalFiles} files in {ElapsedScanTime} milliseconds", totalFiles,
scanElapsedTime);
sw.Restart();
if (skippedFolders == library.Folders.Count)
{
_logger.LogInformation("All Folders were skipped due to no modifications to the directories");
_unitOfWork.LibraryRepository.Update(library);
_scannedSeries = null;
_logger.LogInformation("Processed {TotalFiles} files in {ElapsedScanTime} milliseconds for {LibraryName}",
totalFiles, sw.ElapsedMilliseconds, library.Name);
return new Dictionary<string, List<ParserInfo>>();
}
return SeriesWithInfos(_scannedSeries);
}
/// <summary>
/// Returns any series where there were parsed infos
/// </summary>
/// <param name="scannedSeries"></param>
/// <returns></returns>
private static Dictionary<string, List<ParserInfo>> SeriesWithInfos(IDictionary<string, List<ParserInfo>> scannedSeries)
{
var filtered = scannedSeries.Where(kvp => kvp.Value.Count > 0);
var series = filtered.ToDictionary(v => v.Key, v => v.Value);
return series;
}
private void UpdateLibrary(Library library, Dictionary<string, List<ParserInfo>> parsedSeries)
{
if (parsedSeries == null) throw new ArgumentNullException(nameof(parsedSeries));
// First, remove any series that are not in parsedSeries list
var missingSeries = FindSeriesNotOnDisk(library.Series, parsedSeries).ToList();
library.Series = RemoveMissingSeries(library.Series, missingSeries, out var removeCount);
if (removeCount > 0)
{
_logger.LogInformation("Removed {RemoveMissingSeries} series that are no longer on disk:", removeCount);
foreach (var s in missingSeries)
{
_logger.LogDebug("Removed {SeriesName}", s.Name);
}
}
// Add new series that have parsedInfos
foreach (var (key, infos) in parsedSeries)
{
// Key is normalized already
Series existingSeries;
try
{
existingSeries = library.Series.SingleOrDefault(s => s.NormalizedName == key || Parser.Parser.Normalize(s.OriginalName) == key);
}
catch (Exception e)
{
_logger.LogCritical(e, "There are multiple series that map to normalized key {Key}. You can manually delete the entity via UI and rescan to fix it", key);
var duplicateSeries = library.Series.Where(s => s.NormalizedName == key || Parser.Parser.Normalize(s.OriginalName) == key).ToList();
foreach (var series in duplicateSeries)
{
_logger.LogCritical("{Key} maps with {Series}", key, series.OriginalName);
}
continue;
}
if (existingSeries == null)
{
existingSeries = DbFactory.Series(infos[0].Series);
library.Series.Add(existingSeries);
}
existingSeries.NormalizedName = Parser.Parser.Normalize(existingSeries.Name);
existingSeries.OriginalName ??= infos[0].Series;
}
// Now, we only have to deal with series that exist on disk. Let's recalculate the volumes for each series
var librarySeries = library.Series.ToList();
Parallel.ForEach(librarySeries, (series) =>
{
try
{
_logger.LogInformation("Processing series {SeriesName}", series.OriginalName);
UpdateVolumes(series, parsedSeries[Parser.Parser.Normalize(series.OriginalName)].ToArray());
series.Pages = series.Volumes.Sum(v => v.Pages);
// Test
}
catch (Exception ex)
{
_logger.LogError(ex, "There was an exception updating volumes for {SeriesName}", series.Name);
}
});
}
public IEnumerable<Series> FindSeriesNotOnDisk(ICollection<Series> existingSeries, Dictionary<string, List<ParserInfo>> parsedSeries)
{
var foundSeries = parsedSeries.Select(s => s.Key).ToList();
return existingSeries.Where(es => !es.NameInList(foundSeries));
}
/// <summary>
/// Removes all instances of missingSeries' Series from existingSeries Collection. Existing series is updated by
/// reference and the removed element count is returned.
/// </summary>
/// <param name="existingSeries">Existing Series in DB</param>
/// <param name="missingSeries">Series not found on disk or can't be parsed</param>
/// <param name="removeCount"></param>
/// <returns>the updated existingSeries</returns>
public static ICollection<Series> RemoveMissingSeries(ICollection<Series> existingSeries, IEnumerable<Series> missingSeries, out int removeCount)
{
var existingCount = existingSeries.Count;
var missingList = missingSeries.ToList();
existingSeries = existingSeries.Where(
s => !missingList.Exists(
m => m.NormalizedName.Equals(s.NormalizedName))).ToList();
removeCount = existingCount - existingSeries.Count;
return existingSeries;
}
private void UpdateVolumes(Series series, ParserInfo[] parsedInfos)
{
var startingVolumeCount = series.Volumes.Count;
// Add new volumes and update chapters per volume
var distinctVolumes = parsedInfos.DistinctVolumes();
_logger.LogDebug("Updating {DistinctVolumes} volumes on {SeriesName}", distinctVolumes.Count, series.Name);
foreach (var volumeNumber in distinctVolumes)
{
var volume = series.Volumes.SingleOrDefault(s => s.Name == volumeNumber);
if (volume == null)
{
volume = DbFactory.Volume(volumeNumber);
series.Volumes.Add(volume);
}
// NOTE: Instead of creating and adding? Why Not Merge a new volume into an existing, so no matter what, new properties,etc get propagated?
_logger.LogDebug("Parsing {SeriesName} - Volume {VolumeNumber}", series.Name, volume.Name);
var infos = parsedInfos.Where(p => p.Volumes == volumeNumber).ToArray();
UpdateChapters(volume, infos);
volume.Pages = volume.Chapters.Sum(c => c.Pages);
}
// Remove existing volumes that aren't in parsedInfos
var nonDeletedVolumes = series.Volumes.Where(v => parsedInfos.Select(p => p.Volumes).Contains(v.Name)).ToList();
if (series.Volumes.Count != nonDeletedVolumes.Count)
{
_logger.LogDebug("Removed {Count} volumes from {SeriesName} where parsed infos were not mapping with volume name",
(series.Volumes.Count - nonDeletedVolumes.Count), series.Name);
var deletedVolumes = series.Volumes.Except(nonDeletedVolumes);
foreach (var volume in deletedVolumes)
{
var file = volume.Chapters.FirstOrDefault()?.Files.FirstOrDefault()?.FilePath ?? "no files";
if (new FileInfo(file).Exists)
{
_logger.LogError("Volume cleanup code was trying to remove a volume with a file still existing on disk. File: {File}", file);
}
_logger.LogDebug("Removed {SeriesName} - Volume {Volume}: {File}", series.Name, volume.Name, file);
}
series.Volumes = nonDeletedVolumes;
}
_logger.LogDebug("Updated {SeriesName} volumes from {StartingVolumeCount} to {VolumeCount}",
series.Name, startingVolumeCount, series.Volumes.Count);
}
/// <summary>
///
/// </summary>
/// <param name="volume"></param>
/// <param name="parsedInfos"></param>
private void UpdateChapters(Volume volume, ParserInfo[] parsedInfos)
{
// Add new chapters
foreach (var info in parsedInfos)
{
// Specials go into their own chapters with Range being their filename and IsSpecial = True. Non-Specials with Vol and Chap as 0
// also are treated like specials for UI grouping.
Chapter chapter;
try
{
chapter = volume.Chapters.GetChapterByRange(info);
}
catch (Exception ex)
{
_logger.LogError(ex, "{FileName} mapped as '{Series} - Vol {Volume} Ch {Chapter}' is a duplicate, skipping", info.FullFilePath, info.Series, info.Volumes, info.Chapters);
continue;
}
if (chapter == null)
{
_logger.LogDebug(
"Adding new chapter, {Series} - Vol {Volume} Ch {Chapter}", info.Series, info.Volumes, info.Chapters);
volume.Chapters.Add(DbFactory.Chapter(info));
}
else
{
chapter.UpdateFrom(info);
}
}
// Add files
foreach (var info in parsedInfos)
{
var specialTreatment = info.IsSpecialInfo();
Chapter chapter;
try
{
chapter = volume.Chapters.GetChapterByRange(info);
}
catch (Exception ex)
{
_logger.LogError(ex, "There was an exception parsing chapter. Skipping {SeriesName} Vol {VolumeNumber} Chapter {ChapterNumber} - Special treatment: {NeedsSpecialTreatment}", info.Series, volume.Name, info.Chapters, specialTreatment);
continue;
}
if (chapter == null) continue;
AddOrUpdateFileForChapter(chapter, info);
chapter.Number = Parser.Parser.MinimumNumberFromRange(info.Chapters) + string.Empty;
chapter.Range = specialTreatment ? info.Filename : info.Chapters;
}
// Remove chapters that aren't in parsedInfos or have no files linked
var existingChapters = volume.Chapters.ToList();
foreach (var existingChapter in existingChapters)
{
if (existingChapter.Files.Count == 0 || !parsedInfos.HasInfo(existingChapter))
{
_logger.LogDebug("Removed chapter {Chapter} for Volume {VolumeNumber} on {SeriesName}", existingChapter.Range, volume.Name, parsedInfos[0].Series);
volume.Chapters.Remove(existingChapter);
}
else
{
// Ensure we remove any files that no longer exist AND order
existingChapter.Files = existingChapter.Files
.Where(f => parsedInfos.Any(p => p.FullFilePath == f.FilePath))
.OrderBy(f => f.FilePath, _naturalSort).ToList();
existingChapter.Pages = existingChapter.Files.Sum(f => f.Pages);
}
}
}
/// <summary>
/// Attempts to either add a new instance of a show mapping to the _scannedSeries bag or adds to an existing.
/// </summary>
/// <param name="info"></param>
private void TrackSeries(ParserInfo info)
{
if (info.Series == string.Empty) return;
// Check if normalized info.Series already exists and if so, update info to use that name instead
info.Series = MergeName(_scannedSeries, info);
_scannedSeries.AddOrUpdate(Parser.Parser.Normalize(info.Series), new List<ParserInfo>() {info}, (_, oldValue) =>
{
oldValue ??= new List<ParserInfo>();
if (!oldValue.Contains(info))
{
oldValue.Add(info);
}
return oldValue;
});
}
public string MergeName(ConcurrentDictionary<string,List<ParserInfo>> collectedSeries, ParserInfo info)
{
var normalizedSeries = Parser.Parser.Normalize(info.Series);
_logger.LogDebug("Checking if we can merge {NormalizedSeries}", normalizedSeries);
var existingName = collectedSeries.SingleOrDefault(p => Parser.Parser.Normalize(p.Key) == normalizedSeries)
.Key;
// BUG: We are comparing info.Series against a normalized string. They should never match. (This can cause series to not delete or parse correctly after a rename)
if (!string.IsNullOrEmpty(existingName)) // && info.Series != existingName
{
_logger.LogDebug("Found duplicate parsed infos, merged {Original} into {Merged}", info.Series, existingName);
return existingName;
}
return info.Series;
}
/// <summary>
/// Processes files found during a library scan.
/// Populates a collection of <see cref="ParserInfo"/> for DB updates later.
/// </summary>
/// <param name="path">Path of a file</param>
/// <param name="rootPath"></param>
/// <param name="type">Library type to determine parsing to perform</param>
private void ProcessFile(string path, string rootPath, LibraryType type)
{
ParserInfo info;
if (type == LibraryType.Book && Parser.Parser.IsEpub(path))
{
info = _bookService.ParseInfo(path);
}
else
{
info = Parser.Parser.Parse(path, rootPath, type);
}
if (info == null)
{
_logger.LogWarning("[Scanner] Could not parse series from {Path}", path);
return;
}
if (type == LibraryType.Book && Parser.Parser.IsEpub(path) && Parser.Parser.ParseVolume(info.Series) != "0")
{
info = Parser.Parser.Parse(path, rootPath, type);
var info2 = _bookService.ParseInfo(path);
info.Merge(info2);
}
TrackSeries(info);
}
private MangaFile CreateMangaFile(ParserInfo info)
{
switch (info.Format)
{
case MangaFormat.Archive:
{
return new MangaFile()
{
FilePath = info.FullFilePath,
Format = info.Format,
Pages = _archiveService.GetNumberOfPagesFromArchive(info.FullFilePath)
};
}
case MangaFormat.Book:
{
return new MangaFile()
{
FilePath = info.FullFilePath,
Format = info.Format,
Pages = _bookService.GetNumberOfPages(info.FullFilePath)
};
}
default:
_logger.LogWarning("[Scanner] Ignoring {Filename}. Non-archives are not supported", info.Filename);
break;
}
return null;
}
private void AddOrUpdateFileForChapter(Chapter chapter, ParserInfo info)
{
chapter.Files ??= new List<MangaFile>();
var existingFile = chapter.Files.SingleOrDefault(f => f.FilePath == info.FullFilePath);
if (existingFile != null)
{
existingFile.Format = info.Format;
if (!existingFile.HasFileBeenModified() && existingFile.Pages > 0)
{
existingFile.Pages = existingFile.Format == MangaFormat.Book
? _bookService.GetNumberOfPages(info.FullFilePath)
: _archiveService.GetNumberOfPagesFromArchive(info.FullFilePath);
}
}
else
{
var file = CreateMangaFile(info);
if (file != null)
{
chapter.Files.Add(file);
existingFile = chapter.Files.Last();
}
}
if (existingFile != null)
{
existingFile.LastModified = new FileInfo(existingFile.FilePath).LastWriteTime;
}
}
}
}