Kavita/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs
Joe Milazzo 549e52b458
Swipe Issues (#1745)
* Updated theme support to be able to customize the tile color dynamically from a theme via --tile-color. In addition, --theme-color will update apple-mobile-web-app-status-bar-style as well as the non-apple variants

* Removed --manga-reader-bg-color as it wasn't used anywhere. Fixed double pagination on swipe.

* Cleaned up some dead threshold code for swipe.

* Started refactoring tests to use an abstract test class. Stopping because I should do on the .net 7 branch to avoid large merge conflicts. Tests need to be re-designed so they can run in parallel.

* Fixed a bug in reading lists where when deleting an item, order could be miscalculated.

* Started adding new information for stat service. Refactored time spent reading to be more accurate by taking average time against how much of the chapter the user has read.

* Hooked up total time reading at server stat level. Don't show fancy graphs on mobile.

* Added new stats for v0.7

* Added a test for Clearing want to read

* Fixed a few tests that weren't resetting state between runs

* Fixed some broken unit tests

* Ensure all Series queries sort by a case invariant string.

* Added more aggressive caching of images. This will result in a min delay on pages after a cover is changed.

* Fixed a bug where if during new word count calculation, new word count is zero, restoring the old count wasn't working.

* Cleaned up some of the code for getting time estimates

* Fixed a bug where triggering swipe right wasn't working when there was no scroll

* Delete the temp folder for creating a download after a full zip is created.
2023-01-12 17:24:58 -08:00

254 lines
11 KiB
C#

using System;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
using API.Data;
using API.Data.Repositories;
using API.Entities;
using API.Entities.Enums;
using API.Helpers;
using API.SignalR;
using Hangfire;
using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
using VersOne.Epub;
namespace API.Services.Tasks.Metadata;
public interface IWordCountAnalyzerService
{
[DisableConcurrentExecution(timeoutInSeconds: 60 * 60 * 60)]
[AutomaticRetry(Attempts = 2, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
Task ScanLibrary(int libraryId, bool forceUpdate = false);
Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = true);
}
/// <summary>
/// This service is a metadata task that generates information around time to read
/// </summary>
public class WordCountAnalyzerService : IWordCountAnalyzerService
{
private readonly ILogger<WordCountAnalyzerService> _logger;
private readonly IUnitOfWork _unitOfWork;
private readonly IEventHub _eventHub;
private readonly ICacheHelper _cacheHelper;
private readonly IReaderService _readerService;
public WordCountAnalyzerService(ILogger<WordCountAnalyzerService> logger, IUnitOfWork unitOfWork, IEventHub eventHub,
ICacheHelper cacheHelper, IReaderService readerService)
{
_logger = logger;
_unitOfWork = unitOfWork;
_eventHub = eventHub;
_cacheHelper = cacheHelper;
_readerService = readerService;
}
[DisableConcurrentExecution(timeoutInSeconds: 60 * 60 * 60)]
[AutomaticRetry(Attempts = 2, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
public async Task ScanLibrary(int libraryId, bool forceUpdate = false)
{
var sw = Stopwatch.StartNew();
var library = await _unitOfWork.LibraryRepository.GetLibraryForIdAsync(libraryId, LibraryIncludes.None);
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, string.Empty));
var chunkInfo = await _unitOfWork.SeriesRepository.GetChunkInfo(library.Id);
var stopwatch = Stopwatch.StartNew();
_logger.LogInformation("[MetadataService] Refreshing Library {LibraryName}. Total Items: {TotalSize}. Total Chunks: {TotalChunks} with {ChunkSize} size", library.Name, chunkInfo.TotalSize, chunkInfo.TotalChunks, chunkInfo.ChunkSize);
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 0F, ProgressEventType.Started, $"Starting {library.Name}"));
for (var chunk = 1; chunk <= chunkInfo.TotalChunks; chunk++)
{
if (chunkInfo.TotalChunks == 0) continue;
stopwatch.Restart();
_logger.LogInformation("[MetadataService] Processing chunk {ChunkNumber} / {TotalChunks} with size {ChunkSize}. Series ({SeriesStart} - {SeriesEnd}",
chunk, chunkInfo.TotalChunks, chunkInfo.ChunkSize, chunk * chunkInfo.ChunkSize, (chunk + 1) * chunkInfo.ChunkSize);
var nonLibrarySeries = await _unitOfWork.SeriesRepository.GetFullSeriesForLibraryIdAsync(library.Id,
new UserParams()
{
PageNumber = chunk,
PageSize = chunkInfo.ChunkSize
});
_logger.LogDebug("[MetadataService] Fetched {SeriesCount} series for refresh", nonLibrarySeries.Count);
var seriesIndex = 0;
foreach (var series in nonLibrarySeries)
{
var index = chunk * seriesIndex;
var progress = Math.Max(0F, Math.Min(1F, index * 1F / chunkInfo.TotalSize));
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(library.Id, progress, ProgressEventType.Updated, series.Name));
try
{
await ProcessSeries(series, forceUpdate, false);
}
catch (Exception ex)
{
_logger.LogError(ex, "[MetadataService] There was an exception during metadata refresh for {SeriesName}", series.Name);
}
seriesIndex++;
}
if (_unitOfWork.HasChanges())
{
await _unitOfWork.CommitAsync();
}
_logger.LogInformation(
"[MetadataService] Processed {SeriesStart} - {SeriesEnd} out of {TotalSeries} series in {ElapsedScanTime} milliseconds for {LibraryName}",
chunk * chunkInfo.ChunkSize, (chunk * chunkInfo.ChunkSize) + nonLibrarySeries.Count, chunkInfo.TotalSize, stopwatch.ElapsedMilliseconds, library.Name);
}
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 1F, ProgressEventType.Ended, $"Complete"));
_logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {LibraryName} in {ElapsedMilliseconds} milliseconds", library.Name, sw.ElapsedMilliseconds);
}
public async Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = true)
{
var sw = Stopwatch.StartNew();
var series = await _unitOfWork.SeriesRepository.GetFullSeriesForSeriesIdAsync(seriesId);
if (series == null)
{
_logger.LogError("[WordCountAnalyzerService] Series {SeriesId} was not found on Library {LibraryId}", seriesId, libraryId);
return;
}
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, series.Name));
await ProcessSeries(series, forceUpdate);
if (_unitOfWork.HasChanges())
{
await _unitOfWork.CommitAsync();
}
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 1F, ProgressEventType.Ended, series.Name));
_logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {SeriesName} in {ElapsedMilliseconds} milliseconds", series.Name, sw.ElapsedMilliseconds);
}
public async Task ProcessSeries(Series series, bool forceUpdate = false, bool useFileName = true)
{
var isEpub = series.Format == MangaFormat.Epub;
var existingWordCount = series.WordCount;
series.WordCount = 0;
foreach (var volume in series.Volumes)
{
volume.WordCount = 0;
foreach (var chapter in volume.Chapters)
{
// This compares if it's changed since a file scan only
var firstFile = chapter.Files.FirstOrDefault();
if (firstFile == null) return;
if (!_cacheHelper.HasFileChangedSinceLastScan(firstFile.LastFileAnalysis, forceUpdate,
firstFile))
continue;
if (series.Format == MangaFormat.Epub)
{
long sum = 0;
var fileCounter = 1;
foreach (var file in chapter.Files)
{
var filePath = file.FilePath;
var pageCounter = 1;
try
{
using var book = await EpubReader.OpenBookAsync(filePath, BookService.BookReaderOptions);
var totalPages = book.Content.Html.Values;
foreach (var bookPage in totalPages)
{
var progress = Math.Max(0F,
Math.Min(1F, (fileCounter * pageCounter) * 1F / (chapter.Files.Count * totalPages.Count)));
await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
MessageFactory.WordCountAnalyzerProgressEvent(series.LibraryId, progress,
ProgressEventType.Updated, useFileName ? filePath : series.Name));
sum += await GetWordCountFromHtml(bookPage);
pageCounter++;
}
fileCounter++;
}
catch (Exception ex)
{
_logger.LogError(ex, "There was an error reading an epub file for word count, series skipped");
await _eventHub.SendMessageAsync(MessageFactory.Error,
MessageFactory.ErrorEvent("There was an issue counting words on an epub",
$"{series.Name} - {file}"));
return;
}
UpdateFileAnalysis(file);
}
chapter.WordCount = sum;
series.WordCount += sum;
volume.WordCount += sum;
}
var est = _readerService.GetTimeEstimate(chapter.WordCount, chapter.Pages, isEpub);
chapter.MinHoursToRead = est.MinHours;
chapter.MaxHoursToRead = est.MaxHours;
chapter.AvgHoursToRead = est.AvgHours;
foreach (var file in chapter.Files)
{
UpdateFileAnalysis(file);
}
_unitOfWork.ChapterRepository.Update(chapter);
}
var volumeEst = _readerService.GetTimeEstimate(volume.WordCount, volume.Pages, isEpub);
volume.MinHoursToRead = volumeEst.MinHours;
volume.MaxHoursToRead = volumeEst.MaxHours;
volume.AvgHoursToRead = volumeEst.AvgHours;
_unitOfWork.VolumeRepository.Update(volume);
}
if (series.WordCount == 0 && existingWordCount != 0) series.WordCount = existingWordCount; // Restore original word count if the file hasn't changed
var seriesEstimate = _readerService.GetTimeEstimate(series.WordCount, series.Pages, isEpub);
series.MinHoursToRead = seriesEstimate.MinHours;
series.MaxHoursToRead = seriesEstimate.MaxHours;
series.AvgHoursToRead = seriesEstimate.AvgHours;
_unitOfWork.SeriesRepository.Update(series);
}
private void UpdateFileAnalysis(MangaFile file)
{
file.LastFileAnalysis = DateTime.Now;
_unitOfWork.MangaFileRepository.Update(file);
}
private static async Task<int> GetWordCountFromHtml(EpubContentFileRef bookFile)
{
var doc = new HtmlDocument();
doc.LoadHtml(await bookFile.ReadContentAsTextAsync());
var textNodes = doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]");
if (textNodes == null) return 0;
return textNodes
.Select(node => node.InnerText.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Where(s => char.IsLetter(s[0])))
.Sum(words => words.Count());
}
}