using System; using System.Diagnostics; using System.Linq; using System.Threading.Tasks; using API.Data; using API.Data.Repositories; using API.Entities; using API.Entities.Enums; using API.Helpers; using API.SignalR; using Hangfire; using HtmlAgilityPack; using Microsoft.Extensions.Logging; using VersOne.Epub; namespace API.Services.Tasks.Metadata; public interface IWordCountAnalyzerService { [DisableConcurrentExecution(timeoutInSeconds: 60 * 60 * 60)] [AutomaticRetry(Attempts = 2, OnAttemptsExceeded = AttemptsExceededAction.Delete)] Task ScanLibrary(int libraryId, bool forceUpdate = false); Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = true); } /// /// This service is a metadata task that generates information around time to read /// public class WordCountAnalyzerService : IWordCountAnalyzerService { private readonly ILogger _logger; private readonly IUnitOfWork _unitOfWork; private readonly IEventHub _eventHub; private readonly ICacheHelper _cacheHelper; private readonly IReaderService _readerService; public WordCountAnalyzerService(ILogger logger, IUnitOfWork unitOfWork, IEventHub eventHub, ICacheHelper cacheHelper, IReaderService readerService) { _logger = logger; _unitOfWork = unitOfWork; _eventHub = eventHub; _cacheHelper = cacheHelper; _readerService = readerService; } [DisableConcurrentExecution(timeoutInSeconds: 60 * 60 * 60)] [AutomaticRetry(Attempts = 2, OnAttemptsExceeded = AttemptsExceededAction.Delete)] public async Task ScanLibrary(int libraryId, bool forceUpdate = false) { var sw = Stopwatch.StartNew(); var library = await _unitOfWork.LibraryRepository.GetLibraryForIdAsync(libraryId, LibraryIncludes.None); await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress, MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, string.Empty)); var chunkInfo = await _unitOfWork.SeriesRepository.GetChunkInfo(library.Id); var stopwatch = Stopwatch.StartNew(); _logger.LogInformation("[MetadataService] Refreshing Library {LibraryName}. Total Items: {TotalSize}. Total Chunks: {TotalChunks} with {ChunkSize} size", library.Name, chunkInfo.TotalSize, chunkInfo.TotalChunks, chunkInfo.ChunkSize); await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress, MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 0F, ProgressEventType.Started, $"Starting {library.Name}")); for (var chunk = 1; chunk <= chunkInfo.TotalChunks; chunk++) { if (chunkInfo.TotalChunks == 0) continue; stopwatch.Restart(); _logger.LogInformation("[MetadataService] Processing chunk {ChunkNumber} / {TotalChunks} with size {ChunkSize}. Series ({SeriesStart} - {SeriesEnd}", chunk, chunkInfo.TotalChunks, chunkInfo.ChunkSize, chunk * chunkInfo.ChunkSize, (chunk + 1) * chunkInfo.ChunkSize); var nonLibrarySeries = await _unitOfWork.SeriesRepository.GetFullSeriesForLibraryIdAsync(library.Id, new UserParams() { PageNumber = chunk, PageSize = chunkInfo.ChunkSize }); _logger.LogDebug("[MetadataService] Fetched {SeriesCount} series for refresh", nonLibrarySeries.Count); var seriesIndex = 0; foreach (var series in nonLibrarySeries) { var index = chunk * seriesIndex; var progress = Math.Max(0F, Math.Min(1F, index * 1F / chunkInfo.TotalSize)); await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress, MessageFactory.WordCountAnalyzerProgressEvent(library.Id, progress, ProgressEventType.Updated, series.Name)); try { await ProcessSeries(series, forceUpdate, false); } catch (Exception ex) { _logger.LogError(ex, "[MetadataService] There was an exception during metadata refresh for {SeriesName}", series.Name); } seriesIndex++; } if (_unitOfWork.HasChanges()) { await _unitOfWork.CommitAsync(); } _logger.LogInformation( "[MetadataService] Processed {SeriesStart} - {SeriesEnd} out of {TotalSeries} series in {ElapsedScanTime} milliseconds for {LibraryName}", chunk * chunkInfo.ChunkSize, (chunk * chunkInfo.ChunkSize) + nonLibrarySeries.Count, chunkInfo.TotalSize, stopwatch.ElapsedMilliseconds, library.Name); } await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress, MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 1F, ProgressEventType.Ended, $"Complete")); _logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {LibraryName} in {ElapsedMilliseconds} milliseconds", library.Name, sw.ElapsedMilliseconds); } public async Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = true) { var sw = Stopwatch.StartNew(); var series = await _unitOfWork.SeriesRepository.GetFullSeriesForSeriesIdAsync(seriesId); if (series == null) { _logger.LogError("[WordCountAnalyzerService] Series {SeriesId} was not found on Library {LibraryId}", seriesId, libraryId); return; } await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress, MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, series.Name)); await ProcessSeries(series, forceUpdate); if (_unitOfWork.HasChanges()) { await _unitOfWork.CommitAsync(); } await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress, MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 1F, ProgressEventType.Ended, series.Name)); _logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {SeriesName} in {ElapsedMilliseconds} milliseconds", series.Name, sw.ElapsedMilliseconds); } private async Task ProcessSeries(Series series, bool forceUpdate = false, bool useFileName = true) { var isEpub = series.Format == MangaFormat.Epub; var existingWordCount = series.WordCount; series.WordCount = 0; foreach (var volume in series.Volumes) { volume.WordCount = 0; foreach (var chapter in volume.Chapters) { // This compares if it's changed since a file scan only var firstFile = chapter.Files.FirstOrDefault(); if (firstFile == null) return; if (!_cacheHelper.HasFileChangedSinceLastScan(firstFile.LastFileAnalysis, forceUpdate, firstFile)) continue; if (series.Format == MangaFormat.Epub) { long sum = 0; var fileCounter = 1; foreach (var file in chapter.Files) { var filePath = file.FilePath; var pageCounter = 1; try { using var book = await EpubReader.OpenBookAsync(filePath, BookService.BookReaderOptions); var totalPages = book.Content.Html.Values; foreach (var bookPage in totalPages) { var progress = Math.Max(0F, Math.Min(1F, (fileCounter * pageCounter) * 1F / (chapter.Files.Count * totalPages.Count))); await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress, MessageFactory.WordCountAnalyzerProgressEvent(series.LibraryId, progress, ProgressEventType.Updated, useFileName ? filePath : series.Name)); sum += await GetWordCountFromHtml(bookPage); pageCounter++; } fileCounter++; } catch (Exception ex) { _logger.LogError(ex, "There was an error reading an epub file for word count, series skipped"); await _eventHub.SendMessageAsync(MessageFactory.Error, MessageFactory.ErrorEvent("There was an issue counting words on an epub", $"{series.Name} - {file}")); return; } file.LastFileAnalysis = DateTime.Now; _unitOfWork.MangaFileRepository.Update(file); } chapter.WordCount = sum; series.WordCount += sum; volume.WordCount += sum; } var est = _readerService.GetTimeEstimate(chapter.WordCount, chapter.Pages, isEpub); chapter.MinHoursToRead = est.MinHours; chapter.MaxHoursToRead = est.MaxHours; chapter.AvgHoursToRead = est.AvgHours; _unitOfWork.ChapterRepository.Update(chapter); } var volumeEst = _readerService.GetTimeEstimate(volume.WordCount, volume.Pages, isEpub); volume.MinHoursToRead = volumeEst.MinHours; volume.MaxHoursToRead = volumeEst.MaxHours; volume.AvgHoursToRead = volumeEst.AvgHours; _unitOfWork.VolumeRepository.Update(volume); } var seriesEstimate = _readerService.GetTimeEstimate(series.WordCount, series.Pages, isEpub); series.MinHoursToRead = seriesEstimate.MinHours; series.MaxHoursToRead = seriesEstimate.MaxHours; series.AvgHoursToRead = seriesEstimate.AvgHours; if (series.WordCount == 0) series.WordCount = existingWordCount; // Restore original word count if the file hasn't changed _unitOfWork.SeriesRepository.Update(series); } private static async Task GetWordCountFromHtml(EpubContentFileRef bookFile) { var doc = new HtmlDocument(); doc.LoadHtml(await bookFile.ReadContentAsTextAsync()); var textNodes = doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]"); if (textNodes == null) return 0; return textNodes .Select(node => node.InnerText.Split(' ', StringSplitOptions.RemoveEmptyEntries) .Where(s => char.IsLetter(s[0]))) .Select(words => words.Count()) .Where(wordCount => wordCount > 0) .Sum(); } }