Kavita/API/Services/Tasks/Metadata/WordCountAnalyzerService.cs

using System;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
using API.Data;
using API.Data.Repositories;
using API.Entities;
using API.Entities.Enums;
using API.Helpers;
using API.SignalR;
using Hangfire;
using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
using VersOne.Epub;

namespace API.Services.Tasks.Metadata;

public interface IWordCountAnalyzerService
{
    [DisableConcurrentExecution(timeoutInSeconds: 60 * 60 * 60)]
    [AutomaticRetry(Attempts = 2, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
    Task ScanLibrary(int libraryId, bool forceUpdate = false);
    Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = true);
}

/// <summary>
/// This service is a metadata task that generates information around time to read
/// </summary>
public class WordCountAnalyzerService : IWordCountAnalyzerService
{
    private readonly ILogger<WordCountAnalyzerService> _logger;
    private readonly IUnitOfWork _unitOfWork;
    private readonly IEventHub _eventHub;
    private readonly ICacheHelper _cacheHelper;
    private readonly IReaderService _readerService;

    public WordCountAnalyzerService(ILogger<WordCountAnalyzerService> logger, IUnitOfWork unitOfWork, IEventHub eventHub,
        ICacheHelper cacheHelper, IReaderService readerService)
    {
        _logger = logger;
        _unitOfWork = unitOfWork;
        _eventHub = eventHub;
        _cacheHelper = cacheHelper;
        _readerService = readerService;
    }


    [DisableConcurrentExecution(timeoutInSeconds: 60 * 60 * 60)]
    [AutomaticRetry(Attempts = 2, OnAttemptsExceeded = AttemptsExceededAction.Delete)]
    public async Task ScanLibrary(int libraryId, bool forceUpdate = false)
    {
        var sw = Stopwatch.StartNew();
        var library = await _unitOfWork.LibraryRepository.GetLibraryForIdAsync(libraryId, LibraryIncludes.None);

        await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
            MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, string.Empty));

        var chunkInfo = await _unitOfWork.SeriesRepository.GetChunkInfo(library.Id);
        var stopwatch = Stopwatch.StartNew();
        _logger.LogInformation("[MetadataService] Refreshing Library {LibraryName}. Total Items: {TotalSize}. Total Chunks: {TotalChunks} with {ChunkSize} size", library.Name, chunkInfo.TotalSize, chunkInfo.TotalChunks, chunkInfo.ChunkSize);

        await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
            MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 0F, ProgressEventType.Started, $"Starting {library.Name}"));

        for (var chunk = 1; chunk <= chunkInfo.TotalChunks; chunk++)
        {
            if (chunkInfo.TotalChunks == 0) continue;
            stopwatch.Restart();

            _logger.LogInformation("[MetadataService] Processing chunk {ChunkNumber} / {TotalChunks} with size {ChunkSize}. Series ({SeriesStart} - {SeriesEnd}",
                chunk, chunkInfo.TotalChunks, chunkInfo.ChunkSize, chunk * chunkInfo.ChunkSize, (chunk + 1) * chunkInfo.ChunkSize);

            var nonLibrarySeries = await _unitOfWork.SeriesRepository.GetFullSeriesForLibraryIdAsync(library.Id,
                new UserParams()
                {
                    PageNumber = chunk,
                    PageSize = chunkInfo.ChunkSize
                });
            _logger.LogDebug("[MetadataService] Fetched {SeriesCount} series for refresh", nonLibrarySeries.Count);

            var seriesIndex = 0;
            foreach (var series in nonLibrarySeries)
            {
                var index = chunk * seriesIndex;
                var progress =  Math.Max(0F, Math.Min(1F, index * 1F / chunkInfo.TotalSize));

                await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
                    MessageFactory.WordCountAnalyzerProgressEvent(library.Id, progress, ProgressEventType.Updated, series.Name));

                try
                {
                    await ProcessSeries(series, forceUpdate, false);
                }
                catch (Exception ex)
                {
                    _logger.LogError(ex, "[MetadataService] There was an exception during metadata refresh for {SeriesName}", series.Name);
                }
                seriesIndex++;
            }

            if (_unitOfWork.HasChanges())
            {
                await _unitOfWork.CommitAsync();
            }

            _logger.LogInformation(
                "[MetadataService] Processed {SeriesStart} - {SeriesEnd} out of {TotalSeries} series in {ElapsedScanTime} milliseconds for {LibraryName}",
                chunk * chunkInfo.ChunkSize, (chunk * chunkInfo.ChunkSize) + nonLibrarySeries.Count, chunkInfo.TotalSize, stopwatch.ElapsedMilliseconds, library.Name);
        }

        await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
            MessageFactory.WordCountAnalyzerProgressEvent(library.Id, 1F, ProgressEventType.Ended, $"Complete"));


        _logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {LibraryName} in {ElapsedMilliseconds} milliseconds", library.Name, sw.ElapsedMilliseconds);

    }

    public async Task ScanSeries(int libraryId, int seriesId, bool forceUpdate = true)
    {
        var sw = Stopwatch.StartNew();
        var series = await _unitOfWork.SeriesRepository.GetFullSeriesForSeriesIdAsync(seriesId);
        if (series == null)
        {
            _logger.LogError("[WordCountAnalyzerService] Series {SeriesId} was not found on Library {LibraryId}", seriesId, libraryId);
            return;
        }

        await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
            MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 0F, ProgressEventType.Started, series.Name));

        await ProcessSeries(series, forceUpdate);

        if (_unitOfWork.HasChanges())
        {
            await _unitOfWork.CommitAsync();
        }

        await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
            MessageFactory.WordCountAnalyzerProgressEvent(libraryId, 1F, ProgressEventType.Ended, series.Name));

        _logger.LogInformation("[WordCountAnalyzerService] Updated metadata for {SeriesName} in {ElapsedMilliseconds} milliseconds", series.Name, sw.ElapsedMilliseconds);
    }

    private async Task ProcessSeries(Series series, bool forceUpdate = false, bool useFileName = true)
    {
        var isEpub = series.Format == MangaFormat.Epub;
        var existingWordCount = series.WordCount;
        series.WordCount = 0;
        foreach (var volume in series.Volumes)
        {
            volume.WordCount = 0;
            foreach (var chapter in volume.Chapters)
            {
                // This compares if it's changed since a file scan only
                var firstFile = chapter.Files.FirstOrDefault();
                if (firstFile == null) return;
                if (!_cacheHelper.HasFileChangedSinceLastScan(firstFile.LastFileAnalysis, forceUpdate,
                        firstFile))
                    continue;

                if (series.Format == MangaFormat.Epub)
                {
                    long sum = 0;
                    var fileCounter = 1;
                    foreach (var file in chapter.Files)
                    {
                        var filePath = file.FilePath;
                        var pageCounter = 1;
                        try
                        {
                            using var book = await EpubReader.OpenBookAsync(filePath, BookService.BookReaderOptions);

                            var totalPages = book.Content.Html.Values;
                            foreach (var bookPage in totalPages)
                            {
                                var progress = Math.Max(0F,
                                    Math.Min(1F, (fileCounter * pageCounter) * 1F / (chapter.Files.Count * totalPages.Count)));

                                await _eventHub.SendMessageAsync(MessageFactory.NotificationProgress,
                                    MessageFactory.WordCountAnalyzerProgressEvent(series.LibraryId, progress,
                                        ProgressEventType.Updated, useFileName ? filePath : series.Name));
                                sum += await GetWordCountFromHtml(bookPage);
                                pageCounter++;
                            }

                            fileCounter++;
                        }
                        catch (Exception ex)
                        {
                            _logger.LogError(ex, "There was an error reading an epub file for word count, series skipped");
                            await _eventHub.SendMessageAsync(MessageFactory.Error,
                                MessageFactory.ErrorEvent("There was an issue counting words on an epub",
                                    $"{series.Name} - {file}"));
                            return;
                        }

                        file.LastFileAnalysis = DateTime.Now;
                        _unitOfWork.MangaFileRepository.Update(file);
                    }

                    chapter.WordCount = sum;
                    series.WordCount += sum;
                    volume.WordCount += sum;
                }

                var est = _readerService.GetTimeEstimate(chapter.WordCount, chapter.Pages, isEpub);
                chapter.MinHoursToRead = est.MinHours;
                chapter.MaxHoursToRead = est.MaxHours;
                chapter.AvgHoursToRead = est.AvgHours;
                _unitOfWork.ChapterRepository.Update(chapter);
            }

            var volumeEst = _readerService.GetTimeEstimate(volume.WordCount, volume.Pages, isEpub);
            volume.MinHoursToRead = volumeEst.MinHours;
            volume.MaxHoursToRead = volumeEst.MaxHours;
            volume.AvgHoursToRead = volumeEst.AvgHours;
            _unitOfWork.VolumeRepository.Update(volume);

        }

        var seriesEstimate = _readerService.GetTimeEstimate(series.WordCount, series.Pages, isEpub);
        series.MinHoursToRead = seriesEstimate.MinHours;
        series.MaxHoursToRead = seriesEstimate.MaxHours;
        series.AvgHoursToRead = seriesEstimate.AvgHours;
        if (series.WordCount == 0) series.WordCount = existingWordCount; // Restore original word count if the file hasn't changed
        _unitOfWork.SeriesRepository.Update(series);
    }


    private static async Task<int> GetWordCountFromHtml(EpubContentFileRef bookFile)
    {
        var doc = new HtmlDocument();
        doc.LoadHtml(await bookFile.ReadContentAsTextAsync());

        var textNodes = doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]");
        if (textNodes == null) return 0;

        return textNodes
            .Select(node => node.InnerText.Split(' ', StringSplitOptions.RemoveEmptyEntries)
                .Where(s => char.IsLetter(s[0])))
            .Select(words => words.Count())
            .Where(wordCount => wordCount > 0)
            .Sum();
    }


}