Kavita/API.Benchmark/EpubBenchmark.cs
Joseph Milazzo c1490d6e86
Word Count (#1286)
* Adding some code for Robbie

* See more on series detail metadata area is now at the bottom on the section

* Cleaned up subtitle headings to use a single class for offset with actionables

* Added some markup for the new design, waiting for Robbie to finish it off

* styling age-rating badge

* Started hooking up basic analyze file service and hooks in the UI. Basic code to implement the count is implemented and in benchmarks.

* Hooked up analyze ui to backend

* Refactored Series Detail metadata area to use a new icon/title design

* Cleaned up the new design

* Pushing for robbie to do css

* Massive performance improvement to scan series where we only need to scan folders reported that have series in them, rather than the whole library.

* Removed theme page as we no longer need it. Added WordCount to DTOs so the UI can show them. Added new pipe to format numbers in compact mode.

* Hooked up actual reading time based on user's words per hour

* Refactor some magic numbers to consts

* Hooked in progress reporting for series word count

* Hooked up analyze files

* Re-implemented time to read on comics

* Removed the word Last Read

* Show proper language name instead of iso tag on series detail page. Added some error handling on word count code.

* Reworked error handling

* Fixed some security vulnerabilities in npm.

* Handle a case where there are no text nodes and instead of returning an empty list, htmlagilitypack returns null.

* Tweaked the styles a bit on the icon-and-title

* Code cleanup

Co-authored-by: Robbie Davis <robbie@therobbiedavis.com>
2022-05-25 14:53:39 -07:00

69 lines
2.3 KiB
C#

using System;
using System.Linq;
using System.Threading.Tasks;
using API.Services;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Order;
using HtmlAgilityPack;
using VersOne.Epub;
namespace API.Benchmark;
[MemoryDiagnoser]
[Orderer(SummaryOrderPolicy.FastestToSlowest)]
[RankColumn]
[SimpleJob(launchCount: 1, warmupCount: 3, targetCount: 5, invocationCount: 100, id: "Epub"), ShortRunJob]
public class EpubBenchmark
{
[Benchmark]
public async Task GetWordCount_PassByString()
{
using var book = await EpubReader.OpenBookAsync("Data/book-test.epub", BookService.BookReaderOptions);
foreach (var bookFile in book.Content.Html.Values)
{
Console.WriteLine(GetBookWordCount_PassByString(await bookFile.ReadContentAsTextAsync()));
;
}
}
[Benchmark]
public async Task GetWordCount_PassByRef()
{
using var book = await EpubReader.OpenBookAsync("Data/book-test.epub", BookService.BookReaderOptions);
foreach (var bookFile in book.Content.Html.Values)
{
Console.WriteLine(await GetBookWordCount_PassByRef(bookFile));
}
}
private static int GetBookWordCount_PassByString(string fileContents)
{
var doc = new HtmlDocument();
doc.LoadHtml(fileContents);
var delimiter = new char[] {' '};
return doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]")
.Select(node => node.InnerText)
.Select(text => text.Split(delimiter, StringSplitOptions.RemoveEmptyEntries)
.Where(s => char.IsLetter(s[0])))
.Select(words => words.Count())
.Where(wordCount => wordCount > 0)
.Sum();
}
private static async Task<int> GetBookWordCount_PassByRef(EpubContentFileRef bookFile)
{
var doc = new HtmlDocument();
doc.LoadHtml(await bookFile.ReadContentAsTextAsync());
var delimiter = new char[] {' '};
return doc.DocumentNode.SelectNodes("//body//text()[not(parent::script)]")
.Select(node => node.InnerText)
.Select(text => text.Split(delimiter, StringSplitOptions.RemoveEmptyEntries)
.Where(s => char.IsLetter(s[0])))
.Select(words => words.Count())
.Where(wordCount => wordCount > 0)
.Sum();
}
}