/// Translate PDF metadata (See PdfMetadataExtractor.cs) into ComicInfo structure. // Contributed by https://github.com/microtherion // All references to the "PDF Spec" (section numbers, etc) refer to the // PDF 1.7 Specification a.k.a. PDF32000-1:2008 // https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf using System; using System.Xml; using System.Text; using System.IO; using System.Diagnostics; using API.Data.Metadata; using API.Entities.Enums; using API.Services; using API.Services.Tasks.Scanner.Parser; using Microsoft.Extensions.Logging; using Nager.ArticleNumber; using System.Collections.Generic; namespace API.Helpers; #nullable enable public interface IPdfComicInfoExtractor { ComicInfo? GetComicInfo(string filePath); } public class PdfComicInfoExtractor : IPdfComicInfoExtractor { private readonly ILogger _logger; private readonly IMediaErrorService _mediaErrorService; private readonly string[] _pdfDateFormats = [ // PDF Spec 7.9.4 "D:yyyyMMddHHmmsszzz:", "D:yyyyMMddHHmmss+", "D:yyyyMMddHHmmss", "D:yyyyMMddHHmmzzz:", "D:yyyyMMddHHmm+", "D:yyyyMMddHHmm", "D:yyyyMMddHHzzz:", "D:yyyyMMddHH+", "D:yyyyMMddHH", "D:yyyyMMdd", "D:yyyyMM", "D:yyyy" ]; public PdfComicInfoExtractor(ILogger logger, IMediaErrorService mediaErrorService) { _logger = logger; _mediaErrorService = mediaErrorService; } private float? GetFloatFromText(string? text) { if (string.IsNullOrEmpty(text)) return null; if (float.TryParse(text, out var value)) return value; return null; } private DateTime? GetDateTimeFromText(string? text) { if (string.IsNullOrEmpty(text)) return null; // Dates stored in the XMP metadata stream (PDF Spec 14.3.2) // are stored in ISO 8601 format, which is handled by C# out of the box if (DateTime.TryParse(text, out var date)) return date; // Dates stored in the document information directory (PDF Spec 14.3.3) // are stored in a proprietary format (PDF Spec 7.9.4) that needs to be // massaged slightly to be expressible by a DateTime format. if (text[0] != 'D') { text = "D:" + text; } text = text.Replace("'", ":"); text = text.Replace("Z", "+"); foreach(var format in _pdfDateFormats) { if (DateTime.TryParseExact(text, format, null, System.Globalization.DateTimeStyles.None, out var pdfDate)) return pdfDate; } return null; } private string? MaybeGetMetadata(Dictionary metadata, string key) { return metadata.ContainsKey(key) ? metadata[key] : null; } private ComicInfo? GetComicInfoFromMetadata(Dictionary metadata, string filePath) { var info = new ComicInfo(); var publicationDate = GetDateTimeFromText(MaybeGetMetadata(metadata, "CreationDate")); if (publicationDate != null) { info.Year = publicationDate.Value.Year; info.Month = publicationDate.Value.Month; info.Day = publicationDate.Value.Day; } info.Summary = MaybeGetMetadata(metadata, "Summary") ?? string.Empty; info.Publisher = MaybeGetMetadata(metadata, "Publisher") ?? string.Empty; info.Writer = MaybeGetMetadata(metadata, "Author") ?? string.Empty; info.Title = MaybeGetMetadata(metadata, "Title") ?? string.Empty; info.Genre = MaybeGetMetadata(metadata, "Subject") ?? string.Empty; info.LanguageISO = BookService.ValidateLanguage(MaybeGetMetadata(metadata, "Language")); info.Isbn = MaybeGetMetadata(metadata, "ISBN") ?? string.Empty; if (info.Isbn != string.Empty && !ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn)) { _logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath); info.Isbn = string.Empty; } info.UserRating = GetFloatFromText(MaybeGetMetadata(metadata, "UserRating")) ?? 0.0f; info.TitleSort = MaybeGetMetadata(metadata, "TitleSort") ?? string.Empty; info.Series = MaybeGetMetadata(metadata, "Series") ?? info.TitleSort; info.SeriesSort = info.Series; info.Volume = (GetFloatFromText(MaybeGetMetadata(metadata, "Volume")) ?? 0.0f).ToString(); // If this is a single book and not a collection, set publication status to Completed if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume)) { info.Count = 1; } // Removed as probably unneeded per discussion in https://github.com/Kareadita/Kavita/pull/3108#discussion_r1956747782 // // var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga) // .Equals(Parser.LooseLeafVolume); // if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series))) // { // // This is likely a light novel for which we can set series from parsed title // info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga); // info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga); // } ComicInfo.CleanComicInfo(info); return info; } public ComicInfo? GetComicInfo(string filePath) { try { var extractor = new PdfMetadataExtractor(_logger, filePath); return GetComicInfoFromMetadata(extractor.GetMetadata(), filePath); } catch (Exception ex) { _logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata for {File}", filePath); _mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService, "There was an exception parsing PDF metadata", ex); } return null; } }