mirror of
https://github.com/Kareadita/Kavita.git
synced 2025-05-24 00:52:23 -04:00
Co-authored-by: Robbie Davis <robbie@therobbiedavis.com> Co-authored-by: Fesaa <77553571+Fesaa@users.noreply.github.com>
1638 lines
45 KiB
C#
1638 lines
45 KiB
C#
/**
|
|
* Contributed by https://github.com/microtherion
|
|
*
|
|
* All references to the "PDF Spec" (section numbers, etc) refer to the
|
|
* PDF 1.7 Specification a.k.a. PDF32000-1:2008
|
|
* https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf
|
|
*/
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.IO.Compression;
|
|
using System.Text;
|
|
using System.Xml;
|
|
using System.IO;
|
|
using Microsoft.Extensions.Logging;
|
|
using API.Services;
|
|
|
|
namespace API.Helpers;
|
|
#nullable enable
|
|
|
|
/// <summary>
|
|
/// Parse PDF file and try to extract as much metadata as possible.
|
|
/// Supports both text based XRef tables and compressed XRef streams (Deflate only).
|
|
/// Supports both UTF-16 and PDFDocEncoding for strings.
|
|
/// Lacks support for many PDF configurations that are theoretically possible, but should handle most common cases.
|
|
/// </summary>
|
|
public class PdfMetadataExtractorException : Exception
|
|
{
|
|
public PdfMetadataExtractorException()
|
|
{
|
|
}
|
|
|
|
public PdfMetadataExtractorException(string message)
|
|
: base(message)
|
|
{
|
|
}
|
|
|
|
public PdfMetadataExtractorException(string message, Exception inner)
|
|
: base(message, inner)
|
|
{
|
|
}
|
|
}
|
|
|
|
public interface IPdfMetadataExtractor
|
|
{
|
|
Dictionary<String, String> GetMetadata();
|
|
}
|
|
|
|
class PdfStringBuilder
|
|
{
|
|
private readonly StringBuilder _builder = new();
|
|
private bool _secondByte = false;
|
|
private byte _prevByte = 0;
|
|
private bool _isUnicode = false;
|
|
|
|
// PDFDocEncoding defined in PDF Spec D.1
|
|
|
|
private readonly char[] _pdfDocMappingLow =
|
|
[
|
|
'\u02D8', '\u02C7', '\u02C6', '\u02D9', '\u02DD', '\u02DB', '\u02DA', '\u02DC'
|
|
];
|
|
|
|
private readonly char[] _pdfDocMappingHigh =
|
|
[
|
|
'\u2022', '\u2020', '\u2021', '\u2026', '\u2014', '\u2013', '\u0192', '\u2044',
|
|
'\u2039', '\u203A', '\u2212', '\u2030', '\u201E', '\u201C', '\u201D', '\u2018',
|
|
'\u2019', '\u201A', '\u2122', '\uFB01', '\uFB02', '\u0141', '\u0152', '\u0160',
|
|
'\u0178', '\u017D', '\u0131', '\u0142', '\u0153', '\u0161', '\u017E', ' ',
|
|
'\u20AC'
|
|
];
|
|
|
|
private void AppendPdfDocByte(byte b)
|
|
{
|
|
if (b >= 0x18 && b < 0x20)
|
|
{
|
|
_builder.Append(_pdfDocMappingLow[b - 0x18]);
|
|
}
|
|
else if (b >= 0x80 && b < 0xA1)
|
|
{
|
|
_builder.Append(_pdfDocMappingHigh[b - 0x80]);
|
|
}
|
|
else
|
|
{
|
|
_builder.Append((char)b);
|
|
}
|
|
}
|
|
|
|
public void Append(char c)
|
|
{
|
|
_builder.Append(c);
|
|
}
|
|
|
|
public void AppendByte(byte b)
|
|
{
|
|
// PDF Spec 7.9.2.1: Strings are either UTF-16BE or PDFDocEncoded
|
|
if (_builder.Length == 0 && !_isUnicode)
|
|
{
|
|
// Unicode strings are prefixed by a big endian BOM \uFEFF
|
|
if (_secondByte)
|
|
{
|
|
if (b == 0xFF)
|
|
{
|
|
_isUnicode = true;
|
|
_secondByte = false;
|
|
}
|
|
else
|
|
{
|
|
AppendPdfDocByte(_prevByte);
|
|
AppendPdfDocByte(b);
|
|
}
|
|
}
|
|
else if (!_secondByte && b == 0xFE)
|
|
{
|
|
_secondByte = true;
|
|
_prevByte = b;
|
|
}
|
|
else
|
|
{
|
|
AppendPdfDocByte(b);
|
|
}
|
|
}
|
|
else if (_isUnicode)
|
|
{
|
|
if (_secondByte)
|
|
{
|
|
_builder.Append((char)(((char)_prevByte) << 8 | (char)b));
|
|
_secondByte = false;
|
|
}
|
|
else
|
|
{
|
|
_prevByte = b;
|
|
_secondByte = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
AppendPdfDocByte(b);
|
|
}
|
|
}
|
|
|
|
override public string ToString()
|
|
{
|
|
if (_builder.Length == 0 && _secondByte)
|
|
{
|
|
AppendPdfDocByte(_prevByte);
|
|
}
|
|
|
|
return _builder.ToString();
|
|
}
|
|
}
|
|
|
|
internal class PdfLexer(Stream stream)
|
|
{
|
|
private const int BufferSize = 1024;
|
|
private readonly byte[] _buffer = new byte[BufferSize];
|
|
private int _pos = 0;
|
|
private int _valid = 0;
|
|
|
|
public enum TokenType
|
|
{
|
|
None,
|
|
Bool,
|
|
Int,
|
|
Double,
|
|
Name,
|
|
String,
|
|
ArrayStart,
|
|
ArrayEnd,
|
|
DictionaryStart,
|
|
DictionaryEnd,
|
|
StreamStart,
|
|
StreamEnd,
|
|
ObjectStart,
|
|
ObjectEnd,
|
|
ObjectRef,
|
|
Keyword,
|
|
Newline,
|
|
}
|
|
|
|
public struct Token(TokenType type, object value)
|
|
{
|
|
public TokenType Type = type;
|
|
public object Value = value;
|
|
}
|
|
|
|
public Token NextToken(bool reportNewlines = false)
|
|
{
|
|
while (true)
|
|
{
|
|
switch ((char)NextByte())
|
|
{
|
|
case '\n' when reportNewlines:
|
|
return new Token(TokenType.Newline, true);
|
|
|
|
case '\r' when reportNewlines:
|
|
if (NextByte() != '\n')
|
|
{
|
|
PutBack();
|
|
}
|
|
return new Token(TokenType.Newline, true);
|
|
|
|
case ' ':
|
|
case '\x00':
|
|
case '\t':
|
|
case '\n':
|
|
case '\f':
|
|
case '\r':
|
|
continue; // Skip whitespace
|
|
|
|
case '%':
|
|
SkipComment();
|
|
continue;
|
|
|
|
case '+':
|
|
case '-':
|
|
case '.':
|
|
case >= '0' and <= '9':
|
|
return ScanNumber();
|
|
|
|
case '/':
|
|
return ScanName();
|
|
|
|
case '(':
|
|
return ScanString();
|
|
|
|
case '[':
|
|
return new Token(TokenType.ArrayStart, true);
|
|
|
|
case ']':
|
|
return new Token(TokenType.ArrayEnd, true);
|
|
|
|
case '<':
|
|
if (NextByte() == '<')
|
|
{
|
|
return new Token(TokenType.DictionaryStart, true);
|
|
}
|
|
else
|
|
{
|
|
PutBack();
|
|
return ScanHexString();
|
|
}
|
|
case '>':
|
|
ExpectByte((byte)'>');
|
|
|
|
return new Token(TokenType.DictionaryEnd, true);
|
|
|
|
case >= 'a' and <= 'z':
|
|
case >= 'A' and <= 'Z':
|
|
return ScanKeyword();
|
|
|
|
default:
|
|
throw new PdfMetadataExtractorException("Unexpected byte, got {LastByte()}");
|
|
}
|
|
}
|
|
}
|
|
|
|
public void ResetBuffer()
|
|
{
|
|
_pos = 0;
|
|
_valid = 0;
|
|
}
|
|
|
|
public bool TestByte(byte expected)
|
|
{
|
|
var result = NextByte() == expected;
|
|
|
|
PutBack();
|
|
|
|
return result;
|
|
}
|
|
|
|
public void ExpectNewline()
|
|
{
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
switch ((char)b)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
case '\f':
|
|
continue; // Skip whitespace
|
|
|
|
case '\n':
|
|
return;
|
|
|
|
case '\r':
|
|
if (NextByte() != '\n')
|
|
{
|
|
PutBack();
|
|
}
|
|
|
|
return;
|
|
|
|
default:
|
|
throw new PdfMetadataExtractorException("Unexpected character, expected newline, got {b}");
|
|
}
|
|
}
|
|
}
|
|
|
|
public long GetXRefStart()
|
|
{
|
|
// Look for the startxref element as per PDF Spec 7.5.5
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
|
|
switch ((char)b)
|
|
{
|
|
case '\r':
|
|
b = NextByte();
|
|
|
|
if (b != '\n')
|
|
{
|
|
PutBack();
|
|
}
|
|
|
|
goto case '\n';
|
|
|
|
case '\n':
|
|
// Handle consecutive newlines
|
|
while (true)
|
|
{
|
|
b = NextByte();
|
|
|
|
if (b == '\r')
|
|
{
|
|
goto case '\r';
|
|
}
|
|
else if (b == '\n')
|
|
{
|
|
goto case '\n';
|
|
}
|
|
else if (b == ' ' || b == '\t' || b == '\f')
|
|
{
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
PutBack();
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
var token = NextToken(true);
|
|
|
|
if (token.Type == TokenType.Keyword && (string)token.Value == "startxref")
|
|
{
|
|
token = NextToken();
|
|
|
|
if (token.Type == TokenType.Int)
|
|
{
|
|
return (long)token.Value;
|
|
}
|
|
else
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected integer after startxref keyword");
|
|
}
|
|
}
|
|
|
|
continue;
|
|
|
|
default:
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
public bool NextXRefEntry(ref long obj, ref int generation)
|
|
{
|
|
// Cross-reference table entry as per PDF Spec 7.5.4
|
|
|
|
WantLookahead(20);
|
|
|
|
if (_valid - _pos < 20)
|
|
{
|
|
throw new PdfMetadataExtractorException("End of stream");
|
|
}
|
|
|
|
var inUse = true;
|
|
|
|
if (obj == 0)
|
|
{
|
|
obj = Convert.ToInt64(Encoding.ASCII.GetString(_buffer, _pos, 10));
|
|
generation = Convert.ToInt32(Encoding.ASCII.GetString(_buffer, _pos + 11, 5));
|
|
inUse = _buffer[_pos + 17] == 'n';
|
|
}
|
|
|
|
_pos += 20;
|
|
|
|
return inUse;
|
|
}
|
|
|
|
public Stream StreamObject(int length, bool deflate)
|
|
{
|
|
// Read a stream object as per PDF Spec 7.3.8
|
|
// At the moment, we only accept uncompressed streams or the FlateDecode (PDF Spec 7.4.1) filter
|
|
// with no parameters. These cover the vast majority of streams we're interested in.
|
|
|
|
var rawData = new MemoryStream();
|
|
|
|
ExpectNewline();
|
|
|
|
if (_pos < _valid)
|
|
{
|
|
var buffered = Math.Min(_valid - _pos, length);
|
|
rawData.Write(_buffer, _pos, buffered);
|
|
length -= buffered;
|
|
_pos += buffered;
|
|
}
|
|
|
|
while (length > 0)
|
|
{
|
|
var buffered = Math.Min(length, BufferSize);
|
|
stream.ReadExactly(_buffer, 0, buffered);
|
|
rawData.Write(_buffer, 0, buffered);
|
|
_pos = 0;
|
|
_valid = 0;
|
|
length -= buffered;
|
|
}
|
|
|
|
rawData.Seek(0, SeekOrigin.Begin);
|
|
|
|
if (deflate)
|
|
{
|
|
return new ZLibStream(rawData, CompressionMode.Decompress, false);
|
|
}
|
|
else
|
|
{
|
|
return rawData;
|
|
}
|
|
}
|
|
|
|
private byte NextByte()
|
|
{
|
|
if (_pos >= _valid)
|
|
{
|
|
_pos = 0;
|
|
_valid = stream.Read(_buffer, 0, BufferSize);
|
|
|
|
if (_valid <= 0)
|
|
{
|
|
throw new PdfMetadataExtractorException("End of stream");
|
|
}
|
|
}
|
|
|
|
return _buffer[_pos++];
|
|
}
|
|
|
|
private byte LastByte()
|
|
{
|
|
return _buffer[_pos - 1];
|
|
}
|
|
|
|
private void PutBack()
|
|
{
|
|
--_pos;
|
|
}
|
|
|
|
private void ExpectByte(byte expected)
|
|
{
|
|
if (NextByte() != expected)
|
|
{
|
|
throw new PdfMetadataExtractorException($"Unexpected character, expected {expected}");
|
|
}
|
|
}
|
|
|
|
private void WantLookahead(int length)
|
|
{
|
|
if (_pos + length > _valid)
|
|
{
|
|
Buffer.BlockCopy(_buffer, _pos, _buffer, 0, _valid - _pos);
|
|
_valid -= _pos;
|
|
_pos = 0;
|
|
_valid += stream.Read(_buffer, _valid, BufferSize - _valid);
|
|
}
|
|
}
|
|
|
|
private void SkipComment()
|
|
{
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
|
|
if (b == '\n')
|
|
{
|
|
break;
|
|
}
|
|
else if (b == '\r')
|
|
{
|
|
if (NextByte() != '\n')
|
|
{
|
|
PutBack();
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
private Token ScanNumber()
|
|
{
|
|
StringBuilder sb = new();
|
|
var hasDot = LastByte() == '.';
|
|
var followedBySpace = false;
|
|
|
|
sb.Append((char)LastByte());
|
|
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
|
|
if (b == '.' || b >= '0' && b <= '9')
|
|
{
|
|
sb.Append((char)b);
|
|
|
|
if (b == '.')
|
|
{
|
|
hasDot = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
followedBySpace = (b == ' ' || b == '\t');
|
|
PutBack();
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (hasDot)
|
|
{
|
|
return new Token(TokenType.Double, double.Parse(sb.ToString()));
|
|
}
|
|
|
|
if (followedBySpace)
|
|
{
|
|
// Look ahead to see if it's an object reference (PDF Spec 7.3.10)
|
|
WantLookahead(32);
|
|
|
|
var savedPos = _pos;
|
|
var b = NextByte();
|
|
|
|
while (b == ' ' || b == '\t')
|
|
{
|
|
b = NextByte();
|
|
}
|
|
|
|
// Generation number (ignored)
|
|
while (b >= '0' && b <= '9')
|
|
{
|
|
b = NextByte();
|
|
}
|
|
|
|
while (b == ' ' || b == '\t')
|
|
{
|
|
b = NextByte();
|
|
}
|
|
|
|
if (b == 'R')
|
|
{
|
|
return new Token(TokenType.ObjectRef, long.Parse(sb.ToString()));
|
|
}
|
|
else if (b == 'o' && NextByte() == 'b' && NextByte() == 'j')
|
|
{
|
|
return new Token(TokenType.ObjectStart, long.Parse(sb.ToString()));
|
|
}
|
|
else
|
|
{
|
|
_pos = savedPos;
|
|
}
|
|
}
|
|
|
|
return new Token(TokenType.Int, long.Parse(sb.ToString()));
|
|
}
|
|
|
|
private static int HexDigit(byte b)
|
|
{
|
|
return (char) b switch
|
|
{
|
|
>= '0' and <= '9' => b - (byte) '0',
|
|
>= 'a' and <= 'f' => b - (byte) 'a' + 10,
|
|
>= 'A' and <= 'F' => b - (byte) 'A' + 10,
|
|
_ => throw new PdfMetadataExtractorException("Invalid hex digit, got {b}")
|
|
};
|
|
}
|
|
|
|
private Token ScanName()
|
|
{
|
|
// PDF Spec 7.3.5
|
|
|
|
var sb = new StringBuilder();
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
switch ((char)b)
|
|
{
|
|
case '(':
|
|
case ')':
|
|
case '[':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '<':
|
|
case '>':
|
|
case '/':
|
|
case '%':
|
|
PutBack();
|
|
|
|
goto case ' ';
|
|
|
|
case ' ':
|
|
case '\t':
|
|
case '\n':
|
|
case '\f':
|
|
case '\r':
|
|
return new Token(TokenType.Name, sb.ToString());
|
|
|
|
case '#':
|
|
var b1 = NextByte();
|
|
var b2 = NextByte();
|
|
b = (byte)((HexDigit(b1) << 4) | HexDigit(b2));
|
|
|
|
goto default;
|
|
|
|
default:
|
|
sb.Append((char)b);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
private Token ScanString()
|
|
{
|
|
// PDF Spec 7.3.4.2
|
|
|
|
PdfStringBuilder sb = new();
|
|
var parenLevel = 1;
|
|
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
|
|
switch ((char)b)
|
|
{
|
|
case '(':
|
|
parenLevel++;
|
|
|
|
goto default;
|
|
|
|
case ')':
|
|
if (--parenLevel == 0)
|
|
{
|
|
return new Token(TokenType.String, sb.ToString());
|
|
}
|
|
|
|
goto default;
|
|
|
|
case '\\':
|
|
b = NextByte();
|
|
|
|
switch ((char)b)
|
|
{
|
|
case 'b':
|
|
sb.Append('\b');
|
|
|
|
break;
|
|
|
|
case 'f':
|
|
sb.Append('\f');
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
sb.Append('\n');
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
sb.Append('\r');
|
|
|
|
break;
|
|
|
|
case 't':
|
|
sb.Append('\t');
|
|
|
|
break;
|
|
|
|
case >= '0' and <= '7':
|
|
var b1 = b;
|
|
var b2 = NextByte();
|
|
var b3 = NextByte();
|
|
|
|
if (b2 < '0' || b2 > '7' || b3 < '0' || b3 > '7')
|
|
{
|
|
throw new PdfMetadataExtractorException("Invalid octal escape, got {b1}{b2}{b3}");
|
|
}
|
|
|
|
sb.AppendByte((byte)((b1 - '0') << 6 | (b2 - '0') << 3 | (b3 - '0')));
|
|
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
sb.AppendByte(b);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
private Token ScanHexString()
|
|
{
|
|
// PDF Spec 7.3.4.3
|
|
|
|
PdfStringBuilder sb = new();
|
|
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
|
|
switch ((char)b)
|
|
{
|
|
case (>= '0' and <= '9') or (>= 'a' and <= 'f') or (>= 'A' and <= 'F'):
|
|
var b1 = NextByte();
|
|
if (b1 == '>')
|
|
{
|
|
PutBack();
|
|
b1 = (byte)'0';
|
|
}
|
|
sb.AppendByte((byte)(HexDigit(b) << 4 | HexDigit(b1)));
|
|
|
|
break;
|
|
|
|
case '>':
|
|
return new Token(TokenType.String, sb.ToString());
|
|
|
|
default:
|
|
throw new PdfMetadataExtractorException("Invalid hex string, got {b}");
|
|
}
|
|
}
|
|
}
|
|
|
|
private Token ScanKeyword()
|
|
{
|
|
StringBuilder sb = new();
|
|
|
|
sb.Append((char)LastByte());
|
|
|
|
while (true)
|
|
{
|
|
var b = NextByte();
|
|
if ((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z'))
|
|
{
|
|
sb.Append((char)b);
|
|
}
|
|
else
|
|
{
|
|
PutBack();
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch (sb.ToString())
|
|
{
|
|
case "true":
|
|
return new Token(TokenType.Bool, true);
|
|
|
|
case "false":
|
|
return new Token(TokenType.Bool, false);
|
|
|
|
case "stream":
|
|
return new Token(TokenType.StreamStart, true);
|
|
|
|
case "endstream":
|
|
return new Token(TokenType.StreamEnd, true);
|
|
|
|
case "endobj":
|
|
return new Token(TokenType.ObjectEnd, true);
|
|
|
|
default:
|
|
return new Token(TokenType.Keyword, sb.ToString());
|
|
}
|
|
}
|
|
}
|
|
|
|
internal class PdfMetadataExtractor : IPdfMetadataExtractor
|
|
{
|
|
private readonly ILogger<BookService> _logger;
|
|
private readonly PdfLexer _lexer;
|
|
private readonly FileStream _stream;
|
|
private long[] _objectOffsets = new long[0];
|
|
private readonly Dictionary<string, string> _metadata = [];
|
|
private readonly Stack<MetadataRef> _metadataRef = new();
|
|
|
|
private struct MetadataRef(long root, long info)
|
|
{
|
|
public long Root = root;
|
|
public long Info = info;
|
|
}
|
|
|
|
private struct XRefSection(long first, long count)
|
|
{
|
|
public readonly long First = first;
|
|
public readonly long Count = count;
|
|
}
|
|
|
|
public PdfMetadataExtractor(ILogger<BookService> logger, string filename)
|
|
{
|
|
_logger = logger;
|
|
_stream = File.OpenRead(filename);
|
|
_lexer = new PdfLexer(_stream);
|
|
|
|
ReadObjectOffsets();
|
|
ReadMetadata(filename);
|
|
}
|
|
|
|
public Dictionary<string, string> GetMetadata()
|
|
{
|
|
return _metadata;
|
|
}
|
|
|
|
private void LogMetadata(string filename)
|
|
{
|
|
_logger.LogTrace("Metadata for {Path}:", filename);
|
|
|
|
foreach (var entry in _metadata)
|
|
{
|
|
_logger.LogTrace(" {Key:0,-5} : {Value:1}", entry.Key, entry.Value);
|
|
}
|
|
}
|
|
|
|
private void ReadObjectOffsets()
|
|
{
|
|
// Look for file trailer (PDF Spec 7.5.5)
|
|
// Spec says trailer must be strictly at end of file.
|
|
// Adobe software accepts trailer within last 1K of EOF,
|
|
// but in practice, virtually all PDFs have trailer at end.
|
|
|
|
_stream.Seek(-32, SeekOrigin.End);
|
|
|
|
var xrefOffset = _lexer.GetXRefStart();
|
|
|
|
ReadXRefAndTrailer(xrefOffset);
|
|
}
|
|
|
|
private void ReadXRefAndTrailer(long xrefOffset)
|
|
{
|
|
_stream.Seek(xrefOffset, SeekOrigin.Begin);
|
|
_lexer.ResetBuffer();
|
|
|
|
if (!_lexer.TestByte((byte)'x'))
|
|
{
|
|
// Cross-reference stream (PDF Spec 7.5.8)
|
|
|
|
ReadXRefStream();
|
|
|
|
return;
|
|
}
|
|
|
|
// Cross-reference table (PDF Spec 7.5.4)
|
|
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.Keyword || (string)token.Value != "xref")
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected xref keyword");
|
|
}
|
|
|
|
while (true)
|
|
{
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type == PdfLexer.TokenType.Int)
|
|
{
|
|
var startObj = (long)token.Value;
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected number of objects in xref subsection");
|
|
}
|
|
|
|
var numObj = (long)token.Value;
|
|
|
|
if (_objectOffsets.Length < startObj + numObj)
|
|
{
|
|
Array.Resize(ref _objectOffsets, (int)(startObj + numObj));
|
|
}
|
|
|
|
_lexer.ExpectNewline();
|
|
|
|
var generation = 0;
|
|
|
|
for (var obj = startObj; obj < startObj + numObj; ++obj)
|
|
{
|
|
var inUse = _lexer.NextXRefEntry(ref _objectOffsets[obj], ref generation);
|
|
|
|
if (!inUse)
|
|
{
|
|
_objectOffsets[obj] = 0;
|
|
}
|
|
}
|
|
}
|
|
else if (token.Type == PdfLexer.TokenType.Keyword && (string)token.Value == "trailer")
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
throw new PdfMetadataExtractorException("Unexpected token in xref");
|
|
}
|
|
}
|
|
|
|
ReadTrailerDictionary();
|
|
}
|
|
|
|
private void ReadXRefStream()
|
|
{
|
|
// Cross-reference stream (PDF Spec 7.5.8)
|
|
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.ObjectStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected obj keyword");
|
|
}
|
|
|
|
long length = -1;
|
|
long size = -1;
|
|
var deflate = false;
|
|
long prev = -1;
|
|
long typeWidth = -1;
|
|
long offsetWidth = -1;
|
|
long generationWidth = -1;
|
|
Queue<XRefSection> sections = new();
|
|
var meta = new MetadataRef(-1, -1);
|
|
|
|
// Cross-reference stream dictionary (PDF Spec 7.5.8.2)
|
|
|
|
ParseDictionary(delegate(string key, PdfLexer.Token value) {
|
|
switch (key)
|
|
{
|
|
case "Type":
|
|
if (value.Type != PdfLexer.TokenType.Name || (string)value.Value != "XRef")
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected /Type to be /XRef");
|
|
}
|
|
|
|
return true;
|
|
|
|
case "Length":
|
|
if (value.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected integer after /Length");
|
|
}
|
|
|
|
length = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
case "Size":
|
|
if (value.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected integer after /Size");
|
|
}
|
|
|
|
size = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
case "Prev":
|
|
if (value.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected offset after /Prev");
|
|
}
|
|
|
|
prev = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
case "Index":
|
|
if (value.Type != PdfLexer.TokenType.ArrayStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected array after /Index");
|
|
}
|
|
|
|
while (true)
|
|
{
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type == PdfLexer.TokenType.ArrayEnd)
|
|
{
|
|
break;
|
|
}
|
|
else if (token.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected integer in /Index array");
|
|
}
|
|
|
|
var first = (long)token.Value;
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected integer pair in /Index array");
|
|
}
|
|
|
|
var count = (long)token.Value;
|
|
sections.Enqueue(new XRefSection(first, count));
|
|
}
|
|
|
|
return true;
|
|
|
|
case "W":
|
|
if (value.Type != PdfLexer.TokenType.ArrayStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected array after /W");
|
|
}
|
|
|
|
var widths = new long[3];
|
|
|
|
for (var i = 0; i < 3; ++i)
|
|
{
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected integer in /W array");
|
|
}
|
|
|
|
widths[i] = (long)token.Value;
|
|
}
|
|
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.ArrayEnd)
|
|
{
|
|
throw new PdfMetadataExtractorException("Unclosed array after /W");
|
|
}
|
|
|
|
typeWidth = widths[0];
|
|
offsetWidth = widths[1];
|
|
generationWidth = widths[2];
|
|
|
|
return true;
|
|
|
|
case "Filter":
|
|
if (value.Type != PdfLexer.TokenType.Name)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected name after /Filter");
|
|
}
|
|
|
|
if ((string)value.Value != "FlateDecode")
|
|
{
|
|
throw new PdfMetadataExtractorException("Unsupported filter, only FlateDecode is supported");
|
|
}
|
|
|
|
deflate = true;
|
|
|
|
return true;
|
|
|
|
case "Root":
|
|
if (value.Type != PdfLexer.TokenType.ObjectRef)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object reference after /Root");
|
|
}
|
|
|
|
meta.Root = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
case "Info":
|
|
if (value.Type != PdfLexer.TokenType.ObjectRef)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object reference after /Info");
|
|
}
|
|
|
|
meta.Info = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
});
|
|
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.StreamStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected xref stream after dictionary");
|
|
}
|
|
|
|
var stream = _lexer.StreamObject((int)length, deflate);
|
|
|
|
if (sections.Count == 0)
|
|
{
|
|
sections.Enqueue(new XRefSection(0, size));
|
|
}
|
|
|
|
while (sections.Count > 0)
|
|
{
|
|
var section = sections.Dequeue();
|
|
|
|
if (_objectOffsets.Length < size)
|
|
{
|
|
Array.Resize(ref _objectOffsets, (int)size);
|
|
}
|
|
|
|
for (var i = section.First; i < section.First + section.Count; ++i)
|
|
{
|
|
long type = 0;
|
|
long offset = 0;
|
|
long generation = 0;
|
|
|
|
if (typeWidth == 0)
|
|
{
|
|
type = 1;
|
|
}
|
|
|
|
for (var j = 0; j < typeWidth; ++j)
|
|
{
|
|
type = (type << 8) | (ushort)stream.ReadByte();
|
|
}
|
|
|
|
for (var j = 0; j < offsetWidth; ++j)
|
|
{
|
|
offset = (offset << 8) | (ushort)stream.ReadByte();
|
|
}
|
|
|
|
for (var j = 0; j < generationWidth; ++j)
|
|
{
|
|
generation = (generation << 8) | (ushort)stream.ReadByte();
|
|
}
|
|
|
|
if (type == 1 && _objectOffsets[i] == 0)
|
|
{
|
|
_objectOffsets[i] = offset;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (prev > -1)
|
|
{
|
|
ReadXRefAndTrailer(prev);
|
|
}
|
|
|
|
PushMetadataRef(meta);
|
|
}
|
|
|
|
private void PushMetadataRef(MetadataRef meta)
|
|
{
|
|
if (_metadataRef.Count > 0)
|
|
{
|
|
if (meta.Root == _metadataRef.Peek().Root)
|
|
{
|
|
meta.Root = -1;
|
|
}
|
|
|
|
if (meta.Info == _metadataRef.Peek().Info)
|
|
{
|
|
meta.Info = -1;
|
|
}
|
|
}
|
|
|
|
if (meta.Root != -1 || meta.Info != -1)
|
|
{
|
|
_metadataRef.Push(meta);
|
|
}
|
|
}
|
|
|
|
private void ReadTrailerDictionary()
|
|
{
|
|
// Read trailer directory (PDF Spec 7.5.5)
|
|
|
|
long prev = -1;
|
|
long xrefStm = -1;
|
|
|
|
MetadataRef meta = new(-1, -1);
|
|
|
|
ParseDictionary(delegate(string key, PdfLexer.Token value)
|
|
{
|
|
switch (key)
|
|
{
|
|
case "Root":
|
|
if (value.Type != PdfLexer.TokenType.ObjectRef)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object reference after /Root");
|
|
}
|
|
|
|
meta.Root = (long)value.Value;
|
|
|
|
return true;
|
|
case "Prev":
|
|
if (value.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected offset after /Prev");
|
|
}
|
|
|
|
prev = (long)value.Value;
|
|
|
|
return true;
|
|
case "Info":
|
|
if (value.Type != PdfLexer.TokenType.ObjectRef)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object reference after /Info");
|
|
}
|
|
|
|
meta.Info = (long)value.Value;
|
|
|
|
return true;
|
|
case "XRefStm":
|
|
// Prefer encoded xref stream over xref table
|
|
if (value.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected offset after /XRefStm");
|
|
}
|
|
|
|
xrefStm = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
case "Encrypt":
|
|
throw new PdfMetadataExtractorException("Encryption not supported");
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
});
|
|
|
|
PushMetadataRef(meta);
|
|
|
|
if (xrefStm != -1)
|
|
{
|
|
ReadXRefAndTrailer(xrefStm);
|
|
}
|
|
|
|
if (prev != -1)
|
|
{
|
|
ReadXRefAndTrailer(prev);
|
|
}
|
|
}
|
|
|
|
private void ReadMetadata(string filename)
|
|
{
|
|
// We read potential metadata sources in backwards historical order, so
|
|
// we can overwrite to our heart's content
|
|
|
|
while (_metadataRef.Count > 0)
|
|
{
|
|
var meta = _metadataRef.Pop();
|
|
|
|
//_logger.LogTrace("DocumentCatalog for {Path}: {Root}, Info: {Info}", filename, meta.root, meta.info);
|
|
|
|
ReadMetadataFromInfo(meta.Info);
|
|
ReadMetadataFromXml(MetadataObjInObjectCatalog(meta.Root));
|
|
}
|
|
}
|
|
|
|
private void ReadMetadataFromInfo(long infoObj)
|
|
{
|
|
// Document information dictionary (PDF Spec 14.3.3)
|
|
// We treat this as less authoritative than the Metadata stream.
|
|
|
|
if (infoObj < 1 || infoObj >= _objectOffsets.Length || _objectOffsets[infoObj] == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
_stream.Seek(_objectOffsets[infoObj], SeekOrigin.Begin);
|
|
_lexer.ResetBuffer();
|
|
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.ObjectStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object header");
|
|
}
|
|
|
|
Dictionary<string, long> indirectObjects = [];
|
|
|
|
ParseDictionary(delegate(string key, PdfLexer.Token value)
|
|
{
|
|
switch (key)
|
|
{
|
|
case "Title":
|
|
case "Author":
|
|
case "Subject":
|
|
case "Keywords":
|
|
case "Creator":
|
|
case "Producer":
|
|
case "CreationDate":
|
|
case "ModDate":
|
|
if (value.Type == PdfLexer.TokenType.ObjectRef) {
|
|
indirectObjects[key] = (long)value.Value;
|
|
}
|
|
else if (value.Type != PdfLexer.TokenType.String)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected string value");
|
|
}
|
|
else
|
|
{
|
|
_metadata[key] = (string)value.Value;
|
|
}
|
|
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
});
|
|
|
|
// Resolve indirectly referenced values
|
|
foreach(var key in indirectObjects.Keys) {
|
|
_stream.Seek(_objectOffsets[indirectObjects[key]], SeekOrigin.Begin);
|
|
_lexer.ResetBuffer();
|
|
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.ObjectStart) {
|
|
throw new PdfMetadataExtractorException("Expected object here");
|
|
}
|
|
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.String) {
|
|
throw new PdfMetadataExtractorException("Expected string");
|
|
}
|
|
|
|
_metadata[key] = (string) token.Value;
|
|
}
|
|
}
|
|
|
|
private long MetadataObjInObjectCatalog(long rootObj)
|
|
{
|
|
// Look for /Metadata entry in document catalog (PDF Spec 7.7.2)
|
|
|
|
if (rootObj < 1 || rootObj >= _objectOffsets.Length || _objectOffsets[rootObj] == 0)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
_stream.Seek(_objectOffsets[rootObj], SeekOrigin.Begin);
|
|
_lexer.ResetBuffer();
|
|
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.ObjectStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object header");
|
|
}
|
|
|
|
long meta = -1;
|
|
|
|
ParseDictionary(delegate(string key, PdfLexer.Token value)
|
|
{
|
|
switch (key) {
|
|
case "Metadata":
|
|
if (value.Type != PdfLexer.TokenType.ObjectRef)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object number after /Metadata");
|
|
}
|
|
|
|
meta = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
});
|
|
|
|
return meta;
|
|
}
|
|
|
|
// Obtain metadata from XMP stream object
|
|
// See XMP specification: https://developer.adobe.com/xmp/docs/XMPSpecifications/
|
|
// and Dublin Core: https://www.dublincore.org/specifications/dublin-core/
|
|
|
|
private static string? GetTextFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path)
|
|
{
|
|
return (doc.DocumentElement?.SelectSingleNode(path + "//rdf:li", ns)
|
|
?? doc.DocumentElement?.SelectSingleNode(path, ns))?.InnerText;
|
|
}
|
|
|
|
private static string? GetListFromXmlNode(XmlDocument doc, XmlNamespaceManager ns, string path)
|
|
{
|
|
var nodes = doc.DocumentElement?.SelectNodes(path + "//rdf:li", ns);
|
|
|
|
if (nodes == null) return null;
|
|
|
|
var list = new StringBuilder();
|
|
|
|
foreach (XmlNode n in nodes)
|
|
{
|
|
if (list.Length > 0)
|
|
{
|
|
list.Append(',');
|
|
}
|
|
|
|
list.Append(n.InnerText);
|
|
}
|
|
|
|
return list.Length > 0 ? list.ToString() : null;
|
|
}
|
|
|
|
private void SetMetadata(string key, string? value)
|
|
{
|
|
if (value == null) return;
|
|
|
|
_metadata[key] = value;
|
|
}
|
|
|
|
private void ReadMetadataFromXml(long meta)
|
|
{
|
|
if (meta < 1 || meta >= _objectOffsets.Length || _objectOffsets[meta] == 0) return;
|
|
|
|
_stream.Seek(_objectOffsets[meta], SeekOrigin.Begin);
|
|
_lexer.ResetBuffer();
|
|
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.ObjectStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected object header");
|
|
}
|
|
|
|
long length = -1;
|
|
var deflate = false;
|
|
|
|
// Metadata stream dictionary (PDF Spec 14.3.2)
|
|
|
|
ParseDictionary(delegate(string key, PdfLexer.Token value)
|
|
{
|
|
switch (key) {
|
|
case "Type":
|
|
if (value.Type != PdfLexer.TokenType.Name || (string)value.Value != "Metadata")
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected /Type to be /Metadata");
|
|
}
|
|
|
|
return true;
|
|
|
|
case "Subtype":
|
|
if (value.Type != PdfLexer.TokenType.Name || (string)value.Value != "XML")
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected /Subtype to be /XML");
|
|
}
|
|
|
|
return true;
|
|
|
|
case "Length":
|
|
if (value.Type != PdfLexer.TokenType.Int)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected integer after /Length");
|
|
}
|
|
|
|
length = (long)value.Value;
|
|
|
|
return true;
|
|
|
|
case "Filter":
|
|
if (value.Type != PdfLexer.TokenType.Name)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected name after /Filter");
|
|
}
|
|
|
|
if ((string)value.Value != "FlateDecode")
|
|
{
|
|
throw new PdfMetadataExtractorException("Unsupported filter, only FlateDecode is supported");
|
|
}
|
|
|
|
deflate = true;
|
|
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
});
|
|
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.StreamStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected xref stream after dictionary");
|
|
}
|
|
|
|
var xmlStream = _lexer.StreamObject((int)length, deflate);
|
|
|
|
// Skip XMP header
|
|
while (true) {
|
|
var b = xmlStream.ReadByte();
|
|
|
|
if (b < 0) {
|
|
throw new PdfMetadataExtractorException("Reached EOF in XMP header");
|
|
}
|
|
|
|
if (b == '?') {
|
|
while (b == '?') {
|
|
b = xmlStream.ReadByte();
|
|
}
|
|
|
|
if (b == '>') {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
var metaDoc = new XmlDocument();
|
|
metaDoc.Load(xmlStream);
|
|
|
|
var ns = new XmlNamespaceManager(metaDoc.NameTable);
|
|
ns.AddNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
|
|
ns.AddNamespace("dc", "http://purl.org/dc/elements/1.1/");
|
|
ns.AddNamespace("calibreSI", "http://calibre-ebook.com/xmp-namespace-series-index");
|
|
ns.AddNamespace("calibre", "http://calibre-ebook.com/xmp-namespace");
|
|
ns.AddNamespace("pdfx", "http://ns.adobe.com/pdfx/1.3/");
|
|
ns.AddNamespace("prism", "http://prismstandard.org/namespaces/basic/2.0/");
|
|
ns.AddNamespace("xmp", "http://ns.adobe.com/xap/1.0/");
|
|
|
|
SetMetadata("CreationDate",
|
|
GetTextFromXmlNode(metaDoc, ns, "//dc:date")
|
|
?? GetTextFromXmlNode(metaDoc, ns, "//xmp:CreateDate"));
|
|
SetMetadata("Summary", GetTextFromXmlNode(metaDoc, ns, "//dc:description"));
|
|
SetMetadata("Publisher", GetTextFromXmlNode(metaDoc, ns, "//dc:publisher"));
|
|
SetMetadata("Author", GetListFromXmlNode(metaDoc, ns, "//dc:creator"));
|
|
SetMetadata("Title", GetTextFromXmlNode(metaDoc, ns, "//dc:title"));
|
|
SetMetadata("Subject", GetListFromXmlNode(metaDoc, ns, "//dc:subject"));
|
|
SetMetadata("Language", GetTextFromXmlNode(metaDoc, ns, "//dc:language"));
|
|
SetMetadata("ISBN", GetTextFromXmlNode(metaDoc, ns, "//pdfx:isbn") ?? GetTextFromXmlNode(metaDoc, ns, "//prism:isbn"));
|
|
SetMetadata("UserRating", GetTextFromXmlNode(metaDoc, ns, "//calibre:rating"));
|
|
SetMetadata("TitleSort", GetTextFromXmlNode(metaDoc, ns, "//calibre:title_sort"));
|
|
SetMetadata("Series", GetTextFromXmlNode(metaDoc, ns, "//calibre:series/rdf:value"));
|
|
SetMetadata("Volume", GetTextFromXmlNode(metaDoc, ns, "//calibreSI:series_index"));
|
|
}
|
|
|
|
private delegate bool DictionaryHandler(string key, PdfLexer.Token value);
|
|
|
|
private void ParseDictionary(DictionaryHandler handler)
|
|
{
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type != PdfLexer.TokenType.DictionaryStart)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected dictionary");
|
|
}
|
|
|
|
while (true)
|
|
{
|
|
token = _lexer.NextToken();
|
|
|
|
if (token.Type == PdfLexer.TokenType.DictionaryEnd)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (token.Type == PdfLexer.TokenType.Name)
|
|
{
|
|
var value = _lexer.NextToken();
|
|
|
|
if (!handler((string)token.Value, value)) {
|
|
SkipValue(value);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
throw new PdfMetadataExtractorException("Improper token in dictionary");
|
|
}
|
|
}
|
|
}
|
|
|
|
private void SkipValue(PdfLexer.Token? existingToken = null)
|
|
{
|
|
var token = existingToken ?? _lexer.NextToken();
|
|
|
|
switch (token.Type)
|
|
{
|
|
case PdfLexer.TokenType.Bool:
|
|
case PdfLexer.TokenType.Int:
|
|
case PdfLexer.TokenType.Double:
|
|
case PdfLexer.TokenType.Name:
|
|
case PdfLexer.TokenType.String:
|
|
case PdfLexer.TokenType.ObjectRef:
|
|
break;
|
|
case PdfLexer.TokenType.ArrayStart:
|
|
{
|
|
SkipArray();
|
|
break;
|
|
}
|
|
case PdfLexer.TokenType.DictionaryStart:
|
|
{
|
|
SkipDictionary();
|
|
break;
|
|
}
|
|
default:
|
|
throw new PdfMetadataExtractorException("Unexpected token in SkipValue");
|
|
}
|
|
}
|
|
|
|
private void SkipArray()
|
|
{
|
|
while (true)
|
|
{
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type == PdfLexer.TokenType.ArrayEnd)
|
|
{
|
|
break;
|
|
}
|
|
|
|
SkipValue(token);
|
|
}
|
|
}
|
|
|
|
private void SkipDictionary()
|
|
{
|
|
while (true)
|
|
{
|
|
var token = _lexer.NextToken();
|
|
|
|
if (token.Type == PdfLexer.TokenType.DictionaryEnd)
|
|
{
|
|
break;
|
|
}
|
|
if (token.Type != PdfLexer.TokenType.Name)
|
|
{
|
|
throw new PdfMetadataExtractorException("Expected name in dictionary");
|
|
}
|
|
|
|
SkipValue();
|
|
}
|
|
}
|
|
}
|