Implement serialization of tokens

2025-07-09 03:04:10 -04:00 · 2021-03-19 15:19:16 +05:30 · 2021-03-19 15:19:16 +05:30 · f1e4095225
commit f1e4095225
parent 94756715ef
1 changed files with 120 additions and 14 deletions
--- a/src/calibre/srv/fast_css_transform.cpp
+++ b/src/calibre/srv/fast_css_transform.cpp
@ -32,7 +32,7 @@ is_surrogate(char32_t ch) {
 }
 static inline bool
-is_hex_char(char32_t ch) {
+is_hex_digit(char32_t ch) {
    return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch || ch <= 'F');
 }
@ -41,14 +41,19 @@ is_letter(char32_t ch) {
    return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
 }
 static inline bool
 is_digit(char32_t ch) {
    return '0' <= ch && ch <= '9';
 }
 static inline bool
 is_name_start(char32_t ch) {
    return is_letter(ch) || ch == '_' || ch >= 0x80;
 }
 static inline bool
-is_digit(char32_t ch) {
+is_name_body(char32_t ch) {
-    return '0' <= ch && ch <= '9';
+    return is_name_start(ch) || is_digit(ch) || ch == '-';
 }
 static inline bool
@ -220,7 +225,6 @@ enum class TokenType : unsigned int {
    whitespace,
    delimiter,
    ident,
    function,
    at_keyword,
    hash,
    string,
@ -228,7 +232,6 @@ enum class TokenType : unsigned int {
    function_start,
    number,
    dimension,
    percentage,
    comment,
    cdo,
    cdc
@ -236,6 +239,8 @@ enum class TokenType : unsigned int {
 class Token {
    enum class NameSerializeState : unsigned { start, one_hyphen, body };
    private:
        TokenType type;
        std::u32string text;
@ -247,6 +252,55 @@ class Token {
            unit_at = 0;
        }
        void serialize_escaped_char(const char32_t ch, std::u32string &out) const {
            out.push_back('\\');
            if (is_whitespace(ch) || is_hex_digit(ch)) {
                char buf[8];
                int num = std::snprintf(buf, sizeof(buf), "%x ", ch);
                if (num > 0) {
                    out.resize(out.size() + num);
                    for (int i = 0; i < num; i++) out[i + out.size() - num] = buf[i];
                } else throw std::runtime_error("Failed to convert character to hexedecimal escape");
            } else out.push_back(ch);
        }
        void serialize_ident(std::u32string &out) const {
            NameSerializeState state = NameSerializeState::start;
            for (const auto ch : text) {
                switch(state) {
                    case NameSerializeState::start:
                        if (is_name_start(ch)) { out.push_back(ch); state = NameSerializeState::body; }
                        else if (ch == '-') { out.push_back(ch); state = NameSerializeState::one_hyphen; }
                        else throw std::logic_error("Unable to serialize ident because of invalid start character");
                        break;
                    case NameSerializeState::one_hyphen:
                        if (is_name_start(ch) || ch == '-') { out.push_back(ch); state = NameSerializeState::body; }
                        else serialize_escaped_char(ch, out);
                        break;
                    case NameSerializeState::body:
                        if (is_name_body(ch)) out.push_back(ch);
                        else serialize_escaped_char(ch, out);
                        break;
                }
            }
        }
        void serialize_hash(std::u32string &out) const {
            for (const auto ch : text) {
                if (is_name_body(ch)) out.push_back(ch);
                else serialize_escaped_char(ch, out);
            }
        }
        void serialize_string(std::u32string &out) const {
            const char32_t delim = text.find('"') == std::u32string::npos ? '"' : '\'';
            for (const auto ch : text) {
                if (ch == '\n') out.append({'\\', '\n'});
                else if (ch == delim || ch == '\\') serialize_escaped_char(ch, out);
                else out.push_back(ch);
            }
        }
    public:
        Token() :
 			type(TokenType::whitespace), text(), unit_at(0), out_pos(0) {
@ -279,6 +333,7 @@ class Token {
 		TokenType get_type() const { return type; }
        void set_type(const TokenType q) { type = q; }
        size_t get_output_position() const { return out_pos; }
 		void set_output_position(const size_t val) { out_pos = val; }
        bool is_type(const TokenType q) const { return type == q; }
        bool is_delimiter(const char32_t ch) const { return type == TokenType::delimiter && text.size() == 1 && text[0] == ch; }
@ -374,6 +429,53 @@ class Token {
            set_text(scratch);
            return true;
        }
        void serialize(std::u32string &out) const {
            switch (type) {
                case TokenType::whitespace:
                case TokenType::delimiter:
                    out.append(text);
                    break;
                case TokenType::ident:
                    serialize_ident(out);
                    break;
                case TokenType::at_keyword:
                    out.push_back('@');
                    serialize_ident(out);
                    break;
                case TokenType::hash:
                    out.push_back('#');
                    serialize_hash(out);
                    break;
                case TokenType::string:
                    serialize_string(out);
                    break;
                case TokenType::url:
                    out.append({'u', 'r', 'l', '('});
                    serialize_string(out);
                    out.push_back(')');
                    break;
                case TokenType::function_start:
                    serialize_ident(out);
                    out.push_back('(');
                    break;
                case TokenType::number:
                case TokenType::dimension:
                    out.append(text);
                    break;
                case TokenType::comment:
                    out.append({'/', '*'});
                    out.append(text);
                    out.append({'*', '/'});
                    break;
                case TokenType::cdo:
                    out.append({'<', '!', '-', '-'});
                    break;
                case TokenType::cdc:
                    out.append({'-', '-', '>'});
                    break;
            }
        }
 };
 class TokenQueue {
@ -384,6 +486,8 @@ class TokenQueue {
 		std::string scratch, scratch2;
 		pyobject_raii url_callback;
 		size_t current_output_position() const { return out.size(); }
        void new_token(const TokenType type, const char32_t ch = 0) {
            if (pool.empty()) queue.emplace_back(type, ch, current_output_position());
            else {
@ -394,8 +498,6 @@ class TokenQueue {
            }
        }
 		size_t current_output_position() const { return out.size(); }
        void add_char_of_type(const TokenType type, const char32_t ch) {
            if (!queue.empty() && queue.back().is_type(type)) queue.back().add_char(ch);
            else new_token(type, ch);
@ -552,16 +654,16 @@ class TokenQueue {
        void add_delimiter(const char32_t ch) { new_token(TokenType::delimiter, ch); }
-        void add_hash() { new_token(TokenType::hash, '#'); }
+        void add_hash() { new_token(TokenType::hash); }
-        void add_at_keyword() { new_token(TokenType::at_keyword, '@'); }
+        void add_at_keyword() { new_token(TokenType::at_keyword); }
        void add_number(const char32_t ch) { new_token(TokenType::number, ch); }
        void add_ident(const char32_t ch) { new_token(TokenType::ident, ch); }
-        void add_cdc() { new_token(TokenType::cdc, '-'); queue.back().add_char('-'); queue.back().add_char('>'); }
+        void add_cdc() { new_token(TokenType::cdc); }
-        void add_cdo() { new_token(TokenType::cdo, '<'); queue.back().add_char('-'); queue.back().add_char('-'); }
+        void add_cdo() { new_token(TokenType::cdo); }
        void mark_unit() {
            if (queue.empty()) throw std::logic_error("Attempting to mark unit with no token present");
@ -596,6 +698,10 @@ class TokenQueue {
 			} else {
 				if (process_declaration()) changed = true;
 			}
            if (changed && queue.size()) {
                out.resize(queue[0].get_output_position());
                for (auto tok : queue) tok.serialize(out);
            }
 			return_tokens_to_pool();
 		}
 };
@ -761,7 +867,7 @@ class Parser {
        void handle_escape() {
            if (!escape_buf_pos) {
                if (ch == '\n') { reconsume(); states.pop(); return; }
-                if (!is_hex_char(ch)) {
+                if (!is_hex_digit(ch)) {
                    states.pop();
                    token_queue.add_char(ch);
                    return;
@ -769,7 +875,7 @@ class Parser {
                escape_buf[escape_buf_pos++] = (char)ch;
                return;
            }
-            if (is_hex_char(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = (char)ch; return; }
+            if (is_hex_digit(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = (char)ch; return; }
            if (is_whitespace(ch)) return;  // a single whitespace character is absorbed into escape
            reconsume();
            states.pop();
@ -994,7 +1100,7 @@ class Parser {
                    else token_queue.add_delimiter(ch);
                    break;
                case '<':
-                    if (is_top_level() && peek() == '-' && peek(1) == '-') { token_queue.add_cdo(); write_to_output(input.next()); write_to_output(input.next()); }
+                    if (is_top_level() && peek() == '!' && peek(1) == '-' && peek(2) == '-') { token_queue.add_cdo(); write_to_output(input.next()); write_to_output(input.next()); }
                    else token_queue.add_delimiter(ch);
                    break;
                case '@':