More work on fast CSS parser

This commit is contained in:
Kovid Goyal 2021-03-08 09:33:08 +05:30
parent 692b31aee3
commit bc1fda2a1f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -14,38 +14,192 @@
#include <vector> #include <vector>
#include <stack> #include <stack>
#include <exception> #include <exception>
#include <stdexcept>
#include <string>
#define arraysz(x) (sizeof((x))/sizeof((x)[0])) // character classes {{{
typedef uint32_t char_type;
static inline bool static inline bool
is_whitespace(char_type ch) { is_whitespace(char32_t ch) {
return ch == ' ' || ch == '\n' || ch == '\t'; return ch == ' ' || ch == '\n' || ch == '\t';
} }
static inline bool static inline bool
is_surrogate(char_type ch) { is_surrogate(char32_t ch) {
return 0xd800 <= ch && ch <= 0xdfff; return 0xd800 <= ch && ch <= 0xdfff;
} }
static inline bool static inline bool
is_hex_char(char_type ch) { is_hex_char(char32_t ch) {
return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch || ch <= 'F'); return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch || ch <= 'F');
} }
typedef enum { static inline bool
BLOCK, is_letter(char32_t ch) {
return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
}
ESCAPE, static inline bool
COMMENT, is_name_start(char32_t ch) {
STRING, return is_letter(ch) || ch == '_' || ch >= 0x80;
}
QUALIFIED_RULE, static inline bool
AT_RULE, is_digit(char32_t ch) {
KEY, return '0' <= ch && ch <= '9';
VALUE }
} ParseStates;
static inline bool
is_name(char32_t ch) {
return is_name_start(ch) || is_digit(ch) || ch == '-';
}
// }}}
enum class TokenType : unsigned int {
whitespace,
delimiter,
ident,
function,
at_keyword,
hash,
string,
url,
function_start,
number,
dimension,
percentage,
comment,
cdo,
cdc
} TokenTypes;
class Token {
private:
TokenType type;
std::u32string text;
unsigned unit_at;
void clear() {
type = TokenType::whitespace;
text.clear();
unit_at = 0;
}
public:
Token() : type(TokenType::whitespace), text(), unit_at(0) { text.reserve(16); }
Token(const TokenType type, const char32_t ch) : type(type), text(), unit_at(0) { text.reserve(16); text.push_back(ch); }
Token(const Token& other) : type(other.type), text(other.text), unit_at(other.unit_at) {} // copy constructor
Token(Token&& other) : type(other.type), text(std::move(other.text)), unit_at(other.unit_at) {} // move constructor
Token& operator=(const Token& other) { type = other.type; text = other.text; unit_at = other.unit_at; return *this; } // copy assignment
Token& operator=(Token&& other) { type = other.type; text = std::move(other.text); unit_at = other.unit_at; return *this; } // move assignment
void set_type(const TokenType q) { type = q; }
bool is_type(const TokenType q) const { return type == q; }
void add_char(const char32_t ch) { text.push_back(ch); }
void mark_unit() { unit_at = text.size(); }
void clear_text() { text.clear(); }
bool text_equals_case_insensitive(const char *lowercase_text) const {
const char32_t* str = text.c_str();
const unsigned char* q = reinterpret_cast<const unsigned char*>(lowercase_text);
static const char delta = 'a' - 'A';
for (unsigned i = 0; ; i++) {
if (!str[i]) return q[i] ? false : true;
if (!q[i]) return false;
if (str[i] != q[i] && str[i] + delta != q[i]) return false;
}
return true;
}
void trim_trailing_whitespace() {
while(text.size() && is_whitespace(text.back())) text.pop_back();
}
};
class TokenQueue {
private:
std::stack<Token> pool;
std::vector<Token> queue;
void new_token(const TokenType type, const char32_t ch = 0) {
if (pool.empty()) queue.emplace_back(type, ch);
else {
queue.push_back(std::move(pool.top())); pool.pop();
queue.back().set_type(type);
if (ch) queue.back().add_char(ch);
}
}
void add_char_of_type(const TokenType type, const char32_t ch) {
if (!queue.empty() && queue.back().is_type(type)) queue.back().add_char(ch);
else new_token(type, ch);
}
public:
TokenQueue() : pool(), queue() {}
bool current_token_text_equals_case_insensitive(const char *lowercase_text) const {
if (queue.empty()) return false;
return queue.back().text_equals_case_insensitive(lowercase_text);
}
void add_whitespace(const char32_t ch) { add_char_of_type(TokenType::whitespace, ch); }
void start_string() {
if (queue.empty() || !queue.back().is_type(TokenType::string)) new_token(TokenType::string);
}
void add_comment(const char32_t ch) {
new_token(TokenType::comment, ch);
}
void add_char(const char32_t ch) {
if (queue.empty()) throw std::logic_error("Attempting to add char to non-existent token");
queue.back().add_char(ch);
}
void make_function_start(bool is_url = false) {
if (queue.empty()) throw std::logic_error("Attempting to make function start with non-existent token");
queue.back().set_type(is_url ? TokenType::url : TokenType::function_start);
if (is_url) queue.back().clear_text();
}
void add_delimiter(const char32_t ch) { new_token(TokenType::delimiter, ch); }
void add_hash() { new_token(TokenType::hash, '#'); }
void add_at_keyword() { new_token(TokenType::at_keyword, '@'); }
void add_number(const char32_t ch) { new_token(TokenType::number, ch); }
void add_ident(const char32_t ch) { new_token(TokenType::ident, ch); }
void add_cdc() { new_token(TokenType::cdc, '-'); queue.back().add_char('-'); queue.back().add_char('>'); }
void add_cdo() { new_token(TokenType::cdo, '<'); queue.back().add_char('-'); queue.back().add_char('-'); }
void mark_unit() {
if (queue.empty()) throw std::logic_error("Attempting to mark unit with no token present");
queue.back().mark_unit();
queue.back().set_type(TokenType::dimension);
}
void trim_trailing_whitespace() {
if (!queue.empty()) queue.back().trim_trailing_whitespace();
}
};
enum class ParseState : unsigned int {
normal,
escape,
comment,
string,
hash,
number,
digits,
dimension,
ident,
url, url_start, url_string, url_after_string,
at_keyword,
};
typedef enum { typedef enum {
DECLARATIONS_ALLOWED, DECLARATIONS_ALLOWED,
@ -56,43 +210,125 @@ typedef enum {
typedef std::bitset<NUM_OF_BLOCK_TYPE_FLAGS> BlockTypeFlags; typedef std::bitset<NUM_OF_BLOCK_TYPE_FLAGS> BlockTypeFlags;
class InputStream {
private:
const char32_t *src;
size_t pos;
const size_t src_sz;
char32_t peek_one(size_t at, unsigned *consumed) const {
if (at >= src_sz) { *consumed = 0; return 0; }
*consumed = 1;
char32_t ch = src[at];
if (ch == 0xc) ch = '\n';
else if (ch == '\r') {
ch = '\n';
if (at + 1 < src_sz && src[at + 1] == '\n') *consumed = 2;
}
else if (ch == 0 || is_surrogate(ch)) ch = 0xfffd;
return ch;
}
public:
InputStream(const char32_t *src, size_t sz) : src(src), pos(0), src_sz(sz) { }
char32_t next() {
unsigned last_step_size;
char32_t ans = peek_one(pos, &last_step_size);
pos += last_step_size;
return ans;
}
void rewind() {
if (!pos) throw std::logic_error("Cannot rewind already at start of stream");
pos -= (src[pos-1] == '\n' && pos >= 2 && src[pos-2] == '\r') ? 2 : 1;
}
char32_t peek(unsigned amt = 0) const {
char32_t ans = 0;
size_t at = pos;
unsigned consumed;
while(true) {
ans = peek_one(at, &consumed);
if (!amt || !ans) break;
at += consumed;
amt--;
}
return ans;
}
}; // end InputStream
class Parser { class Parser {
private: private:
ParseStates state, state_before_comment, state_before_string, state_before_escape; char32_t ch, end_string_with, prev_ch;
char_type ch, next_ch, end_string_with;
std::stack<BlockTypeFlags> block_types; std::stack<BlockTypeFlags> block_types;
std::stack<ParseState> states;
char escape_buf[16]; char escape_buf[16];
const char_type *src; size_t escape_buf_pos, declaration_pos;
size_t escape_buf_pos, src_sz, src_pos, declaration_pos; std::u32string out;
std::vector<char_type> out, current_key; TokenQueue token_queue;
InputStream input;
bool declarations_allowed() const { return block_types.top()[DECLARATIONS_ALLOWED]; } bool declarations_allowed() const { return block_types.top()[DECLARATIONS_ALLOWED]; }
bool at_rules_allowed() const { return block_types.top()[AT_RULES_ALLOWED]; } bool at_rules_allowed() const { return block_types.top()[AT_RULES_ALLOWED]; }
bool qualified_rules_allowed() const { return block_types.top()[QUALIFIED_RULES_ALLOWED]; } bool qualified_rules_allowed() const { return block_types.top()[QUALIFIED_RULES_ALLOWED]; }
// testing stream contents {{{
void rewind_output() { out.pop_back(); }
void write_to_output(const char32_t what) { out.push_back(what); }
void reconsume() { input.rewind(); rewind_output(); }
char32_t peek(int which = 0) const { return which < 0 ? ch : input.peek(which); }
bool starting_comment() const { return ch == '/' && peek() == '*'; }
bool starting_string() const { return ch == '"' || ch == '\''; }
bool has_valid_escape_next(int offset=0) const {
if (peek(offset) != '\\') return false;
char32_t second = peek(offset + 1);
return second > 0 && second != '\n';
}
bool has_valid_escape() const { return has_valid_escape_next(-1); }
bool has_identifier_next(int offset = 0) const {
char32_t first = peek(offset);
switch(first) {
case 0:
return false;
case '\\':
return has_valid_escape_next(offset);
case '-': {
char32_t second = peek(offset + 1);
if (is_name_start(second) || second == '-') return true;
if (second == '\\') {
char32_t third = peek(offset + 2);
return third > 0 && third != '\n';
}
return false;
}
default:
return is_name_start(first);
}
}
bool has_identifier() const { return has_identifier_next(-1); }
// }}}
// escape {{{
void enter_escape_mode() { void enter_escape_mode() {
state_before_escape = state; escape_buf_pos = 0; state = ESCAPE; states.push(ParseState::escape);
} escape_buf_pos = 0;
void enter_string_mode() {
state_before_string = state; state = STRING; end_string_with = ch;
}
void handle_comment() {
if (ch == '*' && next_ch == '/') {
out.push_back(next_ch);
state = state_before_comment;
src_pos++;
}
} }
void handle_escape() { void handle_escape() {
if (!escape_buf_pos) { if (!escape_buf_pos) {
if (ch == '\n') { state = state_before_escape; return; } if (ch == '\n') { reconsume(); states.pop(); return; }
if (!is_hex_char(ch)) { if (!is_hex_char(ch)) {
state = state_before_escape; states.pop();
if (state == KEY) { current_key.push_back(ch); } token_queue.add_char(ch);
return; return;
} }
escape_buf[escape_buf_pos++] = ch; escape_buf[escape_buf_pos++] = ch;
@ -100,69 +336,275 @@ private:
} }
if (is_hex_char(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = ch; return; } if (is_hex_char(ch) && escape_buf_pos < 6) { escape_buf[escape_buf_pos++] = ch; return; }
if (is_whitespace(ch)) return; // a single whitespace character is absorbed into escape if (is_whitespace(ch)) return; // a single whitespace character is absorbed into escape
src_pos--; reconsume();
state = state_before_escape; states.pop();
if (state == KEY) {
escape_buf[escape_buf_pos] = 0; escape_buf[escape_buf_pos] = 0;
long kch = strtol(escape_buf, NULL, 16); long kch = strtol(escape_buf, NULL, 16);
if (kch > 0 && !is_surrogate(kch)) { current_key.push_back(kch); } if (kch > 0 && !is_surrogate(kch)) token_queue.add_char(kch);
}
escape_buf_pos = 0; escape_buf_pos = 0;
} }
// }}}
// string {{{
void enter_string_mode() {
states.push(ParseState::string);
end_string_with = ch;
token_queue.start_string();
}
void handle_string() { void handle_string() {
if (ch == '\\') { enter_escape_mode(); } if (ch == '\\') {
else if (ch == end_string_with) state = state_before_string; if (peek() == '\n') input.next();
else enter_escape_mode();
} }
else if (ch == end_string_with) states.pop();
else token_queue.add_char(ch);
} // }}}
// comment {{{
void enter_comment_mode() { void enter_comment_mode() {
src_pos++; states.push(ParseState::comment);
state_before_comment = state; token_queue.add_comment(ch);
state = COMMENT;
out.push_back(next_ch);
} }
void handle_block() { void handle_comment() {
if (ch == '/' && next_ch == '*') { enter_comment_mode(); return; } token_queue.add_char(ch);
if (ch == '@' && at_rules_allowed()) { if (ch == '/' && prev_ch == '*') states.pop();
state = AT_RULE; } // }}}
// hash {{{
void enter_hash_mode() {
states.push(ParseState::hash);
token_queue.add_hash();
}
void handle_name() {
if (is_name(ch)) token_queue.add_char(ch);
else if (has_valid_escape()) enter_escape_mode();
else {
reconsume();
states.pop();
}
}
void handle_hash() {
handle_name();
} // }}}
// number {{{
void enter_number_mode() {
states.push(ParseState::number);
token_queue.add_number(ch);
}
void handle_number() {
if (is_digit(ch)) { token_queue.add_char(ch); return; }
if (ch == '.' && is_digit(peek())) { states.pop(); enter_digits_mode(); return; }
if ((ch == 'e' || ch == 'E')) {
char32_t next = peek();
if (is_digit(next) || ((next == '+' || next == '-') && is_digit(peek(1)))) {
token_queue.add_char(input.next()); token_queue.add_char(input.next());
states.pop();
enter_digits_mode();
return; return;
} }
if (ch == ';' || ch == '{' || ch == '}' || is_whitespace(ch)) return;
if (declarations_allowed()) {
state = KEY;
current_key.clear();
declaration_pos = out.size() > 1 ? out.size() - 2 : 0;
} else {
state = QUALIFIED_RULE;
} }
if (ch == '"' || ch == '\'') { enter_string_mode(); } reconsume();
else if (ch == '\\') { enter_escape_mode(); } states.pop();
else if (state == KEY) current_key.push_back(ch); if (has_identifier_next()) { enter_dimension_mode(); }
} // }}}
// digits {{{
void enter_digits_mode() {
states.push(ParseState::digits);
}
void handle_digits() {
if (is_digit(ch)) { token_queue.add_char(ch); }
else {
reconsume();
states.pop();
if (has_identifier_next()) { enter_dimension_mode(); }
}
} // }}}
// dimension {{{
void enter_dimension_mode() {
token_queue.mark_unit();
states.push(ParseState::dimension);
}
void handle_dimension() {
if (is_name(ch)) { token_queue.add_char(ch); return; }
if (has_valid_escape()) { enter_escape_mode(); return; }
reconsume();
states.pop();
} // }}}
// ident {{{
void enter_ident_mode(const char32_t starting_ch = 0) {
token_queue.add_ident(starting_ch);
states.push(ParseState::ident);
}
void handle_ident() {
if (is_name(ch)) { token_queue.add_char(ch); return; }
if (has_valid_escape()) { enter_escape_mode(); return; }
if (ch == '(') {
if (token_queue.current_token_text_equals_case_insensitive("url")) {
enter_url_start_mode();
}
else token_queue.make_function_start();
return;
}
reconsume();
states.pop();
} // }}}
// url {{{
void enter_url_start_mode() {
token_queue.make_function_start(true);
states.push(ParseState::url_start);
}
void handle_url_start() {
if (is_whitespace(ch)) return;
if (starting_string()) { states.pop(); end_string_with = ch; states.push(ParseState::url_string); return; }
if (ch == ')') { states.pop(); return; }
states.pop(); states.push(ParseState::url);
}
void handle_url_string() {
handle_string();
if (states.top() != ParseState::url_string && states.top() != ParseState::escape) states.push(ParseState::url_after_string);
}
void handle_url_after_string() {
if (!is_whitespace(ch)) exit_url_mode();
}
void handle_url() {
if (has_valid_escape()) enter_escape_mode();
else if (ch == ')') exit_url_mode(true);
}
void exit_url_mode(bool trim=false) {
states.pop();
if (trim) token_queue.trim_trailing_whitespace();
}
// }}}
// hash {{{
void enter_at_keyword() {
states.push(ParseState::hash);
token_queue.add_at_keyword();
}
void handle_at_keyword() {
handle_name();
} // }}}
void handle_normal() {
if (starting_comment()) { enter_comment_mode(); return; }
if (is_whitespace(ch)) { token_queue.add_whitespace(ch); return; }
if (is_digit(ch)) { enter_number_mode(); return; }
if (is_name_start(ch)) { enter_ident_mode(ch); return; }
switch (ch) {
case '"':
case '\'':
enter_string_mode();
break;
case '#':
if (is_name(peek()) || has_valid_escape_next()) {
enter_hash_mode();
} else token_queue.add_delimiter(ch);
break;
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case ',':
case ':':
case ';':
token_queue.add_delimiter(ch);
break;
case '+':
if (is_digit(peek()) || (peek() == '.' && is_digit(peek(1)))) { enter_number_mode(); }
else token_queue.add_delimiter(ch);
break;
case '-':
if (is_digit(peek()) || (peek() == '.' && is_digit(peek(1)))) { enter_number_mode(); }
else if (peek() == '-' && peek(1) == '>') { token_queue.add_cdc(); write_to_output(input.next()); write_to_output(input.next()); }
else if (has_identifier()) { enter_ident_mode(ch); }
else token_queue.add_delimiter(ch);
break;
case '.':
if (is_digit(peek())) { enter_number_mode(); }
else token_queue.add_delimiter(ch);
break;
case '<':
if (peek() == '-' && peek(1) == '-') { token_queue.add_cdo(); write_to_output(input.next()); write_to_output(input.next()); }
else token_queue.add_delimiter(ch);
break;
case '@':
if (has_identifier_next()) enter_at_keyword();
else token_queue.add_delimiter(ch);
break;
case '\\':
if (has_valid_escape()) { enter_ident_mode(); enter_escape_mode(); }
else token_queue.add_delimiter(ch);
break;
default:
token_queue.add_delimiter(ch);
break;
}
} }
void dispatch_current_char() { void dispatch_current_char() {
out.push_back(ch); write_to_output(ch);
switch (state) { switch (states.top()) {
case COMMENT: case ParseState::comment:
handle_comment(); break; handle_comment(); break;
case ESCAPE: case ParseState::escape:
handle_escape(); break; handle_escape(); break;
case STRING: case ParseState::string:
handle_string(); break; handle_string(); break;
case BLOCK: case ParseState::hash:
handle_block(); break; handle_hash(); break;
case ParseState::number:
handle_number(); break;
case ParseState::digits:
handle_digits(); break;
case ParseState::dimension:
handle_dimension(); break;
case ParseState::ident:
handle_ident(); break;
case ParseState::url_start:
handle_url_start(); break;
case ParseState::url_string:
handle_url_string(); break;
case ParseState::url:
handle_url(); break;
case ParseState::url_after_string:
handle_url_after_string(); break;
case ParseState::at_keyword:
handle_at_keyword(); break;
case ParseState::normal:
handle_normal(); break;
} }
prev_ch = ch;
} }
public:
Parser(const char_type *src, size_t src_sz, bool is_declaration) : public:
state(BLOCK), state_before_comment(BLOCK), state_before_string(BLOCK), state_before_escape(BLOCK), Parser(const char32_t *src, const size_t src_sz, const bool is_declaration) :
ch(0), next_ch(0), end_string_with('"'), block_types(), escape_buf(), ch(0), end_string_with('"'), prev_ch(0), block_types(), states(), escape_buf(),
src(src), escape_buf_pos(0), src_sz(src_sz), src_pos(0), declaration_pos(0), escape_buf_pos(0), declaration_pos(0),
out(src_sz * 2), current_key(256) out(), token_queue(), input(src, src_sz)
{ {
out.reserve(src_sz * 2);
BlockTypeFlags initial_block_type; BlockTypeFlags initial_block_type;
initial_block_type.set(DECLARATIONS_ALLOWED); initial_block_type.set(DECLARATIONS_ALLOWED);
if (!is_declaration) { if (!is_declaration) {
@ -172,29 +614,21 @@ public:
block_types.push(initial_block_type); block_types.push(initial_block_type);
} }
void parse(std::vector<char_type> &result) { void parse(std::u32string &result) {
while (src_pos < src_sz) { while (true) {
ch = src[src_pos++]; ch = input.next();
next_ch = src_pos < src_sz ? src[src_pos] : 0; if (!ch) break;
if (ch == 0xc) ch = '\n';
if (ch == '\r') {
if (next_ch == '\n') { ch = '\n'; src_pos++; }
else ch = '\n';
}
if (ch == 0 || is_surrogate(ch)) ch = 0xfffd;
dispatch_current_char(); dispatch_current_char();
} }
out.swap(result); out.swap(result);
} }
}; };
#undef write_key
#undef write
static PyObject* static PyObject*
transform_properties(const char_type *src, size_t src_sz, bool is_declaration) { transform_properties(const char32_t *src, size_t src_sz, bool is_declaration) {
try { try {
std::vector<char_type> result(0); std::u32string result;
Parser parser(src, src_sz, is_declaration); Parser parser(src, src_sz, is_declaration);
parser.parse(result); parser.parse(result);
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, result.data(), result.size()); return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, result.data(), result.size());