| #ifdef NDEBUG |
| #undef NDEBUG |
| #endif |
|
|
| #include "llama.h" |
| #include "llama-grammar.h" |
|
|
| #include <cassert> |
|
|
| static const char * type_str(llama_gretype type) { |
| switch (type) { |
| case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR"; |
| case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT"; |
| case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT"; |
| case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER"; |
| case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF"; |
| case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT"; |
| case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END"; |
| default: return "?"; |
| } |
| } |
|
|
| static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) { |
| uint32_t index = 0; |
| llama_grammar_parser parsed_grammar; |
| parsed_grammar.parse(grammar_bytes); |
|
|
| std::map<uint32_t, std::string> symbol_names; |
| for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) { |
| symbol_names[it->second] = it->first; |
| } |
|
|
| auto print_all = [&]() { |
| fprintf(stderr, " verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes); |
| for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) { |
| fprintf(stderr, " {\"%s\", %u},\n", it->first.c_str(), it->second); |
| } |
| fprintf(stderr, " }, {\n"); |
| for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) { |
| fprintf(stderr, " // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule); |
| auto & rule = parsed_grammar.rules[i_rule]; |
| for (uint32_t i = 0; i < rule.size(); i++) { |
| std::string rule_str; |
| fprintf(stderr, " {%s, ", type_str(rule[i].type)); |
| if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT || |
| rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) { |
| char c = rule[i].value; |
| if (c == '\n') { |
| fprintf(stderr, "'\\n'"); |
| } else if (c == '\t') { |
| fprintf(stderr, "'\\t'"); |
| } else if (c == '\r') { |
| fprintf(stderr, "'\\r'"); |
| } else if (c == '\0') { |
| fprintf(stderr, "'\\0'"); |
| } else { |
| fprintf(stderr, "'%c'", c); |
| } |
| } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) { |
| fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value); |
| } else { |
| fprintf(stderr, "%u", rule[i].value); |
| } |
| fprintf(stderr, "},\n"); |
| } |
| } |
| fprintf(stderr, " });\n"); |
| }; |
|
|
| if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) { |
| print_all(); |
| fprintf(stderr, "\n"); |
| return; |
| } |
|
|
| fprintf(stderr, "Testing grammar:%s\n", grammar_bytes); |
|
|
| if (parsed_grammar.symbol_ids.size() != expected.size()) { |
| fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n"); |
| print_all(); |
| assert(parsed_grammar.symbol_ids.size() == expected.size()); |
| } |
|
|
| for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) |
| { |
| std::string key = it->first; |
| uint32_t value = it->second; |
| std::pair<std::string, uint32_t> expected_pair = expected[index]; |
|
|
| |
| if (expected_pair.first != key || expected_pair.second != value) |
| { |
| fprintf(stderr, "index: %u\n", index); |
| fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second); |
| fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value); |
| fprintf(stderr, "expected_pair != actual_pair\n"); |
| fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n"); |
| print_all(); |
| } |
|
|
| assert(expected_pair.first == key && expected_pair.second == value); |
|
|
| index++; |
| } |
|
|
| index = 0; |
| for (auto rule : parsed_grammar.rules) |
| { |
| |
| for (uint32_t i = 0; i < rule.size(); i++) |
| { |
| llama_grammar_element element = rule[i]; |
| llama_grammar_element expected_element = expected_rules[index]; |
|
|
| |
| if (expected_element.type != element.type || expected_element.value != element.value) |
| { |
| fprintf(stderr, "index: %u\n", index); |
| fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value); |
| fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value); |
| fprintf(stderr, "expected_element != actual_element\n"); |
| fprintf(stderr, "all elements:\n"); |
| fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n"); |
| print_all(); |
| } |
|
|
| assert(expected_element.type == element.type && expected_element.value == element.value); |
| index++; |
| } |
| } |
| } |
|
|
| static void verify_failure(const char * grammar_bytes) { |
| fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes); |
| llama_grammar_parser result; |
| result.parse(grammar_bytes); |
| assert(result.rules.empty() && "should have failed"); |
| } |
|
|
| int main() |
| { |
| verify_failure(R"""( |
| root ::= "a"{,}" |
| )"""); |
|
|
| verify_failure(R"""( |
| root ::= "a"{,10}" |
| )"""); |
|
|
| verify_parsing(R"""( |
| root ::= "a" |
| )""", { |
| {"root", 0}, |
| }, { |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a" | [bdx-z] | [^1-3] |
| )""", { |
| {"root", 0}, |
| }, { |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_CHAR, 'b'}, |
| {LLAMA_GRETYPE_CHAR_ALT, 'd'}, |
| {LLAMA_GRETYPE_CHAR_ALT, 'x'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_CHAR_NOT, '1'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= a+ |
| a ::= "a" |
| )""", { |
| {"a", 1}, |
| {"root", 0}, |
| {"root_2", 2}, |
| }, { |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a"+ |
| )""", { |
| {"root", 0}, |
| {"root_1", 1}, |
| }, { |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= a? |
| a ::= "a" |
| )""", { |
| {"a", 1}, |
| {"root", 0}, |
| {"root_2", 2}, |
| }, { |
| |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a"? |
| )""", { |
| {"root", 0}, |
| {"root_1", 1}, |
| }, { |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= a* |
| a ::= "a" |
| )""", { |
| {"a", 1}, |
| {"root", 0}, |
| {"root_2", 2}, |
| }, { |
| |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a"* |
| )""", { |
| {"root", 0}, |
| {"root_1", 1}, |
| }, { |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a"{2} |
| )""", { |
| {"root", 0}, |
| }, { |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a"{2,} |
| )""", { |
| {"root", 0}, |
| {"root_1", 1}, |
| }, { |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a"{ 4} |
| )""", { |
| {"root", 0}, |
| }, { |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= "a"{2,4} |
| )""", { |
| {"root", 0}, |
| {"root_1", 1}, |
| {"root_2", 2}, |
| }, { |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= (expr "=" term "\n")+ |
| expr ::= term ([-+*/] term)* |
| term ::= [0-9]+ |
| )""", { |
| {"expr", 2}, |
| {"expr_5", 5}, |
| {"expr_6", 6}, |
| {"root", 0}, |
| {"root_1", 1}, |
| {"root_4", 4}, |
| {"term", 3}, |
| {"term_7", 7}, |
| }, { |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_RULE_REF, 4}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_CHAR, '='}, |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_CHAR, '\n'}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_RULE_REF, 6}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, '0'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
| {LLAMA_GRETYPE_RULE_REF, 7}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_RULE_REF, 4}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, '-'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '+'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '*'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '/'}, |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 5}, |
| {LLAMA_GRETYPE_RULE_REF, 6}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, '0'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
| {LLAMA_GRETYPE_RULE_REF, 7}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| verify_parsing(R"""( |
| root ::= (expr "=" ws term "\n")+ |
| expr ::= term ([-+*/] term)* |
| term ::= ident | num | "(" ws expr ")" ws |
| ident ::= [a-z] [a-z0-9_]* ws |
| num ::= [0-9]+ ws |
| ws ::= [ \t\n]* |
| )""", { |
| {"expr", 2}, |
| {"expr_6", 6}, |
| {"expr_7", 7}, |
| {"ident", 8}, |
| {"ident_10", 10}, |
| {"num", 9}, |
| {"num_11", 11}, |
| {"root", 0}, |
| {"root_1", 1}, |
| {"root_5", 5}, |
| {"term", 4}, |
| {"ws", 3}, |
| {"ws_12", 12}, |
| }, { |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_RULE_REF, 5}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_CHAR, '='}, |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_RULE_REF, 4}, |
| {LLAMA_GRETYPE_CHAR, '\n'}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 4}, |
| {LLAMA_GRETYPE_RULE_REF, 7}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 12}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 8}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_RULE_REF, 9}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_CHAR, '('}, |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_RULE_REF, 2}, |
| {LLAMA_GRETYPE_CHAR, ')'}, |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 1}, |
| {LLAMA_GRETYPE_RULE_REF, 5}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, '-'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '+'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '*'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '/'}, |
| {LLAMA_GRETYPE_RULE_REF, 4}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_RULE_REF, 6}, |
| {LLAMA_GRETYPE_RULE_REF, 7}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'}, |
| {LLAMA_GRETYPE_RULE_REF, 10}, |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, '0'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
| {LLAMA_GRETYPE_RULE_REF, 11}, |
| {LLAMA_GRETYPE_RULE_REF, 3}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, 'a'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '0'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '_'}, |
| {LLAMA_GRETYPE_RULE_REF, 10}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, '0'}, |
| {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
| {LLAMA_GRETYPE_RULE_REF, 11}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| |
| {LLAMA_GRETYPE_CHAR, ' '}, |
| {LLAMA_GRETYPE_CHAR_ALT, '\t'}, |
| {LLAMA_GRETYPE_CHAR_ALT, '\n'}, |
| {LLAMA_GRETYPE_RULE_REF, 12}, |
| {LLAMA_GRETYPE_ALT, 0}, |
| {LLAMA_GRETYPE_END, 0}, |
| }); |
|
|
| return 0; |
| } |
|
|