Fast JSON parser

I have implemented a fast JSON parser. On the big.json from the previous post on Apple M1:

mine: 17ms
Python json: 33ms
Python ujson: 33ms

The full tokenizer is:

#include <lfortran/utils.h>
#include <lfortran/string_utils.h>

namespace LFortran::json {


TokenType JSONTokenizer::next_token() {
    tok = cur;
    unsigned char *mar;
    /*!re2c
        re2c:define:YYCURSOR = cur;
        re2c:define:YYMARKER = mar;
        re2c:yyfill:enable = 0;
        re2c:define:YYCTYPE = "unsigned char";

        end = "\x00";
        newline = "\n";
        whitespace = [ \n\r\t]+;
        digit = [0-9];
        integer = ('-')? digit+;
        fraction = '.' digit+;
        exponent = "E" ('+' | '-')? digit+;
        number = integer fraction? exponent?;

        * {
            std::cout << "Unknown char: " << *tok << " NUM: " << std::to_string((int8_t)(*tok)) << std::endl;
            std::cout << "Token: " << std::string((char*)tok, cur-tok) << std::endl;
            throw LFortranException("Unknown token");
        }
        end { return TokenType::TK_EOF; }
        whitespace { return TokenType::TK_WS; }
        ',' { return TokenType::TK_COMMA; }
        ':' { return TokenType::TK_COLLON; }
        '[' { return TokenType::TK_LEFT_SQUARE_BRACKET; }
        ']' { return TokenType::TK_RIGHT_SQUARE_BRACKET; }
        '{' { return TokenType::TK_LEFT_CURLY_BRACKET; }
        '}' { return TokenType::TK_RIGHT_CURLY_BRACKET; }
        'true' { return TokenType::TK_TRUE; }
        'false' { return TokenType::TK_FALSE; }
        'null' { return TokenType::TK_NULL; }
        number { return TokenType::TK_NUMBER; }
        '"' ('\\"'|[^"\x00])* '"' { return TokenType::TK_STRING; }
    */
}


} // namespace LFortran::json

and the full parser is:

#ifndef LFORTRAN_SRC_PARSER_JSON_H
#define LFORTRAN_SRC_PARSER_JSON_H

#include <lfortran/exception.h>
#include <lfortran/utils.h>
#include <lfortran/parser/parser.h>

namespace LFortran
{

namespace json {

enum TokenType {
    TK_LEFT_CURLY_BRACKET,
    TK_RIGHT_CURLY_BRACKET,
    TK_LEFT_SQUARE_BRACKET,
    TK_RIGHT_SQUARE_BRACKET,
    TK_WS,
    TK_COMMA,
    TK_COLLON,
    TK_STRING,
    TK_NUMBER,
    TK_TRUE,
    TK_FALSE,
    TK_NULL,
    TK_EOF
};

struct JSONTokenizer
{
    unsigned char *string_start;
    unsigned char *cur;
    unsigned char *tok;
    const std::string &input;

    JSONTokenizer(const std::string &input) : input{input} {
        string_start = (unsigned char*)(&input[0]);
        cur = string_start;
    };

    TokenType next_token();

    int run() {
        TokenType tt;
        int num_tokens=0;
        while ( (tt=next_token()) != TokenType::TK_EOF ) {
            num_tokens++;
        };
        return num_tokens;
    }
};

struct JSONParser
{
    JSONTokenizer t;
    bool next_token_read=false;
    TokenType next_token_type;
    int count = 0;

    TokenType peek() {
        if (!next_token_read) {
            next_token_type = t.next_token();
            next_token_read = true;
        }
        return next_token_type;
    }

    TokenType read_token() {
        if (next_token_read) {
            next_token_read = false;
            return next_token_type;
        } else {
            return t.next_token();
        }
    }

    void accept_token(const TokenType tt_expected) {
        TokenType tt = read_token();
        if (tt != tt_expected) {
            throw LFortranException("Unexpected token");
        }
    }

    JSONParser(const std::string &input) : t{JSONTokenizer(input)} {
    };

    void parse_json() {
        parse_element();
    }

    void parse_element() {
        if (peek() == TokenType::TK_WS) read_token();
        parse_value();
        if (peek() == TokenType::TK_WS) read_token();
        count++;
    }

    void parse_elements() {
        parse_element();
        while (peek() == TokenType::TK_COMMA) {
            read_token(); // COMMA
            parse_element();
        }
    }

    void parse_member() {
        if (peek() == TokenType::TK_WS) read_token();
        accept_token(TokenType::TK_STRING);
        if (peek() == TokenType::TK_WS) read_token();
        accept_token(TokenType::TK_COLLON);
        parse_element();
    }

    void parse_members() {
        parse_member();
        while (peek() == TokenType::TK_COMMA) {
            read_token(); // COMMA
            parse_member();
        }
    }

    void parse_value() {
        TokenType next = peek();
        if (next == TokenType::TK_NULL) {
            read_token();
        } else if (next == TokenType::TK_FALSE) {
            read_token();
        } else if (next == TokenType::TK_TRUE) {
            read_token();
        } else if (next == TokenType::TK_NUMBER) {
            read_token();
        } else if (next == TokenType::TK_STRING) {
            read_token();
        } else if (next == TokenType::TK_LEFT_CURLY_BRACKET) {
            parse_object();
        } else if (next == TokenType::TK_LEFT_SQUARE_BRACKET) {
            parse_array();
        } else {
            std::cout << next << std::endl;
            throw LFortranException("Invalid token in parse_value()");
        }
    }

    void parse_array() {
        accept_token(TokenType::TK_LEFT_SQUARE_BRACKET);
        if (peek() == TokenType::TK_WS) read_token();
        if (peek() == TokenType::TK_RIGHT_SQUARE_BRACKET) {
            // empty array
        } else {
            parse_elements();
        }
        accept_token(TokenType::TK_RIGHT_SQUARE_BRACKET);
    }

    void parse_object() {
        accept_token(TokenType::TK_LEFT_CURLY_BRACKET);
        if (peek() == TokenType::TK_WS) read_token();
        if (peek() == TokenType::TK_RIGHT_CURLY_BRACKET) {
            // empty object
        } else {
            parse_members();
        }
        accept_token(TokenType::TK_RIGHT_CURLY_BRACKET);
    }

};

} // namespace json

} // namespace LFortran

#endif // LFORTRAN_SRC_PARSER_JSON_H

As you can see, it parses everything. It doesn’t create an AST so that you can query the results. But given that the above is half the time of Python’s JSON, it looks like it should be doable.

I don’t have time to actually do it, so I am posting what I have here.

3 Likes