src/Parser.jl

module Parser  # JSON

using Mmap
using ..Common
import Parsers

"""
Like `isspace`, but work on bytes and includes only the four whitespace
characters defined by the JSON standard: space, tab, line feed, and carriage
return.
"""
isjsonspace(b::UInt8) = b == SPACE || b == TAB || b == NEWLINE || b == RETURN

"""
Like `isdigit`, but for bytes.
"""
isjsondigit(b::UInt8) = DIGIT_ZERO ≤ b ≤ DIGIT_NINE

abstract type ParserState end

mutable struct MemoryParserState <: ParserState
    utf8::String
    s::Int
end

# it is convenient to access MemoryParserState like a Vector{UInt8} to avoid copies
Base.@propagate_inbounds Base.getindex(state::MemoryParserState, i::Int) = codeunit(state.utf8, i)
Base.length(state::MemoryParserState) = sizeof(state.utf8)

mutable struct StreamingParserState{T <: IO} <: ParserState
    io::T
    cur::UInt8
    used::Bool
    utf8array::Vector{UInt8}
end
StreamingParserState(io::IO) = StreamingParserState(io, 0x00, true, UInt8[])

struct ParserContext{DictType, IntType, AllowNanInf, NullValue} end

"""
Return the byte at the current position of the `ParserState`. If there is no
byte (that is, the `ParserState` is done), then an error is thrown that the
input ended unexpectedly.
"""
@inline function byteat(ps::MemoryParserState)
    @inbounds if hasmore(ps)
        return ps[ps.s]
    else
        _error(E_UNEXPECTED_EOF, ps)
    end
end

@inline function byteat(ps::StreamingParserState)
    if ps.used
        ps.used = false
        if eof(ps.io)
            _error(E_UNEXPECTED_EOF, ps)
        else
            ps.cur = read(ps.io, UInt8)
        end
    end
    ps.cur
end

"""
Like `byteat`, but with no special bounds check and error message. Useful when
a current byte is known to exist.
"""
@inline current(ps::MemoryParserState) = ps[ps.s]
@inline current(ps::StreamingParserState) = byteat(ps)

"""
Require the current byte of the `ParserState` to be the given byte, and then
skip past that byte. Otherwise, an error is thrown.
"""
@inline function skip!(ps::ParserState, c::UInt8)
    if byteat(ps) == c
        incr!(ps)
    else
        _error_expected_char(c, ps)
    end
end
@noinline _error_expected_char(c, ps) = _error("Expected '$(Char(c))' here", ps)

function skip!(ps::ParserState, cs::UInt8...)
    for c in cs
        skip!(ps, c)
    end
end

"""
Move the `ParserState` to the next byte.
"""
@inline incr!(ps::MemoryParserState) = (ps.s += 1)
@inline incr!(ps::StreamingParserState) = (ps.used = true)

"""
Move the `ParserState` to the next byte, and return the value at the byte before
the advancement. If the `ParserState` is already done, then throw an error.
"""
@inline advance!(ps::ParserState) = (b = byteat(ps); incr!(ps); b)

"""
Return `true` if there is a current byte, and `false` if all bytes have been
exausted.
"""
@inline hasmore(ps::MemoryParserState) = ps.s ≤ length(ps)
@inline hasmore(ps::StreamingParserState) = true  # no more now ≠ no more ever

"""
Remove as many whitespace bytes as possible from the `ParserState` starting from
the current byte.
"""
@inline function chomp_space!(ps::ParserState)
    @inbounds while hasmore(ps) && isjsonspace(current(ps))
        incr!(ps)
    end
end


# Used for line counts
function _count_before(haystack::AbstractString, needle::Char, _end::Int)
    count = 0
    for (i,c) in enumerate(haystack)
        i >= _end && return count
        count += c == needle
    end
    return count
end


# Throws an error message with an indicator to the source
@noinline function _error(message::AbstractString, ps::MemoryParserState)
    orig = ps.utf8
    lines = _count_before(orig, '\n', ps.s)
    # Replace all special multi-line/multi-space characters with a space.
    strnl = replace(orig, r"[\b\f\n\r\t\s]" => " ")
    li = (ps.s > 20) ? ps.s - 9 : 1 # Left index
    ri = min(lastindex(orig), ps.s + 20)       # Right index
    error(message *
      "\nLine: " * string(lines) *
      "\nAround: ..." * strnl[li:ri] * "..." *
      "\n           " * (" " ^ (ps.s - li)) * "^\n"
    )
end

@noinline function _error(message::AbstractString, ps::StreamingParserState)
    error("$message\n ...when parsing byte with value '$(current(ps))'")
end

# PARSING

"""
Given a `ParserState`, after possibly any amount of whitespace, return the next
parseable value.
"""
function parse_value(pc::ParserContext, ps::ParserState)
    chomp_space!(ps)

    @inbounds byte = byteat(ps)
    if byte == STRING_DELIM
        parse_string(ps)
    elseif isjsondigit(byte) || byte == MINUS_SIGN
        parse_number(pc, ps)
    elseif byte == OBJECT_BEGIN
        parse_object(pc, ps)
    elseif byte == ARRAY_BEGIN
        parse_array(pc, ps)
    else
        parse_jsconstant(pc, ps)
    end
end

function parse_jsconstant(::ParserContext{<:Any,<:Any,AllowNanInf,NullValue},
                          ps::ParserState) where {AllowNanInf,NullValue}
    c = advance!(ps)
    if c == LATIN_T      # true
        skip!(ps, LATIN_R, LATIN_U, LATIN_E)
        true
    elseif c == LATIN_F  # false
        skip!(ps, LATIN_A, LATIN_L, LATIN_S, LATIN_E)
        false
    elseif c == LATIN_N  # null
        skip!(ps, LATIN_U, LATIN_L, LATIN_L)
        NullValue
    elseif AllowNanInf && c == LATIN_UPPER_N
        skip!(ps, LATIN_A, LATIN_UPPER_N)
        NaN
    elseif AllowNanInf && c == LATIN_UPPER_I
        skip!(ps, LATIN_N, LATIN_F, LATIN_I, LATIN_N, LATIN_I, LATIN_T, LATIN_Y)
        Inf
    else
        _error(E_UNEXPECTED_CHAR, ps)
    end
end

function parse_array(pc::ParserContext, ps::ParserState)
    result = Any[]
    @inbounds incr!(ps)  # Skip over opening '['
    chomp_space!(ps)
    if byteat(ps) ≠ ARRAY_END  # special case for empty array
        @inbounds while true
            push!(result, parse_value(pc, ps))
            chomp_space!(ps)
            byteat(ps) == ARRAY_END && break
            skip!(ps, DELIMITER)
        end
    end

    @inbounds incr!(ps)
    result
end


function parse_object(pc::ParserContext{DictType,<:Real,<:Any}, ps::ParserState) where DictType
    obj = DictType()
    keyT = keytype(typeof(obj))

    incr!(ps)  # Skip over opening '{'
    chomp_space!(ps)
    if byteat(ps) ≠ OBJECT_END  # special case for empty object
        @inbounds while true
            # Read key
            chomp_space!(ps)
            byteat(ps) == STRING_DELIM || _error(E_BAD_KEY, ps)
            key = parse_string(ps)
            chomp_space!(ps)
            skip!(ps, SEPARATOR)
            # Read value
            value = parse_value(pc, ps)
            chomp_space!(ps)
            obj[keyT === Symbol ? Symbol(key) : convert(keyT, key)] = value
            byteat(ps) == OBJECT_END && break
            skip!(ps, DELIMITER)
        end
    end

    incr!(ps)
    obj
end


utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)

function read_four_hex_digits!(ps::ParserState)
    local n::UInt16 = 0

    for _ in 1:4
        b = advance!(ps)
        n = n << 4 + if isjsondigit(b)
            b - DIGIT_ZERO
        elseif LATIN_A ≤ b ≤ LATIN_F
            b - (LATIN_A - UInt8(10))
        elseif LATIN_UPPER_A ≤ b ≤ LATIN_UPPER_F
            b - (LATIN_UPPER_A - UInt8(10))
        else
            _error(E_BAD_ESCAPE, ps)
        end
    end

    n
end

function read_unicode_escape!(ps)
    u1 = read_four_hex_digits!(ps)
    if utf16_is_surrogate(u1)
        skip!(ps, BACKSLASH)
        skip!(ps, LATIN_U)
        u2 = read_four_hex_digits!(ps)
        utf16_get_supplementary(u1, u2)
    else
        Char(u1)
    end
end

function parse_string(ps::ParserState)
    b = IOBuffer()
    incr!(ps)  # skip opening quote
    while true
        c = advance!(ps)

        if c == BACKSLASH
            c = advance!(ps)
            if c == LATIN_U  # Unicode escape
                write(b, read_unicode_escape!(ps))
            else
                c = get(ESCAPES, c, 0x00)
                c == 0x00 && _error(E_BAD_ESCAPE, ps)
                write(b, c)
            end
            continue
        elseif c < SPACE
            _error(E_BAD_CONTROL, ps)
        elseif c == STRING_DELIM
            return String(take!(b))
        end

        write(b, c)
    end
end

"""
Return `true` if the given bytes vector, starting at `from` and ending at `to`,
has a leading zero.
"""
function hasleadingzero(bytes, from::Int, to::Int)
    c = bytes[from]
    from + 1 < to && c == UInt8('-') &&
            bytes[from + 1] == DIGIT_ZERO && isjsondigit(bytes[from + 2]) ||
    from < to && to > from + 1 && c == DIGIT_ZERO &&
            isjsondigit(bytes[from + 1])
end

"""
Parse a float from the given bytes vector, starting at `from` and ending at the
byte before `to`. Bytes enclosed should all be ASCII characters.
"""
float_from_bytes(bytes::MemoryParserState, from::Int, to::Int) = float_from_bytes(bytes.utf8, from, to)

function float_from_bytes(bytes::Union{String, Vector{UInt8}}, from::Int, to::Int)::Union{Float64,Nothing}
    # Would like to use tryparse, but we want it to consume the full input,
    # and the version in Parsers does not do this.

    # return Parsers.tryparse(Float64, @view bytes.utf8[from:to])

    len = to - from + 1
    x, code, vpos, vlen, tlen = Parsers.xparse(Float64, bytes, from, to, Parsers.OPTIONS)
    if !Parsers.ok(code) || vlen < len
        return nothing
    end
    return x::Float64
end

"""
Parse an integer from the given bytes vector, starting at `from` and ending at
the byte before `to`. Bytes enclosed should all be ASCII characters.
"""
function int_from_bytes(pc::ParserContext{<:Any,IntType,<:Any},
                        ps::ParserState,
                        bytes,
                        from::Int,
                        to::Int) where IntType <: Real
    @inbounds isnegative = bytes[from] == MINUS_SIGN ? (from += 1; true) : false
    num = IntType(0)
    @inbounds for i in from:to
        c = bytes[i]
        dig = c - DIGIT_ZERO
        if dig < 0x10
            num = IntType(10) * num + IntType(dig)
        else
            _error(E_BAD_NUMBER, ps)
        end
    end
    ifelse(isnegative, -num, num)
end

function number_from_bytes(pc::ParserContext,
                           ps::ParserState,
                           isint::Bool,
                           bytes,
                           from::Int,
                           to::Int)
    @inbounds if hasleadingzero(bytes, from, to)
        _error(E_LEADING_ZERO, ps)
    end

    if isint
        @inbounds if to == from && bytes[from] == MINUS_SIGN
            _error(E_BAD_NUMBER, ps)
        end
        int_from_bytes(pc, ps, bytes, from, to)
    else
        res = float_from_bytes(bytes, from, to)
        res === nothing ? _error(E_BAD_NUMBER, ps) : res
    end
end


function parse_number(pc::ParserContext{<:Any,<:Any,AllowNanInf}, ps::ParserState) where AllowNanInf
    # Determine the end of the floating point by skipping past ASCII values
    # 0-9, +, -, e, E, and .
    number = ps.utf8array
    isint = true
    negative = false

    c = current(ps)

    # Parse and keep track of initial minus sign (for parsing -Infinity)
    if AllowNanInf && c == MINUS_SIGN
        push!(number, UInt8(c)) # save in case the next character is a number
        negative = true
        incr!(ps)
    end

    @inbounds while hasmore(ps)
        c = current(ps)

        if isjsondigit(c) || c == MINUS_SIGN
            push!(number, UInt8(c))
        elseif c in (PLUS_SIGN, LATIN_E, LATIN_UPPER_E, DECIMAL_POINT)
            push!(number, UInt8(c))
            isint = false
        elseif AllowNanInf && c == LATIN_UPPER_I
            infinity = parse_jsconstant(pc, ps)
            resize!(number, 0)
            return (negative ? -infinity : infinity)
        else
            break
        end

        incr!(ps)
    end

    v = number_from_bytes(pc, ps, isint, number, 1, length(number))
    resize!(number, 0)
    return v
end


unparameterize_type(x) = x # Fallback for nontypes -- functions etc
function unparameterize_type(T::Type)
    candidate = typeintersect(T, AbstractDict{String, Any})
    candidate <: Union{} ? T : candidate
end

# Workaround for slow dynamic dispatch for creating objects
const DEFAULT_PARSERCONTEXT = ParserContext{Dict{String, Any}, Int64, false, nothing}()
function _get_parsercontext(dicttype, inttype, allownan, null)
    if dicttype == Dict{String, Any} && inttype == Int64 && !allownan
        DEFAULT_PARSERCONTEXT
    else
        ParserContext{unparameterize_type(dicttype), inttype, allownan, null}.instance
    end
end

"""
    parse{T<:Associative}(str::AbstractString;
                          dicttype::Type{T}=Dict,
                          inttype::Type{<:Real}=Int64,
                          allownan::Bool=true,
                          null=nothing)

Parses the given JSON string into corresponding Julia types.

Keyword arguments:
  • dicttype: Associative type to use when parsing JSON objects (default: Dict{String, Any})
  • inttype: Real number type to use when parsing JSON numbers that can be parsed
             as integers (default: Int64)
  • allownan: allow parsing of NaN, Infinity, and -Infinity (default: true)
  • null: value to use for parsed JSON `null` values (default: `nothing`)
"""
function parse(str::AbstractString;
               dicttype=Dict{String,Any},
               inttype::Type{<:Real}=Int64,
               allownan::Bool=true,
               null=nothing)
    pc = _get_parsercontext(dicttype, inttype, allownan, null)
    ps = MemoryParserState(str, 1)
    v = parse_value(pc, ps)
    chomp_space!(ps)
    if hasmore(ps)
        _error(E_EXPECTED_EOF, ps)
    end
    v
end

"""
    parse{T<:Associative}(io::IO;
                          dicttype::Type{T}=Dict,
                          inttype::Type{<:Real}=Int64,
                          allownan=true,
                          null=nothing)

Parses JSON from the given IO stream into corresponding Julia types.

Keyword arguments:
  • dicttype: Associative type to use when parsing JSON objects (default: Dict{String, Any})
  • inttype: Real number type to use when parsing JSON numbers that can be parsed
             as integers (default: Int64)
  • allownan: allow parsing of NaN, Infinity, and -Infinity (default: true)
  • null: value to use for parsed JSON `null` values (default: `nothing`)
"""
function parse(io::IO;
               dicttype=Dict{String,Any},
               inttype::Type{<:Real}=Int64,
               allownan::Bool=true,
               null=nothing)
    pc = _get_parsercontext(dicttype, inttype, allownan, null)
    ps = StreamingParserState(io)
    parse_value(pc, ps)
end

"""
    parsefile(filename::AbstractString;
              dicttype=Dict{String, Any},
              inttype::Type{<:Real}=Int64,
              allownan::Bool=true,
              null=nothing,
              use_mmap::Bool=true)

Convenience function to parse JSON from the given file into corresponding Julia types.

Keyword arguments:
  • dicttype: Associative type to use when parsing JSON objects (default: Dict{String, Any})
  • inttype: Real number type to use when parsing JSON numbers that can be parsed
             as integers (default: Int64)
  • allownan: allow parsing of NaN, Infinity, and -Infinity (default: true)
  • null: value to use for parsed JSON `null` values (default: `nothing`)
  • use_mmap: use mmap when opening the file (default: true)
"""
function parsefile(filename::AbstractString;
                   dicttype=Dict{String, Any},
                   inttype::Type{<:Real}=Int64,
                   null=nothing,
                   allownan::Bool=true,
                   use_mmap::Bool=true)
    sz = filesize(filename)
    open(filename) do io
        s = use_mmap ? String(Mmap.mmap(io, Vector{UInt8}, sz)) : read(io, String)
        parse(s; dicttype=dicttype, inttype=inttype, allownan=allownan, null=null)
    end
end

# Efficient implementations of some of the above for in-memory parsing
include("specialized.jl")

end  # module Parser