package json

import (
    "fmt"

    "github.com/apparentlymart/go-textseg/v13/textseg"
    "Havoc/pkg/profile/yaotl"
)

//go:generate stringer -type tokenType scanner.go
type tokenType rune

const (
    tokenBraceO  tokenType = '{'
    tokenBraceC  tokenType = '}'
    tokenBrackO  tokenType = '['
    tokenBrackC  tokenType = ']'
    tokenComma   tokenType = ','
    tokenColon   tokenType = ':'
    tokenKeyword tokenType = 'K'
    tokenString  tokenType = 'S'
    tokenNumber  tokenType = 'N'
    tokenEOF     tokenType = '␄'
    tokenInvalid tokenType = 0
    tokenEquals  tokenType = '=' // used only for reminding the user of JSON syntax
)

type token struct {
    Type  tokenType
    Bytes []byte
    Range hcl.Range
}

// scan returns the primary tokens for the given JSON buffer in sequence.
//
// The responsibility of this pass is to just mark the slices of the buffer
// as being of various types. It is lax in how it interprets the multi-byte
// token types keyword, string and number, preferring to capture erroneous
// extra bytes that we presume the user intended to be part of the token
// so that we can generate more helpful diagnostics in the parser.
func scan(buf []byte, start pos) []token {
    var tokens []token
    p := start
    for {
        if len(buf) == 0 {
            tokens = append(tokens, token{
                Type:  tokenEOF,
                Bytes: nil,
                Range: posRange(p, p),
            })
            return tokens
        }

        buf, p = skipWhitespace(buf, p)

        if len(buf) == 0 {
            tokens = append(tokens, token{
                Type:  tokenEOF,
                Bytes: nil,
                Range: posRange(p, p),
            })
            return tokens
        }

        start = p

        first := buf[0]
        switch {
        case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=':
            p.Pos.Column++
            p.Pos.Byte++
            tokens = append(tokens, token{
                Type:  tokenType(first),
                Bytes: buf[0:1],
                Range: posRange(start, p),
            })
            buf = buf[1:]
        case first == '"':
            var tokBuf []byte
            tokBuf, buf, p = scanString(buf, p)
            tokens = append(tokens, token{
                Type:  tokenString,
                Bytes: tokBuf,
                Range: posRange(start, p),
            })
        case byteCanStartNumber(first):
            var tokBuf []byte
            tokBuf, buf, p = scanNumber(buf, p)
            tokens = append(tokens, token{
                Type:  tokenNumber,
                Bytes: tokBuf,
                Range: posRange(start, p),
            })
        case byteCanStartKeyword(first):
            var tokBuf []byte
            tokBuf, buf, p = scanKeyword(buf, p)
            tokens = append(tokens, token{
                Type:  tokenKeyword,
                Bytes: tokBuf,
                Range: posRange(start, p),
            })
        default:
            tokens = append(tokens, token{
                Type:  tokenInvalid,
                Bytes: buf[:1],
                Range: start.Range(1, 1),
            })
            // If we've encountered an invalid then we might as well stop
            // scanning since the parser won't proceed beyond this point.
            // We insert a synthetic EOF marker here to match the expectations
            // of consumers of this data structure.
            p.Pos.Column++
            p.Pos.Byte++
            tokens = append(tokens, token{
                Type:  tokenEOF,
                Bytes: nil,
                Range: posRange(p, p),
            })
            return tokens
        }
    }
}

func byteCanStartNumber(b byte) bool {
    switch b {
    // We are slightly more tolerant than JSON requires here since we
    // expect the parser will make a stricter interpretation of the
    // number bytes, but we specifically don't allow 'e' or 'E' here
    // since we want the scanner to treat that as the start of an
    // invalid keyword instead, to produce more intelligible error messages.
    case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
        return true
    default:
        return false
    }
}

func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
    // The scanner doesn't check that the sequence of digit-ish bytes is
    // in a valid order. The parser must do this when decoding a number
    // token.
    var i int
    p := start
Byte:
    for i = 0; i < len(buf); i++ {
        switch buf[i] {
        case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
            p.Pos.Byte++
            p.Pos.Column++
        default:
            break Byte
        }
    }
    return buf[:i], buf[i:], p
}

func byteCanStartKeyword(b byte) bool {
    switch {
    // We allow any sequence of alphabetical characters here, even though
    // JSON is more constrained, so that we can collect what we presume
    // the user intended to be a single keyword and then check its validity
    // in the parser, where we can generate better diagnostics.
    // So e.g. we want to be able to say:
    //   unrecognized keyword "True". Did you mean "true"?
    case isAlphabetical(b):
        return true
    default:
        return false
    }
}

func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
    var i int
    p := start
Byte:
    for i = 0; i < len(buf); i++ {
        b := buf[i]
        switch {
        case isAlphabetical(b) || b == '_':
            p.Pos.Byte++
            p.Pos.Column++
        default:
            break Byte
        }
    }
    return buf[:i], buf[i:], p
}

func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
    // The scanner doesn't validate correct use of escapes, etc. It pays
    // attention to escapes only for the purpose of identifying the closing
    // quote character. It's the parser's responsibility to do proper
    // validation.
    //
    // The scanner also doesn't specifically detect unterminated string
    // literals, though they can be identified in the parser by checking if
    // the final byte in a string token is the double-quote character.

    // Skip the opening quote symbol
    i := 1
    p := start
    p.Pos.Byte++
    p.Pos.Column++
    escaping := false
Byte:
    for i < len(buf) {
        b := buf[i]

        switch {
        case b == '\\':
            escaping = !escaping
            p.Pos.Byte++
            p.Pos.Column++
            i++
        case b == '"':
            p.Pos.Byte++
            p.Pos.Column++
            i++
            if !escaping {
                break Byte
            }
            escaping = false
        case b < 32:
            break Byte
        default:
            // Advance by one grapheme cluster, so that we consider each
            // grapheme to be a "column".
            // Ignoring error because this scanner cannot produce errors.
            advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)

            p.Pos.Byte += advance
            p.Pos.Column++
            i += advance

            escaping = false
        }
    }
    return buf[:i], buf[i:], p
}

func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
    var i int
    p := start
Byte:
    for i = 0; i < len(buf); i++ {
        switch buf[i] {
        case ' ':
            p.Pos.Byte++
            p.Pos.Column++
        case '\n':
            p.Pos.Byte++
            p.Pos.Column = 1
            p.Pos.Line++
        case '\r':
            // For the purpose of line/column counting we consider a
            // carriage return to take up no space, assuming that it will
            // be paired up with a newline (on Windows, for example) that
            // will account for both of them.
            p.Pos.Byte++
        case '\t':
            // We arbitrarily count a tab as if it were two spaces, because
            // we need to choose _some_ number here. This means any system
            // that renders code on-screen with markers must itself treat
            // tabs as a pair of spaces for rendering purposes, or instead
            // use the byte offset and back into its own column position.
            p.Pos.Byte++
            p.Pos.Column += 2
        default:
            break Byte
        }
    }
    return buf[i:], p
}

type pos struct {
    Filename string
    Pos      hcl.Pos
}

func (p *pos) Range(byteLen, charLen int) hcl.Range {
    start := p.Pos
    end := p.Pos
    end.Byte += byteLen
    end.Column += charLen
    return hcl.Range{
        Filename: p.Filename,
        Start:    start,
        End:      end,
    }
}

func posRange(start, end pos) hcl.Range {
    return hcl.Range{
        Filename: start.Filename,
        Start:    start.Pos,
        End:      end.Pos,
    }
}

func (t token) GoString() string {
    return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
}

func isAlphabetical(b byte) bool {
    return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')
}
