From 8f8d7306503489cb9a72fe57e16540be9d386e9b Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Sun, 3 Nov 2024 02:11:50 +0900 Subject: [PATCH] Fix parsing of document (#501) * fix parsing of document --- lexer/lexer_test.go | 303 ++++++++++++++++++++++++++++++++++++++++++ parser/parser_test.go | 9 ++ scanner/context.go | 132 +++++++++++++++--- scanner/scanner.go | 157 +++++++++++++--------- 4 files changed, 518 insertions(+), 83 deletions(-) diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index 309403f1..d4eec756 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -1850,6 +1850,305 @@ a: > }, }, }, + { + YAML: ` +s: > + 1s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">", + Origin: " >\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "1s\n", + Origin: " 1s\n", + }, + }, + }, + { + YAML: ` +s: >1 + 1s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">1", + Origin: " >1\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: " 1s\n", + Origin: " 1s\n", + }, + }, + }, + { + YAML: ` +s: >+2 + 1s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">+2", + Origin: " >+2\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: " 1s\n", + Origin: " 1s\n", + }, + }, + }, + { + YAML: ` +s: >-3 + 1s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">-3", + Origin: " >-3\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: " 1s", + Origin: " 1s\n", + }, + }, + }, + { + YAML: ` +s: > + 1s + 2s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">", + Origin: " >\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "1s 2s\n", + Origin: " 1s\n 2s\n", + }, + }, + }, + { + YAML: ` +s: > + 1s + 2s + 3s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">", + Origin: " >\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "1s\n 2s\n3s\n", + Origin: " 1s\n 2s\n 3s\n", + }, + }, + }, + { + YAML: ` +s: > + 1s + 2s + 3s + 4s + 5s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">", + Origin: " >\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "1s\n 2s\n 3s\n4s 5s\n", + Origin: " 1s\n 2s\n 3s\n 4s\n 5s\n", + }, + }, + }, + { + YAML: ` +s: >-3 + 1s + 2s + 3s + 4s + 5s +`, + Tokens: token.Tokens{ + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: "s", + Origin: "\ns", + }, + { + Type: token.MappingValueType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockStructureIndicator, + Value: ":", + Origin: ":", + }, + { + Type: token.FoldedType, + CharacterType: token.CharacterTypeIndicator, + Indicator: token.BlockScalarIndicator, + Value: ">-3", + Origin: " >-3\n", + }, + { + Type: token.StringType, + CharacterType: token.CharacterTypeMiscellaneous, + Indicator: token.NotIndicator, + Value: " 1s\n 2s\n 3s\n 4s\n 5s", + Origin: " 1s\n 2s\n 3s\n 4s\n 5s\n", + }, + }, + }, } for _, test := range tests { t.Run(test.YAML, func(t *testing.T) { @@ -2464,6 +2763,10 @@ a: |invalid`, name: "invalid document number", src: ">\n1", }, + { + name: "invalid document header option number", + src: "a: >3\n 1", + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { diff --git a/parser/parser_test.go b/parser/parser_test.go index cf77aa2f..6c824a00 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -1126,6 +1126,15 @@ b: - 2 1 | | > 2 | 1 ^ +`, + }, + { + "a: >3\n 1", + ` +[2:3] found invalid token + 1 | a: >3 +> 2 | 1 + ^ `, }, } diff --git a/scanner/context.go b/scanner/context.go index 54dc0b79..24b96c4d 100644 --- a/scanner/context.go +++ b/scanner/context.go @@ -1,6 +1,9 @@ package scanner import ( + "fmt" + "strconv" + "strings" "sync" "github.com/goccy/go-yaml/token" @@ -8,18 +11,22 @@ import ( // Context context at scanning type Context struct { - idx int - size int - notSpaceCharPos int - notSpaceOrgCharPos int - src []rune - buf []rune - obuf []rune - tokens token.Tokens - isRawFolded bool - isLiteral bool - isFolded bool - literalOpt string + idx int + size int + notSpaceCharPos int + notSpaceOrgCharPos int + src []rune + buf []rune + obuf []rune + tokens token.Tokens + isRawFolded bool + isLiteral bool + isFolded bool + docOpt string + docFirstLineIndentColumn int + docPrevLineIndentColumn int + docLineIndentColumn int + docFoldedNewLine bool } var ( @@ -52,7 +59,11 @@ func (c *Context) clear() { c.isRawFolded = false c.isLiteral = false c.isFolded = false - c.literalOpt = "" + c.docOpt = "" + c.docFirstLineIndentColumn = 0 + c.docLineIndentColumn = 0 + c.docPrevLineIndentColumn = 0 + c.docFoldedNewLine = false } func (c *Context) reset(src []rune) { @@ -64,7 +75,7 @@ func (c *Context) reset(src []rune) { c.isRawFolded = false c.isLiteral = false c.isFolded = false - c.literalOpt = "" + c.docOpt = "" } func (c *Context) resetBuffer() { @@ -74,11 +85,91 @@ func (c *Context) resetBuffer() { c.notSpaceOrgCharPos = 0 } -func (c *Context) breakLiteral() { +func (c *Context) breakDocument() { c.isLiteral = false c.isRawFolded = false c.isFolded = false - c.literalOpt = "" + c.docOpt = "" + c.docFirstLineIndentColumn = 0 + c.docLineIndentColumn = 0 + c.docPrevLineIndentColumn = 0 + c.docFoldedNewLine = false +} + +func (c *Context) updateDocumentIndentColumn() { + indent := c.docFirstLineIndentColumnByDocOpt() + if indent > 0 { + c.docFirstLineIndentColumn = indent + 1 + } +} + +func (c *Context) docFirstLineIndentColumnByDocOpt() int { + trimmed := strings.TrimPrefix(c.docOpt, "-") + trimmed = strings.TrimPrefix(trimmed, "+") + i, _ := strconv.ParseInt(trimmed, 10, 64) + return int(i) +} + +func (c *Context) updateDocumentLineIndentColumn(column int) { + if c.docFirstLineIndentColumn == 0 { + c.docFirstLineIndentColumn = column + } + if c.docLineIndentColumn == 0 { + c.docLineIndentColumn = column + } +} + +func (c *Context) validateDocumentLineIndentColumn() error { + if c.docFirstLineIndentColumnByDocOpt() == 0 { + return nil + } + if c.docFirstLineIndentColumn > c.docLineIndentColumn { + return fmt.Errorf("invalid number of indent is specified in the document header") + } + return nil +} + +func (c *Context) updateDocumentNewLineState() { + c.docPrevLineIndentColumn = c.docLineIndentColumn + c.docFoldedNewLine = true + c.docLineIndentColumn = 0 +} + +func (c *Context) addDocumentIndent(column int) { + if c.docFirstLineIndentColumn == 0 { + return + } + + // If the first line of the document has already been evaluated, the number is treated as the threshold, since the `docFirstLineIndentColumn` is a positive number. + if c.docFirstLineIndentColumn <= column { + // In the folded state, new-line-char is normally treated as space, + // but if the number of indents is different from the number of indents in the first line, + // new-line-char is used as is instead of space. + // Therefore, it is necessary to replace the space already added to buf. + // `c.docFoldedNewLine` is a variable that is set to true for every newline. + if c.isFolded && c.docFoldedNewLine { + c.buf[len(c.buf)-1] = '\n' + c.docFoldedNewLine = false + } + // Since addBuf ignore space character, add to the buffer directly. + c.buf = append(c.buf, ' ') + } +} + +func (c *Context) addDocumentNewLineInFolded(column int) { + if !c.isFolded { + return + } + if !c.docFoldedNewLine { + return + } + if c.docFirstLineIndentColumn == c.docLineIndentColumn && + c.docLineIndentColumn == c.docPrevLineIndentColumn { + // use space as a new line delimiter. + return + } + c.buf[len(c.buf)-1] = '\n' + c.docFoldedNewLine = false } func (c *Context) addToken(tk *token.Token) { @@ -179,7 +270,7 @@ func (c *Context) existsBuffer() bool { func (c *Context) bufferedSrc() []rune { src := c.buf[:c.notSpaceCharPos] - if c.isDocument() && c.literalOpt == "-" { + if c.isDocument() && strings.HasPrefix(c.docOpt, "-") { // remove end '\n' character and trailing empty lines // https://yaml.org/spec/1.2.2/#8112-block-chomping-indicator for { @@ -189,6 +280,13 @@ func (c *Context) bufferedSrc() []rune { } break } + for { + if len(src) > 0 && src[len(src)-1] == ' ' { + src = src[:len(src)-1] + continue + } + break + } } return src } diff --git a/scanner/scanner.go b/scanner/scanner.go index 1b21a462..c6b5d4cc 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "io" + "strconv" "strings" "github.com/goccy/go-yaml/token" @@ -43,7 +44,6 @@ type Scanner struct { prevLineIndentNum int // indentLevel indicates the level of indent depth. This value does not match the column value. indentLevel int - docStartColumn int isFirstCharAtLine bool isAnchor bool startedFlowSequenceNum int @@ -204,9 +204,8 @@ func (s *Scanner) addBufferedTokenIfExists(ctx *Context) { ctx.addToken(s.bufferedToken(ctx)) } -func (s *Scanner) breakLiteral(ctx *Context) { - s.docStartColumn = 0 - ctx.breakLiteral() +func (s *Scanner) breakDocument(ctx *Context) { + ctx.breakDocument() } func (s *Scanner) scanSingleQuote(ctx *Context) (*token.Token, error) { @@ -535,26 +534,32 @@ func (s *Scanner) scanComment(ctx *Context) bool { return true } -func (s *Scanner) trimCommentFromLiteralOpt(text string, header rune) (string, error) { +func (s *Scanner) trimCommentFromDocumentOpt(text string, header rune) (string, error) { idx := strings.Index(text, "#") if idx < 0 { return text, nil } if idx == 0 { return "", ErrInvalidToken( - fmt.Sprintf("invalid literal header %s", text), + fmt.Sprintf("invalid document header %s", text), token.Invalid(string(header)+text, s.pos()), ) } return text[:idx-1], nil } -func (s *Scanner) scanLiteral(ctx *Context, c rune) { +func (s *Scanner) scanDocument(ctx *Context, c rune) error { ctx.addOriginBuf(c) if ctx.isEOS() { + ctx.updateDocumentLineIndentColumn(s.column) + if err := ctx.validateDocumentLineIndentColumn(); err != nil { + invalidTk := token.Invalid(string(ctx.obuf), s.pos()) + s.progressColumn(ctx, 1) + return ErrInvalidToken(err.Error(), invalidTk) + } if ctx.isLiteral { ctx.addBuf(c) - } else if ctx.isFolded && !s.isNewLineChar(c) { + } else if ctx.isFolded { ctx.addBuf(c) } value := ctx.bufferedSrc() @@ -567,19 +572,23 @@ func (s *Scanner) scanLiteral(ctx *Context, c rune) { } else { ctx.addBuf(' ') } + ctx.updateDocumentNewLineState() s.progressLine(ctx) } else if s.isFirstCharAtLine && c == ' ' { - if 0 < s.docStartColumn && s.docStartColumn <= s.column { - ctx.addBuf(c) - } + ctx.addDocumentIndent(s.column) s.progressColumn(ctx, 1) } else { - if s.docStartColumn == 0 { - s.docStartColumn = s.column + ctx.updateDocumentLineIndentColumn(s.column) + if err := ctx.validateDocumentLineIndentColumn(); err != nil { + invalidTk := token.Invalid(string(ctx.obuf), s.pos()) + s.progressColumn(ctx, 1) + return ErrInvalidToken(err.Error(), invalidTk) } + ctx.addDocumentNewLineInFolded(s.column) ctx.addBuf(c) s.progressColumn(ctx, 1) } + return nil } func (s *Scanner) scanNewLine(ctx *Context, c rune) { @@ -807,19 +816,36 @@ func (s *Scanner) scanSequence(ctx *Context) bool { return true } -func (s *Scanner) scanLiteralHeader(ctx *Context) (bool, error) { +func (s *Scanner) scanDocumentHeader(ctx *Context) (bool, error) { if ctx.existsBuffer() { return false, nil } - if err := s.scanLiteralHeaderOption(ctx); err != nil { + if err := s.scanDocumentHeaderOption(ctx); err != nil { return false, err } + ctx.updateDocumentIndentColumn() s.progressLine(ctx) return true, nil } -func (s *Scanner) scanLiteralHeaderOption(ctx *Context) error { +func (s *Scanner) validateDocumentHeaderOption(opt string) error { + if len(opt) == 0 { + return nil + } + if opt[0] == '+' || opt[0] == '-' { + opt = opt[1:] + } + if len(opt) == 0 { + return nil + } + if _, err := strconv.ParseInt(opt, 10, 64); err != nil { + return fmt.Errorf("invalid header option: %q", opt) + } + return nil +} + +func (s *Scanner) scanDocumentHeaderOption(ctx *Context) error { header := ctx.currentChar() ctx.addOriginBuf(header) s.progress(ctx, 1) // skip '|' or '>' character @@ -831,64 +857,61 @@ func (s *Scanner) scanLiteralHeaderOption(ctx *Context) error { value := ctx.source(ctx.idx, ctx.idx+idx) opt := strings.TrimRight(value, " ") orgOptLen := len(opt) - opt, err := s.trimCommentFromLiteralOpt(opt, header) + opt, err := s.trimCommentFromDocumentOpt(opt, header) if err != nil { return err } - switch opt { - case "", "+", "-", - "0", "1", "2", "3", "4", "5", "6", "7", "8", "9": - hasComment := len(opt) < orgOptLen - if s.column == 1 { - s.lastDelimColumn = 1 - } - if header == '|' { - if hasComment { - commentLen := orgOptLen - len(opt) - headerPos := strings.Index(string(ctx.obuf), "|") - litBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos] - commentBuf := ctx.obuf[len(litBuf):] - ctx.addToken(token.Literal("|"+opt, string(litBuf), s.pos())) - s.column += len(litBuf) - s.offset += len(litBuf) - commentHeader := strings.Index(value, "#") - ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos())) - } else { - ctx.addToken(token.Literal("|"+opt, string(ctx.obuf), s.pos())) - } - ctx.isLiteral = true - } else if header == '>' { - if hasComment { - commentLen := orgOptLen - len(opt) - headerPos := strings.Index(string(ctx.obuf), ">") - foldedBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos] - commentBuf := ctx.obuf[len(foldedBuf):] - ctx.addToken(token.Folded(">"+opt, string(foldedBuf), s.pos())) - s.column += len(foldedBuf) - s.offset += len(foldedBuf) - commentHeader := strings.Index(value, "#") - ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos())) - } else { - ctx.addToken(token.Folded(">"+opt, string(ctx.obuf), s.pos())) - } - ctx.isFolded = true - } - s.indentState = IndentStateKeep - ctx.resetBuffer() - ctx.literalOpt = opt - s.progressColumn(ctx, progress) - return nil - default: + if err := s.validateDocumentHeaderOption(opt); err != nil { invalidTk := token.Invalid(string(ctx.obuf), s.pos()) s.progressColumn(ctx, progress) - return ErrInvalidToken(fmt.Sprintf("invalid literal header: %q", opt), invalidTk) + return ErrInvalidToken(err.Error(), invalidTk) + } + hasComment := len(opt) < orgOptLen + if s.column == 1 { + s.lastDelimColumn = 1 } + if header == '|' { + if hasComment { + commentLen := orgOptLen - len(opt) + headerPos := strings.Index(string(ctx.obuf), "|") + litBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos] + commentBuf := ctx.obuf[len(litBuf):] + ctx.addToken(token.Literal("|"+opt, string(litBuf), s.pos())) + s.column += len(litBuf) + s.offset += len(litBuf) + commentHeader := strings.Index(value, "#") + ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos())) + } else { + ctx.addToken(token.Literal("|"+opt, string(ctx.obuf), s.pos())) + } + ctx.isLiteral = true + } else if header == '>' { + if hasComment { + commentLen := orgOptLen - len(opt) + headerPos := strings.Index(string(ctx.obuf), ">") + foldedBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos] + commentBuf := ctx.obuf[len(foldedBuf):] + ctx.addToken(token.Folded(">"+opt, string(foldedBuf), s.pos())) + s.column += len(foldedBuf) + s.offset += len(foldedBuf) + commentHeader := strings.Index(value, "#") + ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos())) + } else { + ctx.addToken(token.Folded(">"+opt, string(ctx.obuf), s.pos())) + } + ctx.isFolded = true + } + s.indentState = IndentStateKeep + ctx.resetBuffer() + ctx.docOpt = opt + s.progressColumn(ctx, progress) + return nil } } text := string(ctx.src[ctx.idx:]) invalidTk := token.Invalid(string(ctx.obuf), s.pos()) s.progressColumn(ctx, len(text)) - return ErrInvalidToken(fmt.Sprintf("invalid literal header: %q", text), invalidTk) + return ErrInvalidToken(fmt.Sprintf("invalid document header: %q", text), invalidTk) } func (s *Scanner) scanMapKey(ctx *Context) bool { @@ -977,9 +1000,11 @@ func (s *Scanner) scan(ctx *Context) error { ctx.addToken(token.String("", "", s.pos())) } } - s.breakLiteral(ctx) + s.breakDocument(ctx) } else { - s.scanLiteral(ctx, c) + if err := s.scanDocument(ctx, c); err != nil { + return err + } continue } } @@ -1027,7 +1052,7 @@ func (s *Scanner) scan(ctx *Context) error { continue } case '|', '>': - scanned, err := s.scanLiteralHeader(ctx) + scanned, err := s.scanDocumentHeader(ctx) if err != nil { return err }