Skip to content

Commit

Permalink
Fix isvalid for 3-byte overlong encoded UTF-8 sequences (#29908)
Browse files Browse the repository at this point in the history
  • Loading branch information
simonbyrne authored and KristofferC committed Nov 3, 2018
1 parent 447095d commit e9d32a6
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/support/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,8 @@ int u8_isvalid(const char *str, size_t len)
return 0;
// Check for surrogate chars
if (byt == 0xed && *pnt > 0x9f) return 0;
// Check for overlong encoding
if (byt == 0xe0 && *pnt < 0xa0) return 0;
pnt += 2;
} else { // 4-byte sequence
// Must have 3 valid continuation characters
Expand Down
12 changes: 10 additions & 2 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -467,9 +467,17 @@ end
end
end
end
# Check for short three-byte sequences
@test isvalid(String, UInt8[0xe0]) == false
for (rng, flg) in ((0x00:0x9f, false), (0xa0:0xbf, true), (0xc0:0xff, false))
for cont in rng
@test isvalid(String, UInt8[0xe0, cont]) == false
@test isvalid(String, UInt8[0xe0, cont, 0x80]) == flg
end
end
# Check three-byte sequences
for r1 in (0xe0:0xec, 0xee:0xef)
for byt = r1
for r1 in (0xe1:0xec, 0xee:0xef)
for byt in r1
# Check for short sequence
@test isvalid(String, UInt8[byt]) == false
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
Expand Down

1 comment on commit e9d32a6

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily benchmark build, I will reply here when finished:

@nanosoldier runbenchmarks(ALL, isdaily = true)

Please sign in to comment.