forked from dimchansky/utfbom
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bom_reader.go
204 lines (168 loc) · 4.75 KB
/
bom_reader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
// It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
// interface but provides automatic BOM checking and removing as necessary.
package bomreader
import (
"errors"
"io"
)
// Encoding is type alias for detected UTF encoding.
type Encoding int
// Constants to identify detected UTF encodings.
const (
// Unknown encoding, returned when no BOM was detected
Unknown Encoding = iota
// UTF8, BOM bytes: EF BB BF
UTF8
// UTF-16, big-endian, BOM bytes: FE FF
UTF16BigEndian
// UTF-16, little-endian, BOM bytes: FF FE
UTF16LittleEndian
// UTF-32, big-endian, BOM bytes: 00 00 FE FF
UTF32BigEndian
// UTF-32, little-endian, BOM bytes: FF FE 00 00
UTF32LittleEndian
// GB18030, BOM bytes: 84 31 95 33
GB18030
)
// String returns a user-friendly string representation of the encoding. Satisfies fmt.Stringer interface.
func (e Encoding) String() string {
switch e {
case UTF8:
return "UTF8"
case UTF16BigEndian:
return "UTF16BigEndian"
case UTF16LittleEndian:
return "UTF16LittleEndian"
case UTF32BigEndian:
return "UTF32BigEndian"
case UTF32LittleEndian:
return "UTF32LittleEndian"
case GB18030:
return "GB18030"
default:
return "Unknown"
}
}
const maxConsecutiveEmptyReads = 100
// Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
// It also returns the encoding detected by the BOM.
// If the detected encoding is not needed, you can call the SkipOnly function.
func Skip(rd io.Reader) (*Reader, Encoding) {
// Is it already a Reader?
b, ok := rd.(*Reader)
if ok {
return b, Unknown
}
enc, left, err := detectUtf(rd)
return &Reader{
rd: rd,
buf: left,
err: err,
}, enc
}
// SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
func SkipOnly(rd io.Reader) *Reader {
r, _ := Skip(rd)
return r
}
// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
// removing as necessary for an io.Reader object.
type Reader struct {
rd io.Reader // reader provided by the client
buf []byte // buffered data
err error // last error
}
// Read is an implementation of io.Reader interface.
// The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
func (r *Reader) Read(p []byte) (n int, err error) {
if len(p) == 0 {
return 0, nil
}
if r.buf == nil {
if r.err != nil {
return 0, r.readErr()
}
return r.rd.Read(p)
}
// copy as much as we can
n = copy(p, r.buf)
r.buf = nilIfEmpty(r.buf[n:])
return n, nil
}
func (r *Reader) readErr() error {
err := r.err
r.err = nil
return err
}
var errNegativeRead = errors.New("bom-reader: reader returned negative count from Read")
func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
buf, err = readBOM(rd)
if len(buf) >= 4 {
if isUTF32BigEndianBOM4(buf) {
return UTF32BigEndian, nilIfEmpty(buf[4:]), err
}
if isUTF32LittleEndianBOM4(buf) {
return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
}
if isGB18030BOM4(buf) {
return GB18030, nilIfEmpty(buf[4:]), err
}
}
if len(buf) > 2 && isUTF8BOM3(buf) {
return UTF8, nilIfEmpty(buf[3:]), err
}
if (err != nil && err != io.EOF) || (len(buf) < 2) {
return Unknown, nilIfEmpty(buf), err
}
if isUTF16BigEndianBOM2(buf) {
return UTF16BigEndian, nilIfEmpty(buf[2:]), err
}
if isUTF16LittleEndianBOM2(buf) {
return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
}
return Unknown, nilIfEmpty(buf), err
}
func readBOM(rd io.Reader) (buf []byte, err error) {
const maxBOMSize = 4
var bom [maxBOMSize]byte // used to read BOM
// read as many bytes as possible
for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
if n, err = rd.Read(bom[len(buf):]); n < 0 {
panic(errNegativeRead)
}
if n > 0 {
nEmpty = 0
} else {
nEmpty++
if nEmpty >= maxConsecutiveEmptyReads {
err = io.ErrNoProgress
}
}
}
return
}
func isUTF32BigEndianBOM4(buf []byte) bool {
return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
}
func isUTF32LittleEndianBOM4(buf []byte) bool {
return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
}
func isGB18030BOM4(buf []byte) bool {
return buf[0] == 0x84 && buf[1] == 0x31 && buf[2] == 0x95 && buf[3] == 0x33
}
func isUTF8BOM3(buf []byte) bool {
return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
}
func isUTF16BigEndianBOM2(buf []byte) bool {
return buf[0] == 0xFE && buf[1] == 0xFF
}
func isUTF16LittleEndianBOM2(buf []byte) bool {
return buf[0] == 0xFF && buf[1] == 0xFE
}
func nilIfEmpty(buf []byte) (res []byte) {
if len(buf) > 0 {
res = buf
}
return
}