Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

string_decoder: optimize write() #1209

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions benchmark/misc/string-decoder.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
var common = require('../common.js');
var StringDecoder = require('string_decoder').StringDecoder;

var bench = common.createBenchmark(main, {
encoding: ['ascii', 'utf8', 'base64-utf8', 'base64-ascii'],
inlen: [32, 128, 1024],
chunk: [16, 64, 256, 1024],
n: [25e4]
});

var UTF_ALPHA = 'Bl�b�rsyltet�y';
var ASC_ALPHA = 'Blueberry jam';

function main(conf) {
var encoding = conf.encoding;
var inLen = conf.inlen | 0;
var chunkLen = conf.chunk | 0;
var n = conf.n | 0;

var alpha;
var chunks = [];
var str = '';
var isBase64 = (encoding === 'base64-ascii' || encoding === 'base64-utf8');

if (encoding === 'ascii' || encoding === 'base64-ascii')
alpha = ASC_ALPHA;
else if (encoding === 'utf8' || encoding === 'base64-utf8')
alpha = UTF_ALPHA;
else
throw new Error('Bad encoding');

var sd = new StringDecoder(isBase64 ? 'base64' : encoding);

for (var i = 0; i < inLen; ++i) {
if (i > 0 && (i % chunkLen) === 0 && !isBase64) {
chunks.push(new Buffer(str, encoding));
str = '';
}
str += alpha[i % alpha.length];
}
if (str.length > 0 && !isBase64)
chunks.push(new Buffer(str, encoding));
if (isBase64) {
str = new Buffer(str, 'utf8').toString('base64');
while (str.length > 0) {
var len = Math.min(chunkLen, str.length);
chunks.push(new Buffer(str.substring(0, len), 'utf8'));
str = str.substring(len);
}
}

var nChunks = chunks.length;

bench.start();
for (var i = 0; i < n; ++i) {
for (var j = 0; j < nChunks; ++j)
sd.write(chunks[j]);
}
bench.end(n);
}
105 changes: 68 additions & 37 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
'use strict';

const isEncoding = Buffer.isEncoding;

function assertEncoding(encoding) {
if (encoding && !Buffer.isEncoding(encoding)) {
if (encoding && !isEncoding(encoding)) {
throw new Error('Unknown encoding: ' + encoding);
}
}
@@ -59,65 +61,83 @@ const StringDecoder = exports.StringDecoder = function(encoding) {
// replacement character. See https://codereview.chromium.org/121173009/ .
StringDecoder.prototype.write = function(buffer) {
var charStr = '';
var buflen = buffer.length;
var charBuffer = this.charBuffer;
var charLength = this.charLength;
var charReceived = this.charReceived;
var surrogateSize = this.surrogateSize;
var encoding = this.encoding;
// if our last write ended with an incomplete multibyte character
while (this.charLength) {
while (charLength) {
// determine how many remaining bytes this buffer has to offer for this char
var available = (buffer.length >= this.charLength - this.charReceived) ?
this.charLength - this.charReceived :
buffer.length;
var diff = charLength - charReceived;
var available = (buflen >= diff) ? diff : buflen;

// add the new bytes to the char buffer
buffer.copy(this.charBuffer, this.charReceived, 0, available);
this.charReceived += available;
buffer.copy(charBuffer, charReceived, 0, available);
charReceived += available;

if (this.charReceived < this.charLength) {
if (charReceived < charLength) {
// still not enough chars in this buffer? wait for more ...

this.charLength = charLength;
this.charReceived = charReceived;

return '';
}

// remove bytes belonging to the current character from the buffer
buffer = buffer.slice(available, buffer.length);
buffer = buffer.slice(available, buflen);
buflen = buffer.length;

// get the character that was split
charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding);
charStr = charBuffer.toString(encoding, 0, charLength);

// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
var charCode = charStr.charCodeAt(charStr.length - 1);
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
this.charLength += this.surrogateSize;
charLength += surrogateSize;
charStr = '';
continue;
}
this.charReceived = this.charLength = 0;
charReceived = charLength = 0;

// if there are no more bytes in this buffer, just emit our char
if (buffer.length === 0) {
if (buflen === 0) {
this.charLength = charLength;
this.charReceived = charReceived;

return charStr;
}
break;
}

// determine and set charLength / charReceived
this.detectIncompleteChar(buffer);
if (this.detectIncompleteChar(buffer))
charLength = this.charLength;
charReceived = this.charReceived;

var end = buffer.length;
if (this.charLength) {
var end = buflen;
if (charLength) {
// buffer the incomplete character bytes we got
buffer.copy(this.charBuffer, 0, buffer.length - this.charReceived, end);
end -= this.charReceived;
buffer.copy(charBuffer, 0, buflen - charReceived, end);
end -= charReceived;
}

charStr += buffer.toString(this.encoding, 0, end);
this.charLength = charLength;
charStr += buffer.toString(encoding, 0, end);

var end = charStr.length - 1;
var charCode = charStr.charCodeAt(end);
// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
var size = this.surrogateSize;
this.charLength += size;
this.charReceived += size;
this.charBuffer.copy(this.charBuffer, size, 0, size);
buffer.copy(this.charBuffer, 0, 0, size);
charLength += surrogateSize;
charReceived += surrogateSize;
charBuffer.copy(charBuffer, surrogateSize, 0, surrogateSize);
buffer.copy(charBuffer, 0, 0, surrogateSize);

this.charLength = charLength;
this.charReceived = charReceived;

return charStr.substring(0, end);
}

@@ -130,47 +150,56 @@ StringDecoder.prototype.write = function(buffer) {
// length that character, and sets this.charReceived to the number of bytes
// that are available for this character.
StringDecoder.prototype.detectIncompleteChar = function(buffer) {
var buflen = buffer.length;
// determine how many bytes we have to check at the end of this buffer
var i = (buffer.length >= 3) ? 3 : buffer.length;
var i = (buflen >= 3) ? 3 : buflen;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if V8 will unroll the subsequent loop (or if there's value in doing so ourselves.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean for the i = 3 case? I didn't try that since I figured people would frown about code duplication between the i = 3 and i = buflen cases. I also did not see what kind performance difference it would make though.

var newlen = false;

// Figure out if one of the last i bytes of our buffer announces an
// incomplete char.
for (; i > 0; i--) {
var c = buffer[buffer.length - i];
var c = buffer[buflen - i];

// See http://en.wikipedia.org/wiki/UTF-8#Description

// 110XXXXX
if (i == 1 && c >> 5 == 0x06) {
if (i === 1 && c >> 5 === 0x06) {
this.charLength = 2;
newlen = true;
break;
}

// 1110XXXX
if (i <= 2 && c >> 4 == 0x0E) {
if (i <= 2 && c >> 4 === 0x0E) {
this.charLength = 3;
newlen = true;
break;
}

// 11110XXX
if (i <= 3 && c >> 3 == 0x1E) {
if (i <= 3 && c >> 3 === 0x1E) {
this.charLength = 4;
newlen = true;
break;
}
}

this.charReceived = i;

return newlen;
};

StringDecoder.prototype.end = function(buffer) {
var res = '';
if (buffer && buffer.length)
res = this.write(buffer);

if (this.charReceived) {
var cr = this.charReceived;
var charReceived = this.charReceived;
if (charReceived) {
var cr = charReceived;
var buf = this.charBuffer;
var enc = this.encoding;
res += buf.slice(0, cr).toString(enc);
res += buf.toString(enc, 0, cr);
}

return res;
@@ -181,11 +210,13 @@ function passThroughWrite(buffer) {
}

function utf16DetectIncompleteChar(buffer) {
this.charReceived = buffer.length % 2;
this.charLength = this.charReceived ? 2 : 0;
var charReceived = this.charReceived = buffer.length % 2;
this.charLength = charReceived ? 2 : 0;
return true;
}

function base64DetectIncompleteChar(buffer) {
this.charReceived = buffer.length % 3;
this.charLength = this.charReceived ? 3 : 0;
var charReceived = this.charReceived = buffer.length % 3;
this.charLength = charReceived ? 3 : 0;
return true;
}