Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speedup C encoder up to 100x #256

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
16 changes: 11 additions & 5 deletions C/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
PROGRAM=blurhash_encoder
DECODER=blurhash_decoder
$(PROGRAM): encode_stb.c encode.c encode.h stb_image.h common.h
$(CC) -o $@ encode_stb.c encode.c -lm -Ofast

$(DECODER): decode_stb.c decode.c decode.h stb_writer.h common.h
$(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast
encod%.o: encod%.c encode.h stb_image.h common.h
$(CC) -c $< -o $@ -Ofast -Wall
$(PROGRAM): encode_stb.o encode.o
$(CC) -o $@ encode_stb.o encode.o -lm

decod%.o: decod%.c decode.h stb_writer.h common.h
$(CC) -c $< -o $@ -Ofast -Wall
$(DECODER): decode_stb.o decode.o
$(CC) -o $@ decode_stb.o decode.o -lm

.PHONY: clean
clean:
rm -f $(PROGRAM)
rm -f $(DECODER)
rm -f $(DECODER)
rm -f *.o
7 changes: 2 additions & 5 deletions C/common.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
#ifndef __BLURHASH_COMMON_H__
#define __BLURHASH_COMMON_H__

#include<math.h>

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#define _USE_MATH_DEFINES
#include <math.h>

static inline int linearTosRGB(float value) {
float v = fmaxf(0, fminf(1, value));
Expand Down
72 changes: 52 additions & 20 deletions C/decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,26 @@

static char chars[83] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~";

static inline uint8_t clampToUByte(int * src) {
if( *src >= 0 && *src <= 255 )
return *src;
return (*src < 0) ? 0 : 255;
#define CACHE_ACCURACY 8191

static inline int convertToCacheIdx(float src) {
int res = src * CACHE_ACCURACY + 0.5;
return (res < 0) ? 0 : (res > CACHE_ACCURACY) ? CACHE_ACCURACY : res;
}

uint8_t *linearTosRGB_cache = NULL;

static void init_linearTosRGB_cache() {
uint8_t *cache;
if (linearTosRGB_cache != NULL) {
return;
}
cache = (uint8_t *)malloc(sizeof(uint8_t) * (CACHE_ACCURACY + 1));
for (int x = 0; x <= CACHE_ACCURACY; x++) {
cache[x] = linearTosRGB((float)x / CACHE_ACCURACY);
}
// Assign cache after population to avoid races
linearTosRGB_cache = cache;
}

static inline uint8_t * createByteArray(int size) {
Expand Down Expand Up @@ -98,37 +114,53 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n

int bytesPerRow = width * nChannels;
int x = 0, y = 0, i = 0, j = 0;
int intR = 0, intG = 0, intB = 0;

float *cosX = malloc(width * colors_size * sizeof(float));
float *cosY = malloc(height * colors_size * sizeof(float));
for(x = 0; x < width; x ++) {
for(i = 0; i < numX; i ++) {
float weight = cosf(M_PI * x * i / width);
for(j = 0; j < numY; j ++) {
cosX[x * colors_size + j * numX + i] = weight;
}
}
}
for(y = 0; y < height; y ++) {
for(j = 0; j < numY; j ++) {
float weight = cosf((M_PI * y * j) / height);
for(i = 0; i < numX; i ++) {
cosY[y * colors_size + j * numX + i] = weight;
}
}
}

init_linearTosRGB_cache();

for(y = 0; y < height; y ++) {
for(x = 0; x < width; x ++) {

float r = 0, g = 0, b = 0;

for(j = 0; j < numY; j ++) {
for(i = 0; i < numX; i ++) {
float basics = cos((M_PI * x * i) / width) * cos((M_PI * y * j) / height);
int idx = i + j * numX;
r += colors[idx][0] * basics;
g += colors[idx][1] * basics;
b += colors[idx][2] * basics;
}
for (int idx = 0; idx < colors_size; idx ++) {
float basics = cosX[x * colors_size + idx] * cosY[y * colors_size + idx];
r += colors[idx][0] * basics;
g += colors[idx][1] * basics;
b += colors[idx][2] * basics;
}

intR = linearTosRGB(r);
intG = linearTosRGB(g);
intB = linearTosRGB(b);

pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(&intR);
pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(&intG);
pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(&intB);
pixelArray[nChannels * x + 0 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(r)];
pixelArray[nChannels * x + 1 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(g)];
pixelArray[nChannels * x + 2 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(b)];

if (nChannels == 4)
pixelArray[nChannels * x + 3 + y * bytesPerRow] = 255; // If nChannels=4, treat each pixel as RGBA instead of RGB

}
}

free(cosX);
free(cosY);

return 0;
}

Expand Down
119 changes: 91 additions & 28 deletions C/encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,69 @@

#include <string.h>

static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow);
static void multiplyBasisFunction(
float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
float *cosX, float *cosY);
static char *encode_int(int value, int length, char *destination);

static int encodeDC(float r, float g, float b);
static int encodeAC(float r, float g, float b, float maximumValue);

float *sRGBToLinear_cache = NULL;

static void init_sRGBToLinear_cache() {
float *cache;
if (sRGBToLinear_cache != NULL) {
return;
}
cache = (float *)malloc(sizeof(float) * 256);
for (int x = 0; x < 256; x++) {
cache[x] = sRGBToLinear(x);
}
// Assign cache after population to avoid races
sRGBToLinear_cache = cache;
}

const char *blurHashForPixels(int xComponents, int yComponents, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
static char buffer[2 + 4 + (9 * 9 - 1) * 2 + 1];

if(xComponents < 1 || xComponents > 9) return NULL;
if(yComponents < 1 || yComponents > 9) return NULL;

float factors[yComponents][xComponents][3];
float factors[yComponents * xComponents][4];
int factorsCount = xComponents * yComponents;
memset(factors, 0, sizeof(factors));

for(int y = 0; y < yComponents; y++) {
float *cosX = (float *)malloc(sizeof(float) * width * factorsCount);
if (! cosX) return NULL;
float *cosY = (float *)malloc(sizeof(float) * height * factorsCount);
if (! cosY) {
free(cosX);
return NULL;
}
for(int i = 0; i < width; i++) {
for(int x = 0; x < xComponents; x++) {
float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow);
factors[y][x][0] = factor[0];
factors[y][x][1] = factor[1];
factors[y][x][2] = factor[2];
float weight = cosf(M_PI * x * i / width);
for(int y = 0; y < yComponents; y++) {
cosX[i * factorsCount + y * xComponents + x] = weight;
}
}
}
for(int i = 0; i < height; i++) {
for(int y = 0; y < yComponents; y++) {
float weight = cosf(M_PI * y * i / height);
for(int x = 0; x < xComponents; x++) {
cosY[i * factorsCount + y * xComponents + x] = weight;
}
}
}
multiplyBasisFunction(factors, factorsCount, width, height, rgb, bytesPerRow, cosX, cosY);
free(cosX);
free(cosY);

float *dc = factors[0][0];
float *ac = dc + 3;
int acCount = xComponents * yComponents - 1;
float *dc = factors[0];
float *ac = dc + 4;
int acCount = factorsCount - 1;
char *ptr = buffer;

int sizeFlag = (xComponents - 1) + (yComponents - 1) * 9;
Expand All @@ -38,7 +74,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
float maximumValue;
if(acCount > 0) {
float actualMaximumValue = 0;
for(int i = 0; i < acCount * 3; i++) {
for(int i = 0; i < acCount * 4; i++) {
actualMaximumValue = fmaxf(fabsf(ac[i]), actualMaximumValue);
}

Expand All @@ -53,35 +89,62 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
ptr = encode_int(encodeDC(dc[0], dc[1], dc[2]), 4, ptr);

for(int i = 0; i < acCount; i++) {
ptr = encode_int(encodeAC(ac[i * 3 + 0], ac[i * 3 + 1], ac[i * 3 + 2], maximumValue), 2, ptr);
ptr = encode_int(encodeAC(ac[i * 4 + 0], ac[i * 4 + 1], ac[i * 4 + 2], maximumValue), 2, ptr);
}

*ptr = 0;

return buffer;
}

static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
float r = 0, g = 0, b = 0;
float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
static void multiplyBasisFunction(
float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
float *cosX, float *cosY
) {
init_sRGBToLinear_cache();

for(int y = 0; y < height; y++) {
for(int x = 0; x < width; x++) {
float basis = cosf(M_PI * xComponent * x / width) * cosf(M_PI * yComponent * y / height);
r += basis * sRGBToLinear(rgb[3 * x + 0 + y * bytesPerRow]);
g += basis * sRGBToLinear(rgb[3 * x + 1 + y * bytesPerRow]);
b += basis * sRGBToLinear(rgb[3 * x + 2 + y * bytesPerRow]);
uint8_t *src = rgb + y * bytesPerRow;
float *cosYLocal = cosY + y * factorsCount;
int x = 0;
for(; x < width - 3; x += 4) {
float *cosXLocal = cosX + x * factorsCount;
float pixel0[4] = {sRGBToLinear_cache[src[3 * (x+0) + 0]], sRGBToLinear_cache[src[3 * (x+0) + 1]], sRGBToLinear_cache[src[3 * (x+0) + 2]]};
float pixel1[4] = {sRGBToLinear_cache[src[3 * (x+1) + 0]], sRGBToLinear_cache[src[3 * (x+1) + 1]], sRGBToLinear_cache[src[3 * (x+1) + 2]]};
float pixel2[4] = {sRGBToLinear_cache[src[3 * (x+2) + 0]], sRGBToLinear_cache[src[3 * (x+2) + 1]], sRGBToLinear_cache[src[3 * (x+2) + 2]]};
float pixel3[4] = {sRGBToLinear_cache[src[3 * (x+3) + 0]], sRGBToLinear_cache[src[3 * (x+3) + 1]], sRGBToLinear_cache[src[3 * (x+3) + 2]]};
for (int i = 0; i < factorsCount; i++) {
float basis0 = cosYLocal[i] * cosXLocal[i + 0 * factorsCount];
float basis1 = cosYLocal[i] * cosXLocal[i + 1 * factorsCount];
float basis2 = cosYLocal[i] * cosXLocal[i + 2 * factorsCount];
float basis3 = cosYLocal[i] * cosXLocal[i + 3 * factorsCount];
factors[i][0] += basis0 * pixel0[0] + basis1 * pixel1[0] + basis2 * pixel2[0] + basis3 * pixel3[0];
factors[i][1] += basis0 * pixel0[1] + basis1 * pixel1[1] + basis2 * pixel2[1] + basis3 * pixel3[1];
factors[i][2] += basis0 * pixel0[2] + basis1 * pixel1[2] + basis2 * pixel2[2] + basis3 * pixel3[2];
}
}
for(; x < width; x++) {
float pixel[4];
float *cosXLocal = cosX + x * factorsCount;
pixel[0] = sRGBToLinear_cache[src[3 * x + 0]];
pixel[1] = sRGBToLinear_cache[src[3 * x + 1]];
pixel[2] = sRGBToLinear_cache[src[3 * x + 2]];
for (int i = 0; i < factorsCount; i++) {
float basis = cosYLocal[i] * cosXLocal[i];
factors[i][0] += basis * pixel[0];
factors[i][1] += basis * pixel[1];
factors[i][2] += basis * pixel[2];
}
}
}

float scale = normalisation / (width * height);

static float result[3];
result[0] = r * scale;
result[1] = g * scale;
result[2] = b * scale;

return result;
for (int i = 0; i < factorsCount; i++) {
float normalisation = (i == 0) ? 1 : 2;
float scale = normalisation / (width * height);
factors[i][0] *= scale;
factors[i][1] *= scale;
factors[i][2] *= scale;
}
}


Expand Down
4 changes: 2 additions & 2 deletions C/encode_stb.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ int main(int argc, const char **argv) {

int xComponents = atoi(argv[1]);
int yComponents = atoi(argv[2]);
if(xComponents < 1 || xComponents > 8 || yComponents < 1 || yComponents > 8) {
fprintf(stderr, "Component counts must be between 1 and 8.\n");
if(xComponents < 1 || xComponents > 9 || yComponents < 1 || yComponents > 9) {
fprintf(stderr, "Component counts must be between 1 and 9.\n");
return 1;
}

Expand Down