Skip to content

Commit

Permalink
[SYSTEMDS-3150] Outlier Detection via DBSCAN
Browse files Browse the repository at this point in the history
  - This commit introduces dbscanApply() method to find the cluster membership
    of unseen (test) data.
Closes apache#1497.
  • Loading branch information
ratschillerp authored and Shafaq-Siddiqi committed Jan 17, 2022
1 parent ac807f4 commit 47533e2
Show file tree
Hide file tree
Showing 8 changed files with 313 additions and 30 deletions.
38 changes: 37 additions & 1 deletion docs/site/builtins-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ limitations under the License.
* [`naiveBayesPredict`-Function](#naiveBayesPredict-function)
* [`normalize`-Function](#normalize-function)
* [`outlier`-Function](#outlier-function)
* [`outlierByDB`-Function](#outlierByDB-function)
* [`pnmf`-Function](#pnmf-function)
* [`scale`-Function](#scale-function)
* [`setdiff`-Function](#setdiff-function)
Expand Down Expand Up @@ -422,12 +423,13 @@ Y = dbscan(X = X, eps = 2.5, minPts = 5)
| Type | Description |
| :-----------| :---------- |
| Matrix[Integer] | The mapping of records to clusters |
| Matrix[Double] | The coordinates of all points considered part of a cluster |

### Example

```r
X = rand(rows=1780, cols=180, min=1, max=20)
dbscan(X = X, eps = 2.5, minPts = 360)
[indices, model] = dbscan(X = X, eps = 2.5, minPts = 360)
```


Expand Down Expand Up @@ -1756,6 +1758,40 @@ X = rand (rows = 50, cols = 10)
outlier(X=X, opposite=1)
```

## `outlierByDB`-Function

The `outlierByDB`-function implements an outlier prediction for a trained dbscan model. The points in the `Xtest` matrix are checked against the model and are considered part of the cluster if at least one member is within `eps` distance.

### Usage

```r
outlierByDB(X, model, eps)
```

### Arguments

| Name | Type | Default | Description |
| :------- | :------------- | -------- | :---------- |
| Xtest | Matrix[Double] | required | Matrix of points for outlier testing |
| model | Matrix[Double] | required | Matrix model of the clusters, containing all points that are considered members, returned by the [`dbscan builtin`](#DBSCAN-function) |
| eps | Double | 0.5 | Epsilon distance between points to be considered in their neighborhood |

### Returns

| Type | Description |
| :------------- | :---------- |
| Matrix[Double] | Matrix indicating outlier values of the points in Xtest, 0 suggests it being an outlier |

### Example

```r
eps = 1
minPts = 5
X = rand(rows=1780, cols=180, min=1, max=20)
[indices, model] = dbscan(X=X, eps=eps, minPts=minPts)
Y = rand(rows=500, cols=180, min=1, max=20)
Z = outlierByDB(Xtest=Y, clusterModel = model, eps = eps)
```

## `pnmf`-Function

Expand Down
57 changes: 29 additions & 28 deletions scripts/builtin/dbscan.dml
Original file line number Diff line number Diff line change
Expand Up @@ -39,42 +39,43 @@
# ----------------------------------------------------------------------------------------------------------------------

m_dbscan = function (Matrix[Double] X, Double eps = 0.5, Integer minPts = 5)
return (Matrix[Double] clusterMembers)
return (Matrix[Double] clusterMembers, Matrix[Double] clusterModel)
{
#check input parameter assertions
if(minPts < 0) { stop("DBSCAN: Stopping due to invalid inputs: minPts should be greater than 0"); }
if(eps < 0) { stop("DBSCAN: Stopping due to invalid inputs: Epsilon (eps) should be greater than 0"); }
#check input parameter assertions
if(minPts < 0) { stop("DBSCAN: Stopping due to invalid inputs: minPts should be greater than 0"); }
if(eps < 0) { stop("DBSCAN: Stopping due to invalid inputs: Epsilon (eps) should be greater than 0"); }

UNASSIGNED = 0;
UNASSIGNED = 0;

num_records = nrow(X);
num_features = ncol(X);
num_records = nrow(X);
num_features = ncol(X);

neighbors = dist(X);
neighbors = dist(X);

#find same pts and set their distance to the smallest double representation
neighbors = replace(target = neighbors, pattern = 0, replacement = 2.225e-307)
neighbors = neighbors - diag(diag(neighbors));
#find same pts and set their distance to the smallest double representation
neighbors = replace(target = neighbors, pattern = 0, replacement = 2.225e-307)
neighbors = neighbors - diag(diag(neighbors));

# neighbors within eps
withinEps = ((neighbors <= eps) * (0 < neighbors));
corePts = rowSums(withinEps) + 1 >= minPts;
# neighbors within eps
withinEps = ((neighbors <= eps) * (0 < neighbors));
corePts = rowSums(withinEps) + 1 >= minPts;

clusterMembers = matrix(UNASSIGNED, num_records, 1);
clusterMembers = matrix(UNASSIGNED, num_records, 1);
clusterModel = matrix(UNASSIGNED, 0, num_features);

if (sum(corePts) != 0) {
# leave only density reachable pts
neighbors = (neighbors * corePts * withinEps) > 0;
if (sum(corePts) != 0) {
# leave only density reachable pts
neighbors = (neighbors * corePts * withinEps) > 0;
# border pts of multiple clusters
border = neighbors * (t(corePts) == 0 & colSums(neighbors) > 1) * seq(num_records, 1);
border = (border - colMaxs(border)) == 0;
neighbors = neighbors * border;

# border pts of multiple clusters
border = neighbors * (t(corePts) == 0 & colSums(neighbors) > 1) * seq(num_records, 1);
border = (border - colMaxs(border)) == 0;
neighbors = neighbors * border;
adjacency = (neighbors + t(neighbors)) > 0;

adjacency = (neighbors + t(neighbors)) > 0;

clusterMembers = components(G=adjacency, verbose=FALSE);
# noise to 0
clusterMembers = clusterMembers * (rowSums(adjacency) > 0);
}
clusterMembers = components(G=adjacency, verbose=FALSE);
# noise to 0
clusterMembers = clusterMembers * (rowSums(adjacency) > 0);
clusterModel = removeEmpty(target=X, margin="rows", select = (clusterMembers > 0))
}
}
58 changes: 58 additions & 0 deletions scripts/builtin/dbscanApply.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Implements the outlier detection/prediction algorithm using a DBScan model
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# Xtest Matrix[Double] --- The input Matrix to do outlier detection on.
# clusterModel Matrix[Double] --- Model of clusters to predict outliers against.
# eps Double 0.5 Maximum distance between two points for one to be considered reachable for the other.

# OUTPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# outlierPoints Matrix[Double] --- Predicted outliers


m_dbscanApply = function (Matrix[Double] Xtest, Matrix[Double] clusterModel, Double eps = 0.5)
return (Matrix[double] outlierPoints)
{
num_features_Xtest = ncol(Xtest);
num_rows_Xtest = nrow(Xtest);
num_features_model = ncol(clusterModel);
num_rows_model = nrow(clusterModel);

if(num_features_Xtest != num_features_model) {stop("DBSCAN Outlier: Stopping due to invalid inputs: features need to match");}
if(eps < 0) { stop("DBSCAN Outlier: Stopping due to invalid inputs: Epsilon (eps) should be greater than 0"); }
if(num_rows_model <= 0) { stop("DBSCAN Outlier: Stopping due to invalid inputs: Model is empty"); }

X = rbind(clusterModel, Xtest);
neighbors = dist(X);
neighbors = replace(target = neighbors, pattern = 0, replacement = 2.225e-307);
neighbors = neighbors - diag(diag(neighbors));
Xtest_dists = neighbors[(num_rows_model+1):nrow(X), 1:num_rows_model];
withinEps = ((Xtest_dists <= eps) * (0 < Xtest_dists));
outlierPoints = rowSums(withinEps) >= 1;
}
1 change: 1 addition & 0 deletions src/main/java/org/apache/sysds/common/Builtins.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ public enum Builtins {
CUMSUM("cumsum", false),
CUMSUMPROD("cumsumprod", false),
DBSCAN("dbscan", true),
DBSCANAPPLY("dbscanApply", true),
DECISIONTREE("decisionTree", true),
DECOMPRESS("decompress", false),
DEEPWALK("deepWalk", true),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.test.functions.builtin.part1;
import org.apache.sysds.common.Types.ExecMode;
import org.apache.sysds.common.Types.ExecType;
import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
import org.junit.Test;
import java.util.HashMap;

public class BuiltinDbscanApplyTest extends AutomatedTestBase
{
private final static String TEST_NAME = "dbscanApply";
private final static String TEST_DIR = "functions/builtin/";
private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinDbscanApplyTest.class.getSimpleName() + "/";

private final static double eps = 1e-9;
private final static int rows = 1700;
private final static int cols = 3;
private final static int min = -10;
private final static int max = 10;

private final static int minPts = 5;

@Override
public void setUp() {
addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"B"}));
}

@Test
public void testDBSCANOutlierDefault0CP() {
runOutlierByDBSCAN(true, 6, 18, 1, ExecType.CP);
}

@Test
public void testDBSCANOutlierDefault0SP() {
runOutlierByDBSCAN(true, 6, 18, 1, ExecType.SPARK);
}

@Test
public void testDBSCANOutlierDefault1CP() {
runOutlierByDBSCAN(true, 5, 15, 1, ExecType.CP);
}

@Test
public void testDBSCANOutlierDefault1SP() {
runOutlierByDBSCAN(true, 5, 15, 1, ExecType.SPARK);
}

@Test
public void testDBSCANOutlierDefault2CP() {
runOutlierByDBSCAN(true, 12, 77, 1, ExecType.CP);
}

@Test
public void testDBSCANOutlierDefault2SP() {
runOutlierByDBSCAN(true, 12, 77, 1, ExecType.SPARK);
}

private void runOutlierByDBSCAN(boolean defaultProb, int seedA, int seedB, double epsDB, ExecType instType)
{
ExecMode platformOld = setExecMode(instType);

try
{
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;

fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[]{"-explain","-nvargs",
"X=" + input("A"), "Y=" + input("B"),"Z=" + output("C"), "eps=" + epsDB, "minPts=" + minPts};
fullRScriptName = HOME + TEST_NAME + ".R";
rCmd = getRCmd(inputDir(), inputDir(), Double.toString(epsDB), Integer.toString(minPts), expectedDir());

//generate actual dataset
double[][] A = getNonZeroRandomMatrix(rows, cols, min, max, seedA);
writeInputMatrixWithMTD("A", A, true);
double[][] B = getNonZeroRandomMatrix(rows, cols, min, max, seedB);
writeInputMatrixWithMTD("B", B, true);

runTest(true, false, null, -1);
runRScript(true);

//compare matrices
HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("C");
HashMap<CellIndex, Double> rfile = readRMatrixFromExpectedDir("C");

TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
}
finally {
rtplatform = platformOld;
}
}
}
2 changes: 1 addition & 1 deletion src/test/scripts/functions/builtin/dbscan.dml
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@
X = read($X);
eps = as.double($eps);
minPts = as.integer($minPts);
Y = dbscan(X, eps, minPts);
[Y, model] = dbscan(X, eps, minPts);
write(Y, $Y);
45 changes: 45 additions & 0 deletions src/test/scripts/functions/builtin/dbscanApply.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

args<-commandArgs(TRUE)
library("Matrix")
library("dbscan")

X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")));
Y = as.matrix(readMM(paste(args[2], "B.mtx", sep="")));
eps = as.double(args[3]);
minPts = as.integer(args[4]);
dbModel = dbscan(X, eps, minPts);

cleanMatr = matrix(, nrow = nrow(X), ncol = 3)
for(i in 1:nrow(X)) {
if(dbModel$cluster[i] > 0) {
cleanMatr[i,] = X[i,]
}
}

cleanMatr = cleanMatr[rowSums(is.na(cleanMatr)) != ncol(cleanMatr),]

dbModelClean = dbscan(cleanMatr, eps, minPts);

Z = predict(dbModelClean, Y, data = cleanMatr);
Z[Z > 0] = 1;
writeMM(as(Z, "CsparseMatrix"), paste(args[5], "C", sep=""));
Loading

0 comments on commit 47533e2

Please sign in to comment.