Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Easily compare the effects of cache and permutation with flags #205

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
2f37855
added experiment.py & added useCacheP, usePerm, cacheMultiplier as pa…
Oct 9, 2022
b2856db
added instructions to run the experiemnts to readme
Oct 9, 2022
8968292
fixed typos in readme
Oct 9, 2022
bf03f17
completed experiment.py
lukeleeai Oct 10, 2022
896161d
completed experiment.py
lukeleeai Oct 10, 2022
ffbf00c
updated README
lukeleeai Oct 10, 2022
3647577
added an instruction to install the package first in README
lukeleeai Oct 10, 2022
c5c5bc9
made the experiment results more readable
lukeleeai Oct 10, 2022
f60b0da
updated README with new experiment results
lukeleeai Oct 10, 2022
0e11458
updated README with new experiment results
lukeleeai Oct 10, 2022
c56dd2d
added scripts/reproduce_results.sh
lukeleeai Oct 10, 2022
fdae883
added one-line command to reproduce the results to README
lukeleeai Oct 10, 2022
9545bae
added flags for cpp code
lukeleeai Nov 2, 2022
d33c56c
added flags to BanditPAM to toggle cache and perm
lukeleeai Nov 3, 2022
3bfdb14
count number of samples/cache and experiment with array instead of ha…
lukeleeai Nov 3, 2022
b9de8c1
reset to original code
Nov 8, 2022
e76bc34
changed default seed
Nov 8, 2022
a3a44fb
changed to original code
Nov 8, 2022
01dfc7f
removed unnecessary files
lukeleeai Dec 23, 2022
7a3c968
added small datasets
lukeleeai Dec 23, 2022
f13bdc1
removed print statements
lukeleeai Dec 23, 2022
aed3a53
Updated readme and added comments to experiments.py
lukeleeai Dec 23, 2022
bd22dd1
Create README.md
lukeleeai Dec 23, 2022
6ebc6cc
Update README.md
lukeleeai Dec 23, 2022
d3c6ec5
Update README.md
lukeleeai Dec 23, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updated readme and added comments to experiments.py
  • Loading branch information
lukeleeai committed Dec 23, 2022
commit aed3a539cabacb3b3671c28ba91c1b327845cea6
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -66,17 +66,16 @@ OR through the source code via
```



## Experiment

### Install the dataset before running the experiments
You can skip this step if you've already run `scripts/reproduce_results.sh`.
If you want to install the package and run the default experiments at one go, please run the following command.
```
/BanditPAM/: wget https://motiwari.com/banditpam_data/MNIST_70k.tar.gz -P data
/BanditPAM/: tar -xf data/MNIST_70k.tar.gz -C data
/BanditPAM/: bash scripts/reproduce_results.sh
```

### Run the experiments
If you want to manually experiement with different conditions, please run the following command.

```
/BanditPAM/: python scripts/experiment.py [options]
```
164 changes: 135 additions & 29 deletions scripts/experiment.py
Original file line number Diff line number Diff line change
@@ -2,70 +2,173 @@
import time
import numpy as np
import banditpam
from typing import List, Tuple
from utils.experiment_utils import get_dataset, print_summary

def _run_experiments(dataset, n_experiments=3, n_medoids=5, useCacheP=True, usePerm=True, cache_multiplier=4000):

def run_experiments(
dataset: np.ndarray,
n_experiments: int = 3,
n_medoids: int = 5,
useCacheP: bool = True,
usePerm: bool = True,
cache_multiplier: int = 4000,
) -> Tuple[float, float]:
"""
Run one type of experiments on a given condition

:param dataset: Numpy array dataset
:param n_experiments: Number of experiments torun
:param n_medoids: Number of medoids to find
:param useCacheP: Whether to use cache
:param usePerm: Whether to use permutation
:param cache_multiplier: A multipler value that determines the cache size

:return: Tuple of mean and standard deviation of run time
"""

print("Cache: %r Perm: %r" % (useCacheP, usePerm))
results = []
for i in range(n_experiments):
kmed = banditpam.KMedoids(n_medoids=n_medoids, algorithm="BanditPAM", useCacheP=useCacheP, usePerm=usePerm, cacheMultiplier=cache_multiplier)
for seed in range(n_experiments):
kmed = banditpam.KMedoids(
n_medoids=n_medoids,
algorithm="BanditPAM",
useCacheP=useCacheP,
usePerm=usePerm,
cacheMultiplier=cache_multiplier,
seed=seed,
)
start = time.time()
kmed.fit(dataset, "L2")
time_elapsed = time.time() - start
results += time_elapsed,
print((i+1), '/', n_experiments, ' : ', time_elapsed, "seconds")
results += (time_elapsed,)
print((seed + 1), "/", n_experiments, " : ", time_elapsed, "seconds")

mean = np.mean(results)
std = np.std(results)

return mean, std

def run_experiments(dataset_name, n_experiments, n_data, n_medoids, cache_multiplier):

def compare_cache_perm(
dataset_name: str,
n_experiments: int,
n_data: int,
n_medoids: int,
cache_multiplier: int,
) -> List[int]:
"""
Run three types of experiments on a given condition
1. Without cache and permutation
2. With cache and without permutation
3. With cache and permutation

:param dataset_name: Name of a dataset (currently only supports MNIST)
:param n_experiments: Number of experiments to run
:param n_data: Size of the dataset
:param n_medoids: Number of medoids to find
:param cache_multiplier: A multipler value that determines the cache size

:return: List of three tuple results (mean, std) from each experiement
"""

print(f"\n[{dataset_name}={n_data}, K={n_medoids}]")

dataset = get_dataset(dataset_name=dataset_name, n_data=n_data)
stats1 = _run_experiments(dataset, n_experiments=1, n_medoids=n_medoids, useCacheP=False, usePerm=False, cache_multiplier=cache_multiplier)
stats2 = _run_experiments(dataset, n_experiments=1, n_medoids=n_medoids, useCacheP=True, usePerm=False, cache_multiplier=cache_multiplier)
# stats3 = _run_experiments(dataset, n_experiments=1, n_medoids=n_medoids, useCacheP=True, usePerm=True, cache_multiplier=cache_multiplier)
stats = [stats1, stats2]

stats1 = run_experiments(
dataset,
n_experiments=n_experiments,
n_medoids=n_medoids,
useCacheP=False,
usePerm=False,
cache_multiplier=cache_multiplier,
)

stats2 = run_experiments(
dataset,
n_experiments=n_experiments,
n_medoids=n_medoids,
useCacheP=True,
usePerm=False,
cache_multiplier=cache_multiplier,
)

stats3 = run_experiments(
dataset,
n_experiments=n_experiments,
n_medoids=n_medoids,
useCacheP=True,
usePerm=True,
cache_multiplier=cache_multiplier,
)

stats = [stats1, stats2, stats3]
return stats

def run_multiple_experiments(dataset_list=[], n_data_list=[], n_medoids_list=[], cache_multiplier=5000):

def run_multiple_experiments(
dataset_list: List[str],
n_data_list: List[str],
n_medoids_list=[],
cache_multiplier=5000,
) -> None:
"""
Run experiments on multiple conditions (datasets, number of data, number of medoids)

:param dataset_list: List of names of datasets (currently only supports MNIST)
:param n_data_list: List of numbers of data
:param n_medoids_list: List of numbers of medoids to find
:param cache_multiplier: A multipler value that determines the cache size
"""
stats_list = []

# collect experiment results
print("\n" + "-"*40)
print("\n" + "-" * 40)
print("\nRUNNING EXPERIMENTS...")

for dataset_name in dataset_list:
for n_data in n_data_list:
for n_medoids in n_medoids_list:
stats = run_experiments(
dataset_name=dataset_name,
n_experiments=3,
n_data=n_data,
n_medoids=n_medoids,
cache_multiplier=cache_multiplier)
stats_list += stats,
stats = compare_cache_perm(
dataset_name=dataset_name,
n_experiments=3,
n_data=n_data,
n_medoids=n_medoids,
cache_multiplier=cache_multiplier,
)
stats_list += (stats,)

# print results
print("\n" + "-"*40)
print("\n" + "-" * 40)
print("\nPRINTING SUMMARY\n")

i = 0
for dataset_name in dataset_list:
print("{:30}{:30}{:30}".format('Cache (X) Perm (X)', 'Cache (O) Perm (X)', 'Cache (O) Perm (O)'))
print(
"{:30}{:30}{:30}".format(
"Cache (X) Perm (X)",
"Cache (O) Perm (X)",
"Cache (O) Perm (O)",
)
)
for n_data in n_data_list:
for n_medoids in n_medoids_list:
stats = stats_list[i]
i += 1
print_summary(stats, dataset_name, n_data, n_medoids)

print("\n" + "-"*40)


print("\n" + "-" * 40)


def main(argv):
try:
opts, _ = getopt.getopt(argv, "k:n:d:c:", ["n_medoids=", "n_data=", "dataset=", "cache_multiplier="])

opts, _ = getopt.getopt(
argv,
"k:n:d:c:",
["n_medoids=", "n_data=", "dataset=", "cache_multiplier="],
)

dataset_list = ["mnist"]
n_medoids_list = [5, 10]
n_data_list = [10000, 30000]
@@ -85,13 +188,16 @@ def main(argv):
# arg = ast.literal_eval(arg)
# dataset_list = [arg] if type(arg) == str else arg
else:
assert(False, "Unhandled option")
assert (False, "Unhandled option")

run_multiple_experiments(dataset_list, n_data_list, n_medoids_list, cache_multiplier)
run_multiple_experiments(
dataset_list, n_data_list, n_medoids_list, cache_multiplier
)

except getopt.GetoptError as error:
print(error)
sys.exit(1)


if __name__ == "__main__":
main(sys.argv[1:])
main(sys.argv[1:])
2 changes: 1 addition & 1 deletion scripts/reproduce_results.sh
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
git submodule update --init --recursive
cd headers/carma
mkdir build && cd build && cmake -DCARMA_INSTALL_LIB=ON .. && sudo cmake --build . --config Release --target install
cd ../../..
cd ../../
pip install -r requirements.txt
sudo pip install .

5 changes: 3 additions & 2 deletions src/python_bindings/kmedoids_pywrapper.cpp
Original file line number Diff line number Diff line change
@@ -18,15 +18,16 @@ namespace km {
PYBIND11_MODULE(banditpam, m) {
m.doc() = "BanditPAM Python library, implemented in C++";
pybind11::class_<KMedoidsWrapper> cls(m, "KMedoids");
cls.def(pybind11::init<int, std::string, bool, bool, int, int, int, int>(),
cls.def(pybind11::init<int, std::string, bool, bool, int, int, int, int, int>(),
pybind11::arg("n_medoids") = NULL,
pybind11::arg("algorithm") = "BanditPAM",
pybind11::arg("useCacheP") = true,
pybind11::arg("usePerm") = true,
pybind11::arg("cacheMultiplier") = 1000,
pybind11::arg("max_iter") = 1000,
pybind11::arg("build_confidence") = 1000,
pybind11::arg("swap_confidence") = 10000);
pybind11::arg("swap_confidence") = 10000,
pybind11::arg("seed") = 0);
cls.def_property("n_medoids",
&KMedoidsWrapper::getNMedoids, &KMedoidsWrapper::setNMedoids);
cls.def_property("algorithm",