Updated readme and added comments to experiments.py

motiwari · lukeleeai · Oct 9, 2022 · Oct 9, 2022 · Oct 9, 2022 · Oct 10, 2022
commit aed3a539cabacb3b3671c28ba91c1b327845cea6
diff --git a/README.md b/README.md
@@ -66,17 +66,16 @@ OR through the source code via
 ```
 
 
-
 ## Experiment
 
-### Install the dataset before running the experiments
-You can skip this step if you've already run `scripts/reproduce_results.sh`.
+If you want to install the package and run the default experiments at one go, please run the following command.
 ```
-/BanditPAM/: wget https://motiwari.com/banditpam_data/MNIST_70k.tar.gz -P data
-/BanditPAM/: tar -xf data/MNIST_70k.tar.gz -C data
+/BanditPAM/: bash scripts/reproduce_results.sh
 ```
 
 ### Run the experiments
+If you want to manually experiement with different conditions, please run the following command.
+
 ```
 /BanditPAM/: python scripts/experiment.py [options]
 ```

diff --git a/scripts/experiment.py b/scripts/experiment.py
@@ -2,70 +2,173 @@
 import time
 import numpy as np
 import banditpam
+from typing import List, Tuple
 from utils.experiment_utils import get_dataset, print_summary
 
-def _run_experiments(dataset, n_experiments=3, n_medoids=5, useCacheP=True, usePerm=True, cache_multiplier=4000):
+
+def run_experiments(
+    dataset: np.ndarray,
+    n_experiments: int = 3,
+    n_medoids: int = 5,
+    useCacheP: bool = True,
+    usePerm: bool = True,
+    cache_multiplier: int = 4000,
+) -> Tuple[float, float]:
+    """
+    Run one type of experiments on a given condition
+
+    :param dataset: Numpy array dataset
+    :param n_experiments: Number of experiments torun
+    :param n_medoids: Number of medoids to find
+    :param useCacheP: Whether to use cache
+    :param usePerm: Whether to use permutation
+    :param cache_multiplier: A multipler value that determines the cache size
+
+    :return: Tuple of mean and standard deviation of run time
+    """
+
     print("Cache: %r   Perm: %r" % (useCacheP, usePerm))
     results = []
-    for i in range(n_experiments):    
-        kmed = banditpam.KMedoids(n_medoids=n_medoids, algorithm="BanditPAM", useCacheP=useCacheP, usePerm=usePerm, cacheMultiplier=cache_multiplier)
+    for seed in range(n_experiments):
+        kmed = banditpam.KMedoids(
+            n_medoids=n_medoids,
+            algorithm="BanditPAM",
+            useCacheP=useCacheP,
+            usePerm=usePerm,
+            cacheMultiplier=cache_multiplier,
+            seed=seed,
+        )
         start = time.time()
         kmed.fit(dataset, "L2")
         time_elapsed = time.time() - start
-        results += time_elapsed,
-        print((i+1), '/', n_experiments, ' : ', time_elapsed, "seconds")
+        results += (time_elapsed,)
+        print((seed + 1), "/", n_experiments, " : ", time_elapsed, "seconds")
 
     mean = np.mean(results)
     std = np.std(results)
 
     return mean, std
 
-def run_experiments(dataset_name, n_experiments, n_data, n_medoids, cache_multiplier):
+
+def compare_cache_perm(
+    dataset_name: str,
+    n_experiments: int,
+    n_data: int,
+    n_medoids: int,
+    cache_multiplier: int,
+) -> List[int]:
+    """
+    Run three types of experiments on a given condition
+    1. Without cache and permutation
+    2. With cache and without permutation
+    3. With cache and permutation
+
+    :param dataset_name: Name of a dataset (currently only supports MNIST)
+    :param n_experiments: Number of experiments to run
+    :param n_data: Size of the dataset
+    :param n_medoids: Number of medoids to find
+    :param cache_multiplier: A multipler value that determines the cache size
+
+    :return: List of three tuple results (mean, std) from each experiement
+    """
+
     print(f"\n[{dataset_name}={n_data}, K={n_medoids}]")
+
     dataset = get_dataset(dataset_name=dataset_name, n_data=n_data)
-    stats1 = _run_experiments(dataset, n_experiments=1, n_medoids=n_medoids, useCacheP=False, usePerm=False, cache_multiplier=cache_multiplier)
-    stats2 = _run_experiments(dataset, n_experiments=1, n_medoids=n_medoids, useCacheP=True, usePerm=False, cache_multiplier=cache_multiplier)
-    # stats3 = _run_experiments(dataset, n_experiments=1, n_medoids=n_medoids, useCacheP=True, usePerm=True, cache_multiplier=cache_multiplier)
-    stats = [stats1, stats2]
+
+    stats1 = run_experiments(
+        dataset,
+        n_experiments=n_experiments,
+        n_medoids=n_medoids,
+        useCacheP=False,
+        usePerm=False,
+        cache_multiplier=cache_multiplier,
+    )
+
+    stats2 = run_experiments(
+        dataset,
+        n_experiments=n_experiments,
+        n_medoids=n_medoids,
+        useCacheP=True,
+        usePerm=False,
+        cache_multiplier=cache_multiplier,
+    )
+
+    stats3 = run_experiments(
+        dataset,
+        n_experiments=n_experiments,
+        n_medoids=n_medoids,
+        useCacheP=True,
+        usePerm=True,
+        cache_multiplier=cache_multiplier,
+    )
+
+    stats = [stats1, stats2, stats3]
     return stats
 
-def run_multiple_experiments(dataset_list=[], n_data_list=[], n_medoids_list=[], cache_multiplier=5000):
+
+def run_multiple_experiments(
+    dataset_list: List[str],
+    n_data_list: List[str],
+    n_medoids_list=[],
+    cache_multiplier=5000,
+) -> None:
+    """
+    Run experiments on multiple conditions (datasets, number of data, number of medoids)
+
+    :param dataset_list: List of names of datasets (currently only supports MNIST)
+    :param n_data_list: List of numbers of data
+    :param n_medoids_list: List of numbers of medoids to find
+    :param cache_multiplier: A multipler value that determines the cache size
+    """
     stats_list = []
 
     # collect experiment results
-    print("\n" + "-"*40)
+    print("\n" + "-" * 40)
     print("\nRUNNING EXPERIMENTS...")
 
     for dataset_name in dataset_list:
         for n_data in n_data_list:
             for n_medoids in n_medoids_list:
-                stats = run_experiments(
-                    dataset_name=dataset_name, 
-                    n_experiments=3, 
-                    n_data=n_data, 
-                    n_medoids=n_medoids, 
-                    cache_multiplier=cache_multiplier)
-                stats_list += stats,
+                stats = compare_cache_perm(
+                    dataset_name=dataset_name,
+                    n_experiments=3,
+                    n_data=n_data,
+                    n_medoids=n_medoids,
+                    cache_multiplier=cache_multiplier,
+                )
+                stats_list += (stats,)
 
     # print results
-    print("\n" + "-"*40)
+    print("\n" + "-" * 40)
     print("\nPRINTING SUMMARY\n")
 
     i = 0
     for dataset_name in dataset_list:
-        print("{:30}{:30}{:30}".format('Cache (X) Perm (X)', 'Cache (O) Perm (X)', 'Cache (O) Perm (O)'))
+        print(
+            "{:30}{:30}{:30}".format(
+                "Cache (X) Perm (X)",
+                "Cache (O) Perm (X)",
+                "Cache (O) Perm (O)",
+            )
+        )
         for n_data in n_data_list:
             for n_medoids in n_medoids_list:
                 stats = stats_list[i]
                 i += 1
                 print_summary(stats, dataset_name, n_data, n_medoids)
-
-    print("\n" + "-"*40)
-
+
+    print("\n" + "-" * 40)
+
+
 def main(argv):
     try:
-        opts, _ = getopt.getopt(argv, "k:n:d:c:", ["n_medoids=", "n_data=", "dataset=", "cache_multiplier="])
-
+        opts, _ = getopt.getopt(
+            argv,
+            "k:n:d:c:",
+            ["n_medoids=", "n_data=", "dataset=", "cache_multiplier="],
+        )
+
         dataset_list = ["mnist"]
         n_medoids_list = [5, 10]
         n_data_list = [10000, 30000]
@@ -85,13 +188,16 @@ def main(argv):
             #     arg = ast.literal_eval(arg)
             #     dataset_list = [arg] if type(arg) == str else arg
             else:
-                assert(False, "Unhandled option")
+                assert (False, "Unhandled option")
 
-        run_multiple_experiments(dataset_list, n_data_list, n_medoids_list, cache_multiplier)
+        run_multiple_experiments(
+            dataset_list, n_data_list, n_medoids_list, cache_multiplier
+        )
 
     except getopt.GetoptError as error:
         print(error)
         sys.exit(1)
 
+
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    main(sys.argv[1:])
diff --git a/scripts/reproduce_results.sh b/scripts/reproduce_results.sh
@@ -2,7 +2,7 @@
 git submodule update --init --recursive
 cd headers/carma
 mkdir build && cd build && cmake -DCARMA_INSTALL_LIB=ON .. && sudo cmake --build . --config Release --target install
-cd ../../..
+cd ../../
 pip install -r requirements.txt
 sudo pip install .
 

diff --git a/src/python_bindings/kmedoids_pywrapper.cpp b/src/python_bindings/kmedoids_pywrapper.cpp
@@ -18,15 +18,16 @@ namespace km {
 PYBIND11_MODULE(banditpam, m) {
   m.doc() = "BanditPAM Python library, implemented in C++";
   pybind11::class_<KMedoidsWrapper> cls(m, "KMedoids");
-  cls.def(pybind11::init<int, std::string, bool, bool, int, int, int, int>(),
+  cls.def(pybind11::init<int, std::string, bool, bool, int, int, int, int, int>(),
           pybind11::arg("n_medoids") = NULL,
           pybind11::arg("algorithm") = "BanditPAM",
           pybind11::arg("useCacheP") = true,
           pybind11::arg("usePerm") = true,
           pybind11::arg("cacheMultiplier") = 1000,
           pybind11::arg("max_iter") = 1000,
           pybind11::arg("build_confidence") = 1000,
-          pybind11::arg("swap_confidence") = 10000);
+          pybind11::arg("swap_confidence") = 10000,
+          pybind11::arg("seed") = 0);
   cls.def_property("n_medoids",
     &KMedoidsWrapper::getNMedoids, &KMedoidsWrapper::setNMedoids);
   cls.def_property("algorithm",