Add benchmark results of LightRAG

circlemind-ai · Dec 2, 2024 · e6a3bfd · e6a3bfd
1 parent 79322c6
commit e6a3bfd
Show file tree

Hide file tree

Showing 16 changed files with 4,620 additions and 86 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -8,5 +8,11 @@
         "*_test.py"
     ],
     "python.testing.pytestEnabled": false,
-    "python.testing.unittestEnabled": true
+    "python.testing.unittestEnabled": true,
+    "python.autoComplete.extraPaths": [
+        "${workspaceFolder}"
+    ],
+    "python.analysis.extraPaths": [
+        "${workspaceFolder}"
+    ]
 }
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,32 +1,54 @@
 ## Benchmarks
-We validate the benchmark results provided in [HippoRAG](https://arxiv.org/abs/2405.14831).
+We validate the benchmark results provided in [HippoRAG](https://arxiv.org/abs/2405.14831), as well as comparing with other methods:
+- NaiveRAG using the embedder `text-embedding-3-small`
+- [LightRAG](https://github.com/HKUDS/LightRAG) 
 
-The scripts in this directory will generate and evaluate the 2wikimultihopqa datasets on a subsets of 51 and 101 queries with the same methodology as in the paper above. We preloaded the results so its is enough to run `evaluate.xx` to get the numbers. You can also run `create_dbs.xx` to regenerate the vector and graph databases.
+The scripts in this directory will generate and evaluate the 2wikimultihopqa datasets on a subsets of 51 and 101 queries with the same methodology as in the HippoRAG paper. In particular, we evaluate the retrieval capabilities of each method, mesauring the percentage of queries for which all the required evidence was retrieved. We preloaded the results so it is enough to run `evaluate.xx` to get the numbers. You can also run `create_dbs.xx` to regenerate the databases for the different methods (you will need to set OPENAI_API_KEY, LightRAG could take a while to process).
 
 The output should looks similar at follow (the exact numbers could vary based on your graph configuration)
 ```
-Evaluation of the performance of the VectorDB and Circlemind on the same data (51 queries)
+Evaluation of the performance of different RAG methods on 2wikimultihopqa (51 queries)
 
 VectorDB
 Loading dataset...
 [all questions] Percentage of queries with perfect retrieval: 0.49019607843137253
 [multihop only] Percentage of queries with perfect retrieval: 0.32432432432432434
 
+LightRAG
+Loading dataset...
+Percentage of queries with perfect retrieval: 0.47058823529411764
+[multihop] Percentage of queries with perfect retrieval: 0.32432432432432434
+
 Circlemind
 Loading dataset...
 [all questions] Percentage of queries with perfect retrieval: 0.9607843137254902
 [multihop only] Percentage of queries with perfect retrieval: 0.9459459459459459
 
 
-Evaluation of the performance of the VectorDB and Circlemind on the same data (101 queries)
+Evaluation of the performance of different RAG methods on 2wikimultihopqa (101 queries)
 
 VectorDB
 Loading dataset...
 [all questions] Percentage of queries with perfect retrieval: 0.4158415841584158
 [multihop only] Percentage of queries with perfect retrieval: 0.2318840579710145
 
+LightRAG [local]
+Loading dataset...
+Percentage of queries with perfect retrieval: 0.44554455445544555
+[multihop] Percentage of queries with perfect retrieval: 0.2753623188405797
+
 Circlemind
 Loading dataset...
 [all questions] Percentage of queries with perfect retrieval: 0.9306930693069307
 [multihop only] Percentage of queries with perfect retrieval: 0.8985507246376812
-```
+```
+
+We also quickly benchmarked on the HotpotQA dataset (we will soon release the code for that as well). Here's a preview of the results (101 queries):
+
+```
+VectorDB: 0.78
+
+LightRAG [local mode]: 0.55
+
+Circlemind: 0.84
+```
diff --git a/benchmarks/_domain.py b/benchmarks/_domain.py
@@ -0,0 +1,46 @@
+from typing import Dict, List
+
+DOMAIN: Dict[str, str] = {
+    "2wikimultihopqa": """Analyse the following passage and identify the people, creative works, and places mentioned in it. Your goal is to create an RDF (Resource Description Framework) graph from the given text.
+ IMPORTANT: among other entities and relationships you find, make sure to extract as separate entities (to be connected with the main one) a person's
+ role as a family member (such as 'son', 'uncle', 'wife', ...), their profession (such as 'director'), and the location
+ where they live or work. Pay attention to the spelling of the names.""",  # noqa: E501
+    "hotpotqa": """Analyse the following passage and identify all the entities mentioned in it and their relationships. Your goal is to create an RDF (Resource Description Framework) graph from the given text.
+ Pay attention to the spelling of the entity names."""
+}
+
+QUERIES: Dict[str, List[str]] = {
+    "2wikimultihopqa": [
+        "When did Prince Arthur's mother die?",
+        "What is the place of birth of Elizabeth II's husband?",
+        "Which film has the director died later, Interstellar or Harry Potter I?",
+        "Where does the singer who wrote the song Blank Space work at?",
+    ],
+    "hotpotqa": [
+        "Are Christopher Nolan and Sathish Kalathil both film directors?",
+        "What language were books being translated into during the era of Haymo of Faversham?",
+        "Who directed the film that was shot in or around Leland, North Carolina in 1986?",
+        "Who wrote a song after attending a luau in the Koolauloa District on the island of Oahu in Honolulu County?"
+    ]
+}
+
+ENTITY_TYPES: Dict[str, List[str]] = {
+    "2wikimultihopqa": [
+        "person",
+        "familiy_role",
+        "location",
+        "organization",
+        "creative_work",
+        "profession",
+    ],
+    "hotpotqa": [
+        "person",
+        "familiy_role",
+        "location",
+        "organization",
+        "creative_work",
+        "profession",
+        "event",
+        "year"
+    ],
+}
diff --git a/benchmarks/create_dbs.bat b/benchmarks/create_dbs.bat
@@ -1,4 +1,20 @@
-python vdb_benchmark.py -n 51 -c -b
-python vdb_benchmark.py -n 101 -c -b
-python graph_benchmark.py -n 51 -c -b
-python graph_benchmark.py -n 101 -c -b
+:: 2wikimultihopqa benchmark
+:: Creating databases
+python vdb_benchmark.py -n 51 -c
+python vdb_benchmark.py -n 101 -c
+python lightrag_benchmark.py -n 51 -c
+python lightrag_benchmark.py -n 101 -c
+python graph_benchmark.py -n 51 -c
+python graph_benchmark.py -n 101 -c
+
+:: Evaluation (create reports)
+python vdb_benchmark.py -n 51 -b
+python vdb_benchmark.py -n 101 -b
+python lightrag_benchmark.py -n 51 -b --mode=local
+python lightrag_benchmark.py -n 101 -b --mode=local
+:: feel free to try with 'global' as well
+python lightrag_benchmark.py -n 51 -b --mode=hybrid
+:: feel free to try with 'global' as well
+python lightrag_benchmark.py -n 101 -b --mode=hybrid
+python graph_benchmark.py -n 51 -b
+python graph_benchmark.py -n 101 -b
diff --git a/benchmarks/create_dbs.sh b/benchmarks/create_dbs.sh
@@ -1,4 +1,18 @@
+# 2wikimultihopqa benchmark
+# Creating databases
 python vdb_benchmark.py -n 51 -c -b
 python vdb_benchmark.py -n 101 -c -b
+python lightrag_benchmark.py -n 51 -c -b
+python lightrag_benchmark.py -n 101 -c -b
 python graph_benchmark.py -n 51 -c -b
 python graph_benchmark.py -n 101 -c -b
+
+# Evaluation (create reports)
+python vdb_benchmark.py -n 51 -b
+python vdb_benchmark.py -n 101 -b
+python lightrag_benchmark.py -n 51 -b --mode=local
+python lightrag_benchmark.py -n 101 -b --mode=local
+python lightrag_benchmark.py -n 51 -b --mode=hybrid  # feel free to try with 'global' as well
+python lightrag_benchmark.py -n 101 -b --mode=hybrid  # feel free to try with 'global' as well
+python graph_benchmark.py -n 51 -b
+python graph_benchmark.py -n 101 -b
diff --git a/benchmarks/db/graph/.gitignore → benchmarks/db/.gitignore b/benchmarks/db/graph/.gitignore → benchmarks/db/.gitignore
diff --git a/benchmarks/db/vdb/.gitignore b/benchmarks/db/vdb/.gitignore
diff --git a/benchmarks/evaluate_dbs.bat b/benchmarks/evaluate_dbs.bat
@@ -1,18 +1,26 @@
 @echo off
-echo Evaluation of the performance of the VectorDB and Circlemind on the same data (51 queries)
+echo Evaluation of the performance of different RAG methods on 2wikimultihopqa (51 queries)
 echo.
 echo VectorDB
 python vdb_benchmark.py -n 51 -s
 echo.
+echo LightRAG
+python lightrag_benchmark.py -n 51 -s --mode=local
+python lightrag_benchmark.py -n 51 -s --mode=hybrid
+echo.
 echo Circlemind
 python graph_benchmark.py -n 51 -s
 
 echo.
 echo.
-echo Evaluation of the performance of the VectorDB and Circlemind on the same data (101 queries)
+echo Evaluation of the performance of different RAG methods on 2wikimultihopqa (101 queries)
 echo.
 echo VectorDB
 python vdb_benchmark.py -n 101 -s
 echo.
+echo LightRAG
+python lightrag_benchmark.py -n 101 -s --mode=local
+python lightrag_benchmark.py -n 101 -s --mode=hybrid
+echo.
 echo Circlemind
 python graph_benchmark.py -n 101 -s
diff --git a/benchmarks/evaluate_dbs.sh b/benchmarks/evaluate_dbs.sh
@@ -1,15 +1,27 @@
-echo "Evaluation of the performance of the VectorDB and Circlemind on the same data (51 queries)";
+echo "Evaluation of the performance of different RAG methods on the 2wikimultihopqa (51 queries)";
 echo;
 echo "VectorDB";
 python vdb_benchmark.py -n 51 -s
 echo;
+echo "LightRAG [local mode]";
+python lightrag_benchmark.py -n 51 -s --mode=local
+# feel free to try with global as well
+echo "[hybrid mode]";
+python lightrag_benchmark.py -n 51 -s --mode=hybrid
+echo;
 echo "Circlemind"
 python graph_benchmark.py -n 51 -s
 
-echo "Evaluation of the performance of the VectorDB and Circlemind on the same data (101 queries)";
+echo "Evaluation of the performance of different RAG methods on the 2wikimultihopqa (101 queries)";
 echo;
 echo "VectorDB";
 python vdb_benchmark.py -n 101 -s
 echo;
+echo "LightRAG [local mode]";
+python lightrag_benchmark.py -n 101 -s --mode=local
+# feel free to try with global as well
+echo "[hybrid mode]";
+python lightrag_benchmark.py -n 101 -s --mode=hybrid
+echo;
 echo "Circlemind";
 python graph_benchmark.py -n 101 -s
diff --git a/benchmarks/graph_benchmark.py b/benchmarks/graph_benchmark.py
@@ -3,12 +3,12 @@
 import argparse
 import asyncio
 import json
-from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Tuple
 
 import numpy as np
 import xxhash
+from _domain import DOMAIN, ENTITY_TYPES, QUERIES
 from dotenv import load_dotenv
 from tqdm import tqdm
 
@@ -19,6 +19,7 @@
 @dataclass
 class Query:
     """Dataclass for a query."""
+
     question: str = field()
     answer: str = field()
     evidence: List[Tuple[str, int]] = field()
@@ -35,29 +36,25 @@ def load_dataset(dataset_name: str, subset: int = 0) -> Any:
         return dataset
 
 
-def get_corpus(dataset: Any) -> Dict[str, str]:
+def get_corpus(dataset: Any, dataset_name: str) -> Dict[int, Tuple[int | str, str]]:
     """Get the corpus from the dataset."""
-    passages: Dict[str, List[List[str]]] = defaultdict(list)
-
-    for datapoint in dataset:
-        context = datapoint["context"]
-
-        for passage in context:
-            title, text = passage
-            passages[title].append(text)
+    if dataset_name == "2wikimultihopqa" or dataset_name == "hotpotqa":
+        passages: Dict[int, Tuple[int | str, str]] = {}
 
-    for title, passage in passages.items():
-        ids = np.array([xxhash.xxh64("  ".join(p)).intdigest() for p in passage], dtype=np.uint64)
+        for datapoint in dataset:
+            context = datapoint["context"]
 
-        # Check that all ids are the same
-        assert np.all(ids == ids[0]), f"Passages with the same title do not have the same hash: {title}"
+            for passage in context:
+                title, text = passage
+                title = title.encode("utf-8").decode()
+                text = "\n".join(text).encode("utf-8").decode()
+                hash_t = xxhash.xxh3_64_intdigest(text)
+                if hash_t not in passages:
+                    passages[hash_t] = (title, text)
 
-        passages[title] = [passage[0]]
-
-    return {
-        title.encode("utf-8").decode(): "  ".join(passage[0]).encode("utf-8").decode()
-        for title, passage in passages.items()
-    }
+        return passages
+    else:
+        raise NotImplementedError(f"Dataset {dataset_name} not supported.")
 
 
 def get_queries(dataset: Any):
@@ -79,28 +76,6 @@ def get_queries(dataset: Any):
 if __name__ == "__main__":
     load_dotenv()
 
-    DOMAIN = """Analyse the following passage and identify the people, creative works, and places mentioned in it.
- IMPORTANT: be careful to make sure to extract as separate entities (to be connected with the main one) a person's
- role as a family member (such as 'son', 'uncle', 'wife', ...), their profession (such as 'director'), and the location
- where they live or work. Each entity description should be a short summary containing only essential information
- to characterize the entity. Pay attention to the spelling of the names.
-"""
-    QUERIES = [
-        "When did Prince Arthur's mother die?",
-        "What is the place of birth of Elizabeth II's husband?",
-        "Which film has the director died later, Interstellar or Harry Potter I?",
-        "Where does the singer who wrote the song Blank Space work at?",
-    ]
-
-    ENTITY_TYPES = [
-        "person",
-        "familiy_role",
-        "location",
-        "organization",
-        "creative_work",
-        "profession",
-    ]
-
     parser = argparse.ArgumentParser(description="GraphRAG CLI")
     parser.add_argument("-d", "--dataset", default="2wikimultihopqa", help="Dataset to use.")
     parser.add_argument("-n", type=int, default=0, help="Subset of corpus to use.")
@@ -112,36 +87,41 @@ def get_queries(dataset: Any):
     print("Loading dataset...")
     dataset = load_dataset(args.dataset, subset=args.n)
     working_dir = f"./db/graph/{args.dataset}_{args.n}"
+    corpus = get_corpus(dataset, args.dataset)
 
     if args.create:
-        corpus = get_corpus(dataset)
         print("Dataset loaded. Corpus:", len(corpus))
         grag = GraphRAG(
             working_dir=working_dir,
-            domain=DOMAIN,
+            domain=DOMAIN[args.dataset],
             example_queries="\n".join(QUERIES),
-            entity_types=ENTITY_TYPES,
+            entity_types=ENTITY_TYPES[args.dataset],
         )
         grag.insert(
-            [f"{title}: {corpus}" for title, corpus in tuple(corpus.items())],
+            [f"{title}: {corpus}" for _, (title, corpus) in tuple(corpus.items())],
             metadata=[{"id": title} for title in tuple(corpus.keys())],
         )
     if args.benchmark:
         queries = get_queries(dataset)
         print("Dataset loaded. Queries:", len(queries))
         grag = GraphRAG(
             working_dir=working_dir,
-            domain=DOMAIN,
+            domain=DOMAIN[args.dataset],
             example_queries="\n".join(QUERIES),
-            entity_types=ENTITY_TYPES,
+            entity_types=ENTITY_TYPES[args.dataset],
         )
 
         async def _query_task(query: Query) -> Dict[str, Any]:
             answer = await grag.async_query(query.question, QueryParam(only_context=True))
             return {
                 "question": query.question,
                 "answer": answer.response,
-                "evidence": [chunk.metadata["id"] for chunk, _ in answer.context.chunks],
+                "evidence": [
+                    corpus[chunk.metadata["id"]][0]
+                        if isinstance(chunk.metadata["id"], int)
+                        else chunk.metadata["id"]
+                    for chunk, _ in answer.context.chunks
+                ],
                 "ground_truth": [e[0] for e in query.evidence],
             }
 
@@ -177,7 +157,7 @@ async def _run():
             ground_truth = answer["ground_truth"]
             predicted_evidence = answer["evidence"]
 
-            p_retrieved: float = len(set(ground_truth).intersection(set(predicted_evidence))) / len(ground_truth)
+            p_retrieved: float = len(set(ground_truth).intersection(set(predicted_evidence))) / len(set(ground_truth))
             retrieval_scores.append(p_retrieved)
 
             if answer["question"] in questions_multihop: