Merge pull request #85 from codefromthecrypt/fuzz

Fix grammar and inconsistent receiver names
philippgille · Jun 13, 2024 · e147c74 · e147c74
2 parents e2f58ad + 87ad9d7
commit e147c74
Show file tree

Hide file tree

Showing 12 changed files with 33 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -41,11 +41,11 @@ Let's look at the RAG use case in more detail:
 
 ### RAG
 
-The knowledge of large language models (LLMs) - even the ones with with 30 billion, 70 billion parameters and more - is limited. They don't know anything about what happened after their training ended, they don't know anything about data they were not trained with (like your company's intranet, Jira / bug tracker, wiki or other kinds of knowledge bases), and even the data they *do* know they often can't reproduce it *exactly*, but start to *hallucinate* instead.
+The knowledge of large language models (LLMs) - even the ones with 30 billion, 70 billion parameters and more - is limited. They don't know anything about what happened after their training ended, they don't know anything about data they were not trained with (like your company's intranet, Jira / bug tracker, wiki or other kinds of knowledge bases), and even the data they *do* know they often can't reproduce it *exactly*, but start to *hallucinate* instead.
 
 Fine-tuning an LLM can help a bit, but it's more meant to improve the LLMs reasoning about specific topics, or reproduce the style of written text or code. Fine-tuning does *not* add knowledge *1:1* into the model. Details are lost or mixed up. And knowledge cutoff (about anything that happened after the fine-tuning) isn't solved either.
 
-=> A vector database can act as the the up-to-date, precise knowledge for LLMs:
+=> A vector database can act as the up-to-date, precise knowledge for LLMs:
 
 1. You store relevant documents that you want the LLM to know in the database.
 2. The database stores the *embeddings* alongside the documents, which you can either provide or can be created by specific "embedding models" like OpenAI's `text-embedding-3-small`.
@@ -136,7 +136,7 @@ For the full interface see the Godoc: <https://pkg.go.dev/github.com/philippgill
 
 - [X] Zero dependencies on third party libraries
 - [X] Embeddable (like SQLite, i.e. no client-server model, no separate DB to maintain)
-- [X] Multi-threaded processing (when adding and querying documents), making use of Go's native concurrency features
+- [X] Multithreaded processing (when adding and querying documents), making use of Go's native concurrency features
 - [X] Experimental WebAssembly binding
 - Embedding creators:
   - Hosted:
@@ -178,7 +178,7 @@ For the full interface see the Godoc: <https://pkg.go.dev/github.com/philippgill
   - Operators (`$and`, `$or` etc.)
 - Storage:
   - JSON as second encoding format
-  - Write-ahead log (WAL) as second file format)
+  - Write-ahead log (WAL) as second file format
   - Optional remote storage (S3, PostgreSQL, ...)
 - Data types:
   - Images

diff --git a/collection.go b/collection.go
@@ -87,7 +87,7 @@ func (c *Collection) Add(ctx context.Context, ids []string, embeddings [][]float
 }
 
 // AddConcurrently is like Add, but adds embeddings concurrently.
-// This is mostly useful when you don't pass any embeddings so they have to be created.
+// This is mostly useful when you don't pass any embeddings, so they have to be created.
 // Upon error, concurrently running operations are canceled and the error is returned.
 //
 // This is a Chroma-like method. For a more Go-idiomatic one, see [AddDocuments].
@@ -103,23 +103,23 @@ func (c *Collection) AddConcurrently(ctx context.Context, ids []string, embeddin
 			return errors.New("ids and embeddings must have the same length")
 		}
 	} else {
-		// Assign empty slice so we can simply access via index later
+		// Assign empty slice, so we can simply access via index later
 		embeddings = make([][]float32, len(ids))
 	}
 	if len(metadatas) != 0 {
 		if len(ids) != len(metadatas) {
 			return errors.New("when metadatas is not empty it must have the same length as ids")
 		}
 	} else {
-		// Assign empty slice so we can simply access via index later
+		// Assign empty slice, so we can simply access via index later
 		metadatas = make([]map[string]string, len(ids))
 	}
 	if len(contents) != 0 {
 		if len(contents) != len(ids) {
 			return errors.New("ids and contents must have the same length")
 		}
 	} else {
-		// Assign empty slice so we can simply access via index later
+		// Assign empty slice, so we can simply access via index later
 		contents = make([]string, len(ids))
 	}
 	if concurrency < 1 {
@@ -323,7 +323,7 @@ type Result struct {
 	Similarity float32
 }
 
-// Performs an exhaustive nearest neighbor search on the collection.
+// Query performs an exhaustive nearest neighbor search on the collection.
 //
 //   - queryText: The text to search for. Its embedding will be created using the
 //     collection's embedding function.
@@ -344,7 +344,7 @@ func (c *Collection) Query(ctx context.Context, queryText string, nResults int,
 	return c.QueryEmbedding(ctx, queryVectors, nResults, where, whereDocument)
 }
 
-// Performs an exhaustive nearest neighbor search on the collection.
+// QueryEmbedding performs an exhaustive nearest neighbor search on the collection.
 //
 //   - queryEmbedding: The embedding of the query to search for. It must be created
 //     with the same embedding model as the document embeddings in the collection.

diff --git a/db.go b/db.go
@@ -38,7 +38,7 @@ type DB struct {
 
 // NewDB creates a new in-memory chromem-go DB.
 // While it doesn't write files when you add collections and documents, you can
-// still use [DB.Export] and [DB.Import] to export and import the the entire DB
+// still use [DB.Export] and [DB.Import] to export and import the entire DB
 // from a file.
 func NewDB() *DB {
 	return &DB{
@@ -51,12 +51,12 @@ func NewDB() *DB {
 // If compress is true, the files are compressed with gzip.
 //
 // The persistence covers the collections (including their documents) and the metadata.
-// However it doesn't cover the EmbeddingFunc, as functions can't be serialized.
-// When some data is persisted and you create a new persistent DB with the same
+// However, it doesn't cover the EmbeddingFunc, as functions can't be serialized.
+// When some data is persisted, and you create a new persistent DB with the same
 // path, you'll have to provide the same EmbeddingFunc as before when getting an
 // existing collection and adding more documents to it.
 //
-// Currently the persistence is done synchronously on each write operation, and
+// Currently, the persistence is done synchronously on each write operation, and
 // each document addition leads to a new file, encoded as gob. In the future we
 // will make this configurable (encoding, async writes, WAL-based writes, etc.).
 //

diff --git a/db_test.go b/db_test.go
@@ -320,7 +320,7 @@ func TestDB_GetOrCreateCollection(t *testing.T) {
 		}
 
 		// Call GetOrCreateCollection() with the same name to only get it. We pass
-		// nil for the metadata and embeddingFunc so we can check that the returned
+		// nil for the metadata and embeddingFunc, so we can check that the returned
 		// collection is the original one, and not a new one.
 		c, err := db.GetOrCreateCollection(name, nil, nil)
 		if err != nil {

diff --git a/embed_cohere.go b/embed_cohere.go
@@ -63,7 +63,7 @@ type cohereResponse struct {
 //
 // When you set up a chromem-go collection with this embedding function, you might
 // want to create the document separately with [NewDocument] and then cut off the
-// prefix before adding the document to the collection. Otherwise when you query
+// prefix before adding the document to the collection. Otherwise, when you query
 // the collection, the returned documents will still have the prefix in their content.
 //
 //	cohereFunc := chromem.NewEmbeddingFuncCohere(cohereApiKey, chromem.EmbeddingModelCohereEnglishV3)

diff --git a/examples/rag-wikipedia-ollama/README.md b/examples/rag-wikipedia-ollama/README.md
@@ -4,7 +4,7 @@ This example shows a retrieval augmented generation (RAG) application, using `ch
 
 We run the embeddings model and LLM in [Ollama](https://github.com/ollama/ollama), to showcase how a RAG application can run entirely offline, without relying on OpenAI or other third party APIs. It doesn't require a GPU, and a CPU like an 11th Gen Intel i5-1135G7 (like in the first generation Framework Laptop 13) is fast enough.
 
-As LLM we use Google's [Gemma (2B)](https://huggingface.co/google/gemma-2b), a very small model that doesn't need much resources and is fast, but doesn't have much knowledge, so it's a prime example for the combination of LLMs and vector databases. We found Gemma 2B to be superior to [TinyLlama (1.1B)](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0), [Stable LM 2 (1.6B)](https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b) and [Phi-2 (2.7B)](https://huggingface.co/microsoft/phi-2) for the RAG use case.
+As LLM we use Google's [Gemma (2B)](https://huggingface.co/google/gemma-2b), a very small model that doesn't need many resources and is fast, but doesn't have much knowledge, so it's a prime example for the combination of LLMs and vector databases. We found Gemma 2B to be superior to [TinyLlama (1.1B)](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0), [Stable LM 2 (1.6B)](https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b) and [Phi-2 (2.7B)](https://huggingface.co/microsoft/phi-2) for the RAG use case.
 
 As embeddings model we use Nomic's [nomic-embed-text v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5).
 

diff --git a/examples/rag-wikipedia-ollama/llm.go b/examples/rag-wikipedia-ollama/llm.go
@@ -10,9 +10,9 @@ import (
 )
 
 const (
-	// We use a local LLM running in Ollama for asking the question: https://github.com/ollama/ollama
+	// We use a local LLM running in Ollama to ask a question: https://github.com/ollama/ollama
 	ollamaBaseURL = "http://localhost:11434/v1"
-	// We use Google's Gemma (2B), a very small model that doesn't need much resources
+	// We use Google's Gemma (2B), a very small model that doesn't need many resources
 	// and is fast, but doesn't have much knowledge: https://huggingface.co/google/gemma-2b
 	// We found Gemma 2B to be superior to TinyLlama (1.1B), Stable LM 2 (1.6B)
 	// and Phi-2 (2.7B) for the retrieval augmented generation (RAG) use case.

diff --git a/examples/rag-wikipedia-ollama/main.go b/examples/rag-wikipedia-ollama/main.go
@@ -56,7 +56,7 @@ func main() {
 	}
 	// Add docs to the collection, if the collection was just created (and not
 	// loaded from persistent storage).
-	docs := []chromem.Document{}
+	var docs []chromem.Document
 	if collection.Count() == 0 {
 		// Here we use a DBpedia sample, where each line contains the lead section/introduction
 		// to some Wikipedia article and its category.

diff --git a/examples/semantic-search-arxiv-openai/README.md b/examples/semantic-search-arxiv-openai/README.md
@@ -2,7 +2,7 @@
 
 This example shows a semantic search application, using `chromem-go` as vector database for finding semantically relevant search results. We load and search across ~5,000 arXiv papers in the "Computer Science - Computation and Language" category, which is the relevant one for Natural Language Processing (NLP) related papers.
 
-This is not a retrieval augmented generation (RAG) app, because after *retrieving* the semantically relevant results, we don't *augment* any prompt to an LLM. No LLM is generates the final output.
+This is not a retrieval augmented generation (RAG) app, because after *retrieving* the semantically relevant results, we don't *augment* any prompt to an LLM. No LLM generates the final output.
 
 ## How to run
 

diff --git a/examples/semantic-search-arxiv-openai/main.go b/examples/semantic-search-arxiv-openai/main.go
@@ -36,7 +36,7 @@ func main() {
 	}
 	// Add docs to the collection, if the collection was just created (and not
 	// loaded from persistent storage).
-	docs := []chromem.Document{}
+	var docs []chromem.Document
 	if collection.Count() == 0 {
 		// Here we use an arXiv metadata sample, where each line contains the metadata
 		// of a paper, including its submitter, title and abstract.

diff --git a/persistence.go b/persistence.go
@@ -43,7 +43,7 @@ func persistToFile(filePath string, obj any, compress bool, encryptionKey string
 	}
 
 	// If path doesn't exist, create the parent path.
-	// If path exists and it's a directory, return an error.
+	// If path exists, and it's a directory, return an error.
 	fi, err := os.Stat(filePath)
 	if err != nil {
 		if !errors.Is(err, fs.ErrNotExist) {
@@ -108,7 +108,7 @@ func persistToWriter(w io.Writer, obj any, compress bool, encryptionKey string)
 		return fmt.Errorf("couldn't encode or write object: %w", err)
 	}
 
-	// If compressing, close the gzip writer. Otherwise the gzip footer won't be
+	// If compressing, close the gzip writer. Otherwise, the gzip footer won't be
 	// written yet. When using encryption (and chainedWriter is a buffer) then
 	// we'll encrypt an incomplete stream. Without encryption when we return here and having
 	// a deferred Close(), there might be a silenced error.
@@ -191,7 +191,7 @@ func readFromReader(r io.ReadSeeker, obj any, encryptionKey string) error {
 	// To reduce memory usage we chain the readers instead of buffering, so we start
 	// from the end. For the decryption there's no reader though.
 
-	// For the chainedReader we don't declare it as ReadSeeker so we can reassign
+	// For the chainedReader we don't declare it as ReadSeeker, so we can reassign
 	// the gzip reader to it.
 	var chainedReader io.Reader
 

diff --git a/query.go b/query.go
@@ -58,15 +58,15 @@ func newMaxDocSims(size int) *maxDocSims {
 }
 
 // add inserts a new docSim into the heap, keeping only the top n similarities.
-func (mds *maxDocSims) add(doc docSim) {
-	mds.lock.Lock()
-	defer mds.lock.Unlock()
-	if mds.h.Len() < mds.size {
-		heap.Push(&mds.h, doc)
-	} else if mds.h.Len() > 0 && mds.h[0].similarity < doc.similarity {
+func (d *maxDocSims) add(doc docSim) {
+	d.lock.Lock()
+	defer d.lock.Unlock()
+	if d.h.Len() < d.size {
+		heap.Push(&d.h, doc)
+	} else if d.h.Len() > 0 && d.h[0].similarity < doc.similarity {
 		// Replace the smallest similarity if the new doc's similarity is higher
-		heap.Pop(&mds.h)
-		heap.Push(&mds.h, doc)
+		heap.Pop(&d.h)
+		heap.Push(&d.h, doc)
 	}
 }