From 411f22533039f9e0802b6ca53591c4bcb4c3ee2f Mon Sep 17 00:00:00 2001 From: rjmacarthy Date: Tue, 12 Nov 2024 20:18:15 +0000 Subject: [PATCH 1/4] update lancedb fix embeddings database --- package-lock.json | 89 ++++++++++--------------------- package.json | 2 +- src/common/constants.ts | 4 -- src/common/types.ts | 2 +- src/extension/chat-service.ts | 15 ------ src/extension/embeddings.ts | 40 ++++++++------ src/webview/embedding-options.tsx | 27 ---------- 7 files changed, 56 insertions(+), 123 deletions(-) diff --git a/package-lock.json b/package-lock.json index 2dd0bcc3..f379068e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,7 +18,7 @@ "win32" ], "dependencies": { - "@lancedb/lancedb": "^0.9.0", + "@lancedb/lancedb": "^0.12.0", "@tiptap/extension-mention": "^2.5.9", "@tiptap/extension-placeholder": "^2.5.9", "@tiptap/pm": "^2.5.9", @@ -2247,9 +2247,9 @@ } }, "node_modules/@lancedb/lancedb": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@lancedb/lancedb/-/lancedb-0.9.0.tgz", - "integrity": "sha512-roIBK6JJq4AkvVmLG4orZ/SXdpzPLnArhKTK+4cG/Nso9k2y2Wk5DsFStqzNmO7ko1phiCJJlmtwPLr787nV/Q==", + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@lancedb/lancedb/-/lancedb-0.12.0.tgz", + "integrity": "sha512-YkfJ0pL0GT6A+goDqVsd/8jYeOJmPvT0A0aWpUbP/hgNn98FKJZicQKUHW6gwWyoZBVKTF9yBNKQS59aZOT+cg==", "cpu": [ "x64", "arm64" @@ -2260,27 +2260,26 @@ "win32" ], "dependencies": { - "axios": "^1.7.2", "reflect-metadata": "^0.2.2" }, "engines": { "node": ">= 18" }, "optionalDependencies": { - "@lancedb/lancedb-darwin-arm64": "0.9.0", - "@lancedb/lancedb-darwin-x64": "0.9.0", - "@lancedb/lancedb-linux-arm64-gnu": "0.9.0", - "@lancedb/lancedb-linux-x64-gnu": "0.9.0", - "@lancedb/lancedb-win32-x64-msvc": "0.9.0" + "@lancedb/lancedb-darwin-arm64": "0.12.0", + "@lancedb/lancedb-darwin-x64": "0.12.0", + "@lancedb/lancedb-linux-arm64-gnu": "0.12.0", + "@lancedb/lancedb-linux-x64-gnu": "0.12.0", + "@lancedb/lancedb-win32-x64-msvc": "0.12.0" }, "peerDependencies": { "apache-arrow": ">=13.0.0 <=17.0.0" } }, "node_modules/@lancedb/lancedb-darwin-arm64": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-arm64/-/lancedb-darwin-arm64-0.9.0.tgz", - "integrity": "sha512-w/lbpjCNNfzMWL0rgOdxjc6dQWrgxt7YtBcU/a5CYUkuj6I+EcBhp7VtX7t+8smF1pIJqeMPakhHe3SrkA5ZRQ==", + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-arm64/-/lancedb-darwin-arm64-0.12.0.tgz", + "integrity": "sha512-yuQkxgdR7q8eXeQ+8wOupB2789f0gS5+uvzZRiKz3ilf1ZgNTV68Zd3vgGjTTepcYGZvFvVOVlszlhZhVQjlfw==", "cpu": [ "arm64" ], @@ -2293,9 +2292,9 @@ } }, "node_modules/@lancedb/lancedb-darwin-x64": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-x64/-/lancedb-darwin-x64-0.9.0.tgz", - "integrity": "sha512-ENnWvwEEUMozlfGt3Q+CLyHrk5PTjYSq9adnKKwfhvdNKXtLLibSpIj9VyTDa+9rfJWpdLKFD28Tsh/6Q+AHzA==", + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-x64/-/lancedb-darwin-x64-0.12.0.tgz", + "integrity": "sha512-ZOVVDJRaEch/54zbDSVRbFXZRCgOEYRaqrcIUSZMKrMgFhinq5xgrau4zLGRsF7rSrxeCoF6eMx9+qkQHotyig==", "cpu": [ "x64" ], @@ -2308,9 +2307,9 @@ } }, "node_modules/@lancedb/lancedb-linux-arm64-gnu": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-gnu/-/lancedb-linux-arm64-gnu-0.9.0.tgz", - "integrity": "sha512-GqduIR6yTBxTu8kCMPofeV4vUSDdzSu43AbxwtuErPIGX6a+O8100OQ3kNFNOICwjpdcRjS5umd5POSxz2H59w==", + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-gnu/-/lancedb-linux-arm64-gnu-0.12.0.tgz", + "integrity": "sha512-mAsVwaiaLoNRLB3stocJyAEoDpwsPu++YISd5ZCaYf66CXeYU8MI50z6NV0ZYUbAFHhUzqG85CfwW+Ns2ToJ8g==", "cpu": [ "arm64" ], @@ -2323,9 +2322,9 @@ } }, "node_modules/@lancedb/lancedb-linux-x64-gnu": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-gnu/-/lancedb-linux-x64-gnu-0.9.0.tgz", - "integrity": "sha512-KfEUoewxGkvAuX3ctJQ0cH4f5AK1QwxHq7ZGj5FfFls68JjewkUgsoeaEDvmhtC5WKq44Bw6N09eWJ1eWSZn3w==", + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-gnu/-/lancedb-linux-x64-gnu-0.12.0.tgz", + "integrity": "sha512-tc/A8NQQjbuEFWmq2qEWJ9s+JZFdH2diYPmMO4FNQpcbikjfk8kbJR5AFFtZel/cl2LqE7BnCvWtkI7v/hn5PA==", "cpu": [ "x64" ], @@ -2338,9 +2337,9 @@ } }, "node_modules/@lancedb/lancedb-win32-x64-msvc": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@lancedb/lancedb-win32-x64-msvc/-/lancedb-win32-x64-msvc-0.9.0.tgz", - "integrity": "sha512-lEFoGudbzZ4RhbhdMoLCDXcDhiVTInK1nSz5/GRIJ8O4j91OOlYu6jDT/mw9II2ghtW7YmpoEGOuon1fqmHn1A==", + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@lancedb/lancedb-win32-x64-msvc/-/lancedb-win32-x64-msvc-0.12.0.tgz", + "integrity": "sha512-aLpGwksA4FWvWRxsoDzo1m8ryvmcIHOLjln4BUEPtJcKdX6zWBC5VL9QlBkKribnsyj87V/5UbJpM9KyrCnk5A==", "cpu": [ "x64" ], @@ -4498,7 +4497,8 @@ "node_modules/asynckit": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "dev": true }, "node_modules/available-typed-arrays": { "version": "1.0.7", @@ -4514,16 +4514,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/axios": { - "version": "1.7.4", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.4.tgz", - "integrity": "sha512-DukmaFRnY6AzAALSH4J2M3k6PkaC+MfaAGdEERRWcC9q3/TWQwLpHR8ZRLKTdQ3aBDL64EdluRDjJqKw+BPZEw==", - "dependencies": { - "follow-redirects": "^1.15.6", - "form-data": "^4.0.0", - "proxy-from-env": "^1.1.0" - } - }, "node_modules/azure-devops-node-api": { "version": "12.5.0", "resolved": "https://registry.npmjs.org/azure-devops-node-api/-/azure-devops-node-api-12.5.0.tgz", @@ -5688,6 +5678,7 @@ "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dev": true, "dependencies": { "delayed-stream": "~1.0.0" }, @@ -6349,6 +6340,7 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "dev": true, "engines": { "node": ">=0.4.0" } @@ -7763,25 +7755,6 @@ "dev": true, "peer": true }, - "node_modules/follow-redirects": { - "version": "1.15.6", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", - "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], - "engines": { - "node": ">=4.0" - }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } - } - }, "node_modules/for-each": { "version": "0.3.3", "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.3.tgz", @@ -7810,6 +7783,7 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "dev": true, "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", @@ -14382,11 +14356,6 @@ "unslab": "^1.3.0" } }, - "node_modules/proxy-from-env": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", - "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" - }, "node_modules/public-encrypt": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/public-encrypt/-/public-encrypt-4.0.3.tgz", diff --git a/package.json b/package.json index c837ef4d..569900de 100644 --- a/package.json +++ b/package.json @@ -464,7 +464,7 @@ "web-tree-sitter": "^0.22.1" }, "dependencies": { - "@lancedb/lancedb": "^0.9.0", + "@lancedb/lancedb": "^0.12.0", "@tiptap/extension-mention": "^2.5.9", "@tiptap/extension-placeholder": "^2.5.9", "@tiptap/pm": "^2.5.9", diff --git a/src/common/constants.ts b/src/common/constants.ts index d6ceda77..f72b616d 100644 --- a/src/common/constants.ts +++ b/src/common/constants.ts @@ -179,7 +179,6 @@ export const EXTENSION_CONTEXT_NAME = { twinnyOverlapSize: "twinnyOverlapSize", twinnyRelevantFilePaths: "twinnyRelevantFilePaths", twinnyRelevantCodeSnippets: "twinnyRelevantCodeSnippets", - twinnyVectorSearchMetric: "twinnyVectorSearchMetric", twinnySymmetryTab: "twinnySymmetryTab", twinnyEnableRag: "twinnyEnableRag", } @@ -338,9 +337,6 @@ export const WASM_LANGUAGES: { [key: string]: string } = { export const DEFAULT_RELEVANT_FILE_COUNT = 10 export const DEFAULT_RELEVANT_CODE_COUNT = 5 -export const DEFAULT_VECTOR_SEARCH_METRIC = "l2" - -export const EMBEDDING_METRICS = ["cosine", "l2", "dot"] export const MULTILINE_OUTSIDE = [ "class_body", diff --git a/src/common/types.ts b/src/common/types.ts index d1ece7c8..2e39d039 100644 --- a/src/common/types.ts +++ b/src/common/types.ts @@ -290,7 +290,7 @@ export interface ChunkOptions { } export type Embedding = { - embeddings: number[] + embeddings: [number[]] } export type EmbeddedDocument = { diff --git a/src/extension/chat-service.ts b/src/extension/chat-service.ts index 90e8a9ae..106dab56 100644 --- a/src/extension/chat-service.ts +++ b/src/extension/chat-service.ts @@ -16,7 +16,6 @@ import { DEFAULT_RELEVANT_CODE_COUNT, DEFAULT_RELEVANT_FILE_COUNT, DEFAULT_RERANK_THRESHOLD, - DEFAULT_VECTOR_SEARCH_METRIC, EVENT_NAME, EXTENSION_CONTEXT_NAME, EXTENSION_SESSION_NAME, @@ -118,18 +117,11 @@ export class ChatService extends Base { ) as number const relevantFileCount = Number(stored) || DEFAULT_RELEVANT_FILE_COUNT - const storedMetric = this._context?.globalState.get( - `${EVENT_NAME.twinnyGlobalContext}-${EXTENSION_CONTEXT_NAME.twinnyVectorSearchMetric}` - ) as number - - const metric = storedMetric || DEFAULT_VECTOR_SEARCH_METRIC - const filePaths = (await this._db.getDocuments( embedding, relevantFileCount, table, - metric as "cosine" | "l2" | "dot" )) || [] if (!filePaths.length) return [] @@ -226,11 +218,6 @@ export class ChatService extends Base { if (!embedding) return "" - const storedMetric = this._context?.globalState.get( - `${EVENT_NAME.twinnyGlobalContext}-${EXTENSION_CONTEXT_NAME.twinnyVectorSearchMetric}` - ) as number - const metric = storedMetric || DEFAULT_VECTOR_SEARCH_METRIC - const query = relevantFiles?.length ? `file IN ("${relevantFiles.map((file) => file[0]).join("\",\"")}")` : "" @@ -240,7 +227,6 @@ export class ChatService extends Base { embedding, Math.round(relevantCodeCount / 2), table, - metric as "cosine" | "l2" | "dot", query )) || [] @@ -249,7 +235,6 @@ export class ChatService extends Base { embedding, Math.round(relevantCodeCount / 2), table, - metric as "cosine" | "l2" | "dot" )) || [] const documents = [...embeddedDocuments, ...queryEmbeddedDocuments] diff --git a/src/extension/embeddings.ts b/src/extension/embeddings.ts index a7b8176f..ab35c108 100644 --- a/src/extension/embeddings.ts +++ b/src/extension/embeddings.ts @@ -19,10 +19,7 @@ import { import { fetchEmbedding } from "./api" import { Base } from "./base" import { TwinnyProvider } from "./provider-manager" -import { - getDocumentSplitChunks, - readGitSubmodulesFile -} from "./utils" +import { getDocumentSplitChunks, readGitSubmodulesFile } from "./utils" export class EmbeddingDatabase extends Base { private _documents: EmbeddedDocument[] = [] @@ -110,7 +107,7 @@ export class EmbeddingDatabase extends Base { ig.add(embeddingIgnoredGlobs) ig.add([".git", ".gitignore"]) - + for (const dirent of dirents) { const fullPath = path.join(dirPath, dirent.name) const relativePath = path.relative(rootPath, fullPath) @@ -150,26 +147,28 @@ export class EmbeddingDatabase extends Base { if (!this._extensionContext) return const promises = filePaths.map(async (filePath) => { const content = await fs.promises.readFile(filePath, "utf-8") + const chunks = await getDocumentSplitChunks( content, filePath, this._extensionContext ) - const filePathEmbedding = await this.fetchModelEmbedding(filePath) + + const fileNameEmbedding = await this.fetchModelEmbedding(filePath) this._filePaths.push({ content: filePath, - vector: filePathEmbedding, + vector: fileNameEmbedding, file: filePath }) for (const chunk of chunks) { - const vector = await this.fetchModelEmbedding(filePath) + const chunkEmbedding = await this.fetchModelEmbedding(chunk) if (this.getIsDuplicateItem(chunk, chunks)) return this._documents.push({ content: chunk, - file: filePath, - vector: vector + vector: chunkEmbedding, + file: filePath }) } @@ -196,11 +195,23 @@ export class EmbeddingDatabase extends Base { try { const tableNames = await this._db?.tableNames() if (!tableNames?.includes(`${this._workspaceName}-documents`)) { - await this._db?.createTable(this._documentTableName, this._documents) + await this._db?.createTable( + this._documentTableName, + this._documents, + { + mode: "overwrite" + } + ) } if (!tableNames?.includes(`${this._workspaceName}-file-paths`)) { - await this._db?.createTable(this._filePathTableName, this._filePaths) + await this._db?.createTable( + this._filePathTableName, + this._filePaths, + { + mode: "overwrite" + } + ) return } @@ -224,12 +235,11 @@ export class EmbeddingDatabase extends Base { vector: IntoVector, limit: number, tableName: string, - metric: "cosine" | "l2" | "dot" = "cosine", where?: string ): Promise { try { const table = await this._db?.openTable(tableName) - const query = table?.search(vector).limit(limit).distanceType(metric) // add type assertion + const query = table?.vectorSearch(vector).select("content").limit(limit) if (where) query?.where(where) return query?.toArray() } catch (e) { @@ -255,6 +265,6 @@ export class EmbeddingDatabase extends Base { return (response as LMStudioEmbedding).data?.[0].embedding } - return (response as Embedding).embeddings + return (response as Embedding).embeddings[0] } } diff --git a/src/webview/embedding-options.tsx b/src/webview/embedding-options.tsx index 08744189..e41322ba 100644 --- a/src/webview/embedding-options.tsx +++ b/src/webview/embedding-options.tsx @@ -9,7 +9,6 @@ import { } from "@vscode/webview-ui-toolkit/react" import { - EMBEDDING_METRICS, EVENT_NAME, EXTENSION_CONTEXT_NAME, } from "../common/constants" @@ -47,9 +46,6 @@ export const EmbeddingOptions = () => { const { context: filePaths = "10", setContext: setRelevantFilePaths } = useGlobalContext(EXTENSION_CONTEXT_NAME.twinnyRelevantFilePaths) - const { context: metric = "l2", setContext: setMetric } = - useGlobalContext(EXTENSION_CONTEXT_NAME.twinnyVectorSearchMetric) - const embeddingProviders = Object.values(getProvidersByType("embedding")) const handleEmbedDocuments = () => { @@ -102,12 +98,6 @@ export const EmbeddingOptions = () => { setActiveEmbeddingsProvider(provider) } - const handleChangeMetric = (e: unknown): void => { - const event = e as React.ChangeEvent - const value = event.target.value - setMetric(value) - } - if (!embeddingProviders) { return (
@@ -193,23 +183,6 @@ export const EmbeddingOptions = () => { The number of filepaths to be used as context.
-
-
Search Metric
- - {EMBEDDING_METRICS.map((metric: string) => ( - - {metric} - - ))} - - - The metric to be used for the vector search. - -