feat(blocks): Add pinecone and jina blocks (#8401)

* add pinecone and jina blocks * udpate based on comments * backend updates * frontend updates * type hint * more type hints * another type hint * update run signature * shared jina provider * fix linting * lockfile * remove noqa * remove noqa * remove vector db folder * line * update pincone credentials provider * fix imports * formating * update frontend * Test (#8425) * h * Discard changes to autogpt_platform/backend/poetry.lock * fix: broken dep --------- Co-authored-by: Nicholas Tindle <[email protected]>
Significant-Gravitas · Oct 24, 2024 · e2df601 · e2df601
1 parent 6d812ac
commit e2df601
Show file tree

Hide file tree

Showing 9 changed files with 355 additions and 10 deletions.
diff --git a/autogpt_platform/backend/backend/blocks/jina/_auth.py b/autogpt_platform/backend/backend/blocks/jina/_auth.py
@@ -0,0 +1,39 @@
+from typing import Literal
+
+from autogpt_libs.supabase_integration_credentials_store.types import APIKeyCredentials
+from pydantic import SecretStr
+
+from backend.data.model import CredentialsField, CredentialsMetaInput
+
+JinaCredentials = APIKeyCredentials
+JinaCredentialsInput = CredentialsMetaInput[
+    Literal["jina"],
+    Literal["api_key"],
+]
+
+
+def JinaCredentialsField() -> JinaCredentialsInput:
+    """
+    Creates a Jina credentials input on a block.
+
+    """
+    return CredentialsField(
+        provider="jina",
+        supported_credential_types={"api_key"},
+        description="The Jina integration can be used with an API Key.",
+    )
+
+
+TEST_CREDENTIALS = APIKeyCredentials(
+    id="01234567-89ab-cdef-0123-456789abcdef",
+    provider="jina",
+    api_key=SecretStr("mock-jina-api-key"),
+    title="Mock Jina API key",
+    expires_at=None,
+)
+TEST_CREDENTIALS_INPUT = {
+    "provider": TEST_CREDENTIALS.provider,
+    "id": TEST_CREDENTIALS.id,
+    "type": TEST_CREDENTIALS.type,
+    "title": TEST_CREDENTIALS.type,
+}
diff --git a/autogpt_platform/backend/backend/blocks/jina/chunking.py b/autogpt_platform/backend/backend/blocks/jina/chunking.py
@@ -0,0 +1,69 @@
+import requests
+
+from backend.blocks.jina._auth import (
+    JinaCredentials,
+    JinaCredentialsField,
+    JinaCredentialsInput,
+)
+from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
+from backend.data.model import SchemaField
+
+
+class JinaChunkingBlock(Block):
+    class Input(BlockSchema):
+        texts: list = SchemaField(description="List of texts to chunk")
+
+        credentials: JinaCredentialsInput = JinaCredentialsField()
+        max_chunk_length: int = SchemaField(
+            description="Maximum length of each chunk", default=1000
+        )
+        return_tokens: bool = SchemaField(
+            description="Whether to return token information", default=False
+        )
+
+    class Output(BlockSchema):
+        chunks: list = SchemaField(description="List of chunked texts")
+        tokens: list = SchemaField(
+            description="List of token information for each chunk", optional=True
+        )
+
+    def __init__(self):
+        super().__init__(
+            id="806fb15e-830f-4796-8692-557d300ff43c",
+            description="Chunks texts using Jina AI's segmentation service",
+            categories={BlockCategory.AI, BlockCategory.TEXT},
+            input_schema=JinaChunkingBlock.Input,
+            output_schema=JinaChunkingBlock.Output,
+        )
+
+    def run(
+        self, input_data: Input, *, credentials: JinaCredentials, **kwargs
+    ) -> BlockOutput:
+        url = "https://segment.jina.ai/"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
+        }
+
+        all_chunks = []
+        all_tokens = []
+
+        for text in input_data.texts:
+            data = {
+                "content": text,
+                "return_tokens": str(input_data.return_tokens).lower(),
+                "return_chunks": "true",
+                "max_chunk_length": str(input_data.max_chunk_length),
+            }
+
+            response = requests.post(url, headers=headers, json=data)
+            response.raise_for_status()
+            result = response.json()
+
+            all_chunks.extend(result.get("chunks", []))
+            if input_data.return_tokens:
+                all_tokens.extend(result.get("tokens", []))
+
+        yield "chunks", all_chunks
+        if input_data.return_tokens:
+            yield "tokens", all_tokens
diff --git a/autogpt_platform/backend/backend/blocks/jina/embeddings.py b/autogpt_platform/backend/backend/blocks/jina/embeddings.py
@@ -0,0 +1,44 @@
+import requests
+
+from backend.blocks.jina._auth import (
+    JinaCredentials,
+    JinaCredentialsField,
+    JinaCredentialsInput,
+)
+from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
+from backend.data.model import SchemaField
+
+
+class JinaEmbeddingBlock(Block):
+    class Input(BlockSchema):
+        texts: list = SchemaField(description="List of texts to embed")
+        credentials: JinaCredentialsInput = JinaCredentialsField()
+        model: str = SchemaField(
+            description="Jina embedding model to use",
+            default="jina-embeddings-v2-base-en",
+        )
+
+    class Output(BlockSchema):
+        embeddings: list = SchemaField(description="List of embeddings")
+
+    def __init__(self):
+        super().__init__(
+            id="7c56b3ab-62e7-43a2-a2dc-4ec4245660b6",
+            description="Generates embeddings using Jina AI",
+            categories={BlockCategory.AI},
+            input_schema=JinaEmbeddingBlock.Input,
+            output_schema=JinaEmbeddingBlock.Output,
+        )
+
+    def run(
+        self, input_data: Input, *, credentials: JinaCredentials, **kwargs
+    ) -> BlockOutput:
+        url = "https://api.jina.ai/v1/embeddings"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
+        }
+        data = {"input": input_data.texts, "model": input_data.model}
+        response = requests.post(url, headers=headers, json=data)
+        embeddings = [e["embedding"] for e in response.json()["data"]]
+        yield "embeddings", embeddings
diff --git a/autogpt_platform/backend/backend/blocks/pinecone.py b/autogpt_platform/backend/backend/blocks/pinecone.py
@@ -0,0 +1,131 @@
+from typing import Literal
+
+from autogpt_libs.supabase_integration_credentials_store import APIKeyCredentials
+from pinecone import Pinecone, ServerlessSpec
+
+from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
+from backend.data.model import CredentialsField, CredentialsMetaInput, SchemaField
+
+PineconeCredentials = APIKeyCredentials
+PineconeCredentialsInput = CredentialsMetaInput[
+    Literal["pinecone"],
+    Literal["api_key"],
+]
+
+
+def PineconeCredentialsField() -> PineconeCredentialsInput:
+    """
+    Creates a Pinecone credentials input on a block.
+
+    """
+    return CredentialsField(
+        provider="pinecone",
+        supported_credential_types={"api_key"},
+        description="The Pinecone integration can be used with an API Key.",
+    )
+
+
+class PineconeInitBlock(Block):
+    class Input(BlockSchema):
+        credentials: PineconeCredentialsInput = PineconeCredentialsField()
+        index_name: str = SchemaField(description="Name of the Pinecone index")
+        dimension: int = SchemaField(
+            description="Dimension of the vectors", default=768
+        )
+        metric: str = SchemaField(
+            description="Distance metric for the index", default="cosine"
+        )
+        cloud: str = SchemaField(
+            description="Cloud provider for serverless", default="aws"
+        )
+        region: str = SchemaField(
+            description="Region for serverless", default="us-east-1"
+        )
+
+    class Output(BlockSchema):
+        index: str = SchemaField(description="Name of the initialized Pinecone index")
+        message: str = SchemaField(description="Status message")
+
+    def __init__(self):
+        super().__init__(
+            id="48d8fdab-8f03-41f3-8407-8107ba11ec9b",
+            description="Initializes a Pinecone index",
+            categories={BlockCategory.LOGIC},
+            input_schema=PineconeInitBlock.Input,
+            output_schema=PineconeInitBlock.Output,
+        )
+
+    def run(
+        self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
+    ) -> BlockOutput:
+        pc = Pinecone(api_key=credentials.api_key.get_secret_value())
+
+        try:
+            existing_indexes = pc.list_indexes()
+            if input_data.index_name not in [index.name for index in existing_indexes]:
+                pc.create_index(
+                    name=input_data.index_name,
+                    dimension=input_data.dimension,
+                    metric=input_data.metric,
+                    spec=ServerlessSpec(
+                        cloud=input_data.cloud, region=input_data.region
+                    ),
+                )
+                message = f"Created new index: {input_data.index_name}"
+            else:
+                message = f"Using existing index: {input_data.index_name}"
+
+            yield "index", input_data.index_name
+            yield "message", message
+        except Exception as e:
+            yield "message", f"Error initializing Pinecone index: {str(e)}"
+
+
+class PineconeQueryBlock(Block):
+    class Input(BlockSchema):
+        credentials: PineconeCredentialsInput = PineconeCredentialsField()
+        query_vector: list = SchemaField(description="Query vector")
+        namespace: str = SchemaField(
+            description="Namespace to query in Pinecone", default=""
+        )
+        top_k: int = SchemaField(
+            description="Number of top results to return", default=3
+        )
+        include_values: bool = SchemaField(
+            description="Whether to include vector values in the response",
+            default=False,
+        )
+        include_metadata: bool = SchemaField(
+            description="Whether to include metadata in the response", default=True
+        )
+        host: str = SchemaField(description="Host for pinecone")
+
+    class Output(BlockSchema):
+        results: dict = SchemaField(description="Query results from Pinecone")
+
+    def __init__(self):
+        super().__init__(
+            id="9ad93d0f-91b4-4c9c-8eb1-82e26b4a01c5",
+            description="Queries a Pinecone index",
+            categories={BlockCategory.LOGIC},
+            input_schema=PineconeQueryBlock.Input,
+            output_schema=PineconeQueryBlock.Output,
+        )
+
+    def run(
+        self,
+        input_data: Input,
+        *,
+        credentials: APIKeyCredentials,
+        **kwargs,
+    ) -> BlockOutput:
+        pc = Pinecone(api_key=credentials.api_key.get_secret_value())
+        idx = pc.Index(host=input_data.host)
+        results = idx.query(
+            namespace=input_data.namespace,
+            vector=input_data.query_vector,
+            top_k=input_data.top_k,
+            include_values=input_data.include_values,
+            include_metadata=input_data.include_metadata,
+        )
+        yield "results", results
diff --git a/autogpt_platform/backend/poetry.lock b/autogpt_platform/backend/poetry.lock