Skip to content

Commit

Permalink
feat(blocks): Add pinecone and jina blocks (#8401)
Browse files Browse the repository at this point in the history
* add pinecone and jina blocks

* udpate based on comments

* backend updates

* frontend updates

* type hint

* more type hints

* another type hint

* update run signature

* shared jina provider

* fix linting

* lockfile

* remove noqa

* remove noqa

* remove vector db folder

* line

* update pincone credentials provider

* fix imports

* formating

* update frontend

* Test (#8425)

* h

* Discard changes to autogpt_platform/backend/poetry.lock

* fix: broken dep

---------

Co-authored-by: Nicholas Tindle <[email protected]>
  • Loading branch information
aarushik93 and ntindle authored Oct 24, 2024
1 parent 6d812ac commit e2df601
Show file tree
Hide file tree
Showing 9 changed files with 355 additions and 10 deletions.
39 changes: 39 additions & 0 deletions autogpt_platform/backend/backend/blocks/jina/_auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Literal

from autogpt_libs.supabase_integration_credentials_store.types import APIKeyCredentials
from pydantic import SecretStr

from backend.data.model import CredentialsField, CredentialsMetaInput

JinaCredentials = APIKeyCredentials
JinaCredentialsInput = CredentialsMetaInput[
Literal["jina"],
Literal["api_key"],
]


def JinaCredentialsField() -> JinaCredentialsInput:
"""
Creates a Jina credentials input on a block.
"""
return CredentialsField(
provider="jina",
supported_credential_types={"api_key"},
description="The Jina integration can be used with an API Key.",
)


TEST_CREDENTIALS = APIKeyCredentials(
id="01234567-89ab-cdef-0123-456789abcdef",
provider="jina",
api_key=SecretStr("mock-jina-api-key"),
title="Mock Jina API key",
expires_at=None,
)
TEST_CREDENTIALS_INPUT = {
"provider": TEST_CREDENTIALS.provider,
"id": TEST_CREDENTIALS.id,
"type": TEST_CREDENTIALS.type,
"title": TEST_CREDENTIALS.type,
}
69 changes: 69 additions & 0 deletions autogpt_platform/backend/backend/blocks/jina/chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import requests

from backend.blocks.jina._auth import (
JinaCredentials,
JinaCredentialsField,
JinaCredentialsInput,
)
from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from backend.data.model import SchemaField


class JinaChunkingBlock(Block):
class Input(BlockSchema):
texts: list = SchemaField(description="List of texts to chunk")

credentials: JinaCredentialsInput = JinaCredentialsField()
max_chunk_length: int = SchemaField(
description="Maximum length of each chunk", default=1000
)
return_tokens: bool = SchemaField(
description="Whether to return token information", default=False
)

class Output(BlockSchema):
chunks: list = SchemaField(description="List of chunked texts")
tokens: list = SchemaField(
description="List of token information for each chunk", optional=True
)

def __init__(self):
super().__init__(
id="806fb15e-830f-4796-8692-557d300ff43c",
description="Chunks texts using Jina AI's segmentation service",
categories={BlockCategory.AI, BlockCategory.TEXT},
input_schema=JinaChunkingBlock.Input,
output_schema=JinaChunkingBlock.Output,
)

def run(
self, input_data: Input, *, credentials: JinaCredentials, **kwargs
) -> BlockOutput:
url = "https://segment.jina.ai/"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
}

all_chunks = []
all_tokens = []

for text in input_data.texts:
data = {
"content": text,
"return_tokens": str(input_data.return_tokens).lower(),
"return_chunks": "true",
"max_chunk_length": str(input_data.max_chunk_length),
}

response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
result = response.json()

all_chunks.extend(result.get("chunks", []))
if input_data.return_tokens:
all_tokens.extend(result.get("tokens", []))

yield "chunks", all_chunks
if input_data.return_tokens:
yield "tokens", all_tokens
44 changes: 44 additions & 0 deletions autogpt_platform/backend/backend/blocks/jina/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import requests

from backend.blocks.jina._auth import (
JinaCredentials,
JinaCredentialsField,
JinaCredentialsInput,
)
from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from backend.data.model import SchemaField


class JinaEmbeddingBlock(Block):
class Input(BlockSchema):
texts: list = SchemaField(description="List of texts to embed")
credentials: JinaCredentialsInput = JinaCredentialsField()
model: str = SchemaField(
description="Jina embedding model to use",
default="jina-embeddings-v2-base-en",
)

class Output(BlockSchema):
embeddings: list = SchemaField(description="List of embeddings")

def __init__(self):
super().__init__(
id="7c56b3ab-62e7-43a2-a2dc-4ec4245660b6",
description="Generates embeddings using Jina AI",
categories={BlockCategory.AI},
input_schema=JinaEmbeddingBlock.Input,
output_schema=JinaEmbeddingBlock.Output,
)

def run(
self, input_data: Input, *, credentials: JinaCredentials, **kwargs
) -> BlockOutput:
url = "https://api.jina.ai/v1/embeddings"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
}
data = {"input": input_data.texts, "model": input_data.model}
response = requests.post(url, headers=headers, json=data)
embeddings = [e["embedding"] for e in response.json()["data"]]
yield "embeddings", embeddings
131 changes: 131 additions & 0 deletions autogpt_platform/backend/backend/blocks/pinecone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from typing import Literal

from autogpt_libs.supabase_integration_credentials_store import APIKeyCredentials
from pinecone import Pinecone, ServerlessSpec

from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from backend.data.model import CredentialsField, CredentialsMetaInput, SchemaField

PineconeCredentials = APIKeyCredentials
PineconeCredentialsInput = CredentialsMetaInput[
Literal["pinecone"],
Literal["api_key"],
]


def PineconeCredentialsField() -> PineconeCredentialsInput:
"""
Creates a Pinecone credentials input on a block.
"""
return CredentialsField(
provider="pinecone",
supported_credential_types={"api_key"},
description="The Pinecone integration can be used with an API Key.",
)


class PineconeInitBlock(Block):
class Input(BlockSchema):
credentials: PineconeCredentialsInput = PineconeCredentialsField()
index_name: str = SchemaField(description="Name of the Pinecone index")
dimension: int = SchemaField(
description="Dimension of the vectors", default=768
)
metric: str = SchemaField(
description="Distance metric for the index", default="cosine"
)
cloud: str = SchemaField(
description="Cloud provider for serverless", default="aws"
)
region: str = SchemaField(
description="Region for serverless", default="us-east-1"
)

class Output(BlockSchema):
index: str = SchemaField(description="Name of the initialized Pinecone index")
message: str = SchemaField(description="Status message")

def __init__(self):
super().__init__(
id="48d8fdab-8f03-41f3-8407-8107ba11ec9b",
description="Initializes a Pinecone index",
categories={BlockCategory.LOGIC},
input_schema=PineconeInitBlock.Input,
output_schema=PineconeInitBlock.Output,
)

def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
pc = Pinecone(api_key=credentials.api_key.get_secret_value())

try:
existing_indexes = pc.list_indexes()
if input_data.index_name not in [index.name for index in existing_indexes]:
pc.create_index(
name=input_data.index_name,
dimension=input_data.dimension,
metric=input_data.metric,
spec=ServerlessSpec(
cloud=input_data.cloud, region=input_data.region
),
)
message = f"Created new index: {input_data.index_name}"
else:
message = f"Using existing index: {input_data.index_name}"

yield "index", input_data.index_name
yield "message", message
except Exception as e:
yield "message", f"Error initializing Pinecone index: {str(e)}"


class PineconeQueryBlock(Block):
class Input(BlockSchema):
credentials: PineconeCredentialsInput = PineconeCredentialsField()
query_vector: list = SchemaField(description="Query vector")
namespace: str = SchemaField(
description="Namespace to query in Pinecone", default=""
)
top_k: int = SchemaField(
description="Number of top results to return", default=3
)
include_values: bool = SchemaField(
description="Whether to include vector values in the response",
default=False,
)
include_metadata: bool = SchemaField(
description="Whether to include metadata in the response", default=True
)
host: str = SchemaField(description="Host for pinecone")

class Output(BlockSchema):
results: dict = SchemaField(description="Query results from Pinecone")

def __init__(self):
super().__init__(
id="9ad93d0f-91b4-4c9c-8eb1-82e26b4a01c5",
description="Queries a Pinecone index",
categories={BlockCategory.LOGIC},
input_schema=PineconeQueryBlock.Input,
output_schema=PineconeQueryBlock.Output,
)

def run(
self,
input_data: Input,
*,
credentials: APIKeyCredentials,
**kwargs,
) -> BlockOutput:
pc = Pinecone(api_key=credentials.api_key.get_secret_value())
idx = pc.Index(host=input_data.host)
results = idx.query(
namespace=input_data.namespace,
vector=input_data.query_vector,
top_k=input_data.top_k,
include_values=input_data.include_values,
include_metadata=input_data.include_metadata,
)
yield "results", results
61 changes: 56 additions & 5 deletions autogpt_platform/backend/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e2df601

Please sign in to comment.