Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(blocks): Add pinecone and jina blocks #8401

Merged
merged 28 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
84b13ee
add pinecone and jina blocks
aarushik93 Oct 22, 2024
a96f9b9
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
aarushik93 Oct 23, 2024
aa58610
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
aarushik93 Oct 23, 2024
d025f00
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
aarushik93 Oct 23, 2024
be65f2f
udpate based on comments
aarushik93 Oct 23, 2024
3540c5f
backend updates
aarushik93 Oct 23, 2024
7a343f4
frontend updates
aarushik93 Oct 23, 2024
44d0e53
type hint
aarushik93 Oct 23, 2024
c2d4c20
more type hints
aarushik93 Oct 23, 2024
5bdfd9c
another type hint
aarushik93 Oct 23, 2024
8de2177
update run signature
aarushik93 Oct 23, 2024
035fb19
shared jina provider
aarushik93 Oct 23, 2024
6956916
fix linting
aarushik93 Oct 23, 2024
69e4c65
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
aarushik93 Oct 23, 2024
a3e6e7e
lockfile
aarushik93 Oct 23, 2024
b4f248a
remove noqa
aarushik93 Oct 23, 2024
6337b4c
remove noqa
aarushik93 Oct 23, 2024
a673ed6
remove vector db folder
aarushik93 Oct 23, 2024
477a730
line
aarushik93 Oct 23, 2024
9470a9e
update pincone credentials provider
aarushik93 Oct 23, 2024
e55f603
fix imports
aarushik93 Oct 23, 2024
1aa6968
formating
aarushik93 Oct 23, 2024
12ff837
update frontend
aarushik93 Oct 23, 2024
6db8883
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
aarushik93 Oct 24, 2024
a294c21
Test (#8425)
ntindle Oct 24, 2024
5bb4bfe
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
aarushik93 Oct 24, 2024
4404eb7
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
ntindle Oct 24, 2024
d628638
Merge branch 'dev' into aarushikansal/set-up-rag-blocks
aarushik93 Oct 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions autogpt_platform/backend/backend/blocks/jina/_auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Literal

from autogpt_libs.supabase_integration_credentials_store.types import APIKeyCredentials
from pydantic import SecretStr

from backend.data.model import CredentialsField, CredentialsMetaInput

JinaCredentials = APIKeyCredentials
JinaCredentialsInput = CredentialsMetaInput[
Literal["jina"],
Literal["api_key"],
]


def JinaCredentialsField() -> JinaCredentialsInput:
"""
Creates a Jina credentials input on a block.

"""
return CredentialsField(
provider="jina",
supported_credential_types={"api_key"}, # noqa
aarushik93 marked this conversation as resolved.
Show resolved Hide resolved
description="The Jina integration can be used with an API Key.",
)


TEST_CREDENTIALS = APIKeyCredentials(
id="01234567-89ab-cdef-0123-456789abcdef",
provider="jina",
api_key=SecretStr("mock-jina-api-key"),
title="Mock Jina API key",
expires_at=None,
)
TEST_CREDENTIALS_INPUT = {
"provider": TEST_CREDENTIALS.provider,
"id": TEST_CREDENTIALS.id,
"type": TEST_CREDENTIALS.type,
"title": TEST_CREDENTIALS.type,
}
69 changes: 69 additions & 0 deletions autogpt_platform/backend/backend/blocks/jina/chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import requests

from backend.blocks.jina._auth import (
JinaCredentials,
JinaCredentialsField,
JinaCredentialsInput,
)
from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from backend.data.model import SchemaField


class JinaChunkingBlock(Block):
class Input(BlockSchema):
texts: list = SchemaField(description="List of texts to chunk")

credentials: JinaCredentialsInput = JinaCredentialsField()
max_chunk_length: int = SchemaField(
description="Maximum length of each chunk", default=1000
)
return_tokens: bool = SchemaField(
description="Whether to return token information", default=False
)

class Output(BlockSchema):
chunks: list = SchemaField(description="List of chunked texts")
tokens: list = SchemaField(
description="List of token information for each chunk", optional=True
)

def __init__(self):
super().__init__(
id="806fb15e-830f-4796-8692-557d300ff43c",
description="Chunks texts using Jina AI's segmentation service",
categories={BlockCategory.AI, BlockCategory.TEXT},
input_schema=JinaChunkingBlock.Input,
output_schema=JinaChunkingBlock.Output,
)

def run(
self, input_data: Input, *, credentials: JinaCredentials, **kwargs
) -> BlockOutput:
url = "https://segment.jina.ai/"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
}

all_chunks = []
all_tokens = []

for text in input_data.texts:
data = {
"content": text,
"return_tokens": str(input_data.return_tokens).lower(),
"return_chunks": "true",
"max_chunk_length": str(input_data.max_chunk_length),
}

response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
result = response.json()

all_chunks.extend(result.get("chunks", []))
if input_data.return_tokens:
all_tokens.extend(result.get("tokens", []))

yield "chunks", all_chunks
if input_data.return_tokens:
yield "tokens", all_tokens
44 changes: 44 additions & 0 deletions autogpt_platform/backend/backend/blocks/jina/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import requests

from backend.blocks.jina._auth import (
JinaCredentials,
JinaCredentialsField,
JinaCredentialsInput,
)
from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from backend.data.model import SchemaField


class JinaEmbeddingBlock(Block):
class Input(BlockSchema):
texts: list = SchemaField(description="List of texts to embed")
credentials: JinaCredentialsInput = JinaCredentialsField()
model: str = SchemaField(
description="Jina embedding model to use",
default="jina-embeddings-v2-base-en",
)

class Output(BlockSchema):
embeddings: list = SchemaField(description="List of embeddings")

def __init__(self):
super().__init__(
id="7c56b3ab-62e7-43a2-a2dc-4ec4245660b6",
description="Generates embeddings using Jina AI",
categories={BlockCategory.AI},
input_schema=JinaEmbeddingBlock.Input,
output_schema=JinaEmbeddingBlock.Output,
)

def run(
self, input_data: Input, *, credentials: JinaCredentials, **kwargs
) -> BlockOutput:
url = "https://api.jina.ai/v1/embeddings"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
}
data = {"input": input_data.texts, "model": input_data.model}
response = requests.post(url, headers=headers, json=data)
embeddings = [e["embedding"] for e in response.json()["data"]]
yield "embeddings", embeddings
127 changes: 127 additions & 0 deletions autogpt_platform/backend/backend/blocks/vector_dbs/pinecone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from typing import Literal

from autogpt_libs.supabase_integration_credentials_store import APIKeyCredentials
from pinecone import Pinecone, ServerlessSpec

from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from backend.data.model import CredentialsField, CredentialsMetaInput, SchemaField


class PineconeInitBlock(Block):
class Input(BlockSchema):
credentials: CredentialsMetaInput[Literal["pinecone"], Literal["api_key"]] = (
CredentialsField(
aarushik93 marked this conversation as resolved.
Show resolved Hide resolved
provider="pinecone",
supported_credential_types={"api_key"}, # noqa
aarushik93 marked this conversation as resolved.
Show resolved Hide resolved
description="The Pinecone integration can be used with "
"any API key with sufficient permissions for the blocks it is used on.",
)
)
index_name: str = SchemaField(description="Name of the Pinecone index")
dimension: int = SchemaField(
description="Dimension of the vectors", default=768
)
metric: str = SchemaField(
description="Distance metric for the index", default="cosine"
)
cloud: str = SchemaField(
description="Cloud provider for serverless", default="aws"
)
region: str = SchemaField(
description="Region for serverless", default="us-east-1"
)

class Output(BlockSchema):
index: str = SchemaField(description="Name of the initialized Pinecone index")
message: str = SchemaField(description="Status message")

def __init__(self):
super().__init__(
id="48d8fdab-8f03-41f3-8407-8107ba11ec9b",
description="Initializes a Pinecone index",
categories={BlockCategory.LOGIC},
input_schema=PineconeInitBlock.Input,
output_schema=PineconeInitBlock.Output,
)

def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
pc = Pinecone(api_key=credentials.api_key.get_secret_value())

try:
existing_indexes = pc.list_indexes()
if input_data.index_name not in [index.name for index in existing_indexes]:
pc.create_index(
name=input_data.index_name,
dimension=input_data.dimension,
metric=input_data.metric,
spec=ServerlessSpec(
cloud=input_data.cloud, region=input_data.region
),
)
message = f"Created new index: {input_data.index_name}"
else:
message = f"Using existing index: {input_data.index_name}"

yield "index", input_data.index_name
yield "message", message
except Exception as e:
yield "message", f"Error initializing Pinecone index: {str(e)}"


class PineconeQueryBlock(Block):
class Input(BlockSchema):
credentials: CredentialsMetaInput[Literal["pinecone"], Literal["api_key"]] = (
CredentialsField(
provider="pinecone",
supported_credential_types={"api_key"}, # noqa
description="The Pinecone integration can be used with "
"any API key with sufficient permissions for the blocks it is used on.",
)
)
query_vector: list = SchemaField(description="Query vector")
namespace: str = SchemaField(
description="Namespace to query in Pinecone", default=""
)
top_k: int = SchemaField(
description="Number of top results to return", default=3
)
include_values: bool = SchemaField(
description="Whether to include vector values in the response",
default=False,
)
include_metadata: bool = SchemaField(
description="Whether to include metadata in the response", default=True
)
host: str = SchemaField(description="Host for pinecone")

class Output(BlockSchema):
results: dict = SchemaField(description="Query results from Pinecone")

def __init__(self):
super().__init__(
id="9ad93d0f-91b4-4c9c-8eb1-82e26b4a01c5",
description="Queries a Pinecone index",
categories={BlockCategory.LOGIC},
input_schema=PineconeQueryBlock.Input,
output_schema=PineconeQueryBlock.Output,
)

def run(
self,
input_data: Input,
*,
credentials: APIKeyCredentials,
**kwargs,
) -> BlockOutput:
pc = Pinecone(api_key=credentials.api_key.get_secret_value())
idx = pc.Index(host=input_data.host)
results = idx.query(
namespace=input_data.namespace,
vector=input_data.query_vector,
top_k=input_data.top_k,
include_values=input_data.include_values,
include_metadata=input_data.include_metadata,
)
yield "results", results
Loading
Loading