diff --git a/lib/sycamore/sycamore/connectors/opensearch/opensearch_reader.py b/lib/sycamore/sycamore/connectors/opensearch/opensearch_reader.py index 3a2956c16..1e3cc5a56 100644 --- a/lib/sycamore/sycamore/connectors/opensearch/opensearch_reader.py +++ b/lib/sycamore/sycamore/connectors/opensearch/opensearch_reader.py @@ -1,3 +1,4 @@ +import copy import logging from sycamore.data import Document, Element @@ -47,6 +48,9 @@ def read_records(self, query_params: BaseDBReader.QueryParams) -> "OpenSearchRea if "size" not in query_params.query and "size" not in query_params.kwargs: query_params.kwargs["size"] = 200 result = [] + # We only fetch the minimum required fields for full document retrieval/reconstruction + if query_params.reconstruct_document: + query_params.kwargs["_source_includes"] = "doc_id,parent_id,properties" # No pagination needed for knn queries if "query" in query_params.query and "knn" in query_params.query["query"]: response = self._client.search( @@ -148,6 +152,8 @@ def to_docs(self, query_params: "BaseDBReader.QueryParams") -> list[Document]: # Batched retrieval of all elements belong to unique docs doc_ids = list(unique_docs.keys()) + # We can't safely exclude embeddings since we might need them for 'rerank', e.g. + # We will need the Planner to determine that and pass that info to the reader. all_elements_for_docs = self._get_all_elements_for_doc_ids(doc_ids, query_params.index_name) """ @@ -183,7 +189,6 @@ def _get_all_elements_for_doc_ids(self, doc_ids: list[str], index: str) -> list[ """ batch_size = 100 page_size = 500 - all_elements = [] for i in range(0, len(doc_ids), batch_size): doc_ids_batch = doc_ids[i : i + batch_size] diff --git a/lib/sycamore/sycamore/reader.py b/lib/sycamore/sycamore/reader.py index e46d4a090..ff7301844 100644 --- a/lib/sycamore/sycamore/reader.py +++ b/lib/sycamore/sycamore/reader.py @@ -224,7 +224,7 @@ def opensearch( index_name: str, query: Optional[Dict] = None, reconstruct_document: bool = False, - query_kwargs: dict[str, Any] = {}, + query_kwargs=None, **kwargs, ) -> DocSet: """ @@ -287,6 +287,8 @@ def opensearch( OpenSearchReaderQueryParams, ) + if query_kwargs is None: + query_kwargs = {} client_params = OpenSearchReaderClientParams(os_client_args=os_client_args) query_params = ( OpenSearchReaderQueryParams(