Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for dataframes with datetime #2871

Merged
merged 1 commit into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions deeplake/auto/structured/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
from typing import DefaultDict, List, Union, Optional, Dict
from deeplake.core.sample import Sample
from deeplake.core.linked_sample import LinkedSample
import pathlib


from deeplake.client.log import logger

Expand All @@ -30,14 +28,13 @@ def __init__(self, source, column_params=None, creds=None, creds_key=None):
Raises:
Exception: If source is not a pandas dataframe object.
"""

import pandas as pd # type: ignore

super().__init__(source)
if not isinstance(self.source, pd.DataFrame):
raise Exception("Source is not a pandas dataframe object.")

self.source = self.source.replace({np.nan: None})

self.creds = creds
self.creds_key = creds_key
self._initialize_params(column_params)
Expand Down Expand Up @@ -79,29 +76,35 @@ def _get_most_frequent_image_extension(self, fn_iterator: List[str]):
)
return most_frequent_image_extension

def _is_datetime(self, type: Union[type, np.dtype]):
return "datetime64[ns]" in str(type) or type == np.dtype(np.datetime64)

def _parse_tensor_params(self, key: str, inspect_limit: int = 1000):
"""Parse the tensor parameters for a column. Required parameters that are not specified will be inferred by inspecting up to 'inspect_limit' rows in the data."""

import pandas as pd # type: ignore

tensor_params: Dict = self.column_params[key]

dtype = self.source[key].dtype

if (
"htype" not in tensor_params
): # Auto-set some typing parameters if htype is not specified
if dtype == np.dtype("object"):
if dtype == np.dtype("object") or self._is_datetime(dtype):

column_data = self.source[key][0:inspect_limit]
column_data = column_data.where(pd.notnull(column_data), None).values
types = [
type(v)
for v in self.source[key][0:inspect_limit].values
if v is not None
type(v) for v in column_data if v is not None
] # Can be length 0 if all data is None

if len(set(types)) > 1:
raise IngestionError(
f"Dataframe has different data types inside '{key}' column. Please make sure all data is given column is compatible with a single Deep Lake htype, or try specifying the htype manually."
)

if len(types) > 0 and types[0] == str:
if len(types) > 0 and (types[0] == str or self._is_datetime(types[0])):
tensor_params.update(
htype="text"
) # Use "text" htype for text data when the htype is not specified tensor_params
Expand All @@ -127,10 +130,22 @@ def _parse_tensor_params(self, key: str, inspect_limit: int = 1000):

def _get_extend_values(self, tensor_params: dict, key: str): # type: ignore
"""Method creates a list of values to be extended to the tensor, based on the tensor parameters and the data in the dataframe column"""

import pandas as pd # type: ignore

column_data = self.source[key]
column_data = column_data.where(pd.notnull(column_data), None).values.tolist()

# Convert datetime arrays to strings. Other data can be uploaded as is.
if self._is_datetime(column_data.dtype):
column_data = (
column_data.where(pd.notnull(column_data), None)
.astype(str)
.values.tolist()
)
else:
column_data = column_data.where(
pd.notnull(column_data), None
).values.tolist()

extend_values: List[Optional[Union[Sample, LinkedSample, np.ndarray]]]

Expand Down
9 changes: 8 additions & 1 deletion deeplake/auto/tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def test_dataframe_files(memory_ds: Dataset, dataframe_ingestion_data):
assert ds[df_keys[2]][2].data()["text"][0] == df[df_keys[2]][2]


def test_dataframe_array(memory_ds: Dataset):
def test_dataframe_mixed(memory_ds: Dataset):
data = {
"AA": ["Alice", "Bob", np.nan, None],
"BB": [
Expand All @@ -322,8 +322,11 @@ def test_dataframe_array(memory_ds: Dataset):
np.array([0, 56, 34]),
],
"FF": [None, "Bob", "Charlie", "Dave"],
"GG": ["2024-01-01", None, "", "2024-04-01"],
}

data["GG"] = pd.to_datetime(data["GG"])

df = pd.DataFrame(data)
df_keys = df.keys()

Expand Down Expand Up @@ -355,6 +358,10 @@ def test_dataframe_array(memory_ds: Dataset):
assert ds[df_keys[4]][0].numpy().shape[0] == 0
assert ds[df_keys[4]][1].numpy().shape[0] == 4

assert ds[df_keys[6]][0].text() in str(data["GG"][0]) # type: ignore
assert ds[df_keys[6]][2].text() == "NaT" # type: ignore
assert ds[df_keys[6]][1].text() == "NaT" # type: ignore


def test_dataframe_array_bad(memory_ds: Dataset):
data = {
Expand Down
12 changes: 8 additions & 4 deletions deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3210,7 +3210,7 @@ def extend(
ignore_errors: bool = False,
progressbar: bool = False,
):
"""Appends multiple samples (rows) to mutliple tensors at once. This method expects all tensors being updated to be of the same length.
"""Appends multiple samples (rows) to multiple tensors at once. This method expects all tensors being updated to be of the same length.

Args:
samples (Dict[str, Any]): Dictionary with tensor names as keys and data as values. The values can be a sequence (i.e. a list) or a single numpy array (the first axis in the array is treated as the row axis).
Expand All @@ -3232,11 +3232,15 @@ def extend(
>>> ds = deeplake.empty("../test/test_ds")

>>> with ds:
>>> ds.create_tensor('data')
>>> ds.create_tensor('labels')
>>> ds.create_tensor("data")
>>> ds.create_tensor("labels", htype = "class_label")
>>> ds.create_tensor("images", htype = "image", sample_compression = "jpeg")

>>> # This operation will append 4 samples (rows) to the Deep Lake dataset
>>> ds.extend({"data": [1, 2, 3, 4], "labels":["table", "chair", "desk", "table"]})
>>> ds.extend({"data": [1, 2, 3, 4],
"labels":["table", "chair", "desk", "table"],
"images": [deeplake.read("image1.jpg"), deeplake.read("image2.jpg"), deeplake.read("image3.jpg"), deeplake.read("image4.jpg")]
})

"""
extend = False
Expand Down
Loading