activeloopai · istranical · Jun 11, 2024 · Jun 10, 2024
diff --git a/deeplake/auto/structured/dataframe.py b/deeplake/auto/structured/dataframe.py
@@ -10,8 +10,6 @@
 from typing import DefaultDict, List, Union, Optional, Dict
 from deeplake.core.sample import Sample
 from deeplake.core.linked_sample import LinkedSample
-import pathlib
-
 
 from deeplake.client.log import logger
 
@@ -30,14 +28,13 @@ def __init__(self, source, column_params=None, creds=None, creds_key=None):
         Raises:
             Exception: If source is not a pandas dataframe object.
         """
+
         import pandas as pd  # type: ignore
 
         super().__init__(source)
         if not isinstance(self.source, pd.DataFrame):
             raise Exception("Source is not a pandas dataframe object.")
 
-        self.source = self.source.replace({np.nan: None})
-
         self.creds = creds
         self.creds_key = creds_key
         self._initialize_params(column_params)
@@ -79,29 +76,35 @@ def _get_most_frequent_image_extension(self, fn_iterator: List[str]):
         )
         return most_frequent_image_extension
 
+    def _is_datetime(self, type: Union[type, np.dtype]):
+        return "datetime64[ns]" in str(type) or type == np.dtype(np.datetime64)
+
     def _parse_tensor_params(self, key: str, inspect_limit: int = 1000):
         """Parse the tensor parameters for a column. Required parameters that are not specified will be inferred by inspecting up to 'inspect_limit' rows in the data."""
 
+        import pandas as pd  # type: ignore
+
         tensor_params: Dict = self.column_params[key]
 
         dtype = self.source[key].dtype
 
         if (
             "htype" not in tensor_params
         ):  # Auto-set some typing parameters if htype is not specified
-            if dtype == np.dtype("object"):
+            if dtype == np.dtype("object") or self._is_datetime(dtype):
+
+                column_data = self.source[key][0:inspect_limit]
+                column_data = column_data.where(pd.notnull(column_data), None).values
                 types = [
-                    type(v)
-                    for v in self.source[key][0:inspect_limit].values
-                    if v is not None
+                    type(v) for v in column_data if v is not None
                 ]  # Can be length 0 if all data is None
 
                 if len(set(types)) > 1:
                     raise IngestionError(
                         f"Dataframe has different data types inside '{key}' column. Please make sure all data is given column is compatible with a single Deep Lake htype, or try specifying the htype manually."
                     )
 
-                if len(types) > 0 and types[0] == str:
+                if len(types) > 0 and (types[0] == str or self._is_datetime(types[0])):
                     tensor_params.update(
                         htype="text"
                     )  # Use "text" htype for text data when the htype is not specified tensor_params
@@ -127,10 +130,22 @@ def _parse_tensor_params(self, key: str, inspect_limit: int = 1000):
 
     def _get_extend_values(self, tensor_params: dict, key: str):  # type: ignore
         """Method creates a list of values to be extended to the tensor, based on the tensor parameters and the data in the dataframe column"""
+
         import pandas as pd  # type: ignore
 
         column_data = self.source[key]
-        column_data = column_data.where(pd.notnull(column_data), None).values.tolist()
+
+        # Convert datetime arrays to strings. Other data can be uploaded as is.
+        if self._is_datetime(column_data.dtype):
+            column_data = (
+                column_data.where(pd.notnull(column_data), None)
+                .astype(str)
+                .values.tolist()
+            )
+        else:
+            column_data = column_data.where(
+                pd.notnull(column_data), None
+            ).values.tolist()
 
         extend_values: List[Optional[Union[Sample, LinkedSample, np.ndarray]]]
 

diff --git a/deeplake/auto/tests/test_ingestion.py b/deeplake/auto/tests/test_ingestion.py
@@ -304,7 +304,7 @@ def test_dataframe_files(memory_ds: Dataset, dataframe_ingestion_data):
     assert ds[df_keys[2]][2].data()["text"][0] == df[df_keys[2]][2]
 
 
-def test_dataframe_array(memory_ds: Dataset):
+def test_dataframe_mixed(memory_ds: Dataset):
     data = {
         "AA": ["Alice", "Bob", np.nan, None],
         "BB": [
@@ -322,8 +322,11 @@ def test_dataframe_array(memory_ds: Dataset):
             np.array([0, 56, 34]),
         ],
         "FF": [None, "Bob", "Charlie", "Dave"],
+        "GG": ["2024-01-01", None, "", "2024-04-01"],
     }
 
+    data["GG"] = pd.to_datetime(data["GG"])
+
     df = pd.DataFrame(data)
     df_keys = df.keys()
 
@@ -355,6 +358,10 @@ def test_dataframe_array(memory_ds: Dataset):
     assert ds[df_keys[4]][0].numpy().shape[0] == 0
     assert ds[df_keys[4]][1].numpy().shape[0] == 4
 
+    assert ds[df_keys[6]][0].text() in str(data["GG"][0])  # type: ignore
+    assert ds[df_keys[6]][2].text() == "NaT"  # type: ignore
+    assert ds[df_keys[6]][1].text() == "NaT"  # type: ignore
+
 
 def test_dataframe_array_bad(memory_ds: Dataset):
     data = {

diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py
@@ -3210,7 +3210,7 @@ def extend(
         ignore_errors: bool = False,
         progressbar: bool = False,
     ):
-        """Appends multiple samples (rows) to mutliple tensors at once. This method expects all tensors being updated to be of the same length.
+        """Appends multiple samples (rows) to multiple tensors at once. This method expects all tensors being updated to be of the same length.
 
         Args:
             samples (Dict[str, Any]): Dictionary with tensor names as keys and data as values. The values can be a sequence (i.e. a list) or a single numpy array (the first axis in the array is treated as the row axis).
@@ -3232,11 +3232,15 @@ def extend(
             >>> ds = deeplake.empty("../test/test_ds")
 
             >>> with ds:
-            >>>     ds.create_tensor('data')
-            >>>     ds.create_tensor('labels')
+            >>>     ds.create_tensor("data")
+            >>>     ds.create_tensor("labels", htype = "class_label")
+            >>>     ds.create_tensor("images", htype = "image", sample_compression = "jpeg")
 
             >>>     # This operation will append 4 samples (rows) to the Deep Lake dataset
-            >>>     ds.extend({"data": [1, 2, 3, 4], "labels":["table", "chair", "desk", "table"]})
+            >>>     ds.extend({"data": [1, 2, 3, 4],
+                               "labels":["table", "chair", "desk", "table"],
+                               "images": [deeplake.read("image1.jpg"), deeplake.read("image2.jpg"), deeplake.read("image3.jpg"), deeplake.read("image4.jpg")]
+                               })
 
         """
         extend = False