Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: CSV parsing for hub.auto #711

Merged
merged 14 commits into from
Mar 25, 2021
Merged
13 changes: 13 additions & 0 deletions docs/source/concepts/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@ Supports `[.png, .jpg, .jpeg]` file extensions.
- ...
- ...

#### Tabular
| Dataset Link | Example Notebook |
|:--- |---: |
| [IMDb Movie Reviews (Kaggle)](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1a5HBelRGmKAxMqm6xK17qKCHvuJxCpHe?usp=sharing) |

Supports `.csv` file formats.

verbose-void marked this conversation as resolved.
Show resolved Hide resolved
Expects the folder path to point to a directory where the folder structure is the following:
- root
- file1.csv
- file2.csv
- ...

## Auto Usage
If your dataset is supported (see [above](#supported-dataset-formats)), you can convert it into hub format with a single line of code:

Expand Down
1 change: 1 addition & 0 deletions hub/auto/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .infer import *
from .computer_vision import *
from .tabular import *
6 changes: 3 additions & 3 deletions hub/auto/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def _find_root(path):
subs = [
sub for sub in subs if os.path.isdir(sub)
] # only keep directories (ignore files)
if len(subs) > 1:
return path
return _find_root(subs[0])
if len(subs) == 1:
return _find_root(subs[0])
return path


def infer_dataset(path, scheduler="single", workers=1):
Expand Down
1 change: 1 addition & 0 deletions hub/auto/tabular/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .csv import *
55 changes: 55 additions & 0 deletions hub/auto/tabular/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os

import hub
import numpy as np
from hub.auto import util
from hub.auto.infer import state
from hub.exceptions import ModuleNotInstalledException
from tqdm import tqdm


@state.directory_parser(priority=1)
def data_from_csv(path, scheduler, workers):
try:
import pandas as pd
except ModuleNotFoundError:
raise ModuleNotInstalledException("pandas")

# check if path's contents are all csv files
if not util.files_are_of_extension(path, util.CSV_EXTS):
return None

df = pd.DataFrame()
files = util.get_children(path)

for i in files:
df_csv = pd.read_csv(i)
df_csv["Filename"] = os.path.basename(i)
df = pd.concat([df, df_csv])

schema = {str(i): df[i].dtype for i in df.columns}
for keys in schema.keys():
if schema[keys] == np.dtype("O"):
# Assigning max_shape as the length of the longest string in the column.
schema[keys] = hub.schema.Text(
shape=(None,), max_shape=(int(df[keys].str.len().max()),)
)
# the below code is to check whether the column is a ClassLabel or not
# elif schema[keys] == np.dtype("int64"):
# if len(np.unique(df[keys])) <= 10:
# schema[keys] = hub.schema.ClassLabel(
# num_classes=len(np.unique(df[keys]))
# )
# else:
# schema[keys] = hub.schema.Primitive(dtype=schema[keys])
else:
schema[keys] = hub.schema.Primitive(dtype=schema[keys])

@hub.transform(schema=schema, scheduler=scheduler, workers=workers)
def upload_data(index, df):
dictionary_cols = {}
for column in df.columns:
dictionary_cols[column] = df[column].iloc[index]
return dictionary_cols

return upload_data(range(len(df)), df=df)
21 changes: 21 additions & 0 deletions hub/auto/tests/dummy_data/tabular/multiple_csv/MOCK_DATA(1).csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
numbers-int,first_name,last_name,email,gender,ip_address,dates,bools,floats
82,Clerkclaude,Hanhard,[email protected],Bigender,31.63.79.108,07/27/2020,true,99.16
37,Benoit,O'Howbane,[email protected],Female,215.244.49.130,08/27/2020,false,62.46
62,Xerxes,Melvin,[email protected],Polygender,89.187.117.126,06/18/2020,false,26.45
18,Cass,Advani,[email protected],Genderfluid,121.208.91.45,12/24/2020,false,67.27
47,Devondra,McKeown,[email protected],Male,78.251.197.82,12/02/2020,true,74.5
65,Giulio,Fyndon,[email protected],Non-binary,177.239.11.250,09/17/2020,true,5.34
48,Rickey,Aartsen,[email protected],Female,109.199.145.238,11/03/2020,true,21.19
55,Brittani,Lanegran,[email protected],Genderqueer,130.77.15.150,12/13/2020,false,17.5
39,Ignaz,Guthrum,[email protected],Bigender,170.171.105.200,12/21/2020,true,76.03
44,Edy,Joselevitz,[email protected],Male,48.20.190.15,02/26/2021,true,65.06
54,Norean,Bellows,[email protected],Male,61.230.54.69,03/20/2021,true,3.02
33,Gayelord,Slemming,[email protected],Bigender,145.36.49.39,11/22/2020,true,25.83
25,Mickie,Charnick,[email protected],Female,150.114.42.154,02/13/2021,false,69.47
3,Kandace,Laimable,[email protected],Female,68.176.159.187,04/06/2020,false,17.24
33,Pennie,Oherlihy,[email protected],Genderfluid,238.10.187.109,12/22/2020,false,18.97
90,Mavis,Dansey,[email protected],Genderqueer,56.43.255.218,07/08/2020,true,58.1
12,Graham,Lynde,[email protected],Male,95.76.178.78,04/23/2020,false,44.93
42,Margi,Mogey,[email protected],Male,166.143.90.60,06/21/2020,false,75.15
80,Pryce,McIlriach,[email protected],Agender,103.112.118.155,04/09/2020,false,52.84
15,Noelyn,Daintier,[email protected],Female,20.90.60.168,10/27/2020,false,54.33
21 changes: 21 additions & 0 deletions hub/auto/tests/dummy_data/tabular/multiple_csv/MOCK_DATA(2).csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
numbers-int,first_name,last_name,email,gender,ip_address,dates,bools,floats
74,Caril,Ivachyov,[email protected],Female,142.158.161.150,11/09/2020,false,29.9
10,Mariejeanne,Melliard,[email protected],Polygender,80.101.140.167,12/20/2020,false,65.95
4,Yorgos,Merwede,[email protected],Agender,182.72.245.157,08/23/2020,true,64.06
97,Padraic,Villa,[email protected],Polygender,161.157.81.174,05/20/2020,false,8.26
92,Mala,Riglar,[email protected],Non-binary,248.189.223.92,03/02/2021,true,91.44
60,Adeline,Harman,[email protected],Female,198.188.118.170,06/28/2020,true,6.27
92,Freemon,Kristoffersen,[email protected],Agender,209.190.54.219,10/25/2020,true,3.57
65,Ailyn,Stollberger,[email protected],Male,195.30.244.185,12/22/2020,false,70.19
77,Maurita,Galliford,[email protected],Female,219.217.109.60,06/29/2020,false,61.62
3,Maryjane,Pim,[email protected],Male,85.122.217.51,03/09/2021,false,86.56
28,Lindi,Sammut,[email protected],Agender,40.201.53.8,04/10/2020,false,97.34
59,Charmain,O'Hanley,[email protected],Genderqueer,161.182.192.170,11/05/2020,false,32.2
82,Hi,Dennerley,[email protected],Genderqueer,147.117.137.22,07/11/2020,true,20.47
75,Paige,Greenfield,[email protected],Bigender,204.156.98.221,07/05/2020,true,95.8
97,Nevins,Cauderlie,[email protected],Male,162.67.234.162,12/28/2020,false,33.47
29,Weidar,Tripon,[email protected],Bigender,188.185.111.88,04/26/2020,false,24.92
57,Aeriell,Serginson,[email protected],Polygender,153.17.192.218,05/13/2020,false,67.06
30,Rodney,Margaritelli,[email protected],Agender,130.42.155.142,09/08/2020,true,81.88
50,Jaynell,Penhall,[email protected],Female,95.33.7.215,11/21/2020,false,66.98
83,Ludovika,Strase,[email protected],Agender,210.124.168.67,02/18/2021,false,44.37
21 changes: 21 additions & 0 deletions hub/auto/tests/dummy_data/tabular/multiple_csv/MOCK_DATA(3).csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
numbers-int,first_name,last_name,email,gender,ip_address,dates,bools,floats
48,Fredericka,Bunnell,[email protected],Polygender,43.70.45.211,10/10/2020,false,95.83
24,Westbrook,Mottershaw,[email protected],Genderqueer,34.48.62.109,03/26/2020,true,25.66
66,Eziechiele,Anelay,[email protected],Genderfluid,33.156.31.251,01/07/2021,false,75.63
98,Eugine,Dimitresco,[email protected],Polygender,248.35.201.165,12/18/2020,false,7.84
10,Karl,Ashburner,[email protected],Polygender,128.65.41.105,02/01/2021,true,80.57
5,Coop,Hovenden,[email protected],Bigender,88.34.15.29,05/28/2020,true,9.37
63,Carolina,Gasken,[email protected],Bigender,78.91.95.197,01/05/2021,true,5.36
65,Timotheus,Cawt,[email protected],Male,182.161.61.241,07/08/2020,false,57.32
46,Donavon,Marks,[email protected],Polygender,251.36.162.60,01/07/2021,true,55.61
82,Ashien,Murrthum,[email protected],Female,28.170.24.185,07/04/2020,true,18.23
26,Kennie,Tupper,[email protected],Agender,124.125.127.40,02/25/2021,false,32.56
96,Alfie,Fake,[email protected],Genderfluid,3.176.48.231,03/10/2021,false,48.3
100,Nero,Laetham,[email protected],Agender,172.243.122.67,08/02/2020,true,99.43
68,Winnifred,Morin,[email protected],Female,230.164.39.124,11/30/2020,false,96.82
83,Toddy,Leppo,[email protected],Non-binary,64.51.228.117,05/23/2020,true,72.7
85,Nicky,Bickerton,[email protected],Female,200.122.113.159,05/14/2020,false,20.1
64,Livvyy,Sawbridge,[email protected],Female,210.235.210.46,09/30/2020,true,45.77
40,Feliks,Witherspoon,[email protected],Genderqueer,196.108.139.132,12/14/2020,false,15.35
91,Jennilee,Langran,[email protected],Genderqueer,119.163.64.88,07/03/2020,true,38.99
42,Sergei,Melling,[email protected],Genderqueer,86.43.228.247,09/24/2020,true,65.13
21 changes: 21 additions & 0 deletions hub/auto/tests/dummy_data/tabular/multiple_csv/MOCK_DATA(4).csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
numbers-int,first_name,last_name,email,gender,ip_address,dates,bools,floats
9,Scottie,Grgic,[email protected],Agender,134.190.188.212,04/30/2020,true,87.49
9,Betti,Dicty,[email protected],Agender,153.12.94.86,10/28/2020,true,33.4
71,Philbert,Bowle,[email protected],Genderqueer,106.153.246.45,05/12/2020,false,56.71
87,Inglis,French,[email protected],Female,161.187.57.178,04/18/2020,true,76.37
91,Robbyn,Moysey,[email protected],Female,127.54.160.235,09/06/2020,false,35.41
88,Esma,Bole,[email protected],Female,140.56.91.229,07/20/2020,true,80.89
63,Tabbie,Dymidowicz,[email protected],Genderqueer,165.93.228.17,02/05/2021,true,1.67
1,Cristobal,Petters,[email protected],Bigender,65.87.2.32,06/04/2020,true,81.32
52,Sydney,Onge,[email protected],Male,232.76.198.91,10/02/2020,true,8.09
28,Elmira,Linnard,[email protected],Genderqueer,127.1.50.197,07/22/2020,false,40.74
14,Eloise,Persence,[email protected],Polygender,161.184.245.206,03/10/2021,false,19.47
19,Alain,Clail,[email protected],Polygender,138.109.138.189,05/23/2020,true,98.84
26,Talbert,McTrustam,[email protected],Polygender,172.191.111.45,01/09/2021,false,94.39
16,Stevy,Friberg,[email protected],Polygender,201.111.86.66,11/11/2020,false,61.9
52,Jeni,Wegenen,[email protected],Non-binary,17.201.159.241,07/28/2020,false,68.53
24,Rena,Chopy,[email protected],Bigender,137.48.106.109,10/01/2020,false,75.8
50,Ricca,Balstone,[email protected],Genderqueer,237.185.146.50,07/08/2020,true,86.83
34,Ranique,Forder,[email protected],Female,234.45.155.78,06/27/2020,false,23.65
50,Rianon,Castan,[email protected],Polygender,49.8.163.80,11/30/2020,false,96.19
67,Clifford,Leftwich,[email protected],Male,149.252.71.18,06/19/2020,true,78.85
21 changes: 21 additions & 0 deletions hub/auto/tests/dummy_data/tabular/multiple_csv/MOCK_DATA.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
numbers-int,first_name,last_name,email,gender,ip_address,dates,bools,floats
19,Olympia,Samms,[email protected],Agender,192.71.74.138,02/12/2021,true,67.9
52,Raddie,Hundley,[email protected],Genderfluid,222.219.70.116,09/26/2020,false,33.75
93,Germayne,Cowpland,[email protected],Non-binary,244.57.127.208,04/14/2020,false,91.99
26,Celestia,Bulleyn,[email protected],Non-binary,29.252.104.82,04/20/2020,false,47.28
46,Erv,Storah,[email protected],Female,33.58.43.3,04/15/2020,false,17.81
99,Caitlin,Loughman,[email protected],Genderfluid,200.46.171.173,08/06/2020,true,80.97
60,Ty,Sugg,[email protected],Male,232.68.50.12,10/15/2020,false,18.64
89,Dacia,Manders,[email protected],Polygender,41.47.109.243,01/21/2021,true,85.51
92,Joletta,Morton,[email protected],Non-binary,225.186.212.25,11/06/2020,true,94.43
40,Keslie,Kerley,[email protected],Agender,133.153.91.156,11/13/2020,true,87.91
78,Brad,Ghirigori,[email protected],Polygender,245.87.236.239,01/22/2021,false,54.81
6,Selby,Cranage,[email protected],Polygender,110.114.120.26,03/07/2021,true,37.47
54,Kaitlin,Vasyutichev,[email protected],Bigender,254.115.252.9,11/02/2020,false,8.95
65,Irving,Crystal,[email protected],Female,143.78.253.210,01/15/2021,false,33.12
98,Loralee,Burel,[email protected],Genderqueer,252.57.231.20,01/26/2021,true,30.28
18,Maxwell,Reubel,[email protected],Genderfluid,39.149.112.20,07/07/2020,true,8.33
44,Honoria,Borrow,[email protected],Genderfluid,157.97.71.174,09/01/2020,true,24.45
61,Luther,Marc,[email protected],Male,223.202.31.42,04/29/2020,false,65.54
29,Jacinta,Karchewski,[email protected],Male,9.247.138.145,06/02/2020,true,66.04
36,Ashly,Jarnell,[email protected],Female,93.157.113.181,12/26/2020,false,23.9
21 changes: 21 additions & 0 deletions hub/auto/tests/dummy_data/tabular/single_csv/MOCK_DATA.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
numbers-int,first_name,last_name,email,gender,ip_address,dates,bools,floats
19,Olympia,Samms,[email protected],Agender,192.71.74.138,02/12/2021,true,67.9
52,Raddie,Hundley,[email protected],Genderfluid,222.219.70.116,09/26/2020,false,33.75
93,Germayne,Cowpland,[email protected],Non-binary,244.57.127.208,04/14/2020,false,91.99
26,Celestia,Bulleyn,[email protected],Non-binary,29.252.104.82,04/20/2020,false,47.28
46,Erv,Storah,[email protected],Female,33.58.43.3,04/15/2020,false,17.81
99,Caitlin,Loughman,[email protected],Genderfluid,200.46.171.173,08/06/2020,true,80.97
60,Ty,Sugg,[email protected],Male,232.68.50.12,10/15/2020,false,18.64
89,Dacia,Manders,[email protected],Polygender,41.47.109.243,01/21/2021,true,85.51
92,Joletta,Morton,[email protected],Non-binary,225.186.212.25,11/06/2020,true,94.43
40,Keslie,Kerley,[email protected],Agender,133.153.91.156,11/13/2020,true,87.91
78,Brad,Ghirigori,[email protected],Polygender,245.87.236.239,01/22/2021,false,54.81
6,Selby,Cranage,[email protected],Polygender,110.114.120.26,03/07/2021,true,37.47
54,Kaitlin,Vasyutichev,[email protected],Bigender,254.115.252.9,11/02/2020,false,8.95
65,Irving,Crystal,[email protected],Female,143.78.253.210,01/15/2021,false,33.12
98,Loralee,Burel,[email protected],Genderqueer,252.57.231.20,01/26/2021,true,30.28
18,Maxwell,Reubel,[email protected],Genderfluid,39.149.112.20,07/07/2020,true,8.33
44,Honoria,Borrow,[email protected],Genderfluid,157.97.71.174,09/01/2020,true,24.45
61,Luther,Marc,[email protected],Male,223.202.31.42,04/29/2020,false,65.54
29,Jacinta,Karchewski,[email protected],Male,9.247.138.145,06/02/2020,true,66.04
36,Ashly,Jarnell,[email protected],Female,93.157.113.181,12/26/2020,false,23.9
87 changes: 87 additions & 0 deletions hub/auto/tests/test_tabular_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import json
import os
import shutil
import zipfile
from pathlib import PosixPath

import hub
import numpy as np
import pytest
from hub.auto.tests.util import get_dataset_store
import pandas as pd
from hub.utils import pandas_loaded
from hub.auto.util import get_children


def assert_conversion(tag):

dataset_store = get_dataset_store(tag)
hub_dir = dataset_store / "hub"

# delete hub dataset so conversion test can be done
if hub_dir.is_dir():
print("hub_dir was found (%s), deleting..." % hub_dir)
shutil.rmtree(hub_dir)

df = pd.DataFrame()
files = get_children(dataset_store)
for i in files:
df_csv = pd.read_csv(i)
df_csv["Filename"] = os.path.basename(i)
df = pd.concat([df, df_csv])

try:
ds = hub.Dataset.from_path(str(dataset_store))
except Exception:
assert False

print("dataset obj:", ds)
assert ds is not None

assert hub_dir.is_dir(), hub_dir

# df = Pandas dataframe, ds = Dataset obtained from hub.auto
if df is not None:
assert ds.shape == (df.shape[0],)

# Checking if the column names are the same
keys_csv_parser = [i[1:] for i in ds.keys]
keys_df = list(df.columns)
assert keys_csv_parser == keys_df

# Checking if all elements are parsed correctly
for i in keys_df:
column = []
if df[i].dtype == np.dtype("O"):
for j in range(df.shape[0]):
column.append(ds[i, j].compute())
else:
column = ds[i].compute()
assert list(column) == list(df[i])

# Checking if the datatypes of the columns match
for i in keys_csv_parser:
if df[i].dtype == np.dtype("O"):
assert ds[i].dtype == np.dtype("int64")
else:
assert ds[i].dtype == df[i].dtype

# Checking if all the filenames are parsed correctly
list_names = []
for i in range(len(ds)):
if ds["Filename", i].compute() in list_names:
continue
list_names.append(ds["Filename", i].compute())
assert list(df["Filename"].unique()) == list_names


@pytest.mark.skipif(not pandas_loaded(), reason="requires pandas to be loaded")
def test_class_sample_single_csv():
tag = "tabular/single_csv"
assert_conversion(tag)


@pytest.mark.skipif(not pandas_loaded(), reason="requires pandas to be loaded")
def test_class_sample_multiple_csv():
tag = "tabular/multiple_csv"
assert_conversion(tag)
1 change: 1 addition & 0 deletions hub/auto/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

IGNORE_EXTS = [".DS_Store"]
IMAGE_EXTS = [".jpg", ".png", ".jpeg"]
CSV_EXTS = [".csv"]


class DirectoryParserState:
Expand Down
14 changes: 12 additions & 2 deletions hub/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
"""

from math import gcd
import time
from collections import abc
from math import gcd

from numpy.lib.arraysetops import isin

from hub.exceptions import ShapeLengthException
from hub import defaults
from hub.exceptions import ShapeLengthException


def _flatten(list_):
Expand Down Expand Up @@ -77,6 +77,16 @@ def minio_creds_exist():
return env1 is not None and env2 is not None


def pandas_loaded():
try:
import pandas as pd

pd.__version__
except ImportError:
return False
return True


def pytorch_loaded():
try:
import torch
Expand Down
1 change: 1 addition & 0 deletions requirements-optional.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
tensorflow==2.4.1
torch>=1,<2
pandas>=1.0
ray>=1.0
transformers>=3.5.1
dask[complete]>=2.30
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ humbug>=0.1.6, <0.2
Pillow>=6
msgpack>=0.6
psutil>=5.8 # needed only for deprecated code
cachey>=0.2.1 # needed only for deprecated code
cachey>=0.2.1 # needed only for deprecated code