Skip to content

Commit

Permalink
Adding a Vanilla dataset class. This can be used for typical classifi…
Browse files Browse the repository at this point in the history
…cation problems, using a set of (X, Y) samples.
  • Loading branch information
gugarosa committed Feb 18, 2019
1 parent 0772942 commit 376bbd3
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 13 deletions.
28 changes: 28 additions & 0 deletions examples/datasets/create_vanilla.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import nalp.stream.loader as l
import nalp.stream.preprocess as p
import numpy as np
from nalp.datasets.vanilla import Vanilla

# Loads an input .csv
csv = l.load_csv('data/16k_twitter_en.csv')

# Creates a pre-processing pipeline
pipe = p.pipeline(
p.lower_case,
p.valid_char,
p.tokenize_to_word
)

# Transforming dataframe into samples and labels
X = csv['text']
Y = csv['sentiment'].values

# Applying pre-processing pipeline to X
X = X.apply(lambda x: pipe(x)).values

# Creates the dataset (c will be for chars and w for words)
d = Vanilla(X, Y, categorical=True)

# Acessing properties from Vanilla class
# Note that every property can be acessed, please refer to the docs to know all of them
print(f'Vanilla -> X[0]: {d.X[0]} | Y[0]: {d.Y[0]} | Label: {d._index_labels[np.argmax(d.Y[0])]}')
55 changes: 42 additions & 13 deletions nalp/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@ class Dataset:
Y (np.array): Target samples.
Methods:
_build_properties(tokens): Builds all properties if there are any tokens.
vocab_to_index(vocab): Maps a vocabulary to integer indexes.
index_to_vocab(vocab): Maps integer indexes to a vocabulary.
indexate_tokens(tokens, vocab_index): Indexates tokens based on a previous defined vocabulary.
create_batches(X, Y, batch_size): Creates an iterator for feeding (X, Y) batches to the network.
"""

def __init__(self, tokens):
def __init__(self, tokens=None):
"""Initialization method.
Some basic shared variables and methods between Datasets's childs
should be declared here.
Expand All @@ -32,28 +33,32 @@ def __init__(self, tokens):
"""

# Firstly, we need to define a tokens property
self._tokens = tokens
# List of tokens
self._tokens = None

# Calculates the vocabulary and its size from tokens
vocab = list(set(tokens))
self._vocab_size = len(vocab)
# The size of the vocabulary
self._vocab_size = None

# Creates a dictionary mapping vocabulary to indexes
self._vocab_index = self.vocab_to_index(vocab)

# Creates a dictionary mapping indexes to vocabulary
self._index_vocab = self.index_to_vocab(vocab)
# A dictionary mapping vocabulary to indexes
self._vocab_index = None

# Indexate tokens based on a vocabulary-index dictionary
self._tokens_idx = self.indexate_tokens(tokens, self._vocab_index)
# A dictionary mapping indexes to vocabulary
self._index_vocab = None

# The indexated tokens
self._tokens_idx = None

# Defining inputs placeholder for further filling
self._X = None

# We also need to define the labels placeholder
self._Y = None

# Checking if there are any tokens
if tokens:
# If yes, build class properties
self._build_properties(tokens)

@property
def tokens(self):
"""A list holding tokenized words or characters.
Expand Down Expand Up @@ -110,6 +115,30 @@ def Y(self):

return self._Y

def _build_properties(self, tokens):
"""Builds all properties if there are any tokens.
Args:
tokens (list): A list holding tokenized words or characters.
"""

# Firstly, we need to define a tokens property
self._tokens = tokens

# Calculates the vocabulary and its size from tokens
vocab = list(set(tokens))
self._vocab_size = len(vocab)

# Creates a dictionary mapping vocabulary to indexes
self._vocab_index = self.vocab_to_index(vocab)

# Creates a dictionary mapping indexes to vocabulary
self._index_vocab = self.index_to_vocab(vocab)

# Indexate tokens based on a vocabulary-index dictionary
self._tokens_idx = self.indexate_tokens(tokens, self._vocab_index)

def vocab_to_index(self, vocab):
"""Maps a vocabulary to integer indexes.
Expand Down
132 changes: 132 additions & 0 deletions nalp/datasets/vanilla.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import nalp.utils.logging as l
import numpy as np
from nalp.core.dataset import Dataset

logger = l.get_logger(__name__)


class Vanilla(Dataset):
"""A Vanilla dataset can be seen as a regular dataset, composed by inputs and labels (X, Y).
Note that the inputs have to be tokenized prior to instanciating this class.
Properties:
unique_labels(list): List of unique labels.
n_class(int): Number of classes, derived from list of labels
labels_index(dict): A dictionary mapping labels to indexes.
index_labels(dict): A dictionary mapping indexes to labels.
Methods:
_labels_to_categorical(labels): Maps labels into a categorical encoding.
"""

def __init__(self, tokens, labels, categorical=True):
"""Initizaliation method.
Args:
tokens (list): A list holding tokenized words or characters.
labels (list): A list holding the labels for each sample.
categorical (boolean): If yes, apply categorical encoding to labels.
"""

logger.info('Overriding class: Dataset -> Vanilla.')

# Overrides its parent class with any custom arguments if needed
super(Vanilla, self).__init__()

# List of unique labels
self._unique_labels = None

# Number of classes, derived from list of labels
self._n_class = None

# A dictionary mapping labels to indexes
self._labels_index = None

# A dictionary mapping indexes to labels
self._index_labels = None

# Populating X from list of tokens
self._X = tokens

# Check if categorical boolean is true
if categorical:
# If yes, calls method to convert string or integer labels into categorical
self._Y = self._labels_to_categorical(labels)
else:
# If not, just apply to property
self._Y = labels

# Logging some important information
logger.debug(
f'X: {self._X.shape} | Y: {self._Y.shape}.')

logger.info('Class overrided.')

@property
def unique_labels(self):
"""List of unique labels.
"""

return self._unique_labels

@property
def n_class(self):
"""Number of classes, derived from list of labels.
"""

return self._n_class

@property
def labels_index(self):
"""A dictionary mapping labels to indexes.
"""

return self._labels_index

@property
def index_labels(self):
"""A dictionary mapping indexes to labels.
"""

return self._index_labels

def _labels_to_categorical(self, labels):
"""Maps labels into a categorical encoding.
Args:
labels(list): A list holding the labels for each sample.
Returns:
Categorical encoding of list of labels.
"""

# Gathering unique labels
self._unique_labels = set(labels)

# We also need the number of classes
self._n_class = len(self._unique_labels)

# Creating a dictionary to map labels to indexes
self._labels_index = {c: i for i, c in enumerate(self._unique_labels)}

# Creating a dictionary to map indexes to labels
self._index_labels = {i: c for i, c in enumerate(self._unique_labels)}

# Creating a numpy array to hold categorical labels
categorical_labels = np.zeros(
(len(labels), self._n_class), dtype=np.int32)

# Iterating through all labels
for i, l in enumerate(labels):
# Apply to current index the categorical encoding
categorical_labels[i] = np.eye(self._n_class)[
self._labels_index[l]]

return categorical_labels

0 comments on commit 376bbd3

Please sign in to comment.