Adding a Vanilla dataset class. This can be used for typical classifi…

…cation problems, using a set of (X, Y) samples.
gugarosa · Feb 18, 2019 · 376bbd3 · 376bbd3
1 parent 0772942
commit 376bbd3
Show file tree

Hide file tree

Showing 3 changed files with 202 additions and 13 deletions.
diff --git a/examples/datasets/create_vanilla.py b/examples/datasets/create_vanilla.py
@@ -0,0 +1,28 @@
+import nalp.stream.loader as l
+import nalp.stream.preprocess as p
+import numpy as np
+from nalp.datasets.vanilla import Vanilla
+
+# Loads an input .csv
+csv = l.load_csv('data/16k_twitter_en.csv')
+
+# Creates a pre-processing pipeline
+pipe = p.pipeline(
+    p.lower_case,
+    p.valid_char,
+    p.tokenize_to_word
+)
+
+# Transforming dataframe into samples and labels
+X = csv['text']
+Y = csv['sentiment'].values
+
+# Applying pre-processing pipeline to X
+X = X.apply(lambda x: pipe(x)).values
+
+# Creates the dataset (c will be for chars and w for words)
+d = Vanilla(X, Y, categorical=True)
+
+# Acessing properties from Vanilla class
+# Note that every property can be acessed, please refer to the docs to know all of them
+print(f'Vanilla -> X[0]:  {d.X[0]} | Y[0]: {d.Y[0]} | Label: {d._index_labels[np.argmax(d.Y[0])]}')
diff --git a/nalp/core/dataset.py b/nalp/core/dataset.py
@@ -15,14 +15,15 @@ class Dataset:
         Y (np.array): Target samples.
 
     Methods:
+        _build_properties(tokens): Builds all properties if there are any tokens.
         vocab_to_index(vocab): Maps a vocabulary to integer indexes.
         index_to_vocab(vocab): Maps integer indexes to a vocabulary.
         indexate_tokens(tokens, vocab_index): Indexates tokens based on a previous defined vocabulary.
         create_batches(X, Y, batch_size): Creates an iterator for feeding (X, Y) batches to the network.
 
     """
 
-    def __init__(self, tokens):
+    def __init__(self, tokens=None):
         """Initialization method.
         Some basic shared variables and methods between Datasets's childs
         should be declared here.
@@ -32,28 +33,32 @@ def __init__(self, tokens):
 
         """
 
-        # Firstly, we need to define a tokens property
-        self._tokens = tokens
+        # List of tokens
+        self._tokens = None
 
-        # Calculates the vocabulary and its size from tokens
-        vocab = list(set(tokens))
-        self._vocab_size = len(vocab)
+        # The size of the vocabulary
+        self._vocab_size = None
 
-        # Creates a dictionary mapping vocabulary to indexes
-        self._vocab_index = self.vocab_to_index(vocab)
-
-        # Creates a dictionary mapping indexes to vocabulary
-        self._index_vocab = self.index_to_vocab(vocab)
+        # A dictionary mapping vocabulary to indexes
+        self._vocab_index = None
 
-        # Indexate tokens based on a vocabulary-index dictionary
-        self._tokens_idx = self.indexate_tokens(tokens, self._vocab_index)
+        # A dictionary mapping indexes to vocabulary
+        self._index_vocab = None
 
+        # The indexated tokens
+        self._tokens_idx = None
+
         # Defining inputs placeholder for further filling
         self._X = None
 
         # We also need to define the labels placeholder
         self._Y = None
 
+        # Checking if there are any tokens
+        if tokens:
+            # If yes, build class properties
+            self._build_properties(tokens)
+
     @property
     def tokens(self):
         """A list holding tokenized words or characters.
@@ -110,6 +115,30 @@ def Y(self):
 
         return self._Y
 
+    def _build_properties(self, tokens):
+        """Builds all properties if there are any tokens.
+
+        Args:
+            tokens (list): A list holding tokenized words or characters.
+
+        """
+
+        # Firstly, we need to define a tokens property
+        self._tokens = tokens
+
+        # Calculates the vocabulary and its size from tokens
+        vocab = list(set(tokens))
+        self._vocab_size = len(vocab)
+
+        # Creates a dictionary mapping vocabulary to indexes
+        self._vocab_index = self.vocab_to_index(vocab)
+
+        # Creates a dictionary mapping indexes to vocabulary
+        self._index_vocab = self.index_to_vocab(vocab)
+
+        # Indexate tokens based on a vocabulary-index dictionary
+        self._tokens_idx = self.indexate_tokens(tokens, self._vocab_index)
+
     def vocab_to_index(self, vocab):
         """Maps a vocabulary to integer indexes.
 

diff --git a/nalp/datasets/vanilla.py b/nalp/datasets/vanilla.py
@@ -0,0 +1,132 @@
+import nalp.utils.logging as l
+import numpy as np
+from nalp.core.dataset import Dataset
+
+logger = l.get_logger(__name__)
+
+
+class Vanilla(Dataset):
+    """A Vanilla dataset can be seen as a regular dataset, composed by inputs and labels (X, Y).
+    Note that the inputs have to be tokenized prior to instanciating this class.
+
+    Properties:
+        unique_labels(list): List of unique labels.
+        n_class(int): Number of classes, derived from list of labels
+        labels_index(dict): A dictionary mapping labels to indexes.
+        index_labels(dict): A dictionary mapping indexes to labels.
+
+    Methods:
+        _labels_to_categorical(labels): Maps labels into a categorical encoding.
+
+    """
+
+    def __init__(self, tokens, labels, categorical=True):
+        """Initizaliation method.
+
+        Args:
+            tokens (list): A list holding tokenized words or characters.
+            labels (list): A list holding the labels for each sample.
+            categorical (boolean): If yes, apply categorical encoding to labels.
+
+        """
+
+        logger.info('Overriding class: Dataset -> Vanilla.')
+
+        # Overrides its parent class with any custom arguments if needed
+        super(Vanilla, self).__init__()
+
+        # List of unique labels
+        self._unique_labels = None
+
+        # Number of classes, derived from list of labels
+        self._n_class = None
+
+        # A dictionary mapping labels to indexes
+        self._labels_index = None
+
+        # A dictionary mapping indexes to labels
+        self._index_labels = None
+
+        # Populating X from list of tokens
+        self._X = tokens
+
+        # Check if categorical boolean is true
+        if categorical:
+            # If yes, calls method to convert string or integer labels into categorical
+            self._Y = self._labels_to_categorical(labels)
+        else:
+            # If not, just apply to property
+            self._Y = labels
+
+        # Logging some important information
+        logger.debug(
+            f'X: {self._X.shape} | Y: {self._Y.shape}.')
+
+        logger.info('Class overrided.')
+
+    @property
+    def unique_labels(self):
+        """List of unique labels.
+
+        """
+
+        return self._unique_labels
+
+    @property
+    def n_class(self):
+        """Number of classes, derived from list of labels.
+
+        """
+
+        return self._n_class
+
+    @property
+    def labels_index(self):
+        """A dictionary mapping labels to indexes.
+
+        """
+
+        return self._labels_index
+
+    @property
+    def index_labels(self):
+        """A dictionary mapping indexes to labels.
+
+        """
+
+        return self._index_labels
+
+    def _labels_to_categorical(self, labels):
+        """Maps labels into a categorical encoding.
+
+        Args:
+            labels(list): A list holding the labels for each sample.
+
+        Returns:
+            Categorical encoding of list of labels.
+
+        """
+
+        # Gathering unique labels
+        self._unique_labels = set(labels)
+
+        # We also need the number of classes
+        self._n_class = len(self._unique_labels)
+
+        # Creating a dictionary to map labels to indexes
+        self._labels_index = {c: i for i, c in enumerate(self._unique_labels)}
+
+        # Creating a dictionary to map indexes to labels
+        self._index_labels = {i: c for i, c in enumerate(self._unique_labels)}
+
+        # Creating a numpy array to hold categorical labels
+        categorical_labels = np.zeros(
+            (len(labels), self._n_class), dtype=np.int32)
+
+        # Iterating through all labels
+        for i, l in enumerate(labels):
+            # Apply to current index the categorical encoding
+            categorical_labels[i] = np.eye(self._n_class)[
+                self._labels_index[l]]
+
+        return categorical_labels