In TensorFlow Categorical values can be transformed to one-hot-encoded vectors by using tf.feature_column.categorical_column_with_vocabulary_list function with tf.feature_column.indicator_column function.
import tensorflow as tf
import pandas as pd
df = pd.read_csv('https://storage.googleapis.com/gcptutorials.com/dataset/sample.csv')
print(df.head())
======Output======
survived sex age n_siblings_spouses parch fare class deck embark_town alone
0 0 male 22.0 1 0 7.2500 Third unknown Southampton n
1 1 female 38.0 1 0 71.2833 First C Cherbourg n
2 1 female 26.0 0 0 7.9250 Third unknown Southampton y
3 1 female 35.0 1 0 53.1000 First C Southampton n
4 0 male 28.0 0 0 8.4583 Third unknown Queenstown y
CAT_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
'embark_town', 'alone']
NUM_COLUMNS = ['age', 'fare']
feature_cols = []
# Create IndicatorColumn for categorical features
for feature in CAT_COLUMNS:
vocab = df[feature].unique()
feature_cols.append(tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(feature, vocab)))
# Create NumericColumn for numerical features
for feature in NUM_COLUMNS:
feature_cols.append(tf.feature_column.numeric_column(feature, dtype=tf.float32))
print(feature_cols)
=======Output======
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols[0])(row).numpy()
======Output======
array([[1., 0.]], dtype=float32)
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols[1])(row).numpy()
======Output======
array([[1., 0., 0., 0., 0., 0., 0.]], dtype=float32)
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols[2])(row).numpy()
======Output======
array([[1., 0., 0., 0., 0., 0.]], dtype=float32)
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols[3])(row).numpy()
======Output======
array([[1., 0., 0.]], dtype=float32)
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols[4])(row).numpy()
======Output======
array([[1., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols[5])(row).numpy()
======Output======
array([[1., 0., 0., 0.]], dtype=float32)
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols[6])(row).numpy()
======Output======
array([[1., 0.]], dtype=float32)
row = dict(df.head(1))
tf.keras.layers.DenseFeatures(feature_cols)(row).numpy()
======Output======
array([[22. , 1. , 0. , 1. , 0. , 0. , 1. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 0. ,
7.25, 1. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ,
0. , 0. , 0. , 0. , 0. , 1. , 0. ]], dtype=float32)
Category: TensorFlow