How to use text_dataset_from_directory in TensorFlow

This tutorial explains how to use text_dataset_from_directory utility in Tensorflow. text_dataset_from_directory utility generates `tf.data.Dataset` from text files in a directory. For using text_dataset_from_directory directory structure should be as follows.
   
main_directory/
...class_a/
......a_text_1.txt
......a_text_2.txt
...class_b/
......b_text_1.txt
......b_text_2.txt
 

Lets understand text_dataset_from_directory with below example. In this example we will create labelled tf.data.Dataset for IMDB movie review dataset using text_dataset_from_directory.

Download the IMDB dataset
   
import tensorflow as tf
import os
import shutil

dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", dataset_url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

 
View the files and folders in downloaded dataset
   
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)
 

Prepare dataset for binary classification

To prepare a dataset for binary classification, we need two folders, corresponding to class_a and class_b. These will be the positive and negative movie reviews, which can be found in aclImdb/train/pos and aclImdb/train/neg. As the IMDB dataset contains additional folders, we need to remove them before using text_dataset_from_directory.

   
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

os.listdir(train_dir)
 
Create labeled tf.data.Dataset using text_dataset_from_directory
   
batch_size = 64
seed = 32

training_dataset = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.3, 
    subset='training', 
    seed=seed)
 
Check labels for corresponding classes
   
for i, label in enumerate(training_dataset.class_names):
  print("Label", i, "corresponds to", label)
 
View labels and reviews for few examples
   
for text_batch, label_batch in training_dataset.take(1):
  for i in range(10):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])
 
Complete code snippet
   
# Import required libraries
import tensorflow as tf
import os
import shutil

# Download IMDB dataset
dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", dataset_url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

# Explore dataset files and folders
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

# Remove extra folders
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

os.listdir(train_dir)

# Create labelled dataset
batch_size = 64
seed = 32

training_dataset = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.3, 
    subset='training', 
    seed=seed)

# Explore examples
for i, label in enumerate(training_dataset.class_names):
  print("Label", i, "corresponds to", label)

for text_batch, label_batch in training_dataset.take(1):
  for i in range(10):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

 

Category: TensorFlow