Multimodal AI combines different types of data (like text, images, and sound) to make better decisions. This tutorial will guide you through creating a simple multimodal system that understands both pictures and words.
We'll use these libraries:
# Install essential packages
!pip install torch torchvision transformers Pillow librosa
# Verify installation
import torch
print(f"PyTorch version: {torch.__version__}") # Should output 2.0+
This system combines:
from transformers import pipeline
from PIL import Image
# Initialize the AI model
vqa_pipeline = pipeline("visual-question-answering",
model="dandelin/vilt-b32-finetuned-vqa")
# Load and analyze an image
image = Image.open("cat.jpg") # Replace with your image
question = "What animal is in the picture?"
result = vqa_pipeline(image, question)
print(f"Answer: {result['answer']} (Confidence: {result['score']:.2f})")
Converting speech to text involves:
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
# Load speech recognition model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# Process audio file
audio, sampling_rate = librosa.load("speech.wav", sr=16000)
inputs = processor(audio, return_tensors="pt", padding=True)
# Convert speech to text
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
print(f"You said: {transcription}")
We combine information using:
import torch.nn as nn
class MultimodalModel(nn.Module):
def __init__(self):
super().__init__()
# Text processing layer
self.text_layer = nn.Linear(768, 256) # 768 = BERT features
# Image processing layer
self.image_layer = nn.Linear(1024, 256) # 1024 = ResNet features
# Audio processing layer
self.audio_layer = nn.Linear(320, 256) # 320 = Wav2Vec2 features
# Combined decision layer
self.combined = nn.Linear(256, 10) # 10 output classes
def forward(self, text, image, audio):
# Process each input type
text_features = self.text_layer(text)
image_features = self.image_layer(image)
audio_features = self.audio_layer(audio)
# Combine features by averaging
merged = (text_features + image_features + audio_features) / 3
# Final classification
return self.combined(merged)
Our final system handles:
from torchvision import transforms
class AIAssistant:
def __init__(self):
# Image preprocessing pipeline
self.image_transform = transforms.Compose([
transforms.Resize(256), # Standard size
transforms.ToTensor(), # Convert to numbers
transforms.Normalize( # Adjust color values
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# Text feature extractor
self.text_processor = pipeline(
"feature-extraction",
model="distilbert-base-uncased"
)
def process_inputs(self, image_path, text):
try:
# Process image
image = Image.open(image_path)
image_tensor = self.image_transform(image).unsqueeze(0)
# Process text
text_features = torch.tensor(
self.text_processor(text, return_tensors="pt")[0]
)
return image_tensor, text_features
except Exception as e:
print(f"Error processing inputs: {str(e)}")
return None, None
Ensure reliable operation with:
def validate_inputs(image, text):
# Check image properties
assert image is not None, "No image provided"
assert image.shape[1] == 3, "Invalid color channels"
# Check text requirements
assert len(text) >= 3, "Text too short"
assert isinstance(text, str), "Text must be a string"
return True
# Example usage
try:
assistant = AIAssistant()
image, text = assistant.process_inputs("cat.jpg", "A furry animal")
validate_inputs(image, text)
print("All inputs valid!")
except AssertionError as e:
print(f"Validation error: {str(e)}")
This guide has walked you through creating a basic multimodal AI system. Remember to start with small experiments and gradually increase complexity as you become more comfortable with the components.
Category: Gemini
Similar Articles