image-Captioning / src /data_loader.py
Anki2004's picture
Upload 117 files
c90c141 verified
# import numpy as np
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
# from tensorflow.keras.preprocessing.image import load_img, img_to_array
# from tensorflow.keras.models import Model
# def load_data(captions_file, image_dir):
# with open(captions_file, 'r') as f:
# captions = f.read().split('\n')
# img_to_captions = {}
# for caption in captions:
# if caption.strip(): # Skip empty lines
# parts = caption.split(',')
# if len(parts) >= 2:
# img = parts[0].strip()
# cap = ','.join(parts[1:]).strip() # Join all parts after the first comma
# if img not in img_to_captions:
# img_to_captions[img] = []
# img_to_captions[img].append(cap)
# else:
# print(f"Skipping invalid line: {caption}")
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts([cap for caps in img_to_captions.values() for cap in caps])
# inception = InceptionV3(weights = 'imagenet')
# inception_model = Model(inception.input, inception.layers[-2].output)
# img_features = {}
# for img in img_to_captions.keys():
# img_path = f'{image_dir}/{img}'
# img = load_img(img_path, target_size = (299, 299))
# img = img_to_array(img)
# img = np.expand_dims(img, axis = 0)
# img = preprocess_input(img)
# features = inception_model.predict(img)
# img_features[img] = features
# X1, X2, y = [], [], []
# for img, caps in img_to_captions.items():
# for cap in caps:
# seq = tokenizer.texts_to_sequences([cap])[0]
# for i in range(1, len(seq)):
# in_seq, out_seq= seq[:i], seq[i]
# in_seq = pad_sequences([in_seq], maxlen = 34)[0]
# out_seq = to_categorical([out_seq], num_classes = len(tokenizer.word_index)+1)[0]
# X1.append(img_features[img][0])
# X2.append(in_seq)
# y.append(out_seq)
# X1, X2, y = np.array(X1), np.array(X2), np.array(y)
# split = int(0.8 * len(X1))
# train_data = ([X1[:split], X2[:split]], y[:split])
# val_data = ([X1[split:], X2[split:]], y[split:])\
# return train_data, val_data, tokenizer
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from PIL import Image
import os
def load_data(captions_file, images_dir):
# Load captions
with open(captions_file, 'r') as f:
captions = f.read().split('\n')
# Process captions
img_to_captions = {}
for caption in captions:
if caption.strip(): # Skip empty lines
parts = caption.split(',')
if len(parts) >= 2:
img = parts[0].strip()
cap = ','.join(parts[1:]).strip() # Join all parts after the first comma
if img not in img_to_captions:
img_to_captions[img] = []
img_to_captions[img].append(cap)
else:
print(f"Skipping invalid line: {caption}")
# Tokenize captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cap for caps in img_to_captions.values() for cap in caps])
# Load images and extract features
inception = InceptionV3(weights='imagenet')
inception_model = Model(inception.input, inception.layers[-2].output)
img_features = {}
for img in img_to_captions.keys():
img_path = os.path.join(images_dir, img)
if os.path.exists(img_path):
image = Image.open(img_path).convert('RGB')
image = image.resize((299, 299))
image = np.array(image)
image = np.expand_dims(image, axis=0)
image = preprocess_input(image)
features = inception_model.predict(image)
img_features[img] = features
else:
print(f"Image not found: {img_path}")
# Prepare training data
max_length = max(len(cap.split()) for caps in img_to_captions.values() for cap in caps)
vocab_size = len(tokenizer.word_index) + 1
X1, X2, y = [], [], []
for img, caps in img_to_captions.items():
if img in img_features:
for cap in caps:
seq = tokenizer.texts_to_sequences([cap])[0]
for i in range(1, len(seq)):
in_seq, out_seq = seq[:i], seq[i]
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
X1.append(img_features[img][0])
X2.append(in_seq)
y.append(out_seq)
X1, X2, y = np.array(X1), np.array(X2), np.array(y)
# Split into train and validation sets
split = int(0.8 * len(X1))
train_data = ([X1[:split], X2[:split]], y[:split])
val_data = ([X1[split:], X2[split:]], y[split:])
return train_data, val_data, tokenizer, max_length, vocab_size