only integer scalar arrays can be converted to a scalar index numpy

November 14, 2022

I found keras tutorial and when was following it got error.

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)


def crop_center(frame):
    cropped = center_crop_layer(frame[None, ...])
    cropped = cropped.numpy().squeeze()
    return cropped


# Following method is modified from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def load_video(path, max_frames=0):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center(frame)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)


def build_feature_extractor():
    feature_extractor = keras.applications.DenseNet121(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.densenet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()


# Label preprocessing with StringLookup.
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"]), mask_token=None
)
print(label_processor.get_vocabulary())


def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_features` are what we will feed to our sequence model.
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))

        # Pad shorter videos.
        if len(frames) < MAX_SEQ_LENGTH:
            diff = MAX_SEQ_LENGTH - len(frames)
            padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
            frames = np.concatenate(frames, padding)

        frames = frames[None, ...]

        # Initialize placeholder to store the features of the current video.
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                if np.mean(batch[j, :]) > 0.0:
                    temp_frame_features[i, j, :] = feature_extractor.predict(
                        batch[None, j, :]
                    )

                else:
                    temp_frame_features[i, j, :] = 0.0

        frame_features[idx,] = temp_frame_features.squeeze()

    return frame_features, labels

When i call prepare_all_videos and pass train_df or test_df to it, this error occurs:

    81             diff = MAX_SEQ_LENGTH - len(frames)
     82             padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
---> 83             frames = np.concatenate(frames, padding)
     84 
     85         frames = frames[None, ...]

<__array_function__ internals> in concatenate(*args, **kwargs)

TypeError: only integer scalar arrays can be converted to a scalar index

That’s how test_df looks like:

video_name tag
0 v_CricketShot_g01_c01.avi CricketShot
1 v_CricketShot_g01_c02.avi CricketShot
2 v_CricketShot_g01_c03.avi CricketShot
3 v_CricketShot_g01_c04.avi CricketShot
4 v_CricketShot_g01_c05.avi CricketShot
… … …
219 v_TennisSwing_g07_c03.avi TennisSwing
220 v_TennisSwing_g07_c04.avi TennisSwing
221 v_TennisSwing_g07_c05.avi TennisSwing
222 v_TennisSwing_g07_c06.avi TennisSwing
223 v_TennisSwing_g07_c07.avi TennisSwing

What’s wrong? How to fix it? This tutorial has colab and you can run it if you want.

>Solution :

Most probably it’s because frame is returning an empty array, so concat is failing. So add a condition to check the length of the frame,

frames = load_video(os.path.join(root_dir, path))

if len(frames) == 0:
   continue

# Pad shorter videos.
if len(frames) < MAX_SEQ_LENGTH:
   diff = MAX_SEQ_LENGTH - len(frames)
   padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))