# Model Evaluation

In this lesson, we'll evaluate the model experiments from our Hydra workflow.
We want identify the model with the best relative performance.

Import required packages.

In [None]:
import os

import hydra
import torch
from omegaconf import OmegaConf

from ml_pipeline.datasets.datamodule import BurnScarsDataModule
from ml_pipeline.model.lightningmodule import BurnScarsSegmentationModel

Configure [Hydra](https://hydra.cc/docs/intro/).

In [None]:
# load hydra configuration
with hydra.initialize(config_path="../../config", version_base="1.3.0"):
    cfg = hydra.compose(
        config_name="config",
        overrides=["seed=0", "author=devseed", "name=test-exp-nb-1"],
        return_hydra_config=True,
    )

Check the parameters of the datamodule.

In [None]:
print(OmegaConf.to_yaml(cfg.datamodule))

Check the static parameters of the model.

In [None]:
print(OmegaConf.to_yaml(cfg.model))

Only the weights of the best performing model experiment were written to disk.

In [None]:
# use the best model ckpt for evaluation
!pwd
!ls logs/checkpoint/

In [None]:
CKPT = os.path.join(cfg.callbacks.model_checkpoint.dirpath, "last.ckpt")
CKPT

Load the data and model modules for prediction.

In [None]:
# Load the datamodule with predict setup
datamodule = BurnScarsDataModule(**cfg.datamodule)
datamodule.setup(stage="predict")

In [None]:
# Loads the model with the best trained checkpoint weights for evaluation
model = BurnScarsSegmentationModel.load_from_checkpoint(CKPT)
_ = model.eval()  # set the model to evaluation mode

Load the validation partition.

In [None]:
# load the validation dataloader
val_dataloader = datamodule.val_dataloader()

In [None]:
def run_prediction(model, dataloader):
    """
    Loop through the dataloader & get model predictions.

    Args:
        model: model with trained weights
        dataloader: dataloader to run inference on

    Returns:
        preds: prediction results
        masks: ground truth labels
    """
    preds = []
    masks = []
    with torch.inference_mode():
        for batch in dataloader:
            image, mask = batch
            pred = model(image)
            preds.extend(pred.detach().numpy())
            masks.extend(mask.detach().numpy())
    return preds, masks

Get model predictions and their respective ground truths from the test partition.

In [None]:
preds, targets = run_prediction(model=model, dataloader=val_dataloader)

In [None]:
# We have the trained the model on a tiny subset of the burn scars dataset to
# show the capabilities of ml-pipeline.
# Hence, the model is overfitting on NON-FLOODED class.
# To properly display the f1,precision,recall score & the confusion matrix, adding here a couple of predictions for FLOODED class.
preds.extend([1, 1])
targets.extend([1, 0])

Generate confusion matrices, precision, recall and f1-scores.

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(classification_report(preds, targets))

In [None]:
confusion_matrix(preds, targets, labels=[0, 1])