Example: run MultiSolSegment and validate

This examples hows how to download the weights for MultiSolSegment, run it on an image to segment crack, dark and busbar masks. Then use pvimage to look at crack count and parameters

[1]:

from pathlib import Path
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

from pvcracks.utils import viz_functions, train_functions

import requests

from pvimage import features

[2]:

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning)

Step 1: Download Model

DOI - https://doi.org/10.21948/2997859

Direct Link - https://datahub.duramat.org/dataset/24d7ed98-956e-4ebc-bf21-6584ecec2ff1/resource/e6d5dee2-0aef-439e-978c-d8c572e7039e/download/model.pt

[4]:

weight_path = "multisolsegment.pt"

if os.path.exists(weight_path):
    print("Weights already downloaded.")
else:
    response = requests.get(
        "https://datahub.duramat.org/dataset/24d7ed98-956e-4ebc-bf21-6584ecec2ff1/resource/e6d5dee2-0aef-439e-978c-d8c572e7039e/download/model.pt"
    )
    with open(weight_path, "wb") as f:
        f.write(response.content)

Weights already downloaded.

Step 2: Download and Load Images

DOI - https://doi.org/10.21948/2587738

Direct Link - https://datahub.duramat.org/dataset/5e7587ad-6ad1-4d6f-8432-70940a6d7ca1/resource/8f12be45-b929-4f5c-9400-84dde27b0e90/download/labeledelcells.zip

[ ]:

from zipfile import ZipFile
from io import BytesIO
import os

url = (
    "https://datahub.duramat.org/"
    "dataset/5e7587ad-6ad1-4d6f-8432-70940a6d7ca1/"
    "resource/8f12be45-b929-4f5c-9400-84dde27b0e90/"
    "download/labeledelcells.zip"
)

out_dir = os.path.expanduser("labeled_image_data")
if os.path.exists(out_dir):
    print("Images already downloaded.")
else:
    os.makedirs(out_dir, exist_ok=True)

    resp = requests.get(url)
    resp.raise_for_status()

    with ZipFile(BytesIO(resp.content)) as z:
        z.extractall(path=out_dir)

Images already downloaded.

[6]:

out_dir

[6]:

'labeled_image_data'

Step 3: Set local paths

[ ]:

files_root = f"{out_dir}/LabeledELcells/"
# files_root = "/Users/ojas/Desktop/saj/SANDIA/pvcracks_data/Channeled_Combined_CWRU_LBNL_ASU_No_Empty/"

# if you have have already downloaded model weights, replace this with the local path
# weight_path = "/Users/ojas/Desktop/saj/SANDIA/pvcracks_data/Channeled_Combined_CWRU_LBNL_ASU_No_Empty/checkpoints/Channeled_Combined_CWRU_LBNL_ASU_No_Empty10/epoch_19/model.pt"

Step 4: Load in everything

[8]:

category_mapping = {0: "dark", 1: "busbar", 2: "crack", 3: "non-cell"}

root = Path(files_root)
img_root = root / "img" / "all"

train_dataset, val_dataset = train_functions.load_dataset(root)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
device, model = train_functions.load_device_and_model(
    category_mapping, existing_weight_path=weight_path
)

Step 5: Pick image and threshold

Index is what image to run this model on.

Threshold is what confidence % we want to code as an actual detection target, rather than just background noise.

[9]:

category_mapping = {0: "dark", 1: "busbar", 2: "crack", 3: "non-cell"}

idx = 7
threshold = 0.5  # default

Step 6: Inference and Show

[ ]:

viz_functions.channeled_inference_and_show(
    val_loader, device, model, category_mapping, idx, threshold
)
# plt.savefig("mss_model_prediction.png", dpi=300)

../_images/Examples_example_download_run_multisolsegment_15_0.png

<Figure size 640x480 with 0 Axes>

Step 7: Investigate dark area %

[11]:

def channel_activation_percentages(mask, category_mapping):
    """
    Calculate the percentage of pixels activated for each channel in a multi-hot mask.

    Args:
        mask: 3D array-like of shape (n_channels, height, width) containing multi-hot activations.
        category_mapping (dict): Mapping from channel index to class name. The iteration order
            defines the channel order in the mask tensor.

    Returns:
        dict: Mapping from class name to percentage (0-100) of activated pixels in that channel.
    """

    mask_np = np.asarray(mask)

    if mask_np.ndim != 3:
        raise ValueError(
            f"Expected a 3D mask of shape (channels, height, width); got {mask_np.shape}."
        )

    n_channels, height, width = mask_np.shape
    total_pixels = height * width

    if total_pixels == 0:
        raise ValueError("Mask must contain at least one pixel.")

    percentages = {}
    for channel_idx, class_name in category_mapping.items():
        if channel_idx >= n_channels:
            raise ValueError(
                f"Channel index {channel_idx} for class '{class_name}' is out of bounds "
                f"for mask with {n_channels} channel(s)."
            )
        channel_activation = mask_np[channel_idx].sum()
        percentages[class_name] = (
            float(channel_activation) / float(total_pixels) * 100.0
        )

    return percentages


def print_channel_activation_percentages(percentages):
    print("Predicted channel activation (% of image):")
    sum = 0
    for _, class_name in category_mapping.items():
        print(f"\t{class_name}: {percentages[class_name]:.2f}%")
        sum += percentages[class_name]
    print("\tRemaining (solar cell): %.2f%%\n" % (100 - sum))

[ ]:

img, mask = val_dataset.__getitem__(idx)
img = img.to(device)
img_path = val_dataset.get_img_path(idx)

logits = model(img.unsqueeze(0)).detach().cpu()
probs = torch.sigmoid(logits)
pred_mask = (probs > threshold).float().squeeze(0).numpy()

gt_mask = mask.cpu().numpy()

gt_percentages = channel_activation_percentages(gt_mask, category_mapping)
pred_percentages = channel_activation_percentages(pred_mask, category_mapping)

print_channel_activation_percentages(gt_percentages)
print_channel_activation_percentages(pred_percentages)

Predicted channel activation (% of image):
        dark: 11.16%
        busbar: 7.75%
        crack: 3.05%
        non-cell: 3.70%
        Remaining (solar cell): 74.34%

Predicted channel activation (% of image):
        dark: 10.31%
        busbar: 7.81%
        crack: 3.08%
        non-cell: 3.70%
        Remaining (solar cell): 75.10%

Step 8: Crack mask features with PVImage

Extract predicted crack masks, compute crack statistics with PVImage, and visualize the distribution of key metrics.

[ ]:

img, mask = val_dataset.__getitem__(idx)
img = img.to(device)
img_path = val_dataset.get_img_path(idx)

logits = model(img.unsqueeze(0)).detach().cpu()
probs = torch.sigmoid(logits)
pred_mask = (probs > threshold).float().squeeze(0).numpy()

fig, ax = plt.subplots()
ax.imshow(pred_mask[2], cmap="viridis")

<matplotlib.image.AxesImage at 0x7f5ceb69b890>

../_images/Examples_example_download_run_multisolsegment_20_1.png

[14]:

crack_mask = (pred_mask[2] * 255).astype(np.uint8)  # Convert [0,1] to [0,255]

# Create DataFrame with image path
dfinfo = pd.DataFrame()
dfinfo["impath"] = [str(img_path)]  # Wrap in list

# Run feature extraction
dffeaturesreal = features.feature_extraction_crack_mask([crack_mask], dfinfo)
dffeaturesreal.head()

[14]:

cell_number	i	prop.perimeter	slope	prop.convex_area	prop.area	prop.orientation
mxy_0540	4	745.404112	-2.728464	13499.0	5157.0	-1.459944
mxy_0540	0	332.468037	-1.693958	3134.0	1898.0	-1.429506
mxy_0540	2	149.59798	-1.927778	949.0	789.0	-1.273929

[15]:

def violinplt(data, palette=None, figsize=(5, 4), y_axis_log=False):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Coerce to DataFrame so .mean() and column names behave
    if isinstance(data, pd.Series):
        data = data.to_frame()
    elif not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)

    fig, ax = plt.subplots(figsize=figsize)

    # Seaborn API: no widths/showmeans/showmedians/showextrema here
    # Use inner="quartile" to show quartiles; cut=0 to avoid extrapolation
    sns.violinplot(
        data=data,
        ax=ax,
        palette=palette,
        inner="quartile",
        cut=0,
        linewidth=1,
    )

    # Overlay means as points
    means = data.mean().to_numpy()
    x_pos = np.arange(len(means))
    ax.scatter(x_pos, means, zorder=3)

    # Prepare mean labels
    nobs = [f"{m:.2f}" for m in means]

    # Place mean labels slightly above the point
    y_offset = 0.03 * (
        np.nanmax(means) - np.nanmin(means) if np.all(np.isfinite(means)) else 1.0
    )
    if len(x_pos) > 3:
        for i in x_pos:
            t = ax.text(
                i,
                means[i] + y_offset,
                nobs[i],
                ha="center",
                va="bottom",
                fontsize=10,
                color="black",
                weight="semibold",
            )
            # subtle white box for readability
            t.set_bbox(
                dict(
                    facecolor="white",
                    alpha=0.5,
                    edgecolor="none",
                    boxstyle="round,pad=0.15",
                )
            )
        # Rotate tick labels cleanly
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
    else:
        for i in x_pos:
            ax.text(
                i,
                means[i] + y_offset,
                nobs[i],
                ha="center",
                va="bottom",
                fontsize=12,
                color="black",
                weight="semibold",
            )

    ax.set_ylabel("Value")
    ax.set_xlabel("")
    if y_axis_log:
        ax.set_yscale("log")
    ax.margins(x=0.05)
    fig.tight_layout()
    return fig

[16]:

dffeaturesreal["count_features"] = dffeaturesreal.groupby("cell_number")[
    "cell_number"
].transform("count")
dffeaturesrealcount = dffeaturesreal.sort_values(by="cell_number").drop_duplicates(
    "cell_number", keep="first"
)

[17]:

dffeaturesrealcount.reset_index(drop=True, inplace=True)

featurescounted = pd.concat(
    [dffeaturesrealcount["count_features"]], axis=1, ignore_index=True
)
featurescounted.columns = ["Original"]
featurescounted.head()

[17]:

	Original
0	3

[18]:

from sklearn.preprocessing import MinMaxScaler


dffeaturesall = dffeaturesreal


vp = violinplt(featurescounted[["Original"]], figsize=(5, 4))


scaler = MinMaxScaler()
columnstonormalize = [
    " prop.perimeter",
    " slope",
    " prop.area",
    " prop.orientation",
]
dffeaturesall_norm = dffeaturesall[columnstonormalize]
dffeaturesall_norm[columnstonormalize] = scaler.fit_transform(dffeaturesall_norm)
fig = violinplt(
    dffeaturesall_norm[
        [
            " prop.perimeter",
            " slope",
            " prop.area",
            " prop.orientation",
        ]
    ],
    figsize=None,
)
# fig.savefig("violins.png", bbox_inches="tight")

../_images/Examples_example_download_run_multisolsegment_25_0.png

../_images/Examples_example_download_run_multisolsegment_25_1.png

Step 9: PVImage stats for many images

[ ]:

m_idx = len(val_dataset)  # indices to process
masks = []
impaths = []

for i in range(m_idx):
    img, _ = val_dataset.__getitem__(i)
    img = img.to(device)
    img_path = val_dataset.get_img_path(i)

    # model inference
    logits = model(img.unsqueeze(0)).detach().cpu()
    probs = torch.sigmoid(logits)
    pred_mask = (probs > threshold).float().squeeze(0).numpy()  # (n_classes, H, W)

    # Extract crack channel (index 2) and convert to uint8 [0,255]
    crack_mask = (pred_mask[2] * 255).astype(np.uint8)

    masks.append(crack_mask)
    impaths.append(str(img_path))

# Build dataframe of image paths expected by feature_extraction_crack_mask
dfinfo_multi = pd.DataFrame({"impath": impaths})

# Run PVImage feature extraction across all predicted crack masks
dffeatures_multi = features.feature_extraction_crack_mask(masks, dfinfo_multi)
print(
    f"Processed {len(masks)} ground-truth images, extracted {len(dffeatures_multi)} feature rows."
)
display(dffeatures_multi.head())

Processed 585 ground-truth images, extracted 601 feature rows.

cell_number	i	prop.perimeter	slope	prop.convex_area	prop.area	prop.orientation
mxy_sa19965_sub_EL_9	2	271.740115	-0.298467	2296.0	1497.0	-0.486834
mxy_sa19965_sub_EL_9	8	266.468037	-0.155785	1826.0	1403.0	-0.386939
mxy_sa19965_sub_EL_9	4	235.39697	-1.476617	2109.0	1289.0	-1.394845
mxy_sa19965_sub_EL_9	5	233.39697	0.500036	2353.0	1422.0	0.739917
SW2	1	157.556349	1.625017	887.0	789.0	1.500654

[20]:

dffeatures_multi["count_features"] = dffeatures_multi.groupby("cell_number")[
    "cell_number"
].transform("count")
dffeatures_multi_count = dffeatures_multi.sort_values(by="cell_number").drop_duplicates(
    "cell_number", keep="first"
)
dffeatures_multi_count.reset_index(drop=True, inplace=True)

featurescounted_multi = pd.concat(
    [dffeatures_multi_count["count_features"]], axis=1, ignore_index=True
)
featurescounted_multi.columns = ["Original"]

# Show basic outputs
print(f"Processed {len(masks)} images, extracted {len(dffeatures_multi)} feature rows.")
display(featurescounted_multi.head())

Processed 585 images, extracted 601 feature rows.

	Original
0	1
1	2
2	1
3	3
4	3

[21]:

dffeatures_multi.to_pickle("mms_dffeatures.pickle")
dffeatures_multi = pd.read_pickle("mms_dffeatures.pickle")

[22]:

vp = violinplt(featurescounted_multi[["Original"]], figsize=(5, 4))

../_images/Examples_example_download_run_multisolsegment_30_0.png

[23]:

scaler = MinMaxScaler()
columnstonormalize = [
    " prop.perimeter",
    " slope",
    " prop.area",
    " prop.orientation",
]
dffeatures_multi_norm = dffeatures_multi[columnstonormalize]
dffeatures_multi_norm[columnstonormalize] = scaler.fit_transform(dffeatures_multi_norm)
fig = violinplt(
    dffeatures_multi_norm[
        [
            " prop.perimeter",
            " slope",
            " prop.area",
            " prop.orientation",
        ]
    ],
    figsize=None,
)

../_images/Examples_example_download_run_multisolsegment_31_0.png

Step 10: PVImage stats for ground truth crack masks

[ ]:

gt_masks = []
gt_impaths = []

for i in range(m_idx):
    _, gt_mask = val_dataset.__getitem__(i)
    img_path = val_dataset.get_img_path(i)

    crack_mask_gt = (gt_mask.cpu().numpy()[2] * 255).astype(np.uint8)

    gt_masks.append(crack_mask_gt)
    gt_impaths.append(str(img_path))

dfinfo_gt = pd.DataFrame({"impath": gt_impaths})
dffeatures_gt = features.feature_extraction_crack_mask(gt_masks, dfinfo_gt)


display(dffeatures_gt.head())

cell_number	i	prop.perimeter	slope	prop.convex_area	prop.area	prop.orientation
mxy_sa19965_sub_EL_9	7	214.083261	-1.170103	1893.0	1167.0	-1.370287
mxy_sa19965_sub_EL_9	14	206.325902	-0.380381	1568.0	1059.0	-1.052619
mxy_sa19965_sub_EL_9	4	162.426407	-0.890218	1256.0	887.0	-0.862461
SW2	3	119.313708	1.319687	616.0	546.0	1.455919
my_SW1	0	454.575685	-0.897546	5464.0	2804.0	-0.985487

[25]:

dffeatures_gt["count_features"] = dffeatures_gt.groupby("cell_number")[
    "cell_number"
].transform("count")
dffeatures_gt_count = dffeatures_gt.sort_values(by="cell_number").drop_duplicates(
    "cell_number", keep="first"
)
dffeatures_gt_count.reset_index(drop=True, inplace=True)

featurescounted_gt = pd.concat(
    [dffeatures_gt_count["count_features"]], axis=1, ignore_index=True
)
featurescounted_gt.columns = ["Ground Truth"]

display(featurescounted_gt.head())
print(
    f"Processed {len(gt_masks)} ground-truth images, extracted {len(dffeatures_gt)} feature rows."
)

	Ground Truth
0	1
1	2
2	1
3	4
4	4

Processed 585 ground-truth images, extracted 590 feature rows.

[26]:

vp = violinplt(featurescounted_gt[["Ground Truth"]], figsize=(5, 4))

../_images/Examples_example_download_run_multisolsegment_35_0.png

[27]:

scaler = MinMaxScaler()
columnstonormalize = [
    " prop.perimeter",
    " slope",
    " prop.area",
    " prop.orientation",
]
dffeatures_gt_norm = dffeatures_gt[columnstonormalize]
dffeatures_gt_norm[columnstonormalize] = scaler.fit_transform(dffeatures_gt_norm)
fig = violinplt(
    dffeatures_gt_norm[
        [
            " prop.perimeter",
            " slope",
            " prop.area",
            " prop.orientation",
        ]
    ],
    figsize=None,
)

../_images/Examples_example_download_run_multisolsegment_36_0.png

Step 11: Compare PVImage stats (predicted vs. ground truth)

Join the PVImage outputs to see how the model’s crack counts and shape metrics differ from the labeled data.

[28]:

pred_counts = dffeatures_multi.groupby("cell_number").size().rename("pred_crack_count")
gt_counts = dffeatures_gt.groupby("cell_number").size().rename("gt_crack_count")

count_comparison = (
    pd.concat([pred_counts, gt_counts], axis=1).fillna(0).astype(int).reset_index()
)
count_comparison.columns = ["cell_number", "pred_crack_count", "gt_crack_count"]
count_comparison["count_diff"] = (
    count_comparison["pred_crack_count"] - count_comparison["gt_crack_count"]
)

display(count_comparison.head())

	cell_number	pred_crack_count	gt_crack_count	count_diff
0	0563	1	1	0
1	0774	2	2	0
2	0809	1	1	0
3	0_EL_18.09.2023	3	4	-1
4	1090	3	4	-1

[29]:

vp = violinplt(
    count_comparison[["pred_crack_count", "gt_crack_count"]].rename(
        columns={
            "pred_crack_count": "Predicted",
            "gt_crack_count": "Ground Truth",
        }
    ),
    figsize=(6, 4),
)

../_images/Examples_example_download_run_multisolsegment_39_0.png

[30]:

feature_cols = [
    " prop.perimeter",
    " slope",
    " prop.area",
    " prop.orientation",
]
feature_comparison = pd.DataFrame(
    {
        "Predicted_mean": dffeatures_multi_norm[feature_cols].mean(),
        "GroundTruth_mean": dffeatures_gt_norm[feature_cols].mean(),
    }
)
feature_comparison["abs_diff"] = (
    feature_comparison["Predicted_mean"] - feature_comparison["GroundTruth_mean"]
).abs()
feature_comparison

[30]:

	Predicted_mean	GroundTruth_mean	abs_diff
prop.perimeter	0.151889	0.170860	0.018971
slope	0.485083	0.457917	0.027167
prop.area	0.163280	0.153094	0.010186
prop.orientation	0.345932	0.337139	0.008792

[31]:

feature_pairs = []

for col in feature_cols:
    if (
        col not in dffeatures_multi_norm.columns
        or col not in dffeatures_gt_norm.columns
    ):
        print(f"Skipping {col}: not found in features dataframe.")
        continue

    label = col.strip()
    pred = dffeatures_multi_norm[col].reset_index(drop=True)
    gt = dffeatures_gt_norm[col].reset_index(drop=True)

    max_len = max(len(pred), len(gt))
    if max_len == 0:
        print(f"Skipping {label}: no data to plot.")
        continue

    pred = pred.reindex(range(max_len))
    gt = gt.reindex(range(max_len))

    feature_pairs.append(
        pd.DataFrame(
            {
                f"Predicted {label}": pred,
                f"Ground Truth {label}": gt,
            }
        )
    )

if not feature_pairs:
    print("No feature data available for violin plots.")
    feature_violin_df = pd.DataFrame()
else:
    feature_violin_df = pd.concat(feature_pairs, axis=1)
    feature_violin_df = feature_violin_df.apply(pd.to_numeric, errors="coerce")
    feature_violin_df = feature_violin_df.dropna(axis=1, how="all")

    if feature_violin_df.empty:
        print("No numeric feature data available for violin plots.")
    else:
        # Render comparison violins
        vp = violinplt(feature_violin_df, figsize=(10, 5))

feature_violin_df

[31]:

	Predicted prop.perimeter	Ground Truth prop.perimeter	Predicted slope	Ground Truth slope	Predicted prop.area	Ground Truth prop.area	Predicted prop.orientation	Ground Truth prop.orientation
0	0.078961	0.057378	0.517969	0.420981	0.077577	0.047678	0.344652	0.063179
1	0.076532	0.053472	0.528462	0.483016	0.070977	0.041025	0.376496	0.164383
2	0.062214	0.031365	0.431319	0.442967	0.062974	0.030430	0.055206	0.224965
3	0.061292	0.009654	0.576696	0.616562	0.072311	0.009425	0.735703	0.963567
4	0.026345	0.178485	0.659435	0.442391	0.027871	0.148515	0.978202	0.185770
...	...	...	...	...	...	...	...	...
596	0.000177	NaN	0.475078	NaN	0.009407	NaN	0.201521	NaN
597	0.120884	NaN	0.463893	NaN	0.148343	NaN	0.233929	NaN
598	0.086195	NaN	0.501708	NaN	0.098568	NaN	0.284622	NaN
599	0.096668	NaN	0.521463	NaN	0.103131	NaN	0.054135	NaN
600	0.003081	NaN	0.533490	NaN	0.011303	NaN	0.027597	NaN

601 rows × 8 columns

../_images/Examples_example_download_run_multisolsegment_41_1.png

Do same for pv-vision

[ ]:

# Load pv-vision model and weights
from tutorials.unet_model import construct_unet  # this tutorial is from pv-vision
from torch.nn import DataParallel

device = torch.device("cpu")

# build & wrap
unet = construct_unet(5)
unet = DataParallel(unet)

# load the dict right out of the file — no prefixing
state_dict = torch.load(
    "unet_oversample_low_final_model_for_paper/model.pt", map_location=device
)

unet.load_state_dict(state_dict)
model_pvvision = unet.module.to(device)

[ ]:

category_mapping = {
    0: "dark",
    1: "busbar",
    2: "crack",
    3: "cross",
}  # I think this is the third category "x-cracks"

idx = 7
threshold = 0.5  # default

[ ]:

viz_functions.channeled_inference_and_show(
    val_loader,
    device=device,
    model=model_pvvision,
    category_mapping=category_mapping,
    idx=idx,
    threshold=0.5,
)
plt.savefig("pvvision_model_prediction.png", dpi=300)

../_images/Examples_example_download_run_multisolsegment_45_0.png

<Figure size 640x480 with 0 Axes>

[ ]:

# All images
m_idx = len(val_dataset)  # indices to process
masks = []
impaths = []

for i in range(m_idx):
    img, _ = val_dataset.__getitem__(i)
    img = img.to(device)
    img_path = val_dataset.get_img_path(i)

    # model inference
    logits = model_pvvision(img.unsqueeze(0)).detach().cpu()
    probs = torch.sigmoid(logits)
    pred_mask = (probs > threshold).float().squeeze(0).numpy()  # (n_classes, H, W)

    # Extract crack channel (index 2) and convert to uint8 [0,255]
    crack_mask = (pred_mask[2] * 255).astype(np.uint8)

    masks.append(crack_mask)
    impaths.append(str(img_path))

# Build dataframe of image paths expected by feature_extraction_crack_mask
dfinfo_multi = pd.DataFrame({"impath": impaths})

# Run PVImage feature extraction across all predicted crack masks
dffeatures_multi_pvvision = features.feature_extraction_crack_mask(masks, dfinfo_multi)
print(
    f"Processed {len(masks)} ground-truth images, extracted {len(dffeatures_multi)} feature rows."
)
display(dffeatures_multi_pvvision.head())

dffeatures_multi_pvvision["count_features"] = dffeatures_multi_pvvision.groupby(
    "cell_number"
)["cell_number"].transform("count")
dffeatures_multi_count = dffeatures_multi_pvvision.sort_values(
    by="cell_number"
).drop_duplicates("cell_number", keep="first")
dffeatures_multi_count.reset_index(drop=True, inplace=True)

featurescounted_multi = pd.concat(
    [dffeatures_multi_count["count_features"]], axis=1, ignore_index=True
)
featurescounted_multi.columns = ["Original"]

# Show basic outputs
print(f"Processed {len(masks)} images, extracted {len(dffeatures_multi)} feature rows.")
display(featurescounted_multi.head())


dffeatures_multi_pvvision.to_pickle("pvvision_dffeatures.pickle")
dffeatures_multi_pvvision = pd.read_pickle("pvvision_dffeatures.pickle")

Processed 585 ground-truth images, extracted 572 feature rows.

cell_number	i	prop.perimeter	slope	prop.convex_area	prop.area	prop.orientation
mxy_sa19965_sub_EL_9	1	552.83557	-0.524613	7743.0	3487.0	-0.699696
mxy_sa19965_sub_EL_9	4	198.325902	-0.461665	1710.0	1447.0	-0.683089
SW2	0	153.59798	-0.985689	1174.0	924.0	-1.113709
SW2	10	127.313708	0.557848	809.0	630.0	1.319049
SW2	2	115.213203	0.222527	666.0	580.0	0.33296

Processed 585 images, extracted 572 feature rows.

	Original
0	1
1	2
2	1
3	2
4	9

[ ]:

# Add ground truth
pred_counts = (
    dffeatures_multi_pvvision.groupby("cell_number").size().rename("pred_crack_count")
)
gt_counts = dffeatures_gt.groupby("cell_number").size().rename("gt_crack_count")

count_comparison = (
    pd.concat([pred_counts, gt_counts], axis=1).fillna(0).astype(int).reset_index()
)
count_comparison.columns = ["cell_number", "pred_crack_count", "gt_crack_count"]
count_comparison["count_diff"] = (
    count_comparison["pred_crack_count"] - count_comparison["gt_crack_count"]
)

scaler = MinMaxScaler()
columnstonormalize = [
    " prop.perimeter",
    " slope",
    " prop.area",
    " prop.orientation",
]
dffeatures_multi_norm = dffeatures_multi_pvvision[columnstonormalize]
dffeatures_multi_norm[columnstonormalize] = scaler.fit_transform(dffeatures_multi_norm)
fig = violinplt(
    dffeatures_multi_norm[
        [
            " prop.perimeter",
            " slope",
            " prop.area",
            " prop.orientation",
        ]
    ],
    figsize=None,
)

../_images/Examples_example_download_run_multisolsegment_47_0.png

[57]:

vp = violinplt(
    count_comparison[["pred_crack_count", "gt_crack_count"]].rename(
        columns={
            "pred_crack_count": "Predicted",
            "gt_crack_count": "Ground Truth",
        }
    ),
    figsize=(6, 4),
)
plt.title("pv-vision")

[57]:

Text(0.5, 1.0, 'pv-vision')

../_images/Examples_example_download_run_multisolsegment_48_1.png

[58]:

feature_cols = [
    " prop.perimeter",
    " slope",
    " prop.area",
    " prop.orientation",
]
feature_comparison = pd.DataFrame(
    {
        "Predicted_mean": dffeatures_multi_norm[feature_cols].mean(),
        "GroundTruth_mean": dffeatures_gt_norm[feature_cols].mean(),
    }
)
feature_comparison["abs_diff"] = (
    feature_comparison["Predicted_mean"] - feature_comparison["GroundTruth_mean"]
).abs()
feature_comparison

[58]:

	Predicted_mean	GroundTruth_mean	abs_diff
prop.perimeter	0.122926	0.170860	0.047934
slope	0.562446	0.457917	0.104529
prop.area	0.134435	0.153094	0.018659
prop.orientation	0.394483	0.337139	0.057343

[59]:

feature_pairs = []

for col in feature_cols:
    if (
        col not in dffeatures_multi_norm.columns
        or col not in dffeatures_gt_norm.columns
    ):
        print(f"Skipping {col}: not found in features dataframe.")
        continue

    label = col.strip()
    pred = dffeatures_multi_norm[col].reset_index(drop=True)
    gt = dffeatures_gt_norm[col].reset_index(drop=True)

    max_len = max(len(pred), len(gt))
    if max_len == 0:
        print(f"Skipping {label}: no data to plot.")
        continue

    pred = pred.reindex(range(max_len))
    gt = gt.reindex(range(max_len))

    feature_pairs.append(
        pd.DataFrame(
            {
                f"Predicted {label}": pred,
                f"Ground Truth {label}": gt,
            }
        )
    )

if not feature_pairs:
    print("No feature data available for violin plots.")
    feature_violin_df = pd.DataFrame()
else:
    feature_violin_df = pd.concat(feature_pairs, axis=1)
    feature_violin_df = feature_violin_df.apply(pd.to_numeric, errors="coerce")
    feature_violin_df = feature_violin_df.dropna(axis=1, how="all")

    if feature_violin_df.empty:
        print("No numeric feature data available for violin plots.")
    else:
        # Render comparison violins
        vp = violinplt(feature_violin_df, figsize=(10, 5))
        plt.title("pv-vision")

feature_violin_df

[59]:

	Predicted prop.perimeter	Ground Truth prop.perimeter	Predicted slope	Ground Truth slope	Predicted prop.area	Ground Truth prop.area	Predicted prop.orientation	Ground Truth prop.orientation
0	0.224055	0.057378	0.551249	0.420981	0.216021	0.047678	0.277144	0.063179
1	0.048634	0.053472	0.556411	0.483016	0.075988	0.041025	0.282431	0.164383
2	0.026501	0.031365	0.513440	0.442967	0.040088	0.030430	0.145335	0.224965
3	0.013495	0.009654	0.640013	0.616562	0.019907	0.009425	0.919853	0.963567
4	0.007507	0.178485	0.612516	0.442391	0.016474	0.148515	0.605911	0.185770
...	...	...	...	...	...	...	...	...
1168	0.018054	NaN	0.626138	NaN	0.036793	NaN	0.738026	NaN
1169	0.012385	NaN	0.602619	NaN	0.018808	NaN	0.542269	NaN
1170	0.001909	NaN	0.631208	NaN	0.006796	NaN	0.664104	NaN
1171	0.131226	NaN	0.599652	NaN	0.177993	NaN	0.994440	NaN
1172	0.001499	NaN	0.592021	NaN	0.012287	NaN	0.003767	NaN

1173 rows × 8 columns

../_images/Examples_example_download_run_multisolsegment_50_1.png

[ ]: