Evaluate nuclei segmentation

In this document, we showcase how to use STimage-1K4M to evaluate nuclei segmentation and cell type assignment. We note here that we don’t have pixel level cell type information. With the help of gene expression data, we can use cell type deconvolution to infer the cell type proportion (see a summary for deconvolution methods here). Here we used RCTD (PMID 33603203) to infer the cell type proportion of Human_Breast_Andersson_10142021_ST_H1 with the help of an external scRNA-seq reference (PMID 34493872).

Here, the cell segmentation method we employed is CellViT. We compare the performance of pretrained CellViT-256 and CellViT-SAM to the RCTD inferred cell type proportion.

First, use this code to convert STimage-1K4M image to OpenSlide image.

convert_tiff.py

from PIL import Image
import sys
import pyvips
import numpy as np
from openslide import OpenSlide
import tifffile

Image.MAX_IMAGE_PIXELS = None
img_path = sys.argv[1]
img = np.array(Image.open(img_path))

if img.shape[0] == 3:
    img = np.transpose(img, axes=(1, 2, 0))
elif img.shape[2] == 4: # RGBA to RGB
    img = img[:,:,:3]

pyvips_img = pyvips.Image.new_from_array(img)
save_path=sys.argv[2]

pixel_size = float(sys.argv[3]) # 1 pixel = ? mu m

pyvips_img.tiffsave(
    save_path, 
    bigtiff=False, 
    pyramid=True, 
    tile=True, 
    tile_width=256, 
    tile_height=256, 
    compression='jpeg', 
    resunit=pyvips.enums.ForeignTiffResunit.CM,
    xres=1. / (pixel_size * 1e-4),
    yres=1. / (pixel_size * 1e-4))
python3 convert_tiff.py \
./ST/image/Human_Breast_Andersson_10142021_ST_H1.png \
Human_Breast_Andersson_10142021_ST_H1.tif \
0.688

tifftools set -y -s ImageDescription  "Aperio Fake |AppMag = 20|MPP = 0.68" Human_Breast_Andersson_10142021_ST_H1.tif

Then we can use CellViT to do cell segmentation.

cd CellViT
python3 ./preprocessing/patch_extraction/main_extraction.py --config preprocessing_example.yaml


python3 ./cell_segmentation/inference/cell_detection.py \
  --model ./models/pretrained/CellViT-SAM-H-x20.pth\
  --gpu 0 \
  --geojson \
  --magnification 20. \
  process_wsi \
  --wsi_path Human_Breast_Andersson_10142021_ST_H1.tif \
  --patched_slide_path ./preprocessing/Human_Breast_Andersson_10142021_ST_H1

cp ./preprocessing/Human_Breast_Andersson_10142021_ST_H1 ./preprocessing0/Human_Breast_Andersson_10142021_ST_H1

python3 ./cell_segmentation/inference/cell_detection.py \
  --model ./models/pretrained/CellViT-256-x20.pth\
  --gpu 0 \
  --geojson \
  --magnification 20. \
  process_wsi \
  --wsi_path Human_Breast_Andersson_10142021_ST_H1.tif \
  --patched_slide_path ./preprocessing0/Human_Breast_Andersson_10142021_ST_H1

preprocessing_example.yaml

# dataset paths
wsi_paths: Human_Breast_Andersson_10142021_ST_H1.tif
output_path: ./preprocessing
wsi_extension: tif
# basic setups
patch_size: 1024
patch_overlap: 6.25
target_mag: 20
processes: 8
overwrite: True

# macenko stain normalization
normalize_stains: True

# finding patches
min_intersection_ratio: 0.05

After cell segmentation, we compare the spot-level cell type proportion. In order to do that, we need to aggregate the cells segmented by CellViT in each spot.


import json
from pathlib import Path
from typing import List, Tuple, Union

import argparse
import inspect
import os
import sys

import numpy as np
import torch
import torch.nn.functional as F
import tqdm
import yaml
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw
from skimage.color import rgba2rgb
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
from torchmetrics.functional import dice
from torchmetrics.functional.classification import binary_jaccard_index
from torchvision import transforms

import pandas as pd
import json


with open('./preprocessing/Human_Breast_Andersson_10142021_ST_H1/cell_detection/cell_detection.json') as json_data:
    data = json.load(json_data)

# load image
image_full = Image.open('./Human_Breast_Andersson_10142021_ST_H1.tif')
image_full = image_full.convert('RGB')

# get the cell types of the cells and plot the contour of the cells
with open('./preprocessing0/Human_Breast_Andersson_10142021_ST_H1/cell_detection/cells.geojson', encoding='utf-8') as f:
    gj = json.load(f)

gj = pd.DataFrame(gj) 

# plot the cells with cell type
fig, ax = plt.subplots(1, 1, figsize=(100, 100))
ax.imshow(image_full)

for i in range(5):
    contour = gj.iloc[i,]['geometry']['coordinates']
    color = gj.iloc[i,]['properties']['classification']['color']
    color = (color[0]/255, color[1]/255, color[2]/255)
    cell_type = gj.iloc[i,]['properties']['classification']['name']
    for j in range(len(contour)):
        contour_cell = np.array(contour[j])[0]
        ax.plot(contour_cell[:,0], contour_cell[:,1], color=color,label = cell_type)

fig.savefig('.segmentation/cell.png')


# get the cell types of the cells
with open('./preprocessing0/Human_Breast_Andersson_10142021_ST_H1/cell_detection/cell_detection.geojson', encoding='utf-8') as f:
    gj = json.load(f)

gj = pd.DataFrame(gj)
data_organized = pd.DataFrame()

for i in range(5):
    temp = pd.DataFrame(gj.iloc[i,]['geometry']['coordinates'])
    temp['cell_type'] = gj.iloc[i,]['properties']['classification']['name']
    data_organized = pd.concat([data_organized,temp])

# organize data
data_organized.columns = ['x','y','cell_type']

coord = pd.read_csv('./ST/coord/Human_Breast_Andersson_10142021_ST_H1_coord.csv', index_col=0)

# only keep the spot within the Cell-ViT inferred region
coord_keep = coord[coord['xaxis']-coord['r'] >= data_organized.x.min()]
coord_keep = coord_keep[coord_keep['xaxis']+coord_keep['r'] <= data_organized.x.max()]
coord_keep = coord_keep[coord_keep['yaxis']-coord_keep['r'] >= data_organized.y.min()]
coord_keep = coord_keep[coord_keep['yaxis']+coord_keep['r'] <= data_organized.y.max()]

index_celltype = ['Neoplastic', 'Inflammatory', 'Connective', 'Dead', 'Epithelial']
inferred = np.zeros((coord_keep.shape[0],5))

# aggregate cell number in each spot
for i in range(coord_keep.shape[0]):
    xaxis = coord_keep.iloc[i,]['xaxis']
    yaxis = coord_keep.iloc[i,]['yaxis']
    radius = coord_keep.iloc[i,]['r']
    cell_in = data_organized[(data_organized['x']-xaxis)**2 + (data_organized['y']-yaxis)**2 <= radius**2]
    if cell_in.shape[0] == 0:
        continue
    else:
        for j in range(5):
            inferred[i,j] = cell_in[cell_in['cell_type'] == index_celltype[j]].shape[0]
    print(i)


coord_keep['Neoplastic'] = inferred[:,0]
coord_keep['Inflammatory'] = inferred[:,1]
coord_keep['Connective'] = inferred[:,2]
coord_keep['Dead'] = inferred[:,3]
coord_keep['Epithelial'] = inferred[:,4]

coord_keep.to_csv('.segmentation/cell_type_256.csv')

# Then repeat the same code for SAM based model.