Gene expression processing

For integration with gene expression, we reduced the dimension of gene expression by selecting highly variable genes (HVGs). We explored two strategies: highly variable genes (HVG) selected separately from each slide, and HVGs selected from overlapping genes across slides (overlap-HVG)

1. HVG

(1) Find HVG of each slide
#python
import pandas as pd
import scanpy as sc
import sys


meta = pd.read_csv('meta_all_gene.csv',sep='\t')
path = '.'

# load data
i = int(sys.argv[1])
slide = meta['slide'][i]
gene_exp_slide = pd.read_csv(f'{path}/{meta.tech[i]}/gene_exp/{meta.slide[i]}_count.csv',sep=',',index_col=0)
adata = sc.AnnData(gene_exp_slide)
adata.var_names_make_unique()

sc.pp.filter_cells(adata, min_genes=1)
sc.experimental.pp.highly_variable_genes(adata, n_top_genes=128)

# sort genes by highly_variable
adata.var_names_make_unique()
hvg_list = adata.var['highly_variable_rank']
hvg_list = hvg_list.sort_values()
hvg_list = hvg_list.dropna()

adata_hvg = adata[:, hvg_list.index]
sc.pp.normalize_total(adata_hvg)
sc.pp.log1p(adata_hvg)
hvg = adata_hvg.X
hvg = pd.DataFrame(hvg)
hvg.index = adata_hvg.obs.index
hvg.to_csv(f'./HVG/{meta.slide[i]}_count_hvg.csv',sep=',')
(2) Combined HVGs

Suppose we want to combine all HVGs for human brain slides, we first extract the slide names for all human brain slides.

#python
data = meta.loc[(meta['species'] == 'human') & (meta['tissue'] == 'brain'),:]
data.slide.to_csv(f'human_brain_slide.csv',index=False,header=False)

Then we combine them together.

#bash
type='human_brain'
path='./'
rm ${type}_count_hvg.csv
while read slide;do
    if [ -f ${path}/HVG/${slide}_count_hvg.csv ]; then
        cat ${path}/HVG/${slide}_count_hvg.csv | tail -n +2 >> ./${type}_count_hvg.csv
    fi
    echo $slide
done < ${type}_slide.csv

2. Overlap-HVG

Here we showcase how to find Overlap-HVG in human brain.

(1) Find overlap gene
# python
# here we find the overlap genes in human brain
data = meta.loc[(meta['species'] == 'human') & (meta['tissue'] == 'brain'),:]
data.index = range(len(data.index))

for i in range(len(data.index)):
    gene_exp_slide = pd.read_csv(f'{path}/{data.tech[i]}/gene_exp/{data.slide[i]}_count.csv',sep=',',nrows=1,index_col=0)
    if i == 0:
        gene_name_overlap = gene_exp_slide.columns
    else:
        gene_name_overlap = gene_name_overlap.intersection(gene_exp_slide.columns)
    print(i)

pd.DataFrame(gene_name_overlap).to_csv('human_brain_gene.csv',index=False,header=False)
(2) Extract gene expression for each slide

Here we extract human brain overlap genes for all human slides.

meta = pd.read_csv('meta_all.csv',sep='\t')
path = '.'

species = 'human'
tissue = 'all'
gene_list = 'human_brain_gene'

if tissue == 'all':
    data = meta.loc[(meta['species'] == species),:]
    data.slide.to_csv(f'{species}_slide.csv',index=False,header=False)
else:
    data = meta.loc[(meta['species'] == species) & (meta['tissue'] == tissue),:]
    data.slide.to_csv(f'{species}_{tissue}_slide.csv',index=False,header=False)


data.index = range(len(data.index))

for index in range(data.index):
    slide = data['slide'][index]
    gene_exp_slide = pd.read_csv(f'{path}/{data.tech[index]}/gene_exp/{data.slide[index]}_count.csv',sep=',',index_col=0)
    overlap_gene = pd.read_csv(f'{gene_list}.csv',header=None)
    gene_exp_slide = gene_exp_slide.loc[:,overlap_gene[0]]
    gene_exp_slide.to_csv(f'./overlap-hvg/{data.slide[index]}_{gene_list}.csv',sep=',')
(3) Combine all gene expression
#bash
type='human'
path='./overlap-hvg/'
rm /proj/yunligrp/users/jiawen/spatial_omics_data/organized_data_gene_exp/all/${type}_count_overlap.csv
while read slide;do
    if [ -f ${path}/${slide}_${type}_gene.csv ]; then
        cat ${path}/${slide}_${type}_gene.csv | tail -n +2 >> ${type}_count_overlap.csv
    else
        echo $slide
    fi
done < ${type}_slide.csv

(4) Extract HVG from combined gene expression
# python
import sys
import pandas as pd
import numpy as np
import scanpy as sc
import os

path='.'

def overlap_hvg(tissue_type):
    # Here we take input of combined gene expression of one tissue type
    # see below for how to generate this file
    gene_exp_slide = pd.read_csv(f'{path}/{tissue_type}_count_overlap.csv',sep=',',index_col=0,header=None)
    gene_exp_slide.columns = ['gene'+str(i) for i in range(gene_exp_slide.shape[1])]
    adata = sc.AnnData(gene_exp_slide)
    sc.pp.filter_cells(adata, min_genes=1)
    sc.pp.filter_genes(adata, min_cells=1)
    # Normalizing to median total counts
    sc.pp.normalize_total(adata)
    # Logarithmize the data
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, n_top_genes=100)
    # sort genes by highly_variable
    adata_hvg = adata[:, adata.var.highly_variable]
    hvg = adata_hvg.X
    hvg = pd.DataFrame(hvg)
    hvg.index = adata_hvg.obs.index
    hvg.to_csv(f'./all/{tissue_type}_count_overlap_hvg.csv',sep=',')

tissue_type = sys.argv[1]
overlap_hvg(tissue_type)