Ken Furudate
import scanpy as sc
import stlearn as st
import scvi
import scib
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
import os
from pathlib import Path
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
scvi.settings.seed = 42
sample_name1="A"
sample_name2="B"
sample_name3="C"
datadir="/data/spatial/"
in_f1 = "SampleA_cluster_patho.annot_region.txt"
in_f2 = "SampleB_cluster_patho.annot_region.txt"
in_f3 = "SampleC_cluster_patho.annot_region.txt"
adata = st.Read10X(datadir + f"{sample_name1}/")
adata.var_names_make_unique()
adata.var_names
adata
bdata = st.Read10X(datadir + f"{sample_name2}/")
bdata.var_names_make_unique()
bdata.var_names
bdata
cdata = st.Read10X(datadir + f"{sample_name3}/")
cdata.var_names_make_unique()
cdata.var_names
cdata
a_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f1), index_col="Barcode")
b_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f2), index_col="Barcode")
c_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f3), index_col="Barcode")
adata.obs['sample'] = "A"
bdata.obs['sample'] = "B"
cdata.obs['sample'] = "C"
adata_tumor = adata[adata.obs['category']=="tumor"]
bdata_tumor = bdata[bdata.obs['category']=="tumor"]
cdata_tumor = cdata[cdata.obs['category']=="tumor"]
adata_tumor
adata_uns_spatial = adata_tumor.uns["spatial"]
bdata_uns_spatial = bdata_tumor.uns["spatial"]
cdata_uns_spatial = cdata_tumor.uns["spatial"]
sc.pp.filter_cells(adata_tumor, min_counts=500)
sc.pp.filter_cells(bdata_tumor, min_counts=500)
sc.pp.filter_cells(cdata_tumor, min_counts=500)
sc.pp.filter_genes(adata_tumor, min_cells=3)
sc.pp.filter_genes(bdata_tumor, min_cells=3)
sc.pp.filter_genes(cdata_tumor, min_cells=3)
data = adata_tumor.concatenate(bdata_tumor)
data = data.concatenate(cdata_tumor)
data.layers["counts"] = data.X.copy()
sc.pp.normalize_total(data, target_sum=1e4)
sc.pp.log1p(data)
data.raw = data # keep full dimension safe
sc.pp.highly_variable_genes(
data,
flavor="seurat_v3",
n_top_genes=4000,
layer="counts",
batch_key="sample",
subset=True
)
scvi.data.setup_anndata(data,
layer="counts",
batch_key="sample"
)
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device
model = scvi.model.SCVI(data,
n_hidden=128,
n_latent=30,
n_layers=2,
dropout_rate=0.2,
dispersion="gene",
gene_likelihood="nb"
)
model.to_device(device)
model.train(use_gpu=True)
scvi.data.view_anndata_setup(model.adata)
model.save(outdir+f"scvi.train.model.{Version}", overwrite=True, save_anndata=True)
data.obsm["X_scvi"] = model.get_latent_representation()
data.layers["scvi_expr"] = model.get_normalized_expression(data)
try:
sc.pp.neighbors(data, use_rep="X_scvi")
sc.tl.leiden(data, key_added='leiden', resolution=0.5)
sc.tl.umap(data)
data
sc.set_figure_params(facecolor="white", figsize=(12, 8), dpi_save=300)
sc.pl.umap(data,
color=["sample", "pathology", "leiden"],
frameon=False,
ncols=1,
save=f"{DAY}_{sample_name}.After_batch_effect_correction.spatialplot.{Version}.pdf"
)
sc.tl.embedding_density(data, groupby="sample", key_added="sample_density")
sc.set_figure_params(facecolor="white", figsize=(10, 8), dpi_save=300)
sc.pl.embedding_density(data,
key="sample_density",
save=f"{DAY}_{sample_name}.sampledensity.batch_effect_correction.{Version}.pdf")
The change
mode follows protocol described in Boyeau et al.
# one vs all, over all groups
full_de_res = model.differential_expression(groupby='leiden',
mode='change', # default
fdr_target=0.1)
full_de_res.head(20)
from copy import deepcopy
markers = {}
cats = data.obs.leiden.cat.categories
dup_lst = []
for i, c in enumerate(cats):
cnt = 0
gene_lst = []
cid = "{} vs Rest".format(c)
cell_type_df = full_de_res.loc[full_de_res.comparison == cid]
cell_type_df = cell_type_df[cell_type_df["is_de_fdr_0.1"] == True]
cell_type_df = cell_type_df.sort_values("lfc_mean", ascending=False)
# those genes with higher expression in group 1
cell_type_df = cell_type_df[cell_type_df.lfc_mean > 0]
# significance
cell_type_df = cell_type_df[cell_type_df["bayes_factor"] >= 2]
# genes with sufficient expression
cell_type_df = cell_type_df[cell_type_df["non_zeros_proportion1"] >= 0.1]
while len(gene_lst) < 5:
if cell_type_df.index[cnt] in dup_lst:
cnt += 1
else:
gene_lst.append(cell_type_df.index[cnt])
cnt += 1
markers[c] = deepcopy(gene_lst)
dup_lst += deepcopy(gene_lst)
markers
sc.tl.dendrogram(data,
groupby='leiden',
cor_method='pearson', # default
linkage_method='complete', # default
inplace=True
)
sc.set_figure_params(facecolor="white", figsize=(20, 8), dpi_save=300)
import matplotlib.font_manager
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']
plt.rcParams["font.size"] = 16
sc.pl.matrixplot(
data,
markers,
groupby='leiden',
standard_scale="var",
layer="scvi_expr",
dendrogram=True,
save=f"{DAY}_{sample_name}.scvi.DE.leiden.matrixplot.{Version}.pdf"
)
# First, we need input_df
select_col1 = "leiden"
select_col2 = "sample"
input_df = data.obs[[select_col1, select_col2 ]]
input_df
# One hot enchoding
one_hot_df = pd.get_dummies(input_df, columns=[select_col2])
one_hot_df.columns = ["cluster", "sample_A", "sample_B", "sample_C"]
one_hot_df
one_hot_df = one_hot_df.groupby(by="cluster").sum()
one_hot_df
sns.set(font_scale=1.5, style='white')
plt.figure(figsize=(20, 10), dpi=300)
one_hot_df.plot.bar(stacked=True)
plt.legend(fontsize=16, loc='upper right', bbox_to_anchor=(1, 1))
plt.xticks(rotation=0)
plt.xlabel('cluster')
sns.despine()
plt.show()
input_df = one_hot_df.copy()
clusters = list(input_df.index)
for cluster_ in clusters:
Sum_cluser = one_hot_df.sum(axis=1)[cluster_]
N_of_cluster = one_hot_df.iloc[cluster_]
input_df.iloc[cluster_] = N_of_cluster/Sum_cluser
input_df.iloc[cluster_] *= 100
input_df
input_df_sort = input_df.sort_values(by="sample_C")
input_df_sort
sns.set(font_scale=1.5, style='white')
plt.figure(figsize=(20, 10), dpi=300)
input_df_sort.plot.bar(stacked=True)
plt.legend(fontsize=16, loc='upper right', bbox_to_anchor=(1.2, 1))
plt.xticks(rotation=0)
plt.xlabel('cluster')
sns.despine()
plt.show()
# First, we need input_df
select_col1 = "leiden"
select_col2 = "pathology"
input_df = data.obs[[select_col1, select_col2 ]]
input_df
# One hot enchoding
one_hot_df = pd.get_dummies(input_df, columns=[select_col2])
one_hot_df.columns = ["cluster", "CA_MD_to_WD", "CA_PD", "CA_PD_&_fibrosis", "CA_PD_&_muscle", "CA_fibrosis_&_lymphocyte"]
one_hot_df["cluster"] = one_hot_df["cluster"].astype(int)
# check data type
print(one_hot_df.dtypes)
one_hot_df = one_hot_df.groupby(by="cluster").sum()
one_hot_df
sns.set(font_scale=1.5, style='white')
plt.figure(figsize=(20, 10), dpi=300)
one_hot_df.plot.bar(stacked=True)
plt.legend(fontsize=16, loc='upper right', bbox_to_anchor=(1, 1))
plt.xticks(rotation=0)
plt.xlabel('cluster')
sns.despine()
plt.show()
adata = data[data.obs['sample']=="A"]
bdata = data[data.obs['sample']=="B"]
cdata = data[data.obs['sample']=="C"]
adata.uns["spatial"] = adata_uns_spatial
bdata.uns["spatial"] = bdata_uns_spatial
cdata.uns["spatial"] = cdata_uns_spatial
sc.pl.spatial(adata=adata,
color=['cluster', 'leiden', 'pathology'],
save=f"{DAY}_{sample_name1}.cluster_pathlogy_category.spatialplot.{Version}.pdf"
)
sc.pl.spatial(adata=bdata,
color=['cluster', 'leiden', 'pathology'],
save=f"{DAY}_{sample_name2}.cluster_pathlogy_category.spatialplot.{Version}.pdf"
)
sc.pl.spatial(adata=cdata,
color=['cluster', 'leiden', 'pathology'],
save=f"{DAY}_{sample_name3}.cluster_pathlogy_category.spatialplot.{Version}.pdf"
)