Ken Furudate

import scanpy as sc
import stlearn as st

import scvi
import scib

import numpy as np
import pandas as pd

from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline

import os
from pathlib import Path

import seaborn as sns

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import warnings
warnings.filterwarnings('ignore')

scvi.settings.seed = 42

Global seed set to 42

sample_name1="A"
sample_name2="B"
sample_name3="C"

datadir="/data/spatial/"

in_f1 = "SampleA_cluster_patho.annot_region.txt"
in_f2 = "SampleB_cluster_patho.annot_region.txt"
in_f3 = "SampleC_cluster_patho.annot_region.txt"

adata = st.Read10X(datadir + f"{sample_name1}/")
adata.var_names_make_unique()
adata.var_names
adata

bdata = st.Read10X(datadir + f"{sample_name2}/")
bdata.var_names_make_unique()
bdata.var_names
bdata

cdata = st.Read10X(datadir + f"{sample_name3}/")
cdata.var_names_make_unique()
cdata.var_names
cdata

a_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f1), index_col="Barcode")
b_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f2), index_col="Barcode")
c_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f3), index_col="Barcode")

adata.obs['sample'] = "A"
bdata.obs['sample'] = "B"
cdata.obs['sample'] = "C"

adata_tumor = adata[adata.obs['category']=="tumor"]
bdata_tumor = bdata[bdata.obs['category']=="tumor"]
cdata_tumor = cdata[cdata.obs['category']=="tumor"]

adata_tumor

View of AnnData object with n_obs × n_vars = 984 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'imagecol', 'imagerow', 'pathology', 'category', 'cluster', 'sample'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'

adata_uns_spatial = adata_tumor.uns["spatial"]
bdata_uns_spatial = bdata_tumor.uns["spatial"]
cdata_uns_spatial = cdata_tumor.uns["spatial"]

sc.pp.filter_cells(adata_tumor, min_counts=500)
sc.pp.filter_cells(bdata_tumor, min_counts=500)
sc.pp.filter_cells(cdata_tumor, min_counts=500)

sc.pp.filter_genes(adata_tumor, min_cells=3)
sc.pp.filter_genes(bdata_tumor, min_cells=3)
sc.pp.filter_genes(cdata_tumor, min_cells=3)

filtered out 1 cells that have less than 500 counts
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
filtered out 18 cells that have less than 500 counts
Trying to set attribute `.obs` of view, copying.
filtered out 213 genes that are detected in less than 3 cells
filtered out 738 genes that are detected in less than 3 cells
filtered out 136 genes that are detected in less than 3 cells

data = adata_tumor.concatenate(bdata_tumor)

data = data.concatenate(cdata_tumor)

data.layers["counts"] = data.X.copy()

sc.pp.normalize_total(data, target_sum=1e4)
sc.pp.log1p(data)
data.raw = data  # keep full dimension safe

normalizing counts per cell
    finished (0:00:00)

sc.pp.highly_variable_genes(
    data,
    flavor="seurat_v3",
    n_top_genes=4000,
    layer="counts",
    batch_key="sample",
    subset=True
    )

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)

scvi.data.setup_anndata(data, 
                        layer="counts", 
                        batch_key="sample"
                        )

INFO     Using batches from adata.obs["sample"]                                              
INFO     No label_key inputted, assuming all cells have same label                           
INFO     Using data from adata.layers["counts"]                                              
INFO     Successfully registered anndata object containing 3637 cells, 4000 vars, 3 batches, 
         1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra
         continuous covariates.                                                              
INFO     Please do not further modify adata until model is trained.

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

model = scvi.model.SCVI(data,
                        n_hidden=128,
                        n_latent=30,
                        n_layers=2,
                        dropout_rate=0.2,
                        dispersion="gene",
                        gene_likelihood="nb"
                        )
model.to_device(device)
model.train(use_gpu=True)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Epoch 400/400: 100%|██████████| 400/400 [02:45<00:00,  2.42it/s, loss=3.14e+03, v_num=1]

scvi.data.view_anndata_setup(model.adata)

Anndata setup with scvi-tools version 0.14.5.

              Data Summary              
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃             Data             ┃ Count ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
│            Cells             │ 3637  │
│             Vars             │ 4000  │
│            Labels            │   1   │
│           Batches            │   3   │
│           Proteins           │   0   │
│ Extra Categorical Covariates │   0   │
│ Extra Continuous Covariates  │   0   │
└──────────────────────────────┴───────┘

             SCVI Data Registry              
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃     Data      ┃    scvi-tools Location    ┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│       X       │  adata.layers['counts']   │
│ batch_indices │ adata.obs['_scvi_batch']  │
│    labels     │ adata.obs['_scvi_labels'] │
└───────────────┴───────────────────────────┘

                        Label Categories                        
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃      Source Location      ┃ Categories ┃ scvi-tools Encoding ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ adata.obs['_scvi_labels'] │     0      │          0          │
└───────────────────────────┴────────────┴─────────────────────┘

                     Batch Categories                     
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃   Source Location   ┃ Categories ┃ scvi-tools Encoding ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ adata.obs['sample'] │     A      │          0          │
│                     │     B      │          1          │
│                     │     C      │          2          │
└─────────────────────┴────────────┴─────────────────────┘

model.save(outdir+f"scvi.train.model.{Version}", overwrite=True, save_anndata=True)

... storing 'pathology' as categorical
... storing 'category' as categorical
... storing 'cluster' as categorical
... storing 'sample' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical

data.obsm["X_scvi"] = model.get_latent_representation()

data.layers["scvi_expr"] = model.get_normalized_expression(data)

try:
  sc.pp.neighbors(data, use_rep="X_scvi")
  sc.tl.leiden(data, key_added='leiden', resolution=0.5)
  sc.tl.umap(data)

computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)
running Leiden clustering
    finished: found 12 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:00)

data

AnnData object with n_obs × n_vars = 3637 × 4000
    obs: 'in_tissue', 'array_row', 'array_col', 'imagecol', 'imagerow', 'pathology', 'category', 'cluster', 'sample', 'n_counts', 'batch', '_scvi_batch', '_scvi_labels', 'leiden'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells-0-0', 'n_cells-1-0', 'n_cells-1', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'log1p', 'hvg', '_scvi', 'neighbors', 'leiden', 'umap'
    obsm: 'spatial', 'X_scvi', 'X_umap'
    layers: 'counts', 'scvi_expr'
    obsp: 'distances', 'connectivities'

sc.set_figure_params(facecolor="white", figsize=(12, 8), dpi_save=300)
sc.pl.umap(data, 
           color=["sample", "pathology", "leiden"],
           frameon=False,
           ncols=1,
           save=f"{DAY}_{sample_name}.After_batch_effect_correction.spatialplot.{Version}.pdf"
           )

WARNING: saving figure to file figures/umap20220220_A2B2C2_tumor.After_batch_effect_correction.spatialplot.v20.pdf

sc.tl.embedding_density(data, groupby="sample", key_added="sample_density")

sc.set_figure_params(facecolor="white", figsize=(10, 8), dpi_save=300)
sc.pl.embedding_density(data, 
                        key="sample_density",
                        save=f"{DAY}_{sample_name}.sampledensity.batch_effect_correction.{Version}.pdf")

computing density on 'umap'
--> added
    'sample_density', densities (adata.obs)
    'sample_density_params', parameter (adata.uns)
WARNING: saving figure to file figures/sample_density_20220220_A2B2C2_tumor.sampledensity.batch_effect_correction.v20.pdf

scVI DE¶

The change mode follows protocol described in Boyeau et al.

# one vs all, over all groups
full_de_res = model.differential_expression(groupby='leiden', 
                                            mode='change', # default
                                            fdr_target=0.1)
full_de_res.head(20)

DE...: 100%|██████████| 12/12 [00:52<00:00,  4.41s/it]

from copy import deepcopy
markers = {}

cats = data.obs.leiden.cat.categories

dup_lst = []

for i, c in enumerate(cats):
    cnt = 0
    gene_lst = []
    cid = "{} vs Rest".format(c)
    cell_type_df = full_de_res.loc[full_de_res.comparison == cid]
    cell_type_df = cell_type_df[cell_type_df["is_de_fdr_0.1"] == True]
    cell_type_df = cell_type_df.sort_values("lfc_mean", ascending=False)

    # those genes with higher expression in group 1
    cell_type_df = cell_type_df[cell_type_df.lfc_mean > 0]

    # significance
    cell_type_df = cell_type_df[cell_type_df["bayes_factor"] >= 2]

    # genes with sufficient expression
    cell_type_df = cell_type_df[cell_type_df["non_zeros_proportion1"] >= 0.1]

    while len(gene_lst) < 5:
      if cell_type_df.index[cnt] in dup_lst: 
        cnt += 1
      else: 
        gene_lst.append(cell_type_df.index[cnt])
        cnt += 1

    markers[c] = deepcopy(gene_lst)
    dup_lst += deepcopy(gene_lst)
    


markers

{'0': ['DES', 'TNNI1', 'CKM', 'SOX11', 'COL11A1'],
 '1': ['IGHD', 'NCCRP1', 'SPIB', 'CD22', 'XIRP2'],
 '10': ['SAMD11', 'KRT4', 'CYP7A1', 'CCR5', 'OR4N2'],
 '11': ['TCAP', 'ACTC1', 'KLHL40', 'TRIM72', 'MYLPF'],
 '2': ['FLG', 'MFAP5', 'KRTAP2-3', 'CPA4', 'CNTNAP2'],
 '3': ['PAEP', 'CCL21', 'NLRP14', 'HMGA2', 'ACTN2'],
 '4': ['PI3', 'SERPINB3', 'S100A9', 'TMPRSS11D', 'MSLN'],
 '5': ['VPREB3', 'DUXA', 'EIF5AL1', 'XCL1', 'LIPJ'],
 '6': ['IGLV9-49', 'IGHG4', 'IGLV3-1', 'IGHA1', 'IGHV5-10-1'],
 '7': ['IGLV6-57', 'KRT13', 'IGKC', 'IGHG2', 'MPPED2'],
 '8': ['SPINK6', 'CXCL14', 'IGHV6-1', 'PTPRZ1', 'KRT1'],
 '9': ['STRIP2', 'FGFBP1', 'BLNK', 'CLCA2', 'LRRC15']}

sc.tl.dendrogram(data, 
                 groupby='leiden',
                 cor_method='pearson', # default
                 linkage_method='complete', # default
                 inplace=True
                 )

WARNING: You’re trying to run this on 4000 dimensions of `.X`, if you really want this, set `use_rep='X'`.
         Falling back to preprocessing with `sc.pp.pca` and default params.
computing PCA
    with n_comps=50
    finished (0:00:08)
Storing dendrogram info using `.uns['dendrogram_leiden']`

sc.set_figure_params(facecolor="white", figsize=(20, 8), dpi_save=300)

import matplotlib.font_manager
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']
plt.rcParams["font.size"] = 16


sc.pl.matrixplot(
    data, 
    markers,
    groupby='leiden',
    standard_scale="var",
    layer="scvi_expr",
    dendrogram=True,
    save=f"{DAY}_{sample_name}.scvi.DE.leiden.matrixplot.{Version}.pdf"
    )

WARNING: saving figure to file figures/matrixplot_20220220_A2B2C2_tumor.scvi.DE.leiden.matrixplot.v20.pdf

# First, we need input_df

select_col1 = "leiden"
select_col2 = "sample"
input_df = data.obs[[select_col1, select_col2 ]]
input_df

# One hot enchoding
one_hot_df = pd.get_dummies(input_df, columns=[select_col2])
one_hot_df.columns = ["cluster", "sample_A", "sample_B", "sample_C"]
one_hot_df

one_hot_df = one_hot_df.groupby(by="cluster").sum()
one_hot_df

sns.set(font_scale=1.5, style='white')
plt.figure(figsize=(20, 10), dpi=300)
one_hot_df.plot.bar(stacked=True)
plt.legend(fontsize=16, loc='upper right', bbox_to_anchor=(1, 1))
plt.xticks(rotation=0)
plt.xlabel('cluster')
sns.despine()
plt.show()

<Figure size 6000x3000 with 0 Axes>

input_df = one_hot_df.copy()
clusters = list(input_df.index)

for cluster_ in clusters:
  Sum_cluser = one_hot_df.sum(axis=1)[cluster_]
  N_of＿cluster = one_hot_df.iloc[cluster_]
  input_df.iloc[cluster_] = N_of＿cluster/Sum_cluser
  input_df.iloc[cluster_] *= 100
  
input_df

input_df_sort = input_df.sort_values(by="sample_C")
input_df_sort

sns.set(font_scale=1.5, style='white')
plt.figure(figsize=(20, 10), dpi=300)
input_df_sort.plot.bar(stacked=True)
plt.legend(fontsize=16, loc='upper right', bbox_to_anchor=(1.2, 1))
plt.xticks(rotation=0)
plt.xlabel('cluster')
sns.despine()
plt.show()

<Figure size 6000x3000 with 0 Axes>

# First, we need input_df

select_col1 = "leiden"
select_col2 = "pathology"
input_df = data.obs[[select_col1, select_col2 ]]
input_df

# One hot enchoding
one_hot_df = pd.get_dummies(input_df, columns=[select_col2])
one_hot_df.columns = ["cluster", "CA_MD_to_WD", "CA_PD", "CA_PD_&_fibrosis", "CA_PD_&_muscle", "CA_fibrosis_&_lymphocyte"]
one_hot_df["cluster"] = one_hot_df["cluster"].astype(int)
# check data type
print(one_hot_df.dtypes)
one_hot_df = one_hot_df.groupby(by="cluster").sum()
one_hot_df

cluster                     int64
CA_MD_to_WD                 uint8
CA_PD                       uint8
CA_PD_&_fibrosis            uint8
CA_PD_&_muscle              uint8
CA_fibrosis_&_lymphocyte    uint8
dtype: object

sns.set(font_scale=1.5, style='white')
plt.figure(figsize=(20, 10), dpi=300)
one_hot_df.plot.bar(stacked=True)
plt.legend(fontsize=16, loc='upper right', bbox_to_anchor=(1, 1))
plt.xticks(rotation=0)
plt.xlabel('cluster')
sns.despine()
plt.show()

<Figure size 6000x3000 with 0 Axes>

adata = data[data.obs['sample']=="A"]
bdata = data[data.obs['sample']=="B"]
cdata = data[data.obs['sample']=="C"]

adata.uns["spatial"] = adata_uns_spatial
bdata.uns["spatial"] = bdata_uns_spatial
cdata.uns["spatial"] = cdata_uns_spatial

Trying to set attribute `._uns` of view, copying.
Trying to set attribute `._uns` of view, copying.
Trying to set attribute `._uns` of view, copying.

sc.pl.spatial(adata=adata, 
              color=['cluster', 'leiden', 'pathology'],
              save=f"{DAY}_{sample_name1}.cluster_pathlogy_category.spatialplot.{Version}.pdf"
              )

WARNING: saving figure to file figures/show20220220_A2.cluster_pathlogy_category.spatialplot.v20.pdf

sc.pl.spatial(adata=bdata, 
              color=['cluster', 'leiden', 'pathology'],
              save=f"{DAY}_{sample_name2}.cluster_pathlogy_category.spatialplot.{Version}.pdf"
              )

WARNING: saving figure to file figures/show20220220_B2.cluster_pathlogy_category.spatialplot.v20.pdf

sc.pl.spatial(adata=cdata, 
              color=['cluster', 'leiden', 'pathology'],
              save=f"{DAY}_{sample_name3}.cluster_pathlogy_category.spatialplot.{Version}.pdf"
              )

WARNING: saving figure to file figures/show20220220_C2.cluster_pathlogy_category.spatialplot.v20.pdf

	proba_de	proba_not_de	bayes_factor	scale1	scale2	delta	lfc_mean	lfc_median	lfc_std	lfc_min	lfc_max	raw_mean1	raw_mean2	non_zeros_proportion1	non_zeros_proportion2	raw_normalized_mean1	raw_normalized_mean2	is_de_fdr_0.1	comparison	group2
IGHD	0.9614	0.0386	3.215138	5.014777e-05	0.000155	0.25	-1.961476	-2.218686	4.679548	-12.442281	13.102363	0.373913	0.302417	0.163478	0.214892	0.613425	1.429468	True	0 vs Rest	Rest
KRT1	0.9612	0.0388	3.209762	1.328588e-05	0.000056	0.25	-2.762277	-2.990346	3.740437	-11.404661	8.144420	0.071304	0.184194	0.062609	0.141737	0.125630	0.490895	True	0 vs Rest	Rest
IGLJ6	0.9574	0.0426	3.112367	2.107955e-06	0.000008	0.25	-1.984414	-2.068377	3.130071	-12.411051	8.842839	0.015652	0.016982	0.015652	0.016329	0.017529	0.075899	True	0 vs Rest	Rest
TNNI1	0.9572	0.0428	3.107474	3.670819e-04	0.000227	0.25	1.782276	1.949461	4.909887	-13.054656	12.058186	3.626084	1.035924	0.570435	0.291313	4.967675	2.516599	True	0 vs Rest	Rest
SCNN1G	0.9568	0.0432	3.097754	2.088196e-06	0.000013	0.25	-2.366047	-2.489151	2.380850	-9.531041	5.713491	0.019130	0.037231	0.019130	0.033638	0.028537	0.113567	True	0 vs Rest	Rest
NCCRP1	0.9564	0.0436	3.088119	3.245742e-05	0.000098	0.25	-2.199116	-2.482291	4.045037	-12.061623	10.157679	0.186087	0.260614	0.139130	0.171130	0.330292	0.912463	True	0 vs Rest	Rest
OR1A2	0.9556	0.0444	3.069100	1.031632e-06	0.000007	0.25	-2.211638	-2.214776	2.158102	-9.141550	5.730095	0.003478	0.017636	0.003478	0.016656	0.003897	0.063990	True	0 vs Rest	Rest
ACTC1	0.9548	0.0452	3.050405	2.031874e-04	0.000111	0.25	1.122663	1.213439	3.782610	-10.068547	9.856674	1.886959	0.481710	0.467826	0.222077	2.365514	1.137099	True	0 vs Rest	Rest
IGHG4	0.9538	0.0462	3.027474	5.267077e-04	0.001180	0.25	-2.009769	-2.003742	5.203235	-12.787839	14.891382	5.542608	5.183572	0.422609	0.589157	5.509517	10.980305	True	0 vs Rest	Rest
IGLV9-49	0.9536	0.0464	3.022945	2.598359e-05	0.000129	0.25	-2.360946	-2.420740	3.946391	-11.608186	10.199312	0.206956	0.628672	0.085217	0.191378	0.233483	1.104792	True	0 vs Rest	Rest
NAA11	0.9532	0.0468	3.013941	1.065899e-06	0.000008	0.25	-1.962109	-2.047962	3.067196	-10.694289	9.241531	0.010435	0.014696	0.010435	0.013390	0.009031	0.095068	True	0 vs Rest	Rest
ACAN	0.9532	0.0468	3.013941	4.309137e-07	0.000003	0.25	-2.434399	-2.467591	2.752172	-10.397037	7.943695	0.003478	0.006858	0.003478	0.006205	0.007703	0.020119	True	0 vs Rest	Rest
CHIT1	0.9530	0.0470	3.009467	2.191571e-06	0.000009	0.25	-1.990829	-2.111579	2.539829	-9.544723	6.760153	0.010435	0.034291	0.010435	0.031352	0.013659	0.095675	True	0 vs Rest	Rest
IGLC1	0.9528	0.0472	3.005011	2.073162e-04	0.000596	0.25	-2.049844	-2.378872	4.033913	-10.686590	12.256248	1.895654	2.242341	0.372174	0.555193	2.079543	5.583817	True	0 vs Rest	Rest
CBLL2	0.9520	0.0480	2.987364	2.197803e-06	0.000013	0.25	-1.821943	-1.919060	2.468560	-8.889462	7.225986	0.017391	0.026780	0.017391	0.026127	0.019305	0.116162	True	0 vs Rest	Rest
DES	0.9520	0.0480	2.987364	6.165525e-04	0.000327	0.25	1.998889	2.131765	4.721220	-12.210159	12.086747	6.502599	1.616600	0.613913	0.339974	8.122251	3.658348	True	0 vs Rest	Rest
HAVCR1	0.9518	0.0482	2.982996	2.777010e-06	0.000006	0.25	-1.968219	-2.368101	2.826574	-9.780298	7.036253	0.015652	0.018289	0.015652	0.016656	0.031786	0.059277	True	0 vs Rest	Rest
CKM	0.9514	0.0486	2.974311	1.391557e-04	0.000215	0.25	1.697646	1.937233	4.156939	-13.632011	10.582188	1.177394	0.671783	0.500870	0.259634	1.457761	2.155419	True	0 vs Rest	Rest
GPA33	0.9514	0.0486	2.974311	1.714058e-06	0.000007	0.25	-1.927718	-2.083632	2.597942	-8.731029	8.907793	0.006957	0.018289	0.006957	0.016982	0.011737	0.065187	True	0 vs Rest	Rest
IFNA8	0.9508	0.0492	2.961410	2.628014e-06	0.000009	0.25	-1.650592	-1.822725	2.898028	-11.695060	8.233589	0.015652	0.020248	0.015652	0.018615	0.026451	0.092085	True	0 vs Rest	Rest

	leiden	sample
AAACAGCTTTCAGAAG-1-0-0	2	A
AAACAGGGTCTATATT-1-0-0	2	A
AAACCGGGTAGGTACC-1-0-0	2	A
AAACCTCATGAAGTTG-1-0-0	2	A
AAACTTGCAAACGTAT-1-0-0	2	A
...	...	...
TTGTTAGCAAATTCGA-1-1	6	C
TTGTTCAGTGTGCTAC-1-1	8	C
TTGTTGTGTGTCAAGA-1-1	8	C
TTGTTTCACATCCAGG-1-1	1	C
TTGTTTCATTAGTCTA-1-1	5	C

	cluster	sample_A	sample_B	sample_C
AAACAGCTTTCAGAAG-1-0-0	2	1	0	0
AAACAGGGTCTATATT-1-0-0	2	1	0	0
AAACCGGGTAGGTACC-1-0-0	2	1	0	0
AAACCTCATGAAGTTG-1-0-0	2	1	0	0
AAACTTGCAAACGTAT-1-0-0	2	1	0	0
...	...	...	...	...
TTGTTAGCAAATTCGA-1-1	6	0	0	1
TTGTTCAGTGTGCTAC-1-1	8	0	0	1
TTGTTGTGTGTCAAGA-1-1	8	0	0	1
TTGTTTCACATCCAGG-1-1	1	0	0	1
TTGTTTCATTAGTCTA-1-1	5	0	0	1

	sample_A	sample_B	sample_C
cluster
0	257.0	137.0	181.0
1	164.0	30.0	282.0
2	168.0	172.0	11.0
3	87.0	73.0	177.0
4	47.0	57.0	219.0
5	29.0	23.0	270.0
6	35.0	3.0	245.0
7	17.0	20.0	229.0
8	5.0	7.0	243.0
9	59.0	3.0	107.0
10	27.0	25.0	115.0
11	88.0	22.0	3.0

	sample_A	sample_B	sample_C
cluster
0	44.695652	23.826087	31.478261
1	34.453782	6.302521	59.243697
2	47.863248	49.002849	3.133903
3	25.816024	21.661721	52.522255
4	14.551084	17.647059	67.801858
5	9.006211	7.142857	83.850932
6	12.367491	1.060071	86.572438
7	6.390977	7.518797	86.090226
8	1.960784	2.745098	95.294118
9	34.911243	1.775148	63.313609
10	16.167665	14.970060	68.862275
11	77.876106	19.469027	2.654867

	leiden	pathology
AAACAGCTTTCAGAAG-1-0-0	2	CA_PD
AAACAGGGTCTATATT-1-0-0	2	CA_PD
AAACCGGGTAGGTACC-1-0-0	2	CA_PD_&_fibrosis
AAACCTCATGAAGTTG-1-0-0	2	CA_PD_&_fibrosis
AAACTTGCAAACGTAT-1-0-0	2	CA_PD_&_fibrosis
...	...	...
TTGTTAGCAAATTCGA-1-1	6	CA_PD
TTGTTCAGTGTGCTAC-1-1	8	CA_PD
TTGTTGTGTGTCAAGA-1-1	8	CA_PD
TTGTTTCACATCCAGG-1-1	1	CA_PD
TTGTTTCATTAGTCTA-1-1	5	CA_PD

	CA_MD_to_WD	CA_PD	CA_PD_&_fibrosis	CA_PD_&_muscle	CA_fibrosis_&_lymphocyte
cluster
0	34.0	102.0	260.0	106.0	73.0
1	109.0	156.0	83.0	77.0	51.0
2	2.0	152.0	191.0	6.0	0.0
3	24.0	26.0	149.0	1.0	137.0
4	146.0	87.0	48.0	42.0	0.0
5	17.0	70.0	37.0	12.0	186.0
6	113.0	118.0	23.0	7.0	22.0
7	180.0	22.0	30.0	7.0	27.0
8	124.0	108.0	7.0	0.0	16.0
9	2.0	90.0	52.0	6.0	19.0
10	45.0	71.0	42.0	1.0	8.0
11	0.0	4.0	52.0	55.0	2.0