Ken Furudate

import scanpy as sc

import numpy as np
import pandas as pd

import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib.font_manager
plt.rcParams['font.sans-serif'] = ['Arial'] + plt.rcParams['font.sans-serif']
plt.rcParams["font.size"] = 40

import os
import seaborn as sns
from pathlib import Path

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import warnings
warnings.filterwarnings('ignore')

sc.logging.print_header()

scanpy==1.8.2 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.4.1 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.10.2 python-igraph==0.9.11 pynndescent==0.5.7

sample_name1="A"
sample_name2="B"
sample_name3="C"
sample_name4="D"

datadir="/data/spatial/"

in_f1 = "SampleA_cluster_patho.annot_region.txt"
in_f2 = "SampleB_cluster_patho.annot_region.txt"
in_f3 = "SampleC_cluster_patho.annot_region.txt"
in_f4 = "SampleD_cluster_patho.annot_region.txt"

adata = sc.read_visium(datadir + f"{sample_name1}/",
               genome=None, 
               count_file='filtered_feature_bc_matrix.h5',
               library_id=None,
               load_images=True,
               source_image_path=datadir + f"{sample_name1}/spatial")
adata.var_names_make_unique()
adata.var_names
adata

AnnData object with n_obs × n_vars = 2899 × 17943
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'

bdata = sc.read_visium(datadir + f"{sample_name2}/",
               genome=None, 
               count_file='filtered_feature_bc_matrix.h5',
               library_id=None,
               load_images=True,
               source_image_path=datadir + f"{sample_name2}/spatial")
bdata.var_names_make_unique()
bdata.var_names
bdata

AnnData object with n_obs × n_vars = 3000 × 17943
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'

cdata = sc.read_visium(datadir + f"{sample_name3}/",
               genome=None, 
               count_file='filtered_feature_bc_matrix.h5',
               library_id=None,
               load_images=True,
               source_image_path=datadir + f"{sample_name3}/spatial")
cdata.var_names_make_unique()
cdata.var_names
cdata

AnnData object with n_obs × n_vars = 2872 × 17943
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'

ddata = sc.read_visium(datadir + f"{sample_name4}/",
               genome=None, 
               count_file='filtered_feature_bc_matrix.h5',
               library_id=None,
               load_images=True,
               source_image_path=datadir + f"{sample_name4}/spatial")
ddata.var_names_make_unique()
ddata.var_names
ddata

AnnData object with n_obs × n_vars = 3234 × 17943
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'

a_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f1), index_col="Barcode")
b_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f2), index_col="Barcode")
c_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f3), index_col="Barcode")
d_annot = pd.read_table(os.path.join(datadir, "patho_annot", in_f4), index_col="Barcode")

adata.obs['pathology'] = a_annot["patho_diag"]
adata.obs['category'] = a_annot["category"]
adata.obs['cluster'] = a_annot["graph-based"]

bdata.obs['pathology'] = b_annot["patho_diag"]
bdata.obs['category'] = b_annot["category"]
bdata.obs['cluster'] = b_annot["graph-based"]

cdata.obs['pathology'] = c_annot["patho_diag"]
cdata.obs['category'] = c_annot["category"]
cdata.obs['cluster'] = c_annot["graph-based"]

ddata.obs['pathology'] = d_annot["patho_diag"]
ddata.obs['category'] = d_annot["category"]
ddata.obs['cluster'] = d_annot["graph-based"]

adata.var['mt'] = adata.var_names.str.startswith('mt-') 
adata.var['hb'] = adata.var_names.str.contains(("^Hb.*-"))
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','hb'], percent_top=None, log1p=False, inplace=True)

bdata.var['mt'] = bdata.var_names.str.startswith('mt-') 
bdata.var['hb'] = bdata.var_names.str.contains(("^Hb.*-"))
sc.pp.calculate_qc_metrics(bdata, qc_vars=['mt','hb'], percent_top=None, log1p=False, inplace=True)

cdata.var['mt'] = cdata.var_names.str.startswith('mt-') 
cdata.var['hb'] = cdata.var_names.str.contains(("^Hb.*-"))
sc.pp.calculate_qc_metrics(cdata, qc_vars=['mt','hb'], percent_top=None, log1p=False, inplace=True)

ddata.var['mt'] = ddata.var_names.str.startswith('mt-')
ddata.var['hb'] = ddata.var_names.str.contains(("^Hb.*-"))
sc.pp.calculate_qc_metrics(ddata, qc_vars=['mt', 'hb'], percent_top=None, log1p=False, inplace=True)

sc.pl.violin(adata=adata, 
             keys=['total_counts', 'n_genes_by_counts'],
             jitter=0.3, 
             size=1.2, 
             ylabel="UMI count",
             xlabel=""
             )

sc.pl.spatial(adata=adata, 
              color=['total_counts', 'n_genes_by_counts'],
              color_map="viridis",
              vmax=[30000, 8000],
              )

sc.pl.violin(adata=bdata, 
             keys=['total_counts', 'n_genes_by_counts'],
             jitter=0.3, 
             size=1.2, 
             ylabel="UMI count",
             xlabel=""
             )

sc.pl.spatial(adata=bdata, 
              color=['total_counts', 'n_genes_by_counts'],
              color_map="viridis",
              vmax=[30000, 8000],
              )

sc.pl.violin(adata=cdata, 
             keys=['total_counts', 'n_genes_by_counts'],
             jitter=0.3, 
             size=1.2, 
             ylabel="UMI count",
             xlabel=""
             )

sc.pl.spatial(adata=cdata, 
              color=['total_counts', 'n_genes_by_counts'],
              color_map="viridis",
              vmax=[20000, 6000],
              )

sc.pl.violin(adata=ddata, 
             keys=['total_counts', 'n_genes_by_counts'],
             jitter=0.3, 
             size=1.2, 
             ylabel="UMI count",
             xlabel=""
             )

sc.pl.spatial(adata=ddata, 
              color=['total_counts', 'n_genes_by_counts'],
              color_map="viridis",
              vmax=[20000, 6000],
              )

def draw_boxplot(input_df, x_axis, y_axis, sample):
  input_df_ = input_df[input_df.obs['category']!="uncategorized"].copy()
  rotation = 0
  order = None
  
  if x_axis == "category": 
    order =["non_tumor", "peritumor", "tumor"]

  elif x_axis == "cluster": 
    import re
    
    def atoi(text):
      return int(text) if text.isdigit() else text

    def natural_keys(text):
      return [ atoi(c) for c in re.split(r'(\d+)', text) ]
    
    order = sorted(list(set(input_df_.obs["cluster"])), key=natural_keys)
    rotation = 45
    
  elif x_axis == "pathology":
    rotation = 90
    order = None
    
  sns.set(font_scale=2, style='white')
  #plt.figure(figsize=(10, 10))
  ax = sns.boxplot(data=input_df_.obs, x=x_axis, y=y_axis, order=order)
  sns.stripplot(data=input_df_.obs, x=x_axis, y=y_axis, color='k',size=1.5, ax=ax, order=order)
  plt.xlabel('')
  plt.xticks(rotation=rotation)
  #plt.savefig(outdir +f'{DAY}_{sample}.{x_axis}.{y_axis}.{Version}.pdf', bbox_inches='tight') 
  plt.show()

sample_lst = ["A", "B", "C", "D"]
x_axis_lst = ["category", "pathology", "cluster"]
y_axis_lst = ["total_counts", "n_genes_by_counts"]

for i in range(len(sample_lst)):
  sample_ = sample_lst[i]
  if sample_ == "A":
    data_ = adata.copy()
  if sample_ == "B":
    data_ = bdata.copy()
  if sample_ == "C":
    data_ = cdata.copy()
  if sample_ == "D":
    data_ = ddata.copy()
  for j in range(len(x_axis_lst)):
    x_ax_ = x_axis_lst[j]
    for k in range(len(y_axis_lst)):
      y_ax_ = y_axis_lst[k]
      print(sample_)
      draw_boxplot(input_df=data_, x_axis=x_ax_, y_axis=y_ax_, sample=sample_)

######################## Sample A: category total_counts ########################

######################## Sample A: category n_genes_by_counts ########################

######################## Sample A: pathology total_counts ########################

######################## Sample A: pathology n_genes_by_counts ########################

######################## Sample A: cluster total_counts ########################

######################## Sample A: cluster n_genes_by_counts ########################

######################## Sample B: category total_counts ########################

######################## Sample B: category n_genes_by_counts ########################

######################## Sample B: pathology total_counts ########################

######################## Sample B: pathology n_genes_by_counts ########################