This document takes the cellranger count data for two samples (a healthy control and a Uveitis patient) and performs standard single cell rnaseq analysis, using the SCassist package.
# Install hdf5 library if its not already installed
#sudo apt-get install libhdf5-dev or brew install hdf5
#install.packages("hdf5r")
# Load below libraries
library(Seurat)
library(ggplot2)
library(plotly)
library(tidyverse)
library(cowplot)
library(scProportionTest)
library(rollama)
library(SCassist)
library(httr)
library(jsonlite)
library(dplyr)
# Set the input, output paths
setwd("/singlecellassistant")
datapath="exampleData"
outputpath="/singlecellassistant/exampleOutput_G/BCRUV/"
sampleinfopath="/singlecellassistant/exampleData/BCRUV/BCRUV_SampleInformation.tab"
# Load RData for example dataset
load("/singlecellassistant/exampleOutput_G/BCRUV/bcruv_G_new.RData")
# Read sample information file
sampleinformation=read.csv(sampleinfopath,sep="\t")
head(sampleinformation)
## RunID file_name
## 1 HVHL7DRXX_19022128 HVHL7DRXX_19022128_S3_L001_R1_001.fastq.gz
## 2 HVHL7DRXX_19022128 HVHL7DRXX_19022128_S3_L001_R2_001.fastq.gz
## 3 HVHL7DRXX_19022126 HVHL7DRXX_19022126_S2_L001_R1_001.fastq.gz
## 4 HVHL7DRXX_19022126 HVHL7DRXX_19022126_S2_L001_R2_001.fastq.gz
## 5 HVHL7DRXX_19022124 HVHL7DRXX_19022124_S4_L001_R1_001.fastq.gz
## 6 HVHL7DRXX_19022124 HVHL7DRXX_19022124_S4_L001_R2_001.fastq.gz
## folder_location_in_biowulf submitted_name
## 1 /data/../TotalSeq/RawData/201230_INGENS_0255_B_HVHL7DRXX/ NS3R189BTS
## 2 /data/../TotalSeq/RawData/201230_INGENS_0255_B_HVHL7DRXX/ NS3R189BTS
## 3 /data/../TotalSeq/RawData/201230_INGENS_0255_B_HVHL7DRXX/ NS3R189BTS
## 4 /data/../TotalSeq/RawData/201230_INGENS_0255_B_HVHL7DRXX/ NS3R189BTS
## 5 /data/../TotalSeq/RawData/201230_INGENS_0255_B_HVHL7DRXX/ NS3R189BTS
## 6 /data/../TotalSeq/RawData/201230_INGENS_0255_B_HVHL7DRXX/ NS3R189BTS
## project machine_type pcnt_optical_duplicates clusters read_length
## 1 Sen Uveitis NovaSeq6000_SP NA 87268107 28,91
## 2 Sen Uveitis NovaSeq6000_SP NA 87268107 28,91
## 3 Sen Uveitis NovaSeq6000_SP NA 130870111 28,91
## 4 Sen Uveitis NovaSeq6000_SP NA 130870111 28,91
## 5 Sen Uveitis NovaSeq6000_SP NA 113258165 28,91
## 6 Sen Uveitis NovaSeq6000_SP NA 113258165 28,91
## experiment_type run_date RunType SampleType ShortSubmittedName
## 1 SCRNA-Seq 12/30/20 0:00 Gene Expression HC 3R189
## 2 SCRNA-Seq 12/30/20 0:00 Gene Expression HC 3R189
## 3 SCRNA-Seq 12/30/20 0:00 Gene Expression HC 3R189
## 4 SCRNA-Seq 12/30/20 0:00 Gene Expression HC 3R189
## 5 SCRNA-Seq 12/30/20 0:00 Gene Expression HC 3R189
## 6 SCRNA-Seq 12/30/20 0:00 Gene Expression HC 3R189
## ShortNameType SampleDescription Age Sex Race SampleBatch Sample.prepared
## 1 3R189_HC Healthy Control 37 FALSE C 1 12/2/20
## 2 3R189_HC Healthy Control 37 FALSE C 1 12/2/20
## 3 3R189_HC Healthy Control 37 FALSE C 1 12/2/20
## 4 3R189_HC Healthy Control 37 FALSE C 1 12/2/20
## 5 3R189_HC Healthy Control 37 FALSE C 1 12/2/20
## 6 3R189_HC Healthy Control 37 FALSE C 1 12/2/20
## DateReceived TreatmentCategory CMV_IgG CMV_IgM EBVCA_IgG
## 1 1/4/21 Healthy negative negative positive
## 2 1/4/21 Healthy negative negative positive
## 3 1/4/21 Healthy negative negative positive
## 4 1/4/21 Healthy negative negative positive
## 5 1/4/21 Healthy negative negative positive
## 6 1/4/21 Healthy negative negative positive
# Specify samples for analysis.
samples=c("NS3R189BTS","NS7R65BBTS")
# Read the data file
NS3R189BTS <- Read10X_h5(file.path("/BCRUV/BCRUV_NS3R189BTS_filtered_feature_bc_matrix.h5"), use.names = T)
NS7R65BBTS <- Read10X_h5(file.path("/BCRUV/BCRUV_NS7R65BBTS_filtered_feature_bc_matrix.h5"), use.names = T)
# Add sample names to columnnames
colnames(NS3R189BTS$`Gene Expression`) <- paste(sapply(strsplit(colnames(NS3R189BTS$`Gene Expression`),split="-"),'[[',1L),"NS3R189BTS",sep="-")
colnames(NS7R65BBTS$`Gene Expression`) <- paste(sapply(strsplit(colnames(NS7R65BBTS$`Gene Expression`),split="-"),'[[',1L),"NS7R65BBTS",sep="-")
# Create a Seurat data object from the gex matrix
NS3R189BTS <- CreateSeuratObject(counts = NS3R189BTS[["Gene Expression"]], names.field = 2,names.delim = "\\-")
NS7R65BBTS <- CreateSeuratObject(counts = NS7R65BBTS[["Gene Expression"]], names.field = 2,names.delim = "\\-")
# Merge seurat objects
allsamples <- merge(NS3R189BTS, NS7R65BBTS, project = "UV")
allsamples
## An object of class Seurat
## 36601 features across 39090 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
## 2 layers present: counts.1, counts.2
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608
table(Idents(allsamples))
##
## NS3R189BTS NS7R65BBTS
## 24790 14300
# Merge counts data
allsamples = JoinLayers(allsamples)
This part of the workflow uses SCassist_analyze_quality(), to identify potential filtering cutoff values. SCassist provides the recommendation based on the summary statistics and the quantile statistic of the data.
# Generate Percent mito for each of the cells
allsamples$percent.mito <- PercentageFeatureSet(allsamples, pattern = "^MT-")
summary(allsamples$percent.mito)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.300 3.517 4.084 5.197 86.180
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431 4.1135335
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 0.8221226
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760 5.1856217
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279 6.5277778
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 4.3795620
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261 3.8418501
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 4.6177139
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283 5.0468959
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 5.3014109
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 2.2900763
# Generate Percent ribo for each of the cells
allsamples$percent.ribo <- PercentageFeatureSet(allsamples, pattern = "^RP[SL]")
summary(allsamples$percent.ribo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5956 20.4335 25.8115 26.1630 32.7232 68.7378
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431 4.1135335
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 0.8221226
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760 5.1856217
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279 6.5277778
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 4.3795620
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261 3.8418501
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 4.6177139
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283 5.0468959
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 5.3014109
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 2.2900763
## percent.ribo
## AAACCCACAAGTCATC-NS3R189BTS 18.510901
## AAACCCACACTGGAAG-NS3R189BTS 42.526158
## AAACCCACAGCACACC-NS3R189BTS 19.151444
## AAACCCACAGGAGGAG-NS3R189BTS 8.842593
## AAACCCACATGACGTT-NS3R189BTS 18.978102
## AAACCCAGTAGTTACC-NS3R189BTS 18.985453
## AAACCCAGTCCAAATC-NS3R189BTS 30.052990
## AAACCCAGTCCATACA-NS3R189BTS 19.830281
## AAACCCAGTGAGTTTC-NS3R189BTS 21.675930
## AAACCCAGTTCTTAGG-NS3R189BTS 23.353124
# Generate Percent hemoglobin for each of the cells
allsamples$percent.hb <- PercentageFeatureSet(allsamples, pattern = "^HB[^(P)]")
summary(allsamples$percent.hb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07317 0.00000 96.37250
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431 4.1135335
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 0.8221226
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760 5.1856217
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279 6.5277778
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 4.3795620
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261 3.8418501
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 4.6177139
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283 5.0468959
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 5.3014109
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 2.2900763
## percent.ribo percent.hb
## AAACCCACAAGTCATC-NS3R189BTS 18.510901 0.00000000
## AAACCCACACTGGAAG-NS3R189BTS 42.526158 0.00000000
## AAACCCACAGCACACC-NS3R189BTS 19.151444 0.00000000
## AAACCCACAGGAGGAG-NS3R189BTS 8.842593 0.00000000
## AAACCCACATGACGTT-NS3R189BTS 18.978102 0.04293688
## AAACCCAGTAGTTACC-NS3R189BTS 18.985453 0.01864976
## AAACCCAGTCCAAATC-NS3R189BTS 30.052990 0.02523341
## AAACCCAGTCCATACA-NS3R189BTS 19.830281 0.04466280
## AAACCCAGTGAGTTTC-NS3R189BTS 21.675930 0.00000000
## AAACCCAGTTCTTAGG-NS3R189BTS 23.353124 0.00000000
# Check the total number of cells in this experiment
table(allsamples$orig.ident)
##
## NS3R189BTS NS7R65BBTS
## 24790 14300
api_key_file = "/singlecellassistant/api_keys.txt"
# Analyze quality of the cells using SCassist
allsamplesquality <- SCassist_analyze_quality("allsamples", percent_mt = "percent.mito", percent_ribo = "percent.ribo", percent_hb = "percent.hb", api_key_file = api_key_file, llm_server="google")
## Based on the data summary, below are my recommendations for the quality filtering of the data:
##
## **nCount_RNA:**
##
## * **Lower Cutoff:** 1000
## * **Upper Cutoff:** 15000
##
## **Rationale:** The 5th percentile suggests a lower limit of 1283, but considering the minimum value of 500, a lower cutoff of 1000 might be more appropriate to avoid removing potentially valid cells with low counts. The 95th percentile suggests an upper limit of 9600.55, but considering the maximum value of 80702, an upper cutoff of 15000 seems more reasonable to remove extreme outliers while retaining cells with high expression.
##
## **nFeature_RNA:**
##
## * **Lower Cutoff:** 800
## * **Upper Cutoff:** 4000
##
## **Rationale:** The 5th percentile suggests a lower limit of 780, but considering the minimum value of 46, a lower cutoff of 800 might be more appropriate to avoid removing potentially valid cells with low feature counts. The 95th percentile suggests an upper limit of 3069, but considering the maximum value of 8873, an upper cutoff of 4000 seems more reasonable to remove extreme outliers while retaining cells with high feature counts.
##
## **percent.mito:**
##
## * **Upper Cutoff:** 10
##
## **Rationale:** The 95th percentile suggests an upper limit of 8.68, but considering the maximum value of 86.18, an upper cutoff of 10 seems more reasonable to remove cells with extremely high mitochondrial content while retaining cells with moderate mitochondrial expression.
##
## **percent.ribo:**
##
## * **Upper Cutoff:** 50
##
## **Rationale:** The 95th percentile suggests an upper limit of 41.89, but considering the maximum value of 68.74, an upper cutoff of 50 seems more reasonable to remove cells with extremely high ribosomal content while retaining cells with moderate ribosomal expression.
##
## **percent.hb:**
##
## * **Upper Cutoff:** 0.1
##
## **Rationale:** The 95th percentile suggests an upper limit of 0.0499, but considering the maximum value of 96.37, an upper cutoff of 0.1 seems more reasonable to remove cells with extremely high hemoglobin content while retaining cells with moderate hemoglobin expression.
##
## **Important Note:** These are just recommendations based on the provided data. It is crucial to test a range of values around these recommendations to determine the optimal cutoffs for your specific dataset and analysis goals. Visualizing the distributions of each metric can help in making informed decisions about the cutoffs.
# The SCassist recommendation is also stored in "allsamplesquality"
#cat(allsamplesquality)
Here we use the SCassist recommended QC cutoff values obtained in the previous step to filter and visualize the before and after QC data.
# Filter data based on SCassists recommendation
allsamplesgood <- subset(allsamples, subset = nCount_RNA > 1000 & nCount_RNA < 15000 & nFeature_RNA > 800 & nFeature_RNA < 4000 & percent.mito < 10 & percent.ribo < 50 & percent.hb < 0.1)
# Counts before filtering
table(allsamples$orig.ident)
##
## NS3R189BTS NS7R65BBTS
## 24790 14300
# Counts after filtering
table(allsamplesgood$orig.ident)
##
## NS3R189BTS NS7R65BBTS
## 22195 12772
# Assign sample types
allsamples$type <- plyr::mapvalues(
x = allsamples$orig.ident,
from = sampleinformation$submitted_name,
to = sampleinformation$SampleType
)
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431 4.1135335
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 0.8221226
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760 5.1856217
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279 6.5277778
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 4.3795620
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261 3.8418501
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 4.6177139
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283 5.0468959
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 5.3014109
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 2.2900763
## percent.ribo percent.hb type
## AAACCCACAAGTCATC-NS3R189BTS 18.510901 0.00000000 HC
## AAACCCACACTGGAAG-NS3R189BTS 42.526158 0.00000000 HC
## AAACCCACAGCACACC-NS3R189BTS 19.151444 0.00000000 HC
## AAACCCACAGGAGGAG-NS3R189BTS 8.842593 0.00000000 HC
## AAACCCACATGACGTT-NS3R189BTS 18.978102 0.04293688 HC
## AAACCCAGTAGTTACC-NS3R189BTS 18.985453 0.01864976 HC
## AAACCCAGTCCAAATC-NS3R189BTS 30.052990 0.02523341 HC
## AAACCCAGTCCATACA-NS3R189BTS 19.830281 0.04466280 HC
## AAACCCAGTGAGTTTC-NS3R189BTS 21.675930 0.00000000 HC
## AAACCCAGTTCTTAGG-NS3R189BTS 23.353124 0.00000000 HC
allsamplesgood$type <- plyr::mapvalues(
x = allsamplesgood$orig.ident,
from = sampleinformation$submitted_name,
to = sampleinformation$SampleType
)
head(allsamplesgood)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431 4.1135335
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 0.8221226
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760 5.1856217
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279 6.5277778
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 4.3795620
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261 3.8418501
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 4.6177139
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283 5.0468959
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 5.3014109
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 2.2900763
## percent.ribo percent.hb type
## AAACCCACAAGTCATC-NS3R189BTS 18.510901 0.00000000 HC
## AAACCCACACTGGAAG-NS3R189BTS 42.526158 0.00000000 HC
## AAACCCACAGCACACC-NS3R189BTS 19.151444 0.00000000 HC
## AAACCCACAGGAGGAG-NS3R189BTS 8.842593 0.00000000 HC
## AAACCCACATGACGTT-NS3R189BTS 18.978102 0.04293688 HC
## AAACCCAGTAGTTACC-NS3R189BTS 18.985453 0.01864976 HC
## AAACCCAGTCCAAATC-NS3R189BTS 30.052990 0.02523341 HC
## AAACCCAGTCCATACA-NS3R189BTS 19.830281 0.04466280 HC
## AAACCCAGTGAGTTTC-NS3R189BTS 21.675930 0.00000000 HC
## AAACCCAGTTCTTAGG-NS3R189BTS 23.353124 0.00000000 HC
# View type counts
table(allsamples$type)
##
## BCR-UV HC
## 14300 24790
table(allsamplesgood$type)
##
## BCR-UV HC
## 12772 22195
# View data
head(allsamplesgood)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431 4.1135335
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 0.8221226
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760 5.1856217
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279 6.5277778
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 4.3795620
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261 3.8418501
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 4.6177139
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283 5.0468959
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 5.3014109
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 2.2900763
## percent.ribo percent.hb type
## AAACCCACAAGTCATC-NS3R189BTS 18.510901 0.00000000 HC
## AAACCCACACTGGAAG-NS3R189BTS 42.526158 0.00000000 HC
## AAACCCACAGCACACC-NS3R189BTS 19.151444 0.00000000 HC
## AAACCCACAGGAGGAG-NS3R189BTS 8.842593 0.00000000 HC
## AAACCCACATGACGTT-NS3R189BTS 18.978102 0.04293688 HC
## AAACCCAGTAGTTACC-NS3R189BTS 18.985453 0.01864976 HC
## AAACCCAGTCCAAATC-NS3R189BTS 30.052990 0.02523341 HC
## AAACCCAGTCCATACA-NS3R189BTS 19.830281 0.04466280 HC
## AAACCCAGTGAGTTTC-NS3R189BTS 21.675930 0.00000000 HC
## AAACCCAGTTCTTAGG-NS3R189BTS 23.353124 0.00000000 HC
# Generate before filtering quality plots
qcbefore=VlnPlot(allsamples,features = c("nFeature_RNA", "nCount_RNA","percent.mito","percent.ribo"),ncol = 2, pt.size = 0) +
NoLegend()
qcbefore
# Save the plot in current working directory
ggsave(paste(outputpath,"before-qc-violineplot.pdf",sep=""), plot = qcbefore, width = 15, height = 20, units = "cm")
# Generate after filtering quality plots
qcafter=VlnPlot(allsamplesgood,features = c("nFeature_RNA", "nCount_RNA","percent.mito","percent.ribo"),ncol = 2, pt.size = 0) +
NoLegend()
qcafter
# Save the plot in current working directory
ggsave(paste(outputpath,"after-qc-violineplot.pdf",sep=""), plot = qcafter, width = 15, height = 20, units = "cm")
Here we ask SCassist_recommend_normalization to use the number of cells, mean gene expression, standard deviation of gene expression, library size and the coefficient of library size variation, to analyze the data properties and recommend an appropriate normalization method among “LogNormalize”, “CLR”, “RC”, “SCTransform”, along with its reasoning for the recommendation.
# Ask SCassist to recommend normalization method
normalization_recommendation<-SCassist_recommend_normalization("allsamplesgood", llm_server="google", api_key_file = api_key_file)
## ## Recommended Normalization Method: SCTransform
##
## Based on the provided characteristics of your single-cell RNA-seq dataset, **SCTransform** is the most appropriate normalization method for this dataset. Here's why:
##
## **1. Large Number of Cells:** SCTransform is designed to handle large datasets efficiently. With 34,967 cells, computational efficiency becomes a significant factor, and SCTransform excels in this regard.
##
## **2. High Variability in Gene Expression:** The high standard deviation of 6.34 in gene expression values indicates significant variability across cells. SCTransform accounts for this variability by modeling both technical and biological sources of variation, leading to more accurate normalization.
##
## **3. Moderate Library Size Variation:** The coefficient of variation of 0.58 for library sizes suggests moderate variation in sequencing depth. SCTransform effectively addresses this by regressing out library size effects during normalization.
##
## **Why SCTransform is Preferred:**
##
## * **Comprehensive Normalization:** SCTransform combines several normalization steps, including library size correction, removal of unwanted variation (e.g., cell cycle effects), and gene-specific scaling. This comprehensive approach leads to more accurate and robust results.
## * **Model-Based Approach:** SCTransform utilizes a statistical model to estimate and correct for technical biases, resulting in more reliable normalization compared to simple scaling methods.
## * **Improved Downstream Analysis:** SCTransform-normalized data often leads to better performance in downstream analyses like dimensionality reduction, clustering, and differential gene expression analysis.
##
## **Potential Alternatives:**
##
## * **LogNormalize:** This method is a simple scaling method that divides each gene count by the total number of counts in a cell and then takes the logarithm. While it is computationally efficient, it does not account for technical biases or gene-specific variability.
## * **CLR (Centered Log Ratio):** This method is based on the log ratio of gene counts to the geometric mean of all genes in a cell. It is effective for removing library size effects but does not address other sources of variation.
## * **RC (Relative Counts):** This method scales gene counts by the total number of counts in a cell. It is a simple method but does not account for technical biases or gene-specific variability.
##
## **Conclusion:**
##
## While other normalization methods might be suitable for smaller datasets or datasets with less variability, SCTransform is the most appropriate choice for this dataset due to its ability to handle large datasets, account for technical and biological variation, and improve downstream analysis.
# Normalize Data
# Below we increase memory to 1GB for SCTransform run
options(future.globals.maxSize = 2000 * 1024^2)
# SCTransform normalization based on SCassist recommendation
allsamplesgood <- SCTransform(allsamplesgood, vars.to.regress = c("percent.mito", "nFeature_RNA", "percent.ribo", "nCount_RNA"))
Explore the variable features using the LLMs abilities to glean insights in to a list of genes, to explore what drives the cell-to-cell variation
# List top 10 variable genes
top10 <- head(VariableFeatures(allsamplesgood), 10)
top10
## [1] "IGKC" "IGLC1" "IGLC2" "IGHA1" "IGLC3" "PPBP" "S100A9" "IGHM"
## [9] "GNLY" "CCL4"
# Plot variable features, label the top 10, save the plot
vfp1 <- VariableFeaturePlot(allsamplesgood)
vfp1 <- LabelPoints(plot = vfp1, points = top10, repel = TRUE)
vfp1
ggsave(paste(outputpath,"variablefeatureplot.pdf",sep=""), plot = vfp1, width = 20, height = 15, units = "cm")
Here we ask SCassist_analyze_variable_features to explore the top 30 variable features, along with the experimental design statement and provide potential insights on the underlying system being studied.
# Experimental design statement
experimental_design = "pbmcs of one healthy control and one uveitis patient"
# Ask SCassist to analyze variable features
variable_feature_analysis <- SCassist_analyze_variable_features("allsamplesgood", experimental_design = experimental_design, api_key_file = api_key_file, llm_server="google")
## The provided list of genes suggests an enrichment in pathways related to **immune response, inflammation, and leukocyte activation**.
##
## Here's a breakdown:
##
## **Immune Response & Inflammation:**
##
## * **Immunoglobulin genes (IGKC, IGLC1, IGLC2, IGHA1, IGLC3, IGHM):** These genes encode components of antibodies, crucial for adaptive immune responses. Their presence suggests activation of B cells and antibody production.
## * **Cytokines (IL1B, CCL4, CCL3, CCL4L2, CXCL8):** These genes encode signaling molecules that regulate immune cell activity and inflammation. Their presence indicates a pro-inflammatory environment.
## * **HLA genes (HLA-DRA, HLA-DRB1):** These genes encode major histocompatibility complex (MHC) proteins, crucial for antigen presentation to T cells. Their presence suggests activation of T cell responses.
## * **S100 proteins (S100A8, S100A9):** These proteins are involved in inflammation and immune cell activation. Their presence suggests a pro-inflammatory state.
## * **JCHAIN:** This gene encodes a protein essential for the assembly of multimeric immunoglobulins, further supporting antibody production.
## * **CD74:** This gene encodes a receptor for the cytokine MIF, which is involved in inflammation and immune regulation.
## * **IFIT2:** This gene encodes an interferon-induced protein involved in antiviral responses and inflammation.
##
## **Leukocyte Activation & Function:**
##
## * **GNLY:** This gene encodes a cytotoxic granule protein found in NK cells and cytotoxic T cells, suggesting activation of these cell types.
## * **PPBP:** This gene encodes a platelet factor that promotes platelet aggregation and activation.
## * **PF4:** This gene encodes a chemokine that attracts and activates platelets.
## * **GP1BB:** This gene encodes a platelet receptor involved in platelet aggregation.
## * **LYZ:** This gene encodes lysozyme, an enzyme found in neutrophils and macrophages that plays a role in bacterial killing.
## * **CST3:** This gene encodes cystatin C, a cysteine protease inhibitor involved in immune regulation.
## * **CD79A:** This gene encodes a protein involved in B cell receptor signaling, suggesting B cell activation.
##
## **Other Relevant Genes:**
##
## * **NRGN:** This gene encodes a protein involved in neuronal development, but its presence in this context could suggest potential neuro-inflammatory aspects.
## * **PTGDS:** This gene encodes a prostaglandin synthase involved in inflammation and pain.
## * **EREG:** This gene encodes a growth factor involved in wound healing and inflammation.
## * **HSPA1A:** This gene encodes a heat shock protein involved in cellular stress response.
##
## **Relevance to the Experimental Design:**
##
## The presence of these genes in PBMCs from a healthy control and a uveitis patient suggests that the patient's immune system is activated and potentially dysregulated. The specific genes involved point towards a pro-inflammatory environment with activation of both innate and adaptive immune responses. This is consistent with the known pathology of uveitis, an inflammatory disease affecting the eye.
##
## The comparison between the healthy control and the uveitis patient allows for the identification of genes that are differentially expressed in the patient, potentially providing insights into the specific mechanisms underlying the disease.
Here we perform PCA and ask SCassist_recommend_pcs to analzye the PCs to provide recommendation for the number of PCs to use for downstream analysis
# Run PCA to generate 50 pcs
allsamplesgood <- RunPCA(object = allsamplesgood, npcs=50)
# Print genes in the top 5 pcs
print(allsamplesgood[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: S100A9, LYZ, S100A8, IL1B, HLA-DRA
## Negative: RPS26, RPS27, RPS12, EEF1A1, RPL10
## PC_ 2
## Positive: IGKC, IGHA1, IGLC2, IGLC1, JCHAIN
## Negative: S100A9, S100A8, PPBP, LYZ, IL1B
## PC_ 3
## Positive: PPBP, PF4, NRGN, GP1BB, GNG11
## Negative: IGLC1, GNLY, IL1B, CXCL8, CCL3
## PC_ 4
## Positive: IGKC, RPS26, HLA-DRA, CD74, RPS27
## Negative: IGLC1, IGLC3, IGHM, PPBP, JCHAIN
## PC_ 5
## Positive: IGLC1, HLA-DRA, IGLC3, CD74, RPS26
## Negative: GNLY, NKG7, IGKC, GZMB, CCL4
Here we ask SCassist_recommend_pcs to analzye the PCs to provide recommendation for the number of PCs to use for downstream analysis
pc_recommendation=SCassist_recommend_pcs("allsamplesgood", experimental_design = experimental_design, llm_server="google", api_key_file = api_key_file)
## Based on the provided variance explained by each PC, the optimal number of PCs to use for downstream analysis is **10**.
##
## Here's the reasoning:
##
## * **Elbow point:** While there isn't a super clear "elbow" in the scree plot, there's a noticeable change in the slope around PC10. The variance explained drops significantly after PC10, indicating that the subsequent PCs capture less substantial information.
## * **Variance explained:** The first 10 PCs capture approximately 65% of the total variance. This is a good balance between capturing a significant portion of the variation while avoiding excessive complexity.
## * **Balance between complexity and interpretability:** Using 10 PCs provides a reasonable level of detail without making the analysis overly complex. Including more PCs might capture subtle variations but could lead to overfitting and difficulty in interpreting the results.
##
## Therefore, using the first 10 PCs strikes a good balance between capturing a significant portion of the variance and maintaining interpretability.
# Identify number of PCs that explains majority of variations
ElbowPlot(allsamplesgood, ndims=50, reduction = "pca")
# Visualize the genes in the first PC
VizDimLoadings(allsamplesgood, dims = 1, ncol = 1) + theme_minimal(base_size = 8)
# Visualize the genes in the second PC
VizDimLoadings(allsamplesgood, dims = 2, ncol = 1) + theme_minimal(base_size = 8)
# Set the identity column to HC vs BCR-UV
Idents(allsamplesgood) <- "orig.ident"
# Visualize the cells after pca
DimPlot(object = allsamplesgood, reduction = "pca")
# Plot heatmap with cells=500 plotting cells with extreme cells on both ends of spectrum
DimHeatmap(object = allsamplesgood, dims = 1:5, cells = 500, balanced = TRUE)
Here we ask SCassist_analyze_pcs to analzye the SCassist recommended PCs and provide insights on the gene sets identified
pc_analyzed=SCassist_analyze_pcs("allsamplesgood", num_pcs = 5, experimental_design = experimental_design, llm_server="google", api_key_file = api_key_file)
##
## **PC1 Summary:**
## The top contributing genes for PC1 are primarily associated with immune response and inflammation. Genes like S100A9, LYZ, S100A8, IL1B, HLA-DRA, CST3, CXCL8, CD74, and IFI30 are all known to be involved in the activation and recruitment of immune cells, particularly neutrophils and macrophages. Additionally, genes like KYNU, TYROBP, and FCN1 are involved in antigen presentation and immune signaling. The presence of these genes suggests that PC1 might be capturing variations in the immune response, potentially reflecting differences in the activation state of immune cells between the healthy control and the uveitis patient. This is further supported by the presence of genes like ZEB2, SPI1, and NRGN, which are involved in the regulation of immune cell differentiation and function. Therefore, PC1 likely reflects differences in the immune response and inflammatory processes between the healthy control and the uveitis patient.
##
##
## **PC2 Summary:**
## The top contributing genes for PC2 are primarily involved in immune response and inflammation, particularly those related to B cell activation and antibody production. Genes like IGKC, IGHA1, IGLC2, IGLC1, JCHAIN, IGHM, and CD79A are components of immunoglobulin molecules and B cell signaling pathways. Additionally, genes like HLA-DRA, HLA-DQA1, HLA-DRB1, and MS4A1 are involved in antigen presentation and T cell activation. The presence of genes like IL1B, CXCL8, and S100A8 suggests an inflammatory response. These findings suggest that PC2 might be capturing variations in B cell activation and antibody production, potentially reflecting differences in immune response between the healthy control and the uveitis patient. This could be related to the pathogenesis of uveitis, an inflammatory condition affecting the eye.
##
##
## **PC3 Summary:**
## The top contributing genes for PC3 include several chemokines (CCL3, CCL4, CXCL8), inflammatory mediators (IL1B, PTGS1), and genes involved in immune cell activation and differentiation (GNLY, ITGA2B, CD9, TSPAN33). These genes are strongly associated with immune responses, particularly those related to inflammation and leukocyte recruitment. The presence of genes like PPBP, PF4, and SPARC suggests potential involvement of platelet activation and coagulation. Furthermore, genes like TUBB1, CAVIN2, and TLN1 point towards cytoskeletal remodeling and cell migration. Taken together, these findings suggest that PC3 might be capturing variations related to **immune cell activation, inflammation, and leukocyte trafficking**, potentially reflecting differences in immune responses between the healthy control and the uveitis patient.
##
##
## **PC4 Summary:**
## The top contributing genes for PC4 are primarily involved in immune response and cell signaling. Genes like IGLC1, IGKC, IGHM, and JCHAIN are components of immunoglobulin production, suggesting a strong B cell signature. Other genes like GNLY, PF4, GZMB, and NKG7 are associated with cytotoxic T cell activity and NK cell function. Furthermore, genes like CCL4 and CCL5 are chemokines involved in attracting immune cells to sites of inflammation. The presence of these genes suggests that PC4 might be capturing variations in the immune cell composition and activation state, potentially reflecting differences in the immune response between the healthy control and the uveitis patient.
##
##
## **PC5 Summary:**
## The top contributing genes for PC5 are primarily involved in immune response and inflammation, particularly those related to cytotoxic T lymphocytes (CTLs) and NK cells. Genes like GNLY, NKG7, GZMB, PRF1, and GZMA are known effector molecules of CTLs and NK cells, responsible for killing target cells. Additionally, genes like CCL4, CCL5, and IFNG are chemokines and cytokines that play a role in attracting and activating immune cells. The presence of HLA genes (HLA-DRA, HLA-DRB1, HLA-DPA1, HLA-DPB1, HLA-DQA1) suggests differences in antigen presentation and immune recognition between the healthy control and uveitis patient. Therefore, PC5 likely captures variations in the immune response, potentially reflecting the activation of CTLs and NK cells, and the associated inflammatory processes, which could be relevant to the pathogenesis of uveitis.
##
##
## ## Overall Summary of PCs:
##
## The five principal components (PCs) identified in this analysis all highlight significant variations in immune response and inflammation between healthy controls and uveitis patients.
##
## **Common Themes:**
##
## * **Immune Response:** All PCs show a strong enrichment of genes involved in immune response, particularly those related to B cell activation, antibody production, T cell activation, and cytotoxic T lymphocyte (CTL) and NK cell activity.
## * **Inflammation:** Genes associated with inflammation, including chemokines, inflammatory mediators, and genes involved in leukocyte recruitment, are consistently present across all PCs.
##
## **Specific Contributions of Each PC:**
##
## * **PC1:** Focuses on the activation and recruitment of immune cells, particularly neutrophils and macrophages.
## * **PC2:** Highlights variations in B cell activation and antibody production, potentially reflecting differences in immune response related to uveitis pathogenesis.
## * **PC3:** Captures variations in immune cell activation, inflammation, and leukocyte trafficking, suggesting differences in immune responses between groups.
## * **PC4:** Shows a strong B cell signature, along with cytotoxic T cell activity and NK cell function, indicating variations in immune cell composition and activation state.
## * **PC5:** Emphasizes the activation of CTLs and NK cells, and associated inflammatory processes, potentially relevant to uveitis pathogenesis.
##
## **Overall, the analysis suggests that uveitis is associated with distinct patterns of immune response and inflammation, potentially involving different immune cell populations and activation states.** Further investigation is needed to understand the specific roles of each PC in the pathogenesis of uveitis and to identify potential therapeutic targets.
Here we ask SCassist_recommend_k to identify optimum number for k, to use with FindNeighbors function, given the number of pcs we are going to use for the downstream analysis.
# Run SCassist_recommend_k
recommended_k <- SCassist_recommend_k("allsamplesgood", num_pcs = 10, llm_server="google", api_key_file = api_key_file)
## ## Recommended K: 10-50
##
## ## Reasoning:
##
## The `k.param` value in `FindNeighbors()` determines the number of nearest neighbors considered for each cell when constructing the k-nearest neighbor graph. This graph is then used for clustering. A higher `k.param` value leads to a more connected graph, potentially merging distinct populations. Conversely, a lower value might result in overly fragmented clusters.
##
## Given your dataset size of 34,967 cells and the use of 10 principal components, a range of `k.param` values between 10 and 50 is recommended. This range allows for exploration of different levels of connectivity while considering the dimensionality of your data. Starting with a lower value like 10 might reveal finer subpopulations, while increasing the value to 50 could capture broader cell types.
# Run findneighbors, with above identified pcs and k
allsamplesgood <- FindNeighbors(allsamplesgood, dims = 1:10, k.param = 30, return.neighbor = TRUE)
Here we ask SCassist_recommend_res to analyze the number of cells, number of genes, mean expression variability and median neighbor distance, and recommend appropriate cluster resolutions.
# Run SCassist_recommend_res
recommended_res <- SCassist_recommend_res("allsamplesgood", llm_server="google", api_key_file = api_key_file)
## Based on the data characteristics, I recommend:
##
## **Recommended Resolution:** seq(0.2, 1.2, 0.1)
##
## **Reasoning:**
##
## The mean expression variability of 0.868717336264306 suggests a moderate level of heterogeneity in your dataset. This indicates that there are likely distinct cell populations, but they may not be drastically different from each other. The median neighbor distance of 2.50502443313599 in the k-nearest neighbor graph further supports this notion, as it implies a moderate level of separation between cells.
##
## Therefore, a resolution range of 0.2 to 1.2 with increments of 0.1 is recommended. This range allows for the identification of both distinct and subtle cell populations. Lower resolutions (closer to 0.2) will capture broader, more general cell types, while higher resolutions (closer to 1.2) will reveal finer distinctions within those populations. By exploring this range, you can effectively identify the optimal resolution for your specific analysis goals.
allsamplesgood <- FindNeighbors(allsamplesgood, dims = 1:10, k.param = 30)
# Perform clustering using the SCassist recommended resolution
allsamplesgood <- FindClusters(allsamplesgood, resolution = seq(0.2,1.2,0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9408
## Number of communities: 10
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9242
## Number of communities: 12
## Elapsed time: 10 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9123
## Number of communities: 13
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9039
## Number of communities: 15
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8956
## Number of communities: 17
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8892
## Number of communities: 17
## Elapsed time: 7 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8819
## Number of communities: 20
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8761
## Number of communities: 21
## Elapsed time: 7 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8707
## Number of communities: 23
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8667
## Number of communities: 25
## Elapsed time: 7 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 34967
## Number of edges: 1730396
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8610
## Number of communities: 28
## Elapsed time: 7 seconds
head(allsamplesgood)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCACAAGTCATC-NS3R189BTS NS3R189BTS 2431 1431 4.1135335
## AAACCCACACTGGAAG-NS3R189BTS NS3R189BTS 2676 1078 0.8221226
## AAACCCACAGCACACC-NS3R189BTS NS3R189BTS 3394 1760 5.1856217
## AAACCCACAGGAGGAG-NS3R189BTS NS3R189BTS 2160 1279 6.5277778
## AAACCCACATGACGTT-NS3R189BTS NS3R189BTS 2329 1271 4.3795620
## AAACCCAGTAGTTACC-NS3R189BTS NS3R189BTS 5362 2261 3.8418501
## AAACCCAGTCCAAATC-NS3R189BTS NS3R189BTS 7926 2660 4.6177139
## AAACCCAGTCCATACA-NS3R189BTS NS3R189BTS 2239 1283 5.0468959
## AAACCCAGTGAGTTTC-NS3R189BTS NS3R189BTS 2339 1291 5.3014109
## AAACCCAGTTCTTAGG-NS3R189BTS NS3R189BTS 3537 1608 2.2900763
## percent.ribo percent.hb type nCount_SCT
## AAACCCACAAGTCATC-NS3R189BTS 18.510901 0.00000000 HC 2476
## AAACCCACACTGGAAG-NS3R189BTS 42.526158 0.00000000 HC 2660
## AAACCCACAGCACACC-NS3R189BTS 19.151444 0.00000000 HC 3016
## AAACCCACAGGAGGAG-NS3R189BTS 8.842593 0.00000000 HC 2321
## AAACCCACATGACGTT-NS3R189BTS 18.978102 0.04293688 HC 2431
## AAACCCAGTAGTTACC-NS3R189BTS 18.985453 0.01864976 HC 3350
## AAACCCAGTCCAAATC-NS3R189BTS 30.052990 0.02523341 HC 2781
## AAACCCAGTCCATACA-NS3R189BTS 19.830281 0.04466280 HC 2379
## AAACCCAGTGAGTTTC-NS3R189BTS 21.675930 0.00000000 HC 2440
## AAACCCAGTTCTTAGG-NS3R189BTS 23.353124 0.00000000 HC 3036
## nFeature_SCT SCT_snn_res.0.2 SCT_snn_res.0.3
## AAACCCACAAGTCATC-NS3R189BTS 1429 0 0
## AAACCCACACTGGAAG-NS3R189BTS 1078 1 6
## AAACCCACAGCACACC-NS3R189BTS 1758 0 0
## AAACCCACAGGAGGAG-NS3R189BTS 1278 3 3
## AAACCCACATGACGTT-NS3R189BTS 1271 6 8
## AAACCCAGTAGTTACC-NS3R189BTS 2104 4 4
## AAACCCAGTCCAAATC-NS3R189BTS 1373 2 5
## AAACCCAGTCCATACA-NS3R189BTS 1283 0 0
## AAACCCAGTGAGTTTC-NS3R189BTS 1291 0 0
## AAACCCAGTTCTTAGG-NS3R189BTS 1608 2 5
## SCT_snn_res.0.4 SCT_snn_res.0.5 SCT_snn_res.0.6
## AAACCCACAAGTCATC-NS3R189BTS 0 0 0
## AAACCCACACTGGAAG-NS3R189BTS 2 2 1
## AAACCCACAGCACACC-NS3R189BTS 0 0 0
## AAACCCACAGGAGGAG-NS3R189BTS 6 4 4
## AAACCCACATGACGTT-NS3R189BTS 9 11 13
## AAACCCAGTAGTTACC-NS3R189BTS 5 6 7
## AAACCCAGTCCAAATC-NS3R189BTS 4 7 3
## AAACCCAGTCCATACA-NS3R189BTS 0 0 0
## AAACCCAGTGAGTTTC-NS3R189BTS 0 0 7
## AAACCCAGTTCTTAGG-NS3R189BTS 4 7 3
## SCT_snn_res.0.7 SCT_snn_res.0.8 SCT_snn_res.0.9
## AAACCCACAAGTCATC-NS3R189BTS 0 0 5
## AAACCCACACTGGAAG-NS3R189BTS 9 13 4
## AAACCCACAGCACACC-NS3R189BTS 0 0 5
## AAACCCACAGGAGGAG-NS3R189BTS 6 4 8
## AAACCCACATGACGTT-NS3R189BTS 13 15 15
## AAACCCAGTAGTTACC-NS3R189BTS 4 11 17
## AAACCCAGTCCAAATC-NS3R189BTS 3 3 2
## AAACCCAGTCCATACA-NS3R189BTS 0 0 0
## AAACCCAGTGAGTTTC-NS3R189BTS 5 11 0
## AAACCCAGTTCTTAGG-NS3R189BTS 3 3 2
## SCT_snn_res.1 SCT_snn_res.1.1 SCT_snn_res.1.2
## AAACCCACAAGTCATC-NS3R189BTS 7 3 3
## AAACCCACACTGGAAG-NS3R189BTS 2 4 4
## AAACCCACAGCACACC-NS3R189BTS 7 3 3
## AAACCCACAGGAGGAG-NS3R189BTS 15 15 19
## AAACCCACATGACGTT-NS3R189BTS 16 18 17
## AAACCCAGTAGTTACC-NS3R189BTS 12 20 21
## AAACCCAGTCCAAATC-NS3R189BTS 5 17 16
## AAACCCAGTCCATACA-NS3R189BTS 0 6 2
## AAACCCAGTGAGTTTC-NS3R189BTS 12 6 2
## AAACCCAGTTCTTAGG-NS3R189BTS 5 17 16
## seurat_clusters
## AAACCCACAAGTCATC-NS3R189BTS 3
## AAACCCACACTGGAAG-NS3R189BTS 4
## AAACCCACAGCACACC-NS3R189BTS 3
## AAACCCACAGGAGGAG-NS3R189BTS 19
## AAACCCACATGACGTT-NS3R189BTS 17
## AAACCCAGTAGTTACC-NS3R189BTS 21
## AAACCCAGTCCAAATC-NS3R189BTS 16
## AAACCCAGTCCATACA-NS3R189BTS 2
## AAACCCAGTGAGTTTC-NS3R189BTS 2
## AAACCCAGTTCTTAGG-NS3R189BTS 16
# Count number of clusters at each resolution
sapply(grep("res",colnames(allsamplesgood@meta.data),value = TRUE),
function(x) length(unique(allsamplesgood@meta.data[,x])))
## SCT_snn_res.0.2 SCT_snn_res.0.3 SCT_snn_res.0.4 SCT_snn_res.0.5 SCT_snn_res.0.6
## 10 12 13 15 17
## SCT_snn_res.0.7 SCT_snn_res.0.8 SCT_snn_res.0.9 SCT_snn_res.1 SCT_snn_res.1.1
## 17 20 21 23 25
## SCT_snn_res.1.2
## 28
# change default identity
Idents(allsamplesgood) <- "SCT_snn_res.0.5"
# list cell number in each cluster for HC vs UV
table(Idents(allsamplesgood),allsamplesgood$type)
##
## BCR-UV HC
## 0 58 6404
## 1 3953 433
## 2 481 3506
## 3 3029 29
## 4 1467 1333
## 5 521 2193
## 6 887 1710
## 7 17 2129
## 8 890 885
## 9 43 1308
## 10 1020 68
## 11 3 923
## 12 9 817
## 13 389 84
## 14 5 373
# Run UMAP with identified dimensions
allsamplesgood <- RunUMAP(allsamplesgood, dims = 1:10)
# Plot umap
DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", label = T)
# Color by HC vs BCR-UV
DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", group.by = "type")
# Plot umap
dimplot1<-DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", label = T)
# Color by HC vs BCR-UV
dimplot2<-DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", group.by = "type")
# Save umaps
ggsave(paste(outputpath,"umap-clusters.pdf",sep=""), plot = dimplot1, width = 20, height = 15, units = "cm")
ggsave(paste(outputpath,"umap-clusters-type.pdf",sep=""), plot = dimplot2, width = 20, height = 15, units = "cm")
# Custom list of genes to visualize
mygenes = c("NKG7","CCL5")
RidgePlot(allsamplesgood, features = mygenes)
VlnPlot(allsamplesgood, features = mygenes, pt.size=0)
FeaturePlot(allsamplesgood, features = mygenes)
DotPlot(allsamplesgood, features = mygenes)
DoHeatmap(subset(allsamplesgood), features = mygenes, size = 3)
## 7. Marker Identification Here we run findallmarkers to identify
potential markers that are unique for each of the clusters.
########################### Find all markers (about 80 minutes)
#markersall=FindAllMarkers(allsamplesgood)
head(markersall)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## C1orf56 0 1.4630371 0.591 0.252 0 0 C1orf56
## IL1B 0 0.6066749 0.564 0.177 0 0 IL1B
## CXCL8 0 0.5038149 0.362 0.123 0 0 CXCL8
## CMSS1 0 0.4504773 0.731 0.433 0 0 CMSS1
## S100A8 0 0.2773159 0.593 0.193 0 0 S100A8
## S100A9 0 0.2416242 0.782 0.247 0 0 S100A9
# Save all markers as tab delimited file
write.table(markersall, file=paste(outputpath,"markersall.txt",sep=""),quote=F, row.names = FALSE, sep = ",")
# Plot selected markers
VlnPlot(allsamplesgood,features=c("CYP1B1","IL1B"), pt.size = 0)
FeaturePlot(allsamplesgood, features="IL1B")
# Extract the top marker for each cluster
head(markersall)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## C1orf56 0 1.4630371 0.591 0.252 0 0 C1orf56
## IL1B 0 0.6066749 0.564 0.177 0 0 IL1B
## CXCL8 0 0.5038149 0.362 0.123 0 0 CXCL8
## CMSS1 0 0.4504773 0.731 0.433 0 0 CMSS1
## S100A8 0 0.2773159 0.593 0.193 0 0 S100A8
## S100A9 0 0.2416242 0.782 0.247 0 0 S100A9
markersall %>%
group_by(cluster) %>%
slice(1) %>%
pull(gene) -> best.gene.per.cluster
best.gene.per.cluster
## [1] "C1orf56" "GZMK" "RPS10" "SLC35F1" "FGFBP2" "RPS3A"
## [7] "TMEM176A" "LEF1" "IGHD" "C1orf56" "KLF2" "IGKC"
## [13] "IGLC1" "SEPTIN5" "IGHEP1"
# Store plot as ggplot list
vlnplot1<-VlnPlot(allsamplesgood,features=best.gene.per.cluster, pt.size = 0, combine = FALSE)
# Set properties for each plot in the list
p1 <- list()
for (i in seq_along(vlnplot1)){
#Change x and y tick label font size.
p1[[i]] = vlnplot1[[i]] +
theme(legend.position = "none",
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
axis.title.y = element_text(size = 6),
axis.title.x = element_blank(),
title = element_text(size=8))
}
# Plot all the plots in the list
plot_grid(plotlist = p1, ncol = 3)
# Save plot to file
vlnplot1=plot_grid(plotlist = p1, ncol = 3)
ggsave(paste(outputpath,"vlnplot-best-per-cluster.pdf",sep=""), plot = vlnplot1, width = 20, height = 15, units = "cm")
# Plot features in umap
fplot1<-FeaturePlot(allsamplesgood,features=best.gene.per.cluster, pt.size = 0, combine = FALSE)
p2 <- list()
for (i in seq_along(fplot1)){
#Change x and y label font size.
p2[[i]] = fplot1[[i]] +
theme(legend.position = "none",
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
title = element_text(size=8))
}
plot_grid(plotlist = p2, ncol = 4)
fplot1=plot_grid(plotlist = p2, ncol = 4)
ggsave(paste(outputpath,"fplot-best-per-cluster.pdf",sep=""), plot = fplot1, width = 20, height = 15, units = "cm")
Here we use SCassist_analyze_and_annotate to analyze Top Markers, Predict Cell Types, and Optionally Annotate the Seurat Object.
Idents(allsamplesgood) <- "SCT_snn_res.0.5"
# Run SCassist_analyze_and_annotate
sca_annotation <- SCassist_analyze_and_annotate(markersall, top_genes = 30, seurat_object_name = "allsamplesgood", api_key_file = api_key_file, llm_server="google")
## [1] "The output of Analyze and Annotate is saved as a tab delimited text if your current working directory. If you provided the name of the seurat object, the annotation is also added to that object in the SCassist_annotation column"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 0"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 1"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 2"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 3"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 4"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 5"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 6"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 7"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 8"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 9"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 10"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 11"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 12"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 13"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 14"
## cluster_num cell_type
## 1 0 Neutrophil
## 2 1 T cell
## 3 2 NK cells
## 4 3 T cell
## 5 4 Cytotoxic T lymphocytes
## 6 5 Ribosomal protein-rich cells
## 7 6 Macrophage
## 8 7 Endothelial cells
## 9 8 B cell
## 10 9 Macrophage
## 11 10 Erythroid progenitor cells
## 12 11 Plasma B cells
## 13 12 B cell
## 14 13 Megakaryocyte-precursor cells
## 15 14 Plasma B cells
## scassist_reasoning
## 1 The presence of markers like IL1B, CXCL8, S100A8, S100A9, and LYZ strongly suggests a neutrophil lineage. These genes are involved in inflammation, immune response, and neutrophil activation, indicating a role in innate immunity.
## 2 The presence of markers like CD3E, TRAC, TRBC2, CD2, and CXCR4 strongly suggests a T cell lineage. Additionally, the expression of genes like GZMK, CXCR3, and TNFAIP3 points towards a cytotoxic or effector T cell phenotype.
## 3 The presence of markers like GNLY, NKG7, TYROBP, LYN, and FCER1G strongly suggests an NK cell identity. These markers are known to be involved in NK cell cytotoxicity, activation, and signaling pathways. Additionally, the presence of HLA-B, HLA-DRA, and HLA-DRB1 indicates potential involvement in immune regulation and antigen presentation.
## 4 The presence of markers like TRAC, TRBC2, CD3E, and FLT3LG strongly suggests a T cell lineage. These markers are involved in T cell receptor signaling, activation, and differentiation, indicating a mature or developing T cell population.
## 5 The presence of markers like GZMB, PRF1, GNLY, GZMH, FASLG, and NKG7 strongly suggests a cytotoxic T cell phenotype. These markers are associated with the cytotoxic machinery of T cells, indicating their ability to eliminate infected or cancerous cells. Additionally, the expression of IFNG, CD244, and KLRD1 further supports this classification, as these markers are commonly found on activated cytotoxic T cells.
## 6 The high expression of numerous ribosomal protein genes (RPS3A, RPL34, RPL26, RPS14, RPL28, RPL9, RPL13, RPL11, RPLP1, RPL12, RPS12, RPL39, EEF1A1, TPT1, RPL19, RPS23, RPL35A, RPS8, RPL18A, RPL30, RPS13, RPL32, RPS28, RPL37, FTL, RPL41, RPS6, RPL29, RPS27, RPL8) strongly suggests a cell type heavily involved in protein synthesis, likely a ribosome-rich cell type.
## 7 The presence of markers like CSF1R, FCAR, C5AR2, MARCO, LILRA1, and CD33 strongly suggests a macrophage lineage. These markers are involved in phagocytosis, antigen presentation, and immune regulation, which are key functions of macrophages.
## 8 The presence of markers like LEF1, ZEB1, and VIM, along with the expression of genes involved in cell adhesion and cytoskeletal organization (ACTB, GAPDH, PFN1, LMNA), suggests an endothelial cell identity. The expression of HLA-A, HLA-B, and HLA-C indicates a potential role in immune response and antigen presentation.
## 9 The presence of markers like CD19, CD79A, PAX5, MS4A1, and CD24 strongly suggests a B cell lineage. These markers are known to be involved in B cell development, activation, and signaling. Additionally, the presence of genes like IGHG2, IGHD, and FCRL1 further supports the B cell identity, indicating involvement in antibody production and immune response.
## 10 The presence of markers like S100A8, S100A9, IL1B, LYZ, TIMP1, CXCL8, HLA-DRA, PTPRC, and CD74 strongly suggests a macrophage identity. These markers are associated with phagocytosis, inflammation, and antigen presentation, all key functions of macrophages.
## 11 The presence of ribosomal protein genes (RPS26, RPL38, RPS29, RPS27, RPL37, RPS21, EEF1B2, RPS6, RPS28, RPL30, RPL36, RPS25, RPS18, RPS13, RPL32, RPS8, RPL37A, RPS3, RPL5, RPL10A, RPL35A, RPL3, RPL36A, RPS12, RPS9, RPL22, RPL19, RPS23) suggests active protein synthesis, a hallmark of erythroid progenitor cells. These cells are responsible for producing red blood cells and require high levels of protein synthesis to support their rapid proliferation and differentiation.
## 12 The presence of immunoglobulin genes (IGKC, IGHA1, IGHA2, IGHG1, IGHG4, IGHM), JCHAIN, and MZB1 strongly suggests a B cell lineage. The expression of inflammatory markers like IL1B, S100A8, S100A9, TNFRSF17, and CXCL8 indicates an activated state, consistent with plasma B cells, which are terminally differentiated B cells responsible for antibody production.
## 13 The presence of genes associated with B cell development and function, such as IGLC1, JCHAIN, IGHA1, MZB1, IGHM, TNFRSF17, and S100A9, strongly suggests a B cell identity. Additionally, the expression of genes involved in immune response and antigen presentation, like HLA-B, HLA-C, and B2M, further supports this classification.
## 14 The presence of markers like SEPTIN5, LY6G6F, GP9, TMEM40, CLEC1B, PF4, GP1BB, ITGB3, GP1BA, and MYL9 strongly suggests a megakaryocytic lineage. These markers are associated with platelet production, megakaryocyte maturation, and cytoskeletal organization, indicating a potential role in platelet development and function.
## 15 The presence of immunoglobulin genes (IGHG1, IGHE, IGLC2, IGHA1, IGLC3, IGHA2, IGKC, IGHV3-30, IGHM), B cell markers (MZB1, JCHAIN, CD79A), and genes involved in antibody production (TXNDC5, S100A9, DERL3, IL1B, C1orf56, MALAT1, S100A8, PLAAT2, VPREB1) strongly suggests a plasma B cell identity. The expression of GNLY, TIMP1, MT-CO3, RPL27A, CD74, and CCL3 further supports this conclusion, as these genes are commonly associated with immune responses and cell signaling pathways.
allsamplesgood = sca_annotation
# create and save umap with labels
an0=DimPlot(allsamplesgood, reduction = "umap", label = TRUE, repel = TRUE, group.by = 'scassist_annotation_SCT_snn_res.0.5', split.by = "type")
an0a=an0+theme(legend.position="none")
an0a
ggsave(paste(outputpath,"umap-with-sctype.pdf",sep=""),plot=an0a, width = 50, height = 30, units = "cm")
Here we perform differential expression analysis between HC vs BCR-UV within the NK cell populations, to identify genes that might be contributing to different subpopulations.
########################### tables
# Proportion and counts of cells in clusters and groups
# Change identity to 0.3 cluster resolution
Idents(allsamplesgood) <- "scassist_annotation_SCT_snn_res.0.5"
prop.table(table(Idents(allsamplesgood)))
##
## Neutrophil NK cells
## 0.18480281 0.11402179
## Cytotoxic T lymphocytes Plasma B cells
## 0.08007550 0.03729230
## Macrophage Endothelial cells
## 0.11290645 0.06137215
## B cell Erythroid progenitor cells
## 0.07438442 0.03111505
## T cell Megakaryocyte-precursor cells
## 0.21288644 0.01352704
## Ribosomal protein-rich cells
## 0.07761604
table(Idents(allsamplesgood))
##
## Neutrophil NK cells
## 6462 3987
## Cytotoxic T lymphocytes Plasma B cells
## 2800 1304
## Macrophage Endothelial cells
## 3948 2146
## B cell Erythroid progenitor cells
## 2601 1088
## T cell Megakaryocyte-precursor cells
## 7444 473
## Ribosomal protein-rich cells
## 2714
# Current identity levels
levels(allsamplesgood)
## [1] "Neutrophil" "NK cells"
## [3] "Cytotoxic T lymphocytes" "Plasma B cells"
## [5] "Macrophage" "Endothelial cells"
## [7] "B cell" "Erythroid progenitor cells"
## [9] "T cell" "Megakaryocyte-precursor cells"
## [11] "Ribosomal protein-rich cells"
head(Idents(allsamplesgood))
## AAACCCACAAGTCATC-NS3R189BTS AAACCCACACTGGAAG-NS3R189BTS
## Neutrophil NK cells
## AAACCCACAGCACACC-NS3R189BTS AAACCCACAGGAGGAG-NS3R189BTS
## Neutrophil Cytotoxic T lymphocytes
## AAACCCACATGACGTT-NS3R189BTS AAACCCAGTAGTTACC-NS3R189BTS
## Plasma B cells Macrophage
## 11 Levels: Neutrophil NK cells Cytotoxic T lymphocytes ... Ribosomal protein-rich cells
# Take all cells in cluster 'Natural Killer cells', and find markers that separate 'HC' and 'BCR-UV' that are in the 'type' column. subset.ident would be the default identity of clusters
markersingroup <- FindMarkers(allsamplesgood, ident.1 = "HC", ident.2 = "BCR-UV", group.by = 'type', subset.ident = "NK cells")
#markersingroup <- FindMarkers(allsamplesgood, ident.1 = "HC", ident.2 = "BCR-UV", group.by = 'type', subset.ident = "2")
head(markersingroup)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## SLC35F1 0.000000e+00 -5.482623 0.018 0.572 0.000000e+00
## IFITM1 0.000000e+00 -5.254630 0.028 0.599 0.000000e+00
## RPS26 4.289374e-289 -3.971019 0.703 1.000 1.072043e-284
## ARL17B 5.752066e-224 -6.234951 0.006 0.324 1.437614e-219
## HLA-B 1.671281e-223 -1.663662 0.968 1.000 4.177031e-219
## MALAT1 1.257242e-220 -1.282043 1.000 1.000 3.142226e-216
# Save markers
write.table(markersingroup, file="markersingroup.txt", quote=F, row.names = FALSE, sep = "\t")
VlnPlot(allsamplesgood,features=c("HLA-B","IFITM1"), group.by = "type", pt.size = 0)
FeaturePlot(allsamplesgood, features=c("HLA-B"), split.by = "type")
table(allsamplesgood$type)
##
## BCR-UV HC
## 12772 22195
Here we run the pathway enrichment analysis using SCassist_analyze_enrichment on the list of differentially expressed genes from the two NK populations found in HC and BCR-UV. SCassist does not perform enrichment analysis, but takes the clusterprofiler output for KEGG pathway enrichment and tries to understand how the identified pathways could represent the system.
########################### Marker genes between groups
# Run enrichment analysis using 'human' database, with experimental_design, pvalue and log2FC cut off values
api_key_file="/singlecellassistant/api_keys.txt"
scassist_pathway_summary <- SCassist_analyze_enrichment(markers=markersingroup,experimental_design = experimental_design, pvalue=0.05, log2FC=1, api_key_file=api_key_file, llm_server = "google")
##
## The KEGG analysis is presented below.
## -------------------------------------------
##
## The ClusterProfiler KEGG pathways result is saved as a text file in the current working directory.
##
## The list of significantly enriched KEGG pathways is also saved as a text file in the current working directory.
##
## KEGG Enrichment Insights:
## ## 1. Significant Pathways:
##
## The KEGG pathway enrichment analysis reveals a significant enrichment of pathways related to cancer, immune response, and cellular processes in the PBMCs of the uveitis patient compared to the healthy control. Several pathways associated with cancer development and progression are enriched, including "Central carbon metabolism in cancer", "Choline metabolism in cancer", "PD-L1 expression and PD-1 checkpoint pathway in cancer", and various cancer-specific pathways like "Breast cancer", "Chronic myeloid leukemia", "Colorectal cancer", "Endometrial cancer", "Glioma", "Hepatocellular carcinoma", "Melanoma", "Non-small cell lung cancer", "Pancreatic cancer", "Small cell lung cancer", and "Thyroid cancer". These findings suggest a potential link between the dysregulation of these pathways and the development of uveitis.
##
## Furthermore, the enrichment of pathways related to immune response, such as "Allograft rejection", "Asthma", "Autoimmune thyroid disease", "Graft-versus-host disease", "Inflammatory bowel disease", "Rheumatoid arthritis", "Antigen processing and presentation", "B cell receptor signaling pathway", "C-type lectin receptor signaling pathway", "Cytosolic DNA-sensing pathway", "Fc gamma R-mediated phagocytosis", "Intestinal immune network for IgA production", "Leukocyte transendothelial migration", "NOD-like receptor signaling pathway", "Natural killer cell mediated cytotoxicity", "Platelet activation", "T cell receptor signaling pathway", "Th1 and Th2 cell differentiation", "Th17 cell differentiation", "Epithelial cell signaling in Helicobacter pylori infection", "Pathogenic Escherichia coli infection", "Pertussis", "Salmonella infection", "Shigellosis", "Tuberculosis", "Vibrio cholerae infection", "Yersinia infection", "Chagas disease", "Leishmaniasis", "Toxoplasmosis", "Epstein-Barr virus infection", "Hepatitis B", "Hepatitis C", "Human T-cell leukemia virus 1 infection", "Human cytomegalovirus infection", "Human immunodeficiency virus 1 infection", "Human papillomavirus infection", "Influenza A", "Kaposi sarcoma-associated herpesvirus infection", and "Viral life cycle - HIV-1", highlights the involvement of the immune system in the pathogenesis of uveitis.
##
## ## 2. Regulators:
##
## The list of enriched pathways includes several transcription factors, such as JUN, FOS, NFKBIA, TP53, and STAT1, which are known to play crucial roles in regulating gene expression and cellular processes. These transcription factors may be involved in the dysregulation of pathways associated with inflammation, immune response, and cell cycle, contributing to the development of uveitis.
##
## ## 3. Key Genes or Potential Targets:
##
## Several genes from the enriched pathways could be important targets for further investigation in the context of uveitis. For example, genes involved in the "Central carbon metabolism in cancer" pathway, such as SLC7A5, GLS, and PIK3R1, could be potential targets for therapeutic intervention. Genes involved in the "PD-L1 expression and PD-1 checkpoint pathway in cancer", such as PIK3R1, CD4, and STAT1, could be investigated for their role in immune evasion and potential therapeutic targets. Furthermore, genes involved in the "Apoptosis" pathway, such as CASP8, BAX, and CASP3, could be investigated for their role in cell death and potential therapeutic targets. Finally, genes involved in the "TNF signaling pathway", such as TNF, NFKBIA, and CHUK, could be investigated for their role in inflammation and potential therapeutic targets. These genes, along with others from the enriched pathways, warrant further investigation to understand their specific roles in the pathogenesis of uveitis and their potential as therapeutic targets.
##
## The GO analysis is provided below.
## ----------------------------------------
##
## The ClusterProfiler GO enrichment result is saved as a text file in the current working directory.
##
## The list of significantly enriched GO terms is also saved as a text file in the current working directory.
##
## GO Enrichment Insights:
## ## 1. Significant Concepts:
##
## The GO enrichment analysis reveals a strong focus on immune system processes, particularly B cell and T cell activation, differentiation, and mediated immunity. Several GO terms related to DNA metabolism and repair are also enriched, suggesting alterations in genomic stability and response to DNA damage. Additionally, a significant number of GO terms are related to cellular response to various stimuli, including biotic, abiotic, and chemical stress, indicating a broader cellular response to the uveitis condition.
##
## The enrichment of GO terms related to RNA processing, particularly splicing and methylation, suggests potential dysregulation in gene expression and protein synthesis. Furthermore, the enrichment of GO terms related to organelle organization and transport, particularly those involving the Golgi apparatus, endosome, and lysosome, suggests alterations in cellular trafficking and protein processing.
##
## ## 2. Regulators:
##
## Several transcription factors are present in the enriched GO terms, potentially playing a role in the observed changes in PBMCs. For example, **KLF2, KLF4, KLF6, and ZBTB1** are involved in regulating gene expression and cell differentiation, potentially contributing to the observed differences in immune cell populations between the healthy control and uveitis patient. **FOXP3** is a key regulator of T cell differentiation, particularly for regulatory T cells, which are crucial for immune homeostasis. **IRF1** and **IRF8** are involved in regulating interferon production and immune response to viral infections. **TP53** is a tumor suppressor gene that plays a critical role in DNA damage response and cell cycle regulation.
##
## ## 3. Key Genes or Potential Targets:
##
## Several genes from the enriched ontologies could be important to the system or a target based on their potential impact on PBMCs in uveitis. **TNFAIP3** is a negative regulator of NF-κB signaling, which is crucial for inflammatory responses. **FOXP3** is a key regulator of regulatory T cell differentiation, which are crucial for immune homeostasis. **IRF1** and **IRF8** are involved in regulating interferon production and immune response to viral infections. **TP53** is a tumor suppressor gene that plays a critical role in DNA damage response and cell cycle regulation. **ATG5** is a key autophagy gene involved in cellular clearance and response to stress. **CTLA4** is a negative regulator of T cell activation, potentially playing a role in immune suppression. **CD40** is a key receptor involved in B cell activation and antibody production. **NLRP3** is a component of the inflammasome, which is involved in inflammatory responses. **MYD88** is a key adaptor protein involved in toll-like receptor signaling, which is crucial for innate immune responses. Targeting these genes could potentially modulate the immune response and disease progression in uveitis.
##
## Overall Summary:
## ## Comprehensive Summary of KEGG and GO Enrichment Results for Uveitis PBMCs
##
## This analysis of differentially expressed genes in PBMCs from a healthy control and a uveitis patient reveals significant dysregulation in pathways and processes related to cancer, immune response, and cellular processes in the uveitis patient.
##
## **KEGG Pathway Enrichment:**
##
## * **Cancer:** Multiple pathways associated with cancer development and progression are enriched, suggesting a potential link between these pathways and uveitis. This includes pathways related to central carbon metabolism, choline metabolism, PD-L1/PD-1 checkpoint, and specific cancer types like breast, colorectal, and lung cancer.
## * **Immune Response:** A wide range of pathways related to immune response are enriched, highlighting the involvement of the immune system in uveitis pathogenesis. This includes pathways related to allograft rejection, autoimmune diseases, antigen processing and presentation, B and T cell signaling, and various infectious diseases.
##
## **GO Enrichment:**
##
## * **Immune System Processes:** The analysis reveals a strong focus on immune system processes, particularly B cell and T cell activation, differentiation, and mediated immunity.
## * **DNA Metabolism and Repair:** Several GO terms related to DNA metabolism and repair are enriched, suggesting alterations in genomic stability and response to DNA damage.
## * **Cellular Response to Stimuli:** A significant number of GO terms are related to cellular response to various stimuli, indicating a broader cellular response to the uveitis condition.
## * **RNA Processing:** Enrichment of GO terms related to RNA processing, particularly splicing and methylation, suggests potential dysregulation in gene expression and protein synthesis.
## * **Organelle Organization and Transport:** Enrichment of GO terms related to organelle organization and transport, particularly those involving the Golgi apparatus, endosome, and lysosome, suggests alterations in cellular trafficking and protein processing.
##
## **Key Regulators:**
##
## * **Transcription Factors:** Several transcription factors are identified as potential regulators of the observed changes in PBMCs. These include JUN, FOS, NFKBIA, TP53, STAT1, KLF2, KLF4, KLF6, ZBTB1, FOXP3, IRF1, and IRF8. These factors are known to play crucial roles in regulating gene expression, cell differentiation, and immune responses.
##
## **Key Genes and Potential Targets:**
##
## * **Cancer-Related Genes:** Genes involved in the "Central carbon metabolism in cancer" pathway, such as SLC7A5, GLS, and PIK3R1, and genes involved in the "PD-L1 expression and PD-1 checkpoint pathway in cancer", such as PIK3R1, CD4, and STAT1, could be potential targets for therapeutic intervention.
## * **Apoptosis-Related Genes:** Genes involved in the "Apoptosis" pathway, such as CASP8, BAX, and CASP3, could be investigated for their role in cell death and potential therapeutic targets.
## * **Inflammation-Related Genes:** Genes involved in the "TNF signaling pathway", such as TNF, NFKBIA, and CHUK, could be investigated for their role in inflammation and potential therapeutic targets.
## * **Immune-Related Genes:** Genes like TNFAIP3, FOXP3, IRF1, IRF8, ATG5, CTLA4, CD40, NLRP3, and MYD88 are identified as potential targets due to their roles in regulating immune responses, cell differentiation, and inflammation.
##
## **Overall, this analysis highlights the complex interplay between cancer, immune response, and cellular processes in the pathogenesis of uveitis. The identified pathways, regulators, and genes provide valuable insights for further investigation and potential therapeutic targets for this disease.**
# Network style summary of the enrichment analysis
scassist_pathway_summary
# Sample proportion table
# Proportion and counts of cells in individual samples
# Proportion test
# assign sample group
allsamplesgood$sample=ifelse(grepl("HC",allsamplesgood$type), "HC", ifelse(grepl("UV",allsamplesgood$type), "UV", ""))
prop.table(table(Idents(allsamplesgood), allsamplesgood$scassist_annotation_SCT_snn_res.0.5))
##
## B cell Cytotoxic T lymphocytes
## Neutrophil 0.00000000 0.00000000
## NK cells 0.00000000 0.00000000
## Cytotoxic T lymphocytes 0.00000000 0.08007550
## Plasma B cells 0.00000000 0.00000000
## Macrophage 0.00000000 0.00000000
## Endothelial cells 0.00000000 0.00000000
## B cell 0.07438442 0.00000000
## Erythroid progenitor cells 0.00000000 0.00000000
## T cell 0.00000000 0.00000000
## Megakaryocyte-precursor cells 0.00000000 0.00000000
## Ribosomal protein-rich cells 0.00000000 0.00000000
##
## Endothelial cells Erythroid progenitor cells
## Neutrophil 0.00000000 0.00000000
## NK cells 0.00000000 0.00000000
## Cytotoxic T lymphocytes 0.00000000 0.00000000
## Plasma B cells 0.00000000 0.00000000
## Macrophage 0.00000000 0.00000000
## Endothelial cells 0.06137215 0.00000000
## B cell 0.00000000 0.00000000
## Erythroid progenitor cells 0.00000000 0.03111505
## T cell 0.00000000 0.00000000
## Megakaryocyte-precursor cells 0.00000000 0.00000000
## Ribosomal protein-rich cells 0.00000000 0.00000000
##
## Macrophage Megakaryocyte-precursor cells
## Neutrophil 0.00000000 0.00000000
## NK cells 0.00000000 0.00000000
## Cytotoxic T lymphocytes 0.00000000 0.00000000
## Plasma B cells 0.00000000 0.00000000
## Macrophage 0.11290645 0.00000000
## Endothelial cells 0.00000000 0.00000000
## B cell 0.00000000 0.00000000
## Erythroid progenitor cells 0.00000000 0.00000000
## T cell 0.00000000 0.00000000
## Megakaryocyte-precursor cells 0.00000000 0.01352704
## Ribosomal protein-rich cells 0.00000000 0.00000000
##
## Neutrophil NK cells Plasma B cells
## Neutrophil 0.18480281 0.00000000 0.00000000
## NK cells 0.00000000 0.11402179 0.00000000
## Cytotoxic T lymphocytes 0.00000000 0.00000000 0.00000000
## Plasma B cells 0.00000000 0.00000000 0.03729230
## Macrophage 0.00000000 0.00000000 0.00000000
## Endothelial cells 0.00000000 0.00000000 0.00000000
## B cell 0.00000000 0.00000000 0.00000000
## Erythroid progenitor cells 0.00000000 0.00000000 0.00000000
## T cell 0.00000000 0.00000000 0.00000000
## Megakaryocyte-precursor cells 0.00000000 0.00000000 0.00000000
## Ribosomal protein-rich cells 0.00000000 0.00000000 0.00000000
##
## Ribosomal protein-rich cells T cell
## Neutrophil 0.00000000 0.00000000
## NK cells 0.00000000 0.00000000
## Cytotoxic T lymphocytes 0.00000000 0.00000000
## Plasma B cells 0.00000000 0.00000000
## Macrophage 0.00000000 0.00000000
## Endothelial cells 0.00000000 0.00000000
## B cell 0.00000000 0.00000000
## Erythroid progenitor cells 0.00000000 0.00000000
## T cell 0.00000000 0.21288644
## Megakaryocyte-precursor cells 0.00000000 0.00000000
## Ribosomal protein-rich cells 0.07761604 0.00000000
table(Idents(allsamplesgood), allsamplesgood$scassist_annotation_SCT_snn_res.0.5)
##
## B cell Cytotoxic T lymphocytes
## Neutrophil 0 0
## NK cells 0 0
## Cytotoxic T lymphocytes 0 2800
## Plasma B cells 0 0
## Macrophage 0 0
## Endothelial cells 0 0
## B cell 2601 0
## Erythroid progenitor cells 0 0
## T cell 0 0
## Megakaryocyte-precursor cells 0 0
## Ribosomal protein-rich cells 0 0
##
## Endothelial cells Erythroid progenitor cells
## Neutrophil 0 0
## NK cells 0 0
## Cytotoxic T lymphocytes 0 0
## Plasma B cells 0 0
## Macrophage 0 0
## Endothelial cells 2146 0
## B cell 0 0
## Erythroid progenitor cells 0 1088
## T cell 0 0
## Megakaryocyte-precursor cells 0 0
## Ribosomal protein-rich cells 0 0
##
## Macrophage Megakaryocyte-precursor cells
## Neutrophil 0 0
## NK cells 0 0
## Cytotoxic T lymphocytes 0 0
## Plasma B cells 0 0
## Macrophage 3948 0
## Endothelial cells 0 0
## B cell 0 0
## Erythroid progenitor cells 0 0
## T cell 0 0
## Megakaryocyte-precursor cells 0 473
## Ribosomal protein-rich cells 0 0
##
## Neutrophil NK cells Plasma B cells
## Neutrophil 6462 0 0
## NK cells 0 3987 0
## Cytotoxic T lymphocytes 0 0 0
## Plasma B cells 0 0 1304
## Macrophage 0 0 0
## Endothelial cells 0 0 0
## B cell 0 0 0
## Erythroid progenitor cells 0 0 0
## T cell 0 0 0
## Megakaryocyte-precursor cells 0 0 0
## Ribosomal protein-rich cells 0 0 0
##
## Ribosomal protein-rich cells T cell
## Neutrophil 0 0
## NK cells 0 0
## Cytotoxic T lymphocytes 0 0
## Plasma B cells 0 0
## Macrophage 0 0
## Endothelial cells 0 0
## B cell 0 0
## Erythroid progenitor cells 0 0
## T cell 0 7444
## Megakaryocyte-precursor cells 0 0
## Ribosomal protein-rich cells 2714 0
# Save tables
write.table(prop.table(table(Idents(allsamplesgood), allsamplesgood$scassist_annotation_SCT_snn_res.0.5)), file=paste(outputpath,"cluster-proportions.txt",sep=""), quote=F, sep = "\t")
write.table(table(Idents(allsamplesgood), allsamplesgood$scassist_annotation_SCT_snn_res.0.5), file=paste(outputpath,"cluster-counts.txt",sep=""), quote=F, sep = "\t")
# Run permutation test
prop_test <- sc_utils(allsamplesgood)
prop_test <- permutation_test(
prop_test, cluster_identity = "scassist_annotation_SCT_snn_res.0.5",
sample_1 = "HC", sample_2 = "BCR-UV",
sample_identity = "type",
n_permutations = 10000
)
plot_data=prop_test@results$permutation
plot_data
## Key: <clusters>
## clusters BCR-UV HC obs_log2FD
## <char> <num> <num> <num>
## 1: B cell 0.0703883495 0.076683938 -0.1235878
## 2: Cytotoxic T lymphocytes 0.1148606326 0.060058572 0.9354423
## 3: Endothelial cells 0.0013310366 0.095922505 -6.1712471
## 4: Erythroid progenitor cells 0.0798621986 0.003063753 4.7041408
## 5: Macrophage 0.0728155340 0.135976571 -0.9010399
## 6: Megakaryocyte-precursor cells 0.0304572502 0.003784636 3.0085592
## 7: NK cells 0.0376605074 0.157963505 -2.0684669
## 8: Neutrophil 0.0045411838 0.288533453 -5.9895263
## 9: Plasma B cells 0.0006263702 0.058391530 -6.5425998
## 10: Ribosomal protein-rich cells 0.0407923583 0.098806037 -1.2763003
## 11: T cell 0.5466645788 0.020815499 4.7149259
## pval FDR boot_mean_log2FD boot_CI_2.5 boot_CI_97.5
## <num> <num> <num> <num> <num>
## 1: 0.01649835 0.016498350 -0.1238388 -0.2394087 -0.01032646
## 2: 0.00009999 0.000109989 0.9348009 0.8319372 1.03505340
## 3: 0.00009999 0.000109989 -6.2105337 -6.9643176 -5.59436282
## 4: 0.00009999 0.000109989 4.7162778 4.3735695 5.08265247
## 5: 0.00009999 0.000109989 -0.9011725 -1.0030494 -0.80228322
## 6: 0.00009999 0.000109989 3.0143311 2.6851774 3.36799889
## 7: 0.00009999 0.000109989 -2.0703577 -2.2071623 -1.94068825
## 8: 0.00009999 0.000109989 -6.0002551 -6.4033202 -5.64613015
## 9: 0.00009999 0.000109989 -6.6520279 -7.9742392 -5.73078518
## 10: 0.00009999 0.000109989 -1.2777645 -1.4138475 -1.14641911
## 11: 0.00009999 0.000109989 4.7170552 4.5868278 4.85229708
FDR_threshold = 0.05
log2FD_threshold = log2(1.5)
order_clusters = TRUE
plot_data[, significance := ifelse(
FDR < FDR_threshold & abs(obs_log2FD) > log2FD_threshold,
paste("FDR <", FDR_threshold, "& abs(Log2FD) >", round(log2FD_threshold, 2)),
"n.s."
)]
plot_data[, significance := factor(significance, levels = c(
paste("FDR <", FDR_threshold, "& abs(Log2FD) >", round(log2FD_threshold, 2)),
"n.s."
))]
#library(tidyverse)
## Order the clusters by observed log2FD if requested.
if (order_clusters) {
plot_data[, clusters := fct_reorder(factor(clusters), dplyr::desc(obs_log2FD))]
}
## Plot the results.
p <- ggplot(plot_data, aes(x = clusters, y = obs_log2FD)) +
geom_pointrange(aes(ymin = boot_CI_2.5, ymax = boot_CI_97.5, color = significance)) +
theme_bw() +
geom_hline(yintercept = log2FD_threshold, lty = 2) +
geom_hline(yintercept = -log2FD_threshold, lty = 2) +
geom_hline(yintercept = 0) +
scale_color_manual(values = c("salmon", "grey")) +
coord_flip()
p
ggsave(paste(outputpath,"cluster_proportion_test_labelled.pdf",sep=""), plot=p)
save.image(file=paste(outputpath,"bcruv_G_new.RData",sep=""))
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.7.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] org.Hs.eg.db_3.19.1 AnnotationDbi_1.66.0
## [3] IRanges_2.38.1 S4Vectors_0.42.1
## [5] Biobase_2.64.0 BiocGenerics_0.50.0
## [7] jsonlite_1.8.8 httr_1.4.7
## [9] SCassist_0.1.0 rollama_0.1.0
## [11] scProportionTest_0.0.0.9000 cowplot_1.1.3
## [13] lubridate_1.9.3 forcats_1.0.0
## [15] stringr_1.5.1 dplyr_1.1.4
## [17] purrr_1.0.2 readr_2.1.5
## [19] tidyr_1.3.1 tibble_3.2.1
## [21] tidyverse_2.0.0 plotly_4.10.4
## [23] ggplot2_3.5.1 Seurat_5.0.0
## [25] SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.4 matrixStats_1.3.0
## [3] spatstat.sparse_3.1-0 enrichplot_1.24.4
## [5] RColorBrewer_1.1-3 tools_4.4.1
## [7] sctransform_0.4.1 utf8_1.2.4
## [9] R6_2.5.1 lazyeval_0.2.2
## [11] uwot_0.2.2 withr_3.0.0
## [13] gridExtra_2.3 progressr_0.14.0
## [15] textshaping_0.4.0 cli_3.6.3
## [17] spatstat.explore_3.3-2 fastDummies_1.7.4
## [19] scatterpie_0.2.4 labeling_0.4.3
## [21] sass_0.4.9 spatstat.data_3.1-2
## [23] ggridges_0.5.6 pbapply_1.7-2
## [25] systemfonts_1.1.0 yulab.utils_0.1.7
## [27] gson_0.1.0 DOSE_3.30.5
## [29] R.utils_2.12.3 parallelly_1.38.0
## [31] limma_3.60.4 rstudioapi_0.16.0
## [33] RSQLite_2.3.7 visNetwork_2.1.2
## [35] generics_0.1.3 gridGraphics_0.5-1
## [37] ica_1.0-3 spatstat.random_3.3-2
## [39] GO.db_3.19.1 Matrix_1.7-0
## [41] ggbeeswarm_0.7.2 fansi_1.0.6
## [43] abind_1.4-5 R.methodsS3_1.8.2
## [45] lifecycle_1.0.4 yaml_2.3.9
## [47] SummarizedExperiment_1.34.0 SparseArray_1.4.8
## [49] glmGamPoi_1.16.0 qvalue_2.36.0
## [51] Rtsne_0.17 grid_4.4.1
## [53] blob_1.2.4 promises_1.3.0
## [55] crayon_1.5.3 miniUI_0.1.1.1
## [57] lattice_0.22-6 KEGGREST_1.44.1
## [59] pillar_1.9.0 knitr_1.48
## [61] GenomicRanges_1.56.1 fgsea_1.30.0
## [63] future.apply_1.11.2 codetools_0.2-20
## [65] fastmatch_1.1-4 leiden_0.4.3.1
## [67] glue_1.7.0 ggfun_0.1.5
## [69] spatstat.univar_3.0-1 data.table_1.15.4
## [71] vctrs_0.6.5 png_0.1-8
## [73] treeio_1.28.0 spam_2.10-0
## [75] gtable_0.3.5 cachem_1.1.0
## [77] xfun_0.45 S4Arrays_1.4.1
## [79] mime_0.12 tidygraph_1.3.1
## [81] survival_3.6-4 statmod_1.5.0
## [83] fitdistrplus_1.2-1 ROCR_1.0-11
## [85] nlme_3.1-164 ggtree_3.12.0
## [87] bit64_4.0.5 RcppAnnoy_0.0.22
## [89] GenomeInfoDb_1.40.1 bslib_0.7.0
## [91] irlba_2.3.5.1 vipor_0.4.7
## [93] KernSmooth_2.23-24 colorspace_2.1-0
## [95] DBI_1.2.3 ggrastr_1.0.2
## [97] tidyselect_1.2.1 bit_4.0.5
## [99] compiler_4.4.1 curl_5.2.1
## [101] httr2_1.0.1 hdf5r_1.3.11
## [103] DelayedArray_0.30.1 shadowtext_0.1.4
## [105] scales_1.3.0 lmtest_0.9-40
## [107] rappdirs_0.3.3 digest_0.6.36
## [109] goftest_1.2-3 spatstat.utils_3.1-0
## [111] rmarkdown_2.27 XVector_0.44.0
## [113] htmltools_0.5.8.1 pkgconfig_2.0.3
## [115] sparseMatrixStats_1.16.0 MatrixGenerics_1.16.0
## [117] highr_0.11 fastmap_1.2.0
## [119] rlang_1.1.4 htmlwidgets_1.6.4
## [121] UCSC.utils_1.0.0 DelayedMatrixStats_1.26.0
## [123] shiny_1.8.1.1 farver_2.1.2
## [125] jquerylib_0.1.4 zoo_1.8-12
## [127] BiocParallel_1.38.0 GOSemSim_2.30.2
## [129] R.oo_1.26.0 magrittr_2.0.3
## [131] GenomeInfoDbData_1.2.12 ggplotify_0.1.2
## [133] dotCall64_1.1-1 patchwork_1.2.0
## [135] munsell_0.5.1 Rcpp_1.0.12
## [137] ape_5.8 viridis_0.6.5
## [139] reticulate_1.39.0 stringi_1.8.4
## [141] ggraph_2.2.1 zlibbioc_1.50.0
## [143] MASS_7.3-60.2 plyr_1.8.9
## [145] parallel_4.4.1 listenv_0.9.1
## [147] ggrepel_0.9.5 deldir_2.0-4
## [149] Biostrings_2.72.1 graphlayouts_1.2.0
## [151] splines_4.4.1 tensor_1.5
## [153] hms_1.1.3 igraph_2.0.3
## [155] spatstat.geom_3.3-3 RcppHNSW_0.6.0
## [157] reshape2_1.4.4 evaluate_0.24.0
## [159] BiocManager_1.30.23 tzdb_0.4.0
## [161] tweenr_2.0.3 httpuv_1.6.15
## [163] RANN_2.6.2 polyclip_1.10-6
## [165] future_1.34.0 scattermore_1.2
## [167] ggforce_0.4.2 xtable_1.8-4
## [169] RSpectra_0.16-1 tidytree_0.4.6
## [171] later_1.3.2 ragg_1.3.2
## [173] viridisLite_0.4.2 clusterProfiler_4.12.6
## [175] aplot_0.2.3 memoise_2.0.1
## [177] beeswarm_0.4.0 cluster_2.1.6
## [179] timechange_0.3.0 globals_0.16.3