This document takes the cellranger count data for two samples (a WT and an Ifng CBS mutant mice) and performs standard single cell rnaseq analysis, using the SCassist package.
# Install hdf5 library if its not already installed
#sudo apt-get install libhdf5-dev or brew install hdf5
#install.packages("hdf5r")
# Load below libraries
library(Seurat)
library(ggplot2)
library(plotly)
library(tidyverse)
library(cowplot)
library(scProportionTest)
library(rollama)
library(SCassist)
library(httr)
library(jsonlite)
# Set the input, output paths
setwd("/singlecellassistant/exampleOutput_G/LCMV/")
#datapath="exampleData/LCMV"
outputpath="/singlecellassistant/exampleOutput_G/LCMV/"
# Load RData for example dataset
load("/singlecellassistant/exampleOutput_G/LCMV/LCMV_G.RData")
# Specify samples for analysis.
# Define LCMV Day4, CD4, CD8, NK samples
samples=c("WT","KO")
# Read the data file
WT <- Read10X_h5(file.path("/singlecellassistant/exampleData/LCMV/GSM6625298_scRNA_LCMV_Day4_CD4_CD8_NK_WT_filtered_feature_bc_matrix.h5"), use.names = T)
KO <- Read10X_h5(file.path("/singlecellassistant/exampleData/LCMV/GSM6625299_scRNA_LCMV_Day4_CD4_CD8_NK_KO_filtered_feature_bc_matrix.h5"), use.names = T)
# Add sample names to columnnames
colnames(WT$`Gene Expression`) <- paste(sapply(strsplit(colnames(WT$`Gene Expression`),split="-"),'[[',1L),"WT",sep="-")
colnames(KO$`Gene Expression`) <- paste(sapply(strsplit(colnames(KO$`Gene Expression`),split="-"),'[[',1L),"KO",sep="-")
# Create a Seurat data object from the gex matrix
WT <- CreateSeuratObject(counts = WT[["Gene Expression"]], names.field = 2,names.delim = "\\-")
KO <- CreateSeuratObject(counts = KO[["Gene Expression"]], names.field = 2,names.delim = "\\-")
# Merge seurat objects
allsamples <- merge(WT, KO, project = "LCMV")
allsamples
## An object of class Seurat
## 32285 features across 10551 samples within 1 assay
## Active assay: RNA (32285 features, 0 variable features)
## 2 layers present: counts.1, counts.2
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA
## AAACCCAAGAGTTGTA-WT WT 17880 4120
## AAACCCAAGATTTGCC-WT WT 7916 2395
## AAACCCAAGTTGTAGA-WT WT 35993 5094
## AAACCCACAAGTATCC-WT WT 9059 2865
## AAACCCAGTCCAGGTC-WT WT 10948 2939
## AAACCCATCCTGGGAC-WT WT 16768 3484
## AAACGAACAGAGAGGG-WT WT 10467 3159
## AAACGAACAGCGACAA-WT WT 12996 2853
## AAACGAACAGTGTGGA-WT WT 26674 4654
## AAACGAATCTAGCCTC-WT WT 11434 3100
table(Idents(allsamples))
##
## WT KO
## 5666 4885
# Merge counts data
allsamples = JoinLayers(allsamples)
This part of the workflow uses SCassist_analyze_quality(), to identify potential filtering cutoff values. SCassist provides the recommendation based on the summary statistics and the quantile statistic of the data.
# Generate Percent mito for each of the cells
grep("^mt-",rownames(allsamples@assays$RNA$counts),value = TRUE)
## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
## [8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
allsamples$percent.mito <- PercentageFeatureSet(allsamples, pattern = "^mt-")
summary(allsamples$percent.mito)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.819 2.425 5.053 3.310 83.600
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCAAGAGTTGTA-WT WT 17880 4120 3.333333
## AAACCCAAGATTTGCC-WT WT 7916 2395 2.627590
## AAACCCAAGTTGTAGA-WT WT 35993 5094 2.008724
## AAACCCACAAGTATCC-WT WT 9059 2865 1.953858
## AAACCCAGTCCAGGTC-WT WT 10948 2939 2.831567
## AAACCCATCCTGGGAC-WT WT 16768 3484 0.942271
## AAACGAACAGAGAGGG-WT WT 10467 3159 3.668673
## AAACGAACAGCGACAA-WT WT 12996 2853 1.569714
## AAACGAACAGTGTGGA-WT WT 26674 4654 1.570818
## AAACGAATCTAGCCTC-WT WT 11434 3100 1.687948
# Generate Percent ribo for each of the cells
allsamples$percent.ribo <- PercentageFeatureSet(allsamples, pattern = "^Rp[sl]")
summary(allsamples$percent.ribo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.002 18.156 24.731 24.894 32.868 56.275
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCAAGAGTTGTA-WT WT 17880 4120 3.333333
## AAACCCAAGATTTGCC-WT WT 7916 2395 2.627590
## AAACCCAAGTTGTAGA-WT WT 35993 5094 2.008724
## AAACCCACAAGTATCC-WT WT 9059 2865 1.953858
## AAACCCAGTCCAGGTC-WT WT 10948 2939 2.831567
## AAACCCATCCTGGGAC-WT WT 16768 3484 0.942271
## AAACGAACAGAGAGGG-WT WT 10467 3159 3.668673
## AAACGAACAGCGACAA-WT WT 12996 2853 1.569714
## AAACGAACAGTGTGGA-WT WT 26674 4654 1.570818
## AAACGAATCTAGCCTC-WT WT 11434 3100 1.687948
## percent.ribo
## AAACCCAAGAGTTGTA-WT 22.80761
## AAACCCAAGATTTGCC-WT 16.28348
## AAACCCAAGTTGTAGA-WT 21.85147
## AAACCCACAAGTATCC-WT 16.81201
## AAACCCAGTCCAGGTC-WT 25.30142
## AAACCCATCCTGGGAC-WT 35.07276
## AAACGAACAGAGAGGG-WT 22.92921
## AAACGAACAGCGACAA-WT 42.22068
## AAACGAACAGTGTGGA-WT 26.22029
## AAACGAATCTAGCCTC-WT 28.44149
# Generate Percent hemoglobin for each of the cells
allsamples$percent.hb <- PercentageFeatureSet(allsamples, pattern = "^Hb[^(p)]")
summary(allsamples$percent.hb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.004744 0.005529 2.162162
head(allsamples)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCAAGAGTTGTA-WT WT 17880 4120 3.333333
## AAACCCAAGATTTGCC-WT WT 7916 2395 2.627590
## AAACCCAAGTTGTAGA-WT WT 35993 5094 2.008724
## AAACCCACAAGTATCC-WT WT 9059 2865 1.953858
## AAACCCAGTCCAGGTC-WT WT 10948 2939 2.831567
## AAACCCATCCTGGGAC-WT WT 16768 3484 0.942271
## AAACGAACAGAGAGGG-WT WT 10467 3159 3.668673
## AAACGAACAGCGACAA-WT WT 12996 2853 1.569714
## AAACGAACAGTGTGGA-WT WT 26674 4654 1.570818
## AAACGAATCTAGCCTC-WT WT 11434 3100 1.687948
## percent.ribo percent.hb
## AAACCCAAGAGTTGTA-WT 22.80761 0.011185682
## AAACCCAAGATTTGCC-WT 16.28348 0.000000000
## AAACCCAAGTTGTAGA-WT 21.85147 0.002778318
## AAACCCACAAGTATCC-WT 16.81201 0.000000000
## AAACCCAGTCCAGGTC-WT 25.30142 0.009134088
## AAACCCATCCTGGGAC-WT 35.07276 0.005963740
## AAACGAACAGAGAGGG-WT 22.92921 0.009553836
## AAACGAACAGCGACAA-WT 42.22068 0.000000000
## AAACGAACAGTGTGGA-WT 26.22029 0.000000000
## AAACGAATCTAGCCTC-WT 28.44149 0.000000000
# Check the total number of cells in this experiment
table(allsamples$orig.ident)
##
## KO WT
## 4885 5666
Analyze quality of the cells using SCassist and use the SCassist recommended QC cutoff values obtained in the previous step to filter and visualize the before and after QC data.
# Generate before filtering quality plots
qcbefore=VlnPlot(allsamples,features = c("nFeature_RNA", "nCount_RNA","percent.mito","percent.ribo"),ncol = 2, pt.size = 0) +
NoLegend()
qcbefore
# Save the plot in current working directory
ggsave(paste(outputpath,"LCMV-before-qc-violineplot.pdf",sep=""), plot = qcbefore, width = 15, height = 20, units = "cm")
api_key_file = "/singlecellassistant/api_keys.txt"
# Analyze quality of the cells using SCassist
allsamplesquality <- SCassist_analyze_quality("allsamples", percent_mt = "percent.mito", percent_ribo = "percent.ribo", percent_hb = "percent.hb", api_key_file=api_key_file, llm_server="google")
## Based on the data summary, below are my recommendations for the quality filtering of the data:
##
## **nCount_RNA:**
##
## * **Lower Cutoff:** 1500. This value is chosen to be slightly above the 5th percentile (947) to remove cells with very low counts, but still capture a significant portion of the data.
## * **Upper Cutoff:** 35000. This value is chosen to be slightly below the 95th percentile (26528) to remove cells with extremely high counts, which could indicate potential doublets or other artifacts.
##
## **nFeature_RNA:**
##
## * **Lower Cutoff:** 700. This value is chosen to be slightly above the 5th percentile (528) to remove cells with very few detected genes, but still capture a significant portion of the data.
## * **Upper Cutoff:** 5500. This value is chosen to be slightly below the 95th percentile (4793) to remove cells with an unusually high number of detected genes, which could indicate potential doublets or other artifacts.
##
## **percent.mito:**
##
## * **Upper Cutoff:** 15. This value is chosen to be significantly lower than the 95th percentile (23.01) to remove cells with a high percentage of mitochondrial reads, which could indicate cell stress or damage.
##
## **percent.ribo:**
##
## * **Upper Cutoff:** 45. This value is chosen to be slightly lower than the 95th percentile (41.97) to remove cells with a high percentage of ribosomal reads, which could indicate cell stress or damage.
##
## **percent.hb:**
##
## * **Upper Cutoff:** 0.025. This value is chosen to be slightly higher than the 95th percentile (0.019) to remove cells with a high percentage of hemoglobin reads, which could indicate contamination from blood cells.
##
## **Important Note:** These are just recommendations, and the researcher should test a range of values around these recommendations to determine the optimal cutoffs for their specific dataset. The optimal cutoffs will depend on the specific characteristics of the data and the research question being addressed.
# Filter data based on the above plot.
allsamplesgood <- subset(allsamples, subset = nCount_RNA > 1500 & nCount_RNA < 35000 & nFeature_RNA > 700 & nFeature_RNA < 5500 & percent.mito < 15 & percent.ribo < 45 & percent.hb < 0.025)
# Counts before filtering
table(allsamples$orig.ident)
##
## KO WT
## 4885 5666
# Counts after filtering
table(allsamplesgood$orig.ident)
##
## KO WT
## 3979 4576
# Generate after filtering quality plots
qcafter=VlnPlot(allsamplesgood,features = c("nFeature_RNA", "nCount_RNA","percent.mito","percent.ribo"),ncol = 2, pt.size = 0) +
NoLegend()
qcafter
# Save the plot in current working directory
ggsave(paste(outputpath,"LCMV-after-qc-violineplot.pdf",sep=""), plot = qcafter, width = 15, height = 20, units = "cm")
Here we ask SCassist_recommend_normalization to use the number of cells, mean gene expression, standard deviation of gene expression, library size and the coefficient of library size variation, to analyze the data properties and recommend an appropriate normalization method among “LogNormalize”, “CLR”, “RC”, “SCTransform”, along with its reasoning for the recommendation.
# Identify suitable normalization method
normalization_recommendation<-SCassist_recommend_normalization("allsamplesgood", api_key_file = api_key_file, llm_server="google")
## ## Recommended Normalization Method: SCTransform
##
## Based on the provided characteristics of the single-cell RNA-seq dataset, **SCTransform** is the most appropriate normalization method for this dataset. Here's why:
##
## **1. Large Number of Cells:** SCTransform is designed to handle large datasets efficiently. With 8555 cells, computational efficiency becomes a significant factor, and SCTransform excels in this regard.
##
## **2. High Variability in Gene Expression:** The high standard deviation of 14.8757353917764 in gene expression values indicates significant variability across cells. SCTransform accounts for this variability by modeling the technical noise associated with each gene and cell, leading to more accurate normalization.
##
## **3. Moderate Library Size Variation:** The coefficient of variation of 0.538648217306346 for library sizes suggests moderate variation. While not extremely high, SCTransform effectively handles library size variations by regressing out the effect of library size during normalization.
##
## **Why SCTransform is Preferred:**
##
## * **Comprehensive Normalization:** SCTransform combines normalization, variance stabilization, and feature selection into a single step, making it a powerful and efficient approach.
## * **Cell-Specific Modeling:** It models the technical noise associated with each cell and gene, leading to more accurate normalization and downstream analysis.
## * **Improved Downstream Analysis:** SCTransform has been shown to improve the performance of downstream analyses like dimensionality reduction, clustering, and differential gene expression analysis.
##
## **Potential Alternatives:**
##
## * **LogNormalize:** This method is a simple and commonly used normalization technique. However, it does not account for cell-specific variability or library size variations as effectively as SCTransform.
## * **CLR (Centered Log Ratio):** CLR is a robust normalization method that can handle zero values. However, it is not as widely used as SCTransform and may not be as effective in handling high variability in gene expression.
## * **RC (Relative Counts):** RC is a simple normalization method that divides each gene count by the total number of counts in the cell. It is not as sophisticated as SCTransform and may not be suitable for datasets with high variability.
##
## **Conclusion:**
##
## While other normalization methods are available, SCTransform is the most recommended option for this dataset due to its ability to handle large datasets, high variability in gene expression, and moderate library size variations. It provides a comprehensive and efficient approach to normalization, leading to improved downstream analysis.
# Normalize Data
# Below we increase memory to 1GB for SCTransform run
options(future.globals.maxSize = 2000 * 1024^2)
# SCTransform normalization based on SCassist recommendation
allsamplesgood <- SCTransform(allsamplesgood, vars.to.regress = c("percent.mito", "nFeature_RNA", "percent.ribo", "nCount_RNA"))
Explore the variable features using the LLMs abilities to glean insights in to a list of genes, to explore what drives the cell-to-cell variation. Here we ask SCassist_analyze_variable_features to explore the top 30 variable features, along with the experimental design statement and provide potential insights on the underlying system being studied.
# Experimental design statement
experimental_design = "NK, CD4+ and CD8+ T cells isolated from lymphocytic choriomeningitis virus infected WT and Ifng - CTCF binding site mutant mice"
# List top 10 variable genes
top10 <- head(VariableFeatures(allsamplesgood), 10)
top10
## [1] "Gzma" "Ccl5" "Hist1h2ap" "Hist1h1b" "Ccl4" "Gzmb"
## [7] "Hist1h2ae" "Tyrobp" "Klra4" "Xcl1"
# Plot variable features, label the top 10, save the plot
vfp1 <- VariableFeaturePlot(allsamplesgood)
vfp1 <- LabelPoints(plot = vfp1, points = top10, repel = TRUE)
vfp1
ggsave(paste(outputpath,"LCMV-variablefeatureplot.pdf",sep=""), plot = vfp1, width = 20, height = 15, units = "cm")
# Analyze variable features (top 30 default genes)
variable_feature_analysis <- SCassist_analyze_variable_features("allsamplesgood", experimental_design = experimental_design, api_key_file = api_key_file, llm_server = "google")
## ## Enriched Gene Ontologies and Pathways:
##
## The provided list of genes suggests enrichment in several key pathways and functions related to **immune response, cell cycle, and inflammation**.
##
## **1. Immune Response:**
##
## * **NK cell activation and cytotoxicity:** Genes like *Gzma, Gzmb, Nkg7, Klra1, Klra4, Klra7, Klra8, Klra9* are all involved in NK cell function. *Gzma* and *Gzmb* are granzyme proteins that induce apoptosis in target cells, while *Nkg7* is a receptor involved in NK cell activation. The *Klra* genes encode killer cell lectin-like receptors, which play a role in NK cell recognition and activation.
## * **Chemokine signaling:** Genes like *Ccl3, Ccl4, Ccl5, Cxcl10, Xcl1* are chemokines involved in attracting and activating immune cells, particularly T cells and NK cells.
## * **Antigen presentation:** *Tyrobp* encodes a protein involved in the signaling pathway of the Fc receptor, which plays a role in antigen presentation and immune cell activation.
## * **T cell activation and differentiation:** *Ikzf2* is a transcription factor involved in T cell development and differentiation.
##
## **2. Cell Cycle and Proliferation:**
##
## * **Cell cycle regulation:** *Mki67* is a marker of proliferating cells, while *Top2a* is involved in DNA replication and repair.
## * **Cell growth and differentiation:** *Stmn1* is a protein involved in microtubule dynamics and cell growth.
##
## **3. Inflammation and Tissue Repair:**
##
## * **Inflammation:** *S100a4, S100a6, Hmgb2* are involved in inflammation and immune response.
## * **Extracellular matrix remodeling:** *Spp1* encodes osteopontin, a protein involved in extracellular matrix remodeling and inflammation.
##
## **Relevance to Experimental Design:**
##
## The identified gene ontologies and pathways are highly relevant to the experimental design involving NK, CD4+, and CD8+ T cells isolated from lymphocytic choriomeningitis virus (LCMV) infected WT and Ifng - CTCF binding site mutant mice.
##
## * **LCMV infection:** LCMV is a virus that induces a robust immune response, involving both NK cells and T cells. The genes identified are likely involved in the immune response to LCMV infection.
## * **NK cell function:** The enrichment of genes involved in NK cell activation and cytotoxicity suggests that NK cells play a significant role in the immune response to LCMV infection.
## * **T cell function:** The presence of genes involved in T cell activation, differentiation, and chemokine signaling indicates that T cells are also actively involved in the immune response.
## * **Ifng - CTCF binding site mutant mice:** The mutation in the Ifng - CTCF binding site likely affects the expression of genes involved in the immune response, potentially altering the activation and function of NK and T cells.
##
## By analyzing the expression of these genes in WT and mutant mice, researchers can gain insights into the role of NK and T cells in the immune response to LCMV infection and how the mutation in the Ifng - CTCF binding site affects these processes.
Here we perform PCA and ask SCassist_recommend_pcs to analzye the PCs to provide recommendation for the number of PCs to use for downstream analysis.
# Run PCA to generate 50 pcs
allsamplesgood <- RunPCA(object = allsamplesgood, npcs=50)
# Print genes in the top 5 pcs
print(allsamplesgood[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: Gzma, Ccl5, Gzmb, Tyrobp, Fcer1g
## Negative: Ly6c1, Ltb, Ikzf2, Rps27, Cd3d
## PC_ 2
## Positive: Hist1h2ap, Hist1h1b, Hist1h2ae, Top2a, Mki67
## Negative: Ccl5, Gzma, Gzmb, Tyrobp, Ccl4
## PC_ 3
## Positive: Ccl5, Gzma, Cd8b1, Rps27, Rps24
## Negative: C1qa, Slc40a1, C1qc, Hmox1, Vcam1
## PC_ 4
## Positive: Ly6c2, Cd8b1, Ly6c1, Dapl1, Plac8
## Negative: Ikzf2, Tnfrsf4, Foxp3, Tnfrsf9, Izumo1r
## PC_ 5
## Positive: Ccl4, Xcl1, Ccl3, Gzmb, Spp1
## Negative: Ccl5, Gzma, Cma1, S100a6, Cenpf
Here we ask SCassist_recommend_pcs to analzye the PCs to provide recommendation for the number of PCs to use for downstream analysis. We also ask SCassist_analyze_pcs to analzye the SCassist recommended PCs and provide insights on the gene sets identified.
# Identify appropriate PCs to use
pc_recommendation=SCassist_recommend_pcs("allsamplesgood", experimental_design = experimental_design, api_key_file = api_key_file, llm_server = "google")
## Based on the provided variance explained by each PC, the optimal number of PCs to use for downstream analysis is **10**.
##
## Here's the reasoning:
##
## * **Elbow point:** While there isn't a super clear "elbow" in the scree plot, there's a noticeable change in slope around PC10. The variance explained drops significantly after PC10, indicating that the subsequent PCs capture less substantial information.
## * **Variance explained:** The first 10 PCs capture approximately 70% of the total variance (34.35% + 15.63% + 6.67% + 3.91% + 2.88% + 2.5% + 2.35% + 2.11% + 2.01% + 1.9% = 70.31%). This is a good balance between capturing most of the variation and avoiding excessive complexity.
## * **Balance between complexity and interpretability:** Using 10 PCs provides a reasonable level of detail while maintaining interpretability. Including more PCs might capture subtle variations but could lead to overfitting and make the analysis harder to understand.
##
## Therefore, using 10 PCs strikes a balance between capturing most of the variance and maintaining a manageable level of complexity for downstream analysis.
# Identify number of PCs that explains majority of variations
ElbowPlot(allsamplesgood, ndims=50, reduction = "pca")
# Visualize the genes in the first PC
VizDimLoadings(allsamplesgood, dims = 1, ncol = 1) + theme_minimal(base_size = 8)
# Visualize the genes in the second PC
VizDimLoadings(allsamplesgood, dims = 2, ncol = 1) + theme_minimal(base_size = 8)
# Set the identity column to WT vs KO
Idents(allsamplesgood) <- "orig.ident"
# Visualize the cells after pca
DimPlot(object = allsamplesgood, reduction = "pca", group.by = "orig.ident")
# Plot heatmap with cells=500 plotting cells with extreme cells on both ends of spectrum
DimHeatmap(object = allsamplesgood, dims = 1:5, cells = 500, balanced = TRUE)
# Here we are also analyzing the gene sets that make up the top 5 PCs, top_n_pc_contributing_genes = 50
pc_analyzed=SCassist_analyze_pcs("allsamplesgood", num_pcs = 5, experimental_design = experimental_design, api_key_file = api_key_file, llm_server = "google")
##
## **PC1 Summary:**
## The top contributing genes for PC1 are primarily associated with cytotoxic T cell function and inflammation. Genes like *Gzma*, *Gzmb*, *Prf1*, *Nkg7*, and *Klra* family members are all involved in granule exocytosis and killing of target cells. *Ccl3*, *Ccl4*, and *Ccl5* are chemokines that attract immune cells to sites of inflammation. *Tyrobp*, *Fcer1g*, and *Irf8* are involved in immune signaling and activation. The presence of multiple histone genes (e.g., *Hist1h2ap*, *Hist1h1b*, *Hist1h2ae*, *Hist1h3c*) suggests a potential role for changes in gene expression regulation.
##
## Taken together, these genes suggest that PC1 might be capturing variations in the cytotoxic activity and inflammatory response of CD4+ and CD8+ T cells in the context of lymphocytic choriomeningitis virus infection. The differences in gene expression between WT and Ifng-CTCF binding site mutant mice could reflect variations in the ability of these cells to control viral infection, potentially due to altered cytokine production, cytotoxic activity, or immune signaling.
##
##
## **PC2 Summary:**
## The top contributing genes for PC2 are primarily involved in cell cycle regulation, proliferation, and immune response. Genes like *Mki67*, *Ccna2*, *Cdk1*, *Cenpf*, *Kif11*, and *Tuba1b* are known to be crucial for cell division and proliferation. *Gzma*, *Gzmb*, *Klra8*, *Ccl5*, and *Ccl4* are associated with cytotoxic T cell activity and cytokine production, indicating an immune response. The presence of histone genes like *Hist1h2ap*, *Hist1h1b*, *Hist1h2ae*, and *Hist1h3c* suggests active transcription and chromatin remodeling.
##
## Therefore, PC2 likely captures variations in the cellular state related to T cell activation, proliferation, and cytotoxic activity. This could be driven by differences in the immune response to lymphocytic choriomeningitis virus infection between WT and Ifng - CTCF binding site mutant mice.
##
##
## **PC3 Summary:**
## The top contributing genes for PC3 are primarily involved in immune response, inflammation, and macrophage activation. Genes like C1qa, C1qc, C1qb, and Cfb are components of the complement system, which plays a crucial role in innate immunity and inflammation. Other genes like Hmox1, Vcam1, and Cd5l are involved in macrophage activation and recruitment. Additionally, genes like Ifitm3, Ifi207, and Ccr3 are associated with antiviral responses. These genes suggest that PC3 might be capturing variations in the immune response to lymphocytic choriomeningitis virus (LCMV) infection, particularly the activation and recruitment of macrophages and the induction of antiviral responses. The presence of genes like Apoe and Lpl, involved in lipid metabolism, might also indicate a potential role of lipid metabolism in the immune response to LCMV infection.
##
##
## **PC4 Summary:**
## The top contributing genes for PC4 include several key players in T cell activation, differentiation, and function. Notably, we see genes like **Ikzf2 (Helios)**, a transcription factor associated with regulatory T cell (Treg) development, **Foxp3**, the master regulator of Treg function, and **Ctla4**, a negative regulator of T cell activation. Additionally, genes like **Cd8a**, **Cd8b1**, and **Gzmb** point towards cytotoxic T cell (CTL) activity. The presence of **Tnfrsf4 (OX40)** and **Tnfrsf9 (4-1BB)**, both costimulatory molecules involved in T cell activation and survival, further supports this notion.
##
## Taken together, these genes suggest that PC4 might be capturing variations in the balance between Treg and CTL populations, potentially reflecting differences in immune responses to lymphocytic choriomeningitis virus (LCMV) infection between WT and Ifng-CTCF binding site mutant mice. This could be driven by altered T cell differentiation pathways, changes in T cell activation and suppression, or variations in the expression of effector molecules like granzyme B.
##
##
## **PC5 Summary:**
## The top contributing genes for PC5 are primarily associated with immune response and inflammation. Genes like *Ccl4, Xcl1, Ccl3, Ccl5, Cxcl10, Ifng, Gzmb, Gzma, Tnfrsf9* are chemokines and cytokines involved in attracting and activating immune cells, particularly T cells and NK cells. *Egr1, Egr3, Nr4a1, Nr4a3, Nr4a2* are transcription factors that regulate the expression of genes involved in immune responses. *Irf8, Isg15, Ifi204, Ifitm3* are interferon-stimulated genes involved in antiviral responses. *Cd69, Cd7, Cd160* are cell surface receptors involved in T cell activation and differentiation. *Prf1, Vim, Lgals1, S100a4, S100a6, S100a10* are involved in cell adhesion, migration, and cytotoxicity. These genes suggest that PC5 might be capturing variations in the activation, differentiation, and effector functions of T cells and NK cells during lymphocytic choriomeningitis virus infection. The presence of genes involved in both innate and adaptive immune responses suggests that PC5 might be reflecting the interplay between these two arms of the immune system in controlling viral infection.
##
##
## ## Overall Summary of Principal Components (PCs)
##
## The five principal components (PCs) identified in this study capture distinct aspects of the immune response to lymphocytic choriomeningitis virus (LCMV) infection, particularly in the context of differences between wild-type (WT) and Ifng-CTCF binding site mutant mice.
##
## **PC1:** Primarily reflects variations in **cytotoxic T cell function and inflammation**. This component highlights genes involved in granule exocytosis, target cell killing, chemokine production, and immune signaling. Differences in PC1 scores between WT and mutant mice could indicate variations in their ability to control viral infection through altered cytokine production, cytotoxic activity, or immune signaling.
##
## **PC2:** Captures variations in **T cell activation, proliferation, and cytotoxic activity**. This component includes genes crucial for cell division, proliferation, and immune response, suggesting differences in the cellular state of T cells between WT and mutant mice.
##
## **PC3:** Focuses on **immune response, inflammation, and macrophage activation**. This component highlights genes involved in the complement system, macrophage activation and recruitment, and antiviral responses. Differences in PC3 scores could reflect variations in the activation and recruitment of macrophages and the induction of antiviral responses during LCMV infection.
##
## **PC4:** Reflects variations in the **balance between regulatory T cells (Tregs) and cytotoxic T cells (CTLs)**. This component includes genes associated with Treg development and function, as well as CTL activity. Differences in PC4 scores could indicate altered T cell differentiation pathways, changes in T cell activation and suppression, or variations in the expression of effector molecules like granzyme B.
##
## **PC5:** Captures variations in the **activation, differentiation, and effector functions of T cells and NK cells**. This component includes genes involved in chemokine and cytokine production, T cell activation and differentiation, and antiviral responses. The presence of genes involved in both innate and adaptive immune responses suggests that PC5 might be reflecting the interplay between these two arms of the immune system in controlling viral infection.
##
## **Overall, these PCs provide a comprehensive view of the complex immune response to LCMV infection, highlighting the distinct roles of different immune cell populations and pathways in controlling viral infection. The differences observed between WT and Ifng-CTCF binding site mutant mice suggest that the CTCF binding site in the Ifng gene plays a crucial role in regulating these immune responses.**
Here we ask SCassist_recommend_k to identify optimum number for k, to use with FindNeighbors function, given the number of pcs we are going to use for the downstream analysis.
# Run SCassist_recommend_k to identify data based K
recommended_k <- SCassist_recommend_k("allsamplesgood", num_pcs = 10, api_key_file = api_key_file, llm_server = "google", experimental_design = experimental_design)
## ## Recommended K: 15-30
##
## ## Reasoning:
##
## The `k.param` value in `FindNeighbors()` determines the number of nearest neighbors considered for each cell when constructing the k-nearest neighbor graph. A higher `k.param` value leads to a more connected graph, potentially capturing broader relationships between cells. However, too high a value can blur distinct cell populations.
##
## Given your dataset of approximately 8555 cells and the use of 10 principal components, a range of `k.param` values between 15 and 30 is recommended. This range allows for a balance between capturing local relationships within clusters while still considering broader connections between different cell types. Starting with a lower value like 15 and gradually increasing to 30 can help identify the optimal `k.param` for your specific data and clustering goals.
# Run findneighbors
allsamplesgood <- FindNeighbors(allsamplesgood, dims = 1:10, k.param = 15, return.neighbor = TRUE)
Here we ask SCassist_recommend_res to analyze the number of cells, number of genes, mean expression variability and median neighbor distance, and recommend appropriate cluster resolutions.
# Run SCassist_recommend_res
recommended_res <- SCassist_recommend_res("allsamplesgood", api_key_file = api_key_file, llm_server = "google")
## Based on the data characteristics, I recommend the following resolution range:
##
## **Recommended Resolution:** seq(0.4, 1.2, 0.1)
##
## **Reasoning:**
##
## The mean expression variability of 8.66843315573589 suggests a moderate level of heterogeneity in your dataset. This indicates that there might be subtle differences in gene expression between cell populations. The median neighbor distance of 4.83575773239136 in the k-nearest neighbor graph further supports this notion, as it implies a relatively dense and interconnected cell landscape.
##
## Therefore, a resolution range of seq(0.4, 1.2, 0.1) is recommended. This range allows for the identification of both distinct and subtle cell populations. Lower resolutions (0.4-0.7) will capture broader, more general cell types, while higher resolutions (0.8-1.2) will enable the identification of finer, more specific subpopulations. By exploring this range, you can effectively delineate the cellular landscape of your dataset and uncover potentially interesting biological insights.
allsamplesgood <- FindNeighbors(allsamplesgood, dims = 1:10, k.param = 15)
# Perform clustering using the above identified resolution.
allsamplesgood <- FindClusters(allsamplesgood, resolution = seq(0.4,1.2,0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8999
## Number of communities: 10
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8877
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8781
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8688
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8598
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8519
## Number of communities: 17
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8441
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8370
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8555
## Number of edges: 241353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8317
## Number of communities: 23
## Elapsed time: 0 seconds
head(allsamplesgood)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCAAGAGTTGTA-WT WT 17880 4120 3.333333
## AAACCCAAGATTTGCC-WT WT 7916 2395 2.627590
## AAACCCACAAGTATCC-WT WT 9059 2865 1.953858
## AAACCCAGTCCAGGTC-WT WT 10948 2939 2.831567
## AAACCCATCCTGGGAC-WT WT 16768 3484 0.942271
## AAACGAACAGAGAGGG-WT WT 10467 3159 3.668673
## AAACGAACAGCGACAA-WT WT 12996 2853 1.569714
## AAACGAACAGTGTGGA-WT WT 26674 4654 1.570818
## AAACGAATCTAGCCTC-WT WT 11434 3100 1.687948
## AAACGCTAGCGTTCCG-WT WT 28638 4519 3.579161
## percent.ribo percent.hb nCount_SCT nFeature_SCT
## AAACCCAAGAGTTGTA-WT 22.80761 0.011185682 10914 3831
## AAACCCAAGATTTGCC-WT 16.28348 0.000000000 9352 2394
## AAACCCACAAGTATCC-WT 16.81201 0.000000000 9513 2863
## AAACCCAGTCCAGGTC-WT 25.30142 0.009134088 10388 2936
## AAACCCATCCTGGGAC-WT 35.07276 0.005963740 10691 3307
## AAACGAACAGAGAGGG-WT 22.92921 0.009553836 10239 3158
## AAACGAACAGCGACAA-WT 42.22068 0.000000000 10582 2853
## AAACGAACAGTGTGGA-WT 26.22029 0.000000000 10379 2877
## AAACGAATCTAGCCTC-WT 28.44149 0.000000000 10519 3100
## AAACGCTAGCGTTCCG-WT 43.16642 0.000000000 9760 2387
## SCT_snn_res.0.4 SCT_snn_res.0.5 SCT_snn_res.0.6
## AAACCCAAGAGTTGTA-WT 5 4 2
## AAACCCAAGATTTGCC-WT 4 6 5
## AAACCCACAAGTATCC-WT 5 4 2
## AAACCCAGTCCAGGTC-WT 3 5 1
## AAACCCATCCTGGGAC-WT 0 2 7
## AAACGAACAGAGAGGG-WT 7 7 8
## AAACGAACAGCGACAA-WT 2 1 0
## AAACGAACAGTGTGGA-WT 8 8 9
## AAACGAATCTAGCCTC-WT 3 5 1
## AAACGCTAGCGTTCCG-WT 0 2 6
## SCT_snn_res.0.7 SCT_snn_res.0.8 SCT_snn_res.0.9
## AAACCCAAGAGTTGTA-WT 5 3 8
## AAACCCAAGATTTGCC-WT 4 5 4
## AAACCCACAAGTATCC-WT 13 13 14
## AAACCCAGTCCAGGTC-WT 1 1 3
## AAACCCATCCTGGGAC-WT 6 7 6
## AAACGAACAGAGAGGG-WT 7 8 7
## AAACGAACAGCGACAA-WT 2 0 2
## AAACGAACAGTGTGGA-WT 9 9 10
## AAACGAATCTAGCCTC-WT 1 1 3
## AAACGCTAGCGTTCCG-WT 6 6 5
## SCT_snn_res.1 SCT_snn_res.1.1 SCT_snn_res.1.2
## AAACCCAAGAGTTGTA-WT 6 6 9
## AAACCCAAGATTTGCC-WT 5 9 6
## AAACCCACAAGTATCC-WT 17 18 18
## AAACCCAGTCCAGGTC-WT 12 13 13
## AAACCCATCCTGGGAC-WT 4 5 3
## AAACGAACAGAGAGGG-WT 8 7 8
## AAACGAACAGCGACAA-WT 1 1 19
## AAACGAACAGTGTGGA-WT 9 8 11
## AAACGAATCTAGCCTC-WT 3 3 4
## AAACGCTAGCGTTCCG-WT 7 4 5
## seurat_clusters
## AAACCCAAGAGTTGTA-WT 9
## AAACCCAAGATTTGCC-WT 6
## AAACCCACAAGTATCC-WT 18
## AAACCCAGTCCAGGTC-WT 13
## AAACCCATCCTGGGAC-WT 3
## AAACGAACAGAGAGGG-WT 8
## AAACGAACAGCGACAA-WT 19
## AAACGAACAGTGTGGA-WT 11
## AAACGAATCTAGCCTC-WT 4
## AAACGCTAGCGTTCCG-WT 5
# Count number of clusters at each resolution
sapply(grep("res",colnames(allsamplesgood@meta.data),value = TRUE),
function(x) length(unique(allsamplesgood@meta.data[,x])))
## SCT_snn_res.0.4 SCT_snn_res.0.5 SCT_snn_res.0.6 SCT_snn_res.0.7 SCT_snn_res.0.8
## 10 12 13 14 14
## SCT_snn_res.0.9 SCT_snn_res.1 SCT_snn_res.1.1 SCT_snn_res.1.2
## 17 21 21 23
# change default identity
Idents(allsamplesgood) <- "SCT_snn_res.1.1"
# list cell number in each cluster for HC vs UV
table(Idents(allsamplesgood),allsamplesgood$orig.ident)
##
## KO WT
## 0 505 400
## 1 438 417
## 2 531 280
## 3 128 618
## 4 478 209
## 5 277 399
## 6 232 375
## 7 303 154
## 8 135 317
## 9 197 162
## 10 111 231
## 11 149 183
## 12 95 187
## 13 17 209
## 14 69 95
## 15 110 51
## 16 58 101
## 17 70 56
## 18 36 62
## 19 30 53
## 20 10 17
set.seed(20)
# Run UMAP with identified dimensions
allsamplesgood <- RunUMAP(allsamplesgood, dims = 1:10)
# Plot umap
DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", label = T)
# Color by WT vs KO
DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", group.by = "orig.ident")
# Plot umap
dimplot1<-DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", label = T)
# Color by WT vs KO
dimplot2<-DimPlot(object = allsamplesgood, pt.size=0.5, reduction = "umap", group.by = "orig.ident")
# Save umaps
ggsave(paste(outputpath,"LCMV-umap-clusters.pdf",sep=""), plot = dimplot1, width = 20, height = 15, units = "cm")
ggsave(paste(outputpath,"LCMV-umap-clusters-type.pdf",sep=""), plot = dimplot2, width = 20, height = 15, units = "cm")
# Custom list of genes to visualize
mygenes = c("Ly6c1", "Tbc1d4","Foxp3","Klrg1", "Irf8","Fcer1g","Gm44175","Cd8a","Cd8b1","Mcm3","Dut","Pclaf","Cd4","Klrb1c")
RidgePlot(allsamplesgood, features = mygenes)
VlnPlot(allsamplesgood, features = mygenes, pt.size=0)
FeaturePlot(allsamplesgood, features = mygenes)
DotPlot(allsamplesgood, features = mygenes) + coord_flip()
DoHeatmap(subset(allsamplesgood), features = mygenes, size = 3)
Here we run findallmarkers to identify potential markers that are unique for each of the clusters.
########################### Find all markers (about 80 minutes)
#markersall=FindAllMarkers(allsamplesgood)
head(markersall)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## Tbc1d4 1.158042e-86 1.0800634 0.517 0.214 1.915055e-82 0 Tbc1d4
## Cd4 1.364313e-84 0.9398292 0.794 0.464 2.256164e-80 0 Cd4
## Ctsw 3.829336e-84 -2.4903435 0.220 0.522 6.332572e-80 0 Ctsw
## Slamf6 1.890603e-78 1.0467176 0.760 0.484 3.126491e-74 0 Slamf6
## Sell 8.866188e-77 -1.1435992 0.636 0.816 1.466202e-72 0 Sell
## Bcl11b 3.750989e-74 0.7344389 0.920 0.644 6.203011e-70 0 Bcl11b
# Save all markers as tab delimited file
write.table(markersall, file=paste(outputpath,"LCMV-markersall.txt",sep=""),quote=F, row.names = FALSE, sep = ",")
# Plot selected markers
VlnPlot(allsamplesgood,features=c("Cd4","Ikzf2", "Ly6c1","Ccl4","Cxcr6","Fcgrt","Ccl5","Cd8b1","Cx3cl1"), pt.size = 0)
FeaturePlot(allsamplesgood, features="Cd8b1")
# Extract the top marker for each cluster
head(markersall)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## Tbc1d4 1.158042e-86 1.0800634 0.517 0.214 1.915055e-82 0 Tbc1d4
## Cd4 1.364313e-84 0.9398292 0.794 0.464 2.256164e-80 0 Cd4
## Ctsw 3.829336e-84 -2.4903435 0.220 0.522 6.332572e-80 0 Ctsw
## Slamf6 1.890603e-78 1.0467176 0.760 0.484 3.126491e-74 0 Slamf6
## Sell 8.866188e-77 -1.1435992 0.636 0.816 1.466202e-72 0 Sell
## Bcl11b 3.750989e-74 0.7344389 0.920 0.644 6.203011e-70 0 Bcl11b
markersall %>%
group_by(cluster) %>%
slice(1) %>%
pull(gene) -> best.gene.per.cluster
best.gene.per.cluster
## [1] "Tbc1d4" "Rps29" "Ikzf2" "Ly6c1"
## [5] "Fcgrt" "Cd8a" "Klrb1c" "Itgb8"
## [9] "Hist1h3b" "Itgam" "Slfn5" "Lat2"
## [13] "Lef1" "Ly6c1" "Batf3" "1500009L16Rik"
## [17] "Ccl4" "Cma1" "Hmmr" "Lpl"
## [21] "Cx3cl1"
# Store plot as ggplot list
vlnplot1<-VlnPlot(allsamplesgood,features=best.gene.per.cluster, pt.size = 0, combine = FALSE)
# Set properties for each plot in the list
p1 <- list()
for (i in seq_along(vlnplot1)){
#Change x and y tick label font size.
p1[[i]] = vlnplot1[[i]] +
theme(legend.position = "none",
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
axis.title.y = element_text(size = 6),
axis.title.x = element_blank(),
title = element_text(size=8))
}
# Plot all the plots in the list
plot_grid(plotlist = p1, ncol = 3)
# Save plot to file
vlnplot1=plot_grid(plotlist = p1, ncol = 3)
ggsave(paste(outputpath,"LCMV-vlnplot-best-per-cluster.pdf",sep=""), plot = vlnplot1, width = 20, height = 25, units = "cm")
# Plot features in umap
fplot1<-FeaturePlot(allsamplesgood,features=best.gene.per.cluster, pt.size = 0, combine = FALSE)
p2 <- list()
for (i in seq_along(fplot1)){
#Change x and y label font size.
p2[[i]] = fplot1[[i]] +
theme(legend.position = "none",
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
title = element_text(size=8))
}
plot_grid(plotlist = p2, ncol = 4)
fplot1=plot_grid(plotlist = p2, ncol = 4)
ggsave(paste(outputpath,"LCMV-fplot-best-per-cluster.pdf",sep=""), plot = fplot1, width = 20, height = 25, units = "cm")
Here we use SCassist_analyze_and_annotate to analyze Top Markers, Predict Cell Types, and Optionally Annotate the Seurat Object.
Idents(allsamplesgood) <- "SCT_snn_res.1.1"
head(allsamplesgood)
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCCAAGAGTTGTA-WT WT 17880 4120 3.333333
## AAACCCAAGATTTGCC-WT WT 7916 2395 2.627590
## AAACCCACAAGTATCC-WT WT 9059 2865 1.953858
## AAACCCAGTCCAGGTC-WT WT 10948 2939 2.831567
## AAACCCATCCTGGGAC-WT WT 16768 3484 0.942271
## AAACGAACAGAGAGGG-WT WT 10467 3159 3.668673
## AAACGAACAGCGACAA-WT WT 12996 2853 1.569714
## AAACGAACAGTGTGGA-WT WT 26674 4654 1.570818
## AAACGAATCTAGCCTC-WT WT 11434 3100 1.687948
## AAACGCTAGCGTTCCG-WT WT 28638 4519 3.579161
## percent.ribo percent.hb nCount_SCT nFeature_SCT
## AAACCCAAGAGTTGTA-WT 22.80761 0.011185682 10914 3831
## AAACCCAAGATTTGCC-WT 16.28348 0.000000000 9352 2394
## AAACCCACAAGTATCC-WT 16.81201 0.000000000 9513 2863
## AAACCCAGTCCAGGTC-WT 25.30142 0.009134088 10388 2936
## AAACCCATCCTGGGAC-WT 35.07276 0.005963740 10691 3307
## AAACGAACAGAGAGGG-WT 22.92921 0.009553836 10239 3158
## AAACGAACAGCGACAA-WT 42.22068 0.000000000 10582 2853
## AAACGAACAGTGTGGA-WT 26.22029 0.000000000 10379 2877
## AAACGAATCTAGCCTC-WT 28.44149 0.000000000 10519 3100
## AAACGCTAGCGTTCCG-WT 43.16642 0.000000000 9760 2387
## SCT_snn_res.0.4 SCT_snn_res.0.5 SCT_snn_res.0.6
## AAACCCAAGAGTTGTA-WT 5 4 2
## AAACCCAAGATTTGCC-WT 4 6 5
## AAACCCACAAGTATCC-WT 5 4 2
## AAACCCAGTCCAGGTC-WT 3 5 1
## AAACCCATCCTGGGAC-WT 0 2 7
## AAACGAACAGAGAGGG-WT 7 7 8
## AAACGAACAGCGACAA-WT 2 1 0
## AAACGAACAGTGTGGA-WT 8 8 9
## AAACGAATCTAGCCTC-WT 3 5 1
## AAACGCTAGCGTTCCG-WT 0 2 6
## SCT_snn_res.0.7 SCT_snn_res.0.8 SCT_snn_res.0.9
## AAACCCAAGAGTTGTA-WT 5 3 8
## AAACCCAAGATTTGCC-WT 4 5 4
## AAACCCACAAGTATCC-WT 13 13 14
## AAACCCAGTCCAGGTC-WT 1 1 3
## AAACCCATCCTGGGAC-WT 6 7 6
## AAACGAACAGAGAGGG-WT 7 8 7
## AAACGAACAGCGACAA-WT 2 0 2
## AAACGAACAGTGTGGA-WT 9 9 10
## AAACGAATCTAGCCTC-WT 1 1 3
## AAACGCTAGCGTTCCG-WT 6 6 5
## SCT_snn_res.1 SCT_snn_res.1.1 SCT_snn_res.1.2
## AAACCCAAGAGTTGTA-WT 6 6 9
## AAACCCAAGATTTGCC-WT 5 9 6
## AAACCCACAAGTATCC-WT 17 18 18
## AAACCCAGTCCAGGTC-WT 12 13 13
## AAACCCATCCTGGGAC-WT 4 5 3
## AAACGAACAGAGAGGG-WT 8 7 8
## AAACGAACAGCGACAA-WT 1 1 19
## AAACGAACAGTGTGGA-WT 9 8 11
## AAACGAATCTAGCCTC-WT 3 3 4
## AAACGCTAGCGTTCCG-WT 7 4 5
## seurat_clusters
## AAACCCAAGAGTTGTA-WT 9
## AAACCCAAGATTTGCC-WT 6
## AAACCCACAAGTATCC-WT 18
## AAACCCAGTCCAGGTC-WT 13
## AAACCCATCCTGGGAC-WT 3
## AAACGAACAGAGAGGG-WT 8
## AAACGAACAGCGACAA-WT 19
## AAACGAACAGTGTGGA-WT 11
## AAACGAATCTAGCCTC-WT 4
## AAACGCTAGCGTTCCG-WT 5
# Run computational annotation. If seurat_object_name is provided, annotation will be added to that object and returned as sca_annotated, otherwise, sca_annotated will have the annotation details. Annotation detail is also saved in the current working directory as G_sca_annotation_results.tsv.
sca_annotated <- SCassist_analyze_and_annotate(markersall, top_genes = 30, seurat_object_name = "allsamplesgood", api_key_file = api_key_file, llm_server = "google")
## [1] "The output of Analyze and Annotate is saved as a tab delimited text if your current working directory. If you provided the name of the seurat object, the annotation is also added to that object in the SCassist_annotation column"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 0"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 1"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 2"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 3"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 4"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 5"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 6"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 7"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 8"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 9"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 10"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 11"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 12"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 13"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 14"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 15"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 16"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 17"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 18"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 19"
## [1] "SCassistant is analyzing markers to predict cell types for cluster : 20"
## cluster_num cell_type
## 1 0 T cell
## 2 1 T cell
## 3 2 Regulatory T cells
## 4 3 Immune cells (likely macrophages or dendritic cells)
## 5 4 T cell
## 6 5 Cytotoxic T lymphocytes
## 7 6 NK cells
## 8 7 Regulatory T cells
## 9 8 Proliferating cells
## 10 9 Natural Killer (NK) cells
## 11 10 Interferon-stimulated cells
## 12 11 Natural Killer (NK) cells
## 13 12 T cell
## 14 13 Interferon-stimulated cells
## 15 14 NK cell
## 16 15 T helper 2 (Th2) cells
## 17 16 NK cells
## 18 17 NK cells
## 19 18 Neural progenitor cells
## 20 19 Macrophage
## 21 20 B cell
## scassist_reasoning
## 1 The presence of markers like Cd4, Ccr5, Slamf6, Slamf7, Cd28, Klrd1, Prf1, and Nkg7 strongly suggests a T cell lineage. These markers are associated with T cell activation, differentiation, and effector functions.
## 2 The presence of Il2rb, Lef1, and Fau suggests a T cell lineage, with Il2rb being a key receptor for T cell activation and Lef1 playing a role in T cell development. The abundance of ribosomal proteins (Rps and Rpl genes) indicates active protein synthesis, consistent with the metabolic demands of T cells.
## 3 The presence of markers like Foxp3, Ctla4, Tnfrsf4, and Pdcd1 strongly suggests a regulatory T cell population. These markers are known to be involved in immune suppression and maintaining immune homeostasis.
## 4 The presence of markers like Ly6c1, Isg15, Stat1, Gbp2, Slfn5, S100a10, Igtp, Oasl2, Il2rb, Tmsb4x, Iigp1, Ly6a, Parp14, Slfn1, Ifit3, Lef1, Ifit1, S100a11, Smchd1, Tgtp1, Rps29, Irf7, Gbp4, Rps19, Bst2, Isg20, Herc6, Zbp1, Lgals1, Cst7 strongly suggests an immune cell origin. These markers are associated with interferon signaling, antiviral response, and immune regulation, which are key functions of macrophages and dendritic cells.
## 5 The presence of markers like Cd8a, Cd8b1, Ifngas1, Cxcr6, and Hopx strongly suggests a T cell lineage. These markers are associated with cytotoxic T cells, particularly those involved in immune responses and cytokine production.
## 6 The presence of Cd8a and Cd8b1, along with other markers like Ly6c2 and Fam241a, strongly suggests a cytotoxic T lymphocyte population. These markers are known to be involved in immune responses and the elimination of infected or cancerous cells.
## 7 The presence of markers like Klrb1c, Ncr1, Klrk1, Cd244a, Klre1, Klrb1f, Klrc2, Klrd1, Klri2, and Prf1 strongly suggests an NK cell identity. These markers are known to be involved in NK cell activation, cytotoxicity, and immune regulation.
## 8 The presence of markers like Foxp3, Ctla4, Ikzf2, and Cd2 strongly suggests a regulatory T cell population. These markers are known to be involved in immune suppression and maintaining immune homeostasis.
## 9 The presence of a high number of histone genes (Hist1h3b, Hist1h2af, Hist1h3c, Hist1h1b, Hist1h2ab, Hist1h2ap, Hist1h2ae, Hist1h3g, Hist1h2bm, Hist1h2an, Hist1h2bb, Hist1h2bn, Hist1h2ak, Hist1h2bj) suggests active DNA replication and cell division. Additionally, genes like Esco2, E2f8, Mxd3, Neil3, Pbk, E2f7, Nusap1, Pclaf, Rrm2, Anln, Spc25, Bub1 are involved in cell cycle regulation and DNA repair, further supporting the hypothesis of a proliferating cell population.
## 10 The presence of markers like Itgam, Klrb1c, Ncr1, Cd244a, Klrk1, Klrc2, Prf1, and Irf8 strongly suggests a NK cell identity. These markers are known to be involved in NK cell activation, cytotoxicity, and immune regulation. The expression of chemokine receptors like S1pr5, Ccl5, and Ccl3 further supports the NK cell phenotype, indicating their role in migration and homing to specific tissues.
## 11 The presence of numerous interferon-stimulated genes (ISGs) such as Ifit1, Isg20, Ifit3, Cxcl10, Isg15, Stat1, Ddx60, Oasl2, Samhd1, Ifi203, Ifit3b, Ifih1, Irf7, Ifi27l2a, and Ly6e strongly suggests an interferon-stimulated cell type, potentially involved in antiviral responses or immune modulation.
## 12 The presence of markers like Klrb1c, Klrk1, Klre1, Klrd1, Nkg7, Klrg1, and Klrc2 strongly suggests a NK cell lineage. These markers are known to be involved in NK cell activation, cytotoxicity, and immune regulation. Additionally, the presence of markers like Gzma, Gzmb, Prf1, and Itgam further supports the NK cell identity, indicating their role in target cell lysis and immune response.
## 13 The presence of Lef1, Tcf7, Stat1, Il2rb, Ccr7, and Trib2 suggests a T cell lineage, with potential involvement in immune response and regulation.
## 14 The presence of numerous interferon-stimulated genes (ISGs) such as Isg15, Isg20, Ifitm3, Gbp2, Oasl2, Ifit1, Ifit2, Parp14, Herc6, Iigp1, Stat1, Gbp4, Ifit3, Bst2, Igtp, Rsad2, Daxx, Zbp1, Irf7, Oas2, Gbp7, Trim30b, and S100a10 strongly suggests an interferon-stimulated cell type, likely involved in antiviral responses.
## 15 The presence of markers like Klrb1f, Klrc2, Ncr1, Cd244a, and Ccr5 strongly suggests an NK cell identity. These markers are known to be involved in NK cell development, activation, and cytotoxicity, indicating a role in immune surveillance and response to pathogens.
## 16 The presence of markers like Il9r, Il1rl1, Maf, and Gpr55 strongly suggests a Th2 cell identity. These markers are known to be involved in Th2 cell differentiation and function, particularly in allergic responses and immune regulation.
## 17 The presence of markers like Ifng, Prf1, Gzmb, Nkg7, Klrb1c, Klrb1f, and Cd244a strongly suggests an NK cell identity. These markers are associated with cytotoxic activity, cytokine production, and NK cell receptor signaling, all characteristic of NK cells.
## 18 The presence of markers like Klrg1, Klrb1c, Klrb1a, Klrc2, Klre1, Cd244a, Klrk1, Prf1, Gzma, and Ncr1 strongly suggests an NK cell identity. These markers are known to be involved in NK cell activation, cytotoxicity, and immune regulation.
## 19 The presence of markers like Hmmr, Aspm, Cdc20, Cdkn3, Ccnb1, Kif2c, Tpx2, Cep55, Ccnb2, Cenpe, Cip2a, Cenpf, Cdca8, 2610318N02Rik, Kif20a, Nek2, Birc5, Prr11, Troap, Ckap2, Knstrn, Ckap2l, Bub1b, Cdca3, Knl1, Kif4, Cks1b, Ube2c, Ccna2, Dlgap5 strongly suggests a neural progenitor cell population. These genes are involved in cell cycle regulation, spindle assembly, and microtubule dynamics, all crucial processes for the proliferation and differentiation of neural progenitor cells.
## 20 The presence of markers like Lpl, Cd5l, Vcam1, Sirpa, Mertk, Mrc1, C1qa, C1qc, and Fcgr4 strongly suggests a macrophage identity. These markers are involved in lipid metabolism, adhesion, phagocytosis, and immune response, all of which are key functions of macrophages.
## 21 The presence of markers like Cd19, Ms4a1, Fcrla, and Blk strongly suggests a B cell lineage. These markers are known to be involved in B cell development, activation, and antibody production. The presence of other markers like H2-Aa, H2-Ab1, and H2-DMb2 further supports this identification, as they are associated with MHC class II expression, a key feature of B cells.
# Print out Annotation details
#sca_annotation_results <- read.delim("G_sca_annotation_results.tsv", sep = "\t")
#print(sca_annotation_results)
# Assign sca_annotated seurat object back to allsamplesgood object
allsamplesgood = sca_annotated
Idents(allsamplesgood) <- "scassist_annotation_SCT_snn_res.1.1"
# Proportions and counts
prop.table(table(Idents(allsamplesgood), allsamplesgood$orig.ident))
##
## KO WT
## NK cells 0.042080655 0.062185856
## Natural Killer (NK) cells 0.040444185 0.040327294
## Neural progenitor cells 0.004208065 0.007247224
## Interferon-stimulated cells 0.014962011 0.051431911
## Cytotoxic T lymphocytes 0.032378726 0.046639392
## Regulatory T cells 0.097486850 0.050730567
## T cell 0.177206312 0.141788428
## Proliferating cells 0.015780245 0.037054354
## Immune cells (likely macrophages or dendritic cells) 0.014962011 0.072238457
## NK cell 0.008065459 0.011104617
## Macrophage 0.003506721 0.006195207
## B cell 0.001168907 0.001987142
## T helper 2 (Th2) cells 0.012857978 0.005961426
table(Idents(allsamplesgood), allsamplesgood$orig.ident)
##
## KO WT
## NK cells 360 532
## Natural Killer (NK) cells 346 345
## Neural progenitor cells 36 62
## Interferon-stimulated cells 128 440
## Cytotoxic T lymphocytes 277 399
## Regulatory T cells 834 434
## T cell 1516 1213
## Proliferating cells 135 317
## Immune cells (likely macrophages or dendritic cells) 128 618
## NK cell 69 95
## Macrophage 30 53
## B cell 10 17
## T helper 2 (Th2) cells 110 51
DimPlot(allsamplesgood, reduction = "umap", label = TRUE, repel = TRUE, group.by = 'scassist_annotation_SCT_snn_res.1.1', split.by = "orig.ident")
# create and save umap with labels
an0=DimPlot(allsamplesgood, reduction = "umap", label = TRUE, repel = TRUE, group.by = 'scassist_annotation_SCT_snn_res.1.1', split.by = "orig.ident")
#an0a=an0+theme(legend.position="none")
#an0
ggsave(paste(outputpath,"LCMV-umap-with-scassist.pdf",sep=""),plot=an0, width = 35, height = 13, units = "cm")
Idents(allsamplesgood) <- "SCT_snn_res.1.2"
# Proportion test
# assign sample group
allsamplesgood$sample=ifelse(grepl("WT",allsamplesgood$orig.ident), "WT", ifelse(grepl("KO",allsamplesgood$orig.ident), "KO", ""))
prop.table(table(Idents(allsamplesgood), allsamplesgood$orig.ident))
##
## KO WT
## 0 0.061484512 0.048626534
## 1 0.061952075 0.032612507
## 2 0.049094097 0.042080655
## 3 0.029807130 0.047925190
## 4 0.009234366 0.064406780
## 5 0.048509643 0.022910579
## 6 0.034132086 0.032963179
## 7 0.020222092 0.036469901
## 8 0.035417884 0.017884278
## 9 0.020455874 0.031209819
## 10 0.015312683 0.032261835
## 11 0.013091759 0.024663939
## 12 0.008883694 0.020689655
## 13 0.002104033 0.020105202
## 14 0.008182350 0.011455289
## 15 0.012974868 0.005961426
## 16 0.006896552 0.011221508
## 17 0.008533022 0.006545880
## 18 0.004208065 0.007597896
## 19 0.003039158 0.007130333
## 20 0.003389831 0.006195207
## 21 0.007013442 0.001987142
## 22 0.001168907 0.001987142
table(Idents(allsamplesgood), allsamplesgood$orig.ident)
##
## KO WT
## 0 526 416
## 1 530 279
## 2 420 360
## 3 255 410
## 4 79 551
## 5 415 196
## 6 292 282
## 7 173 312
## 8 303 153
## 9 175 267
## 10 131 276
## 11 112 211
## 12 76 177
## 13 18 172
## 14 70 98
## 15 111 51
## 16 59 96
## 17 73 56
## 18 36 65
## 19 26 61
## 20 29 53
## 21 60 17
## 22 10 17
# Save tables
write.table(prop.table(table(Idents(allsamplesgood), allsamplesgood$orig.ident)), file=paste(outputpath,"LCMV-cluster-proportions.txt",sep=""), quote=F, sep = "\t")
write.table(table(Idents(allsamplesgood), allsamplesgood$orig.ident), file=paste(outputpath,"LCMV-cluster-counts.txt",sep=""), quote=F, sep = "\t")
# Run permutation test
prop_test <- sc_utils(allsamplesgood)
prop_test <- permutation_test(
prop_test, cluster_identity = "SCT_snn_res.1.2",
sample_1 = "WT", sample_2 = "KO",
sample_identity = "orig.ident",
n_permutations = 10000
)
plot_data=prop_test@results$permutation
plot_data
## Key: <clusters>
## clusters KO WT obs_log2FD pval FDR
## <char> <num> <num> <num> <num> <num>
## 1: 0 0.132194019 0.090909091 0.5401604 0.00009999 0.0001642693
## 2: 1 0.133199296 0.060970280 1.1274084 0.00009999 0.0001642693
## 3: 10 0.032922845 0.060314685 -0.8734203 0.00009999 0.0001642693
## 4: 11 0.028147776 0.046110140 -0.7120631 0.00009999 0.0001642693
## 5: 12 0.019100276 0.038680070 -1.0179969 0.00009999 0.0001642693
## 6: 13 0.004523750 0.037587413 -3.0546586 0.00009999 0.0001642693
## 7: 14 0.017592360 0.021416084 -0.2837457 0.11978802 0.1252329313
## 8: 15 0.027896456 0.011145105 1.3236717 0.00009999 0.0001642693
## 9: 16 0.014827846 0.020979021 -0.5006383 0.02089791 0.0240325967
## 10: 17 0.018346318 0.012237762 0.5841508 0.01289871 0.0174511961
## 11: 18 0.009047499 0.014204545 -0.6507617 0.01479852 0.0189092202
## 12: 19 0.006534305 0.013330420 -1.0286165 0.00059994 0.0009199080
## 13: 2 0.105554159 0.078671329 0.4240736 0.00009999 0.0001642693
## 14: 20 0.007288263 0.011582168 -0.6682583 0.02829717 0.0309921389
## 15: 21 0.015079166 0.003715035 2.0211089 0.00009999 0.0001642693
## 16: 22 0.002513194 0.003715035 -0.5638536 0.21337866 0.2133786621
## 17: 3 0.064086454 0.089597902 -0.4834455 0.00009999 0.0001642693
## 18: 4 0.019854235 0.120410839 -2.6004466 0.00009999 0.0001642693
## 19: 5 0.104297562 0.042832168 1.2839388 0.00009999 0.0001642693
## 20: 6 0.073385273 0.061625874 0.2519544 0.01629837 0.0197296060
## 21: 7 0.043478261 0.068181818 -0.6490928 0.00009999 0.0001642693
## 22: 8 0.076149786 0.033435315 1.1874673 0.00009999 0.0001642693
## 23: 9 0.043980900 0.058347902 -0.4078037 0.00229977 0.0033059194
## clusters KO WT obs_log2FD pval FDR
## boot_mean_log2FD boot_CI_2.5 boot_CI_97.5
## <num> <num> <num>
## 1: 0.5402189 0.36493735 0.71923636
## 2: 1.1268935 0.92616928 1.33330811
## 3: -0.8761208 -1.17584033 -0.58885374
## 4: -0.7151484 -1.04864770 -0.39424836
## 5: -1.0186437 -1.41302869 -0.64451351
## 6: -3.0874029 -3.87727019 -2.41980722
## 7: -0.2821696 -0.72584208 0.15045583
## 8: 1.3307259 0.86823254 1.83105512
## 9: -0.5032535 -0.97418051 -0.04520463
## 10: 0.5814466 0.08235734 1.08920642
## 11: -0.6587217 -1.27944554 -0.08775983
## 12: -1.0456985 -1.75984470 -0.39871139
## 13: 0.4233094 0.23089945 0.61779589
## 14: -0.6778360 -1.35903380 -0.02071127
## 15: 2.0529062 1.31715837 2.93864675
## 16: -0.5896947 -1.88638533 0.52360925
## 17: -0.4839698 -0.70399246 -0.26808520
## 18: -2.6096665 -2.96569816 -2.28481871
## 19: 1.2872022 1.04894646 1.53173713
## 20: 0.2529342 0.02501069 0.47565588
## 21: -0.6514605 -0.91955541 -0.39336755
## 22: 1.1898458 0.92296154 1.46820292
## 23: -0.4079698 -0.67701255 -0.14567488
## boot_mean_log2FD boot_CI_2.5 boot_CI_97.5
FDR_threshold = 0.05
log2FD_threshold = log2(1.5)
order_clusters = TRUE
plot_data[, significance := ifelse(
FDR < FDR_threshold & abs(obs_log2FD) > log2FD_threshold,
paste("FDR <", FDR_threshold, "& abs(Log2FD) >", round(log2FD_threshold, 2)),
"n.s."
)]
plot_data[, significance := factor(significance, levels = c(
paste("FDR <", FDR_threshold, "& abs(Log2FD) >", round(log2FD_threshold, 2)),
"n.s."
))]
#library(tidyverse)
## Order the clusters by observed log2FD if requested.
if (order_clusters) {
plot_data[, clusters := fct_reorder(factor(clusters), dplyr::desc(obs_log2FD))]
}
## Plot the results.
p <- ggplot(plot_data, aes(x = clusters, y = obs_log2FD)) +
geom_pointrange(aes(ymin = boot_CI_2.5, ymax = boot_CI_97.5, color = significance)) +
theme_bw() +
geom_hline(yintercept = log2FD_threshold, lty = 2) +
geom_hline(yintercept = -log2FD_threshold, lty = 2) +
geom_hline(yintercept = 0) +
scale_color_manual(values = c("salmon", "grey")) +
coord_flip()
p
ggsave(paste(outputpath,"LCMV-cluster_proportion_test_labelled_scassist.pdf",sep=""), plot=p)
Here we perform pathway enrichment analysis for CD4+ clusters 15 and 13 that are different between WT vs KO in proportion test, as well as expressed the CD4+ marker Ly6c1. NOTE: cluster 13 is also identified by computational methods as “Interferon-stimulated cells”, which seems to be close to absent in the KO. SCassist does not perform enrichment analysis, but takes the clusterprofiler output for KEGG pathway and GO enrichment and tries to understand how the identified pathways could represent the system.
########################### tables
# Current identity levels
levels(allsamplesgood)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [16] "15" "16" "17" "18" "19" "20" "21" "22"
head(Idents(allsamplesgood))
## AAACCCAAGAGTTGTA-WT AAACCCAAGATTTGCC-WT AAACCCACAAGTATCC-WT AAACCCAGTCCAGGTC-WT
## 9 6 18 13
## AAACCCATCCTGGGAC-WT AAACGAACAGAGAGGG-WT
## 3 8
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# Extract markers for cluster
# Filter the dataframe for cluster 13 and 15 (CD4+)
# Filter the dataframe for cluster 13, p-value <= 0.05, and |avg_log2FC| > 1
cluster13_markers <- markersall[
markersall$cluster == 13 &
markersall$p_val <= 0.05 &
abs(markersall$avg_log2FC) > 1,
]
# Run enrichment analysis using 'mouse' database, with experimental_design, pvalue and log2FC cut off values
scassist_pathway_summary_13 <- SCassist_analyze_enrichment(markers=cluster13_markers,organism = "mouse", experimental_design = experimental_design, pvalue=0.05, log2FC=1, api_key_file = api_key_file, llm_server = "google")
##
## The KEGG analysis is presented below.
## -------------------------------------------
##
## The ClusterProfiler KEGG pathways result is saved as a text file in the current working directory.
##
## The list of significantly enriched KEGG pathways is also saved as a text file in the current working directory.
##
## KEGG Enrichment Insights:
## ## Analysis of KEGG Pathway Enrichment Results
##
## ### 1. Significant Pathways:
##
## The KEGG pathway enrichment analysis reveals a complex interplay of pathways involved in immune response, cancer, and cellular processes in NK, CD4+, and CD8+ T cells from lymphocytic choriomeningitis virus (LCMV) infected WT and Ifng-CTCF binding site mutant mice. Several pathways related to cancer are enriched, suggesting a potential link between LCMV infection and tumorigenesis. These pathways include "Central carbon metabolism in cancer", "Choline metabolism in cancer", "MicroRNAs in cancer", and "Proteoglycans in cancer". Furthermore, pathways related to immune cell signaling and function are prominent, such as "PD-L1 expression and PD-1 checkpoint pathway in cancer", "T cell receptor signaling pathway", "Chemokine signaling pathway", and "Natural killer cell mediated cytotoxicity". These pathways highlight the crucial role of immune cells in controlling LCMV infection and the potential impact of the Ifng-CTCF mutation on immune responses.
##
## ### 2. Regulators:
##
## Several transcription factors are present in the enriched pathways, suggesting their potential involvement in regulating the observed gene expression changes. These include STAT1, JUN, NFKBIA, and SMAD3, all known to play critical roles in immune cell activation and differentiation. STAT1 is a key mediator of interferon signaling, while JUN and NFKBIA are involved in inflammatory responses. SMAD3 is a downstream effector of TGF-beta signaling, which can modulate immune cell function and contribute to tumorigenesis. The specific roles of these transcription factors in the context of LCMV infection and the Ifng-CTCF mutation warrant further investigation.
##
## ### 3. Key Genes or Potential Targets:
##
## Several genes from the enriched pathways could be crucial to the system under investigation. For instance, *Pik3r1* (phosphoinositide-3-kinase regulatory subunit 1) is a key component of the PI3K-Akt signaling pathway, which is involved in cell survival, proliferation, and immune responses. *Ifng* (interferon gamma) is a crucial cytokine for antiviral immunity and is directly affected by the CTCF mutation. *Cd274* (PD-L1) is a key checkpoint molecule that regulates immune responses and is implicated in cancer progression. *Casp3* (caspase 3) is a central effector of apoptosis, a process that can be triggered by LCMV infection and contribute to immune cell depletion. *Tnf* (tumor necrosis factor) is a pro-inflammatory cytokine that plays a critical role in immune responses and can contribute to tissue damage. These genes represent potential targets for therapeutic interventions aimed at modulating immune responses and controlling LCMV infection.
##
## The GO analysis is provided below.
## ----------------------------------------
##
## The ClusterProfiler GO enrichment result is saved as a text file in the current working directory.
##
## The list of significantly enriched GO terms is also saved as a text file in the current working directory.
##
## GO Enrichment Insights:
## ## 1. Significant Concepts:
##
## The GO enrichment results reveal a complex interplay of cellular processes in NK, CD4+, and CD8+ T cells from lymphocytic choriomeningitis virus (LCMV) infected mice. A prominent theme is the **activation and differentiation of immune cells**, particularly T cells, with significant enrichment for terms like "T cell activation involved in immune response", "CD4-positive, alpha-beta T cell activation", and "T cell differentiation". This suggests a robust immune response to LCMV infection, potentially influenced by the Ifng-CTCF binding site mutation. Furthermore, the enrichment for terms related to **DNA damage response and repair** indicates a significant impact of LCMV infection on cellular integrity, potentially leading to cell cycle arrest and apoptosis. This is further supported by the enrichment for terms related to **apoptosis** in various cell types, including T cells, B cells, and macrophages.
##
## ## 2. Regulators:
##
## Several transcription factors are enriched in the GO analysis, suggesting their potential involvement in regulating the immune response to LCMV infection. For example, **Tbx21** (T-bet) is a key regulator of T helper 1 (Th1) cell differentiation, while **Foxp3** is crucial for regulatory T cell (Treg) development. **Irf1** and **Irf4** are transcription factors involved in interferon signaling and T cell development, respectively. **Eomes** is a transcription factor essential for CD8+ T cell differentiation and function. These transcription factors, along with **Gata3** (a Th2 cell differentiation factor), may be differentially regulated in the Ifng-CTCF mutant mice, potentially impacting the immune response to LCMV infection.
##
## ## 3. Key Genes or Potential Targets:
##
## Several genes from the enriched ontologies could be crucial to the system and potential targets for further investigation. **Ifng** (interferon-gamma) is a key cytokine involved in Th1 cell differentiation and antiviral immunity. Its dysregulation in the Ifng-CTCF mutant mice could significantly impact the immune response to LCMV infection. **Cd274** (PD-L1) is an immune checkpoint molecule that inhibits T cell activation. Its upregulation in the context of LCMV infection could contribute to immune evasion by the virus. **Cd44** is a cell adhesion molecule involved in T cell activation and migration. Its dysregulation could affect T cell trafficking and immune response. **Ripk2** is a kinase involved in both NF-κB signaling and necroptosis, a form of programmed cell death. Its role in LCMV infection could be complex, potentially contributing to both immune activation and cell death. **Il2ra** (CD25) is the alpha chain of the interleukin-2 receptor, crucial for T cell proliferation and survival. Its dysregulation could affect T cell expansion and immune response. Finally, **Tbx21** (T-bet) and **Foxp3** are transcription factors that could be targeted to modulate the balance between Th1 and Treg cells, potentially influencing the outcome of LCMV infection.
##
## Overall Summary:
## ## Comprehensive Summary of KEGG and GO Enrichment Results
##
## This analysis of differentially expressed genes from NK, CD4+, and CD8+ T cells in LCMV-infected WT and Ifng-CTCF mutant mice reveals a complex interplay of pathways and cellular processes involved in immune response, cancer, and cellular integrity.
##
## **KEGG Pathway Enrichment:**
##
## * **Cancer-related pathways:** Enrichment of pathways like "Central carbon metabolism in cancer", "Choline metabolism in cancer", "MicroRNAs in cancer", and "Proteoglycans in cancer" suggests a potential link between LCMV infection and tumorigenesis.
## * **Immune cell signaling and function:** Prominent pathways include "PD-L1 expression and PD-1 checkpoint pathway in cancer", "T cell receptor signaling pathway", "Chemokine signaling pathway", and "Natural killer cell mediated cytotoxicity", highlighting the crucial role of immune cells in controlling LCMV infection.
## * **Key regulators:** Transcription factors like STAT1, JUN, NFKBIA, and SMAD3 are present in the enriched pathways, suggesting their involvement in regulating gene expression changes related to immune cell activation and differentiation.
## * **Potential targets:** Genes like *Pik3r1*, *Ifng*, *Cd274*, *Casp3*, and *Tnf* are identified as potential targets for therapeutic interventions aimed at modulating immune responses and controlling LCMV infection.
##
## **GO Enrichment:**
##
## * **Immune cell activation and differentiation:** Significant enrichment for terms like "T cell activation involved in immune response", "CD4-positive, alpha-beta T cell activation", and "T cell differentiation" suggests a robust immune response to LCMV infection, potentially influenced by the Ifng-CTCF binding site mutation.
## * **DNA damage response and repair:** Enrichment for terms related to DNA damage response and repair indicates a significant impact of LCMV infection on cellular integrity, potentially leading to cell cycle arrest and apoptosis.
## * **Apoptosis:** Enrichment for terms related to apoptosis in various cell types, including T cells, B cells, and macrophages, further supports the impact of LCMV infection on cellular integrity.
## * **Key regulators:** Transcription factors like Tbx21, Foxp3, Irf1, Irf4, Eomes, and Gata3 are enriched, suggesting their potential involvement in regulating the immune response to LCMV infection.
## * **Potential targets:** Genes like *Ifng*, *Cd274*, *Cd44*, *Ripk2*, *Il2ra*, *Tbx21*, and *Foxp3* are identified as potential targets for further investigation due to their crucial roles in immune response and cellular processes.
##
## **Overall, the combined KEGG and GO enrichment results suggest a complex interplay of pathways and cellular processes involved in the immune response to LCMV infection, potentially influenced by the Ifng-CTCF binding site mutation. The identified key regulators and potential targets provide valuable insights for further investigation into the mechanisms underlying LCMV infection and the development of therapeutic strategies.**
# Network style summary of the enrichment analysis of Cluster 13
scassist_pathway_summary_13
# Filter the dataframe for cluster 15, p-value <= 0.05, and |avg_log2FC| > 1
cluster15_markers <- markersall[
markersall$cluster == 15 &
markersall$p_val <= 0.05 &
abs(markersall$avg_log2FC) > 1,
]
# Run enrichment analysis using 'mouse' database, with experimental_design, pvalue and log2FC cut off values
scassist_pathway_summary_15 <- SCassist_analyze_enrichment(markers=cluster15_markers,organism = "mouse", experimental_design = experimental_design, pvalue=0.05, log2FC=1, api_key_file = api_key_file, llm_server = "google")
##
## The KEGG analysis is presented below.
## -------------------------------------------
##
## The ClusterProfiler KEGG pathways result is saved as a text file in the current working directory.
##
## The list of significantly enriched KEGG pathways is also saved as a text file in the current working directory.
##
## KEGG Enrichment Insights:
## ## Analysis of KEGG Pathway Enrichment Results
##
## **1. Significant Pathways:**
##
## The KEGG pathway enrichment analysis reveals a complex interplay of pathways involved in immune response, cancer, and cellular processes in NK, CD4+, and CD8+ T cells from lymphocytic choriomeningitis virus (LCMV) infected WT and Ifng-CTCF binding site mutant mice. Several pathways related to cancer development and progression are enriched, including "Transcriptional Misregulation in Cancer", "Acute Myeloid Leukemia", and various cancer types like "Colorectal Cancer" and "Hepatocellular Carcinoma". This suggests that LCMV infection, particularly in the context of the Ifng-CTCF mutant, might influence cellular processes that contribute to cancer development. Furthermore, pathways related to immune response, such as "Natural Killer Cell Mediated Cytotoxicity", "T Cell Receptor Signaling Pathway", and "Th1 and Th2 Cell Differentiation", are also significantly enriched, indicating a strong impact of LCMV infection on immune cell function and differentiation.
##
## **2. Regulators:**
##
## Several transcription factors are identified within the enriched pathways, potentially playing a role in regulating the observed changes in gene expression. These include "Maf", "Hif1a", "Runx2", "Cebpa", "Tcf7", "Lef1", "E2f2", "Pparg", and "Gata3". These transcription factors are known to be involved in various cellular processes, including immune response, cell differentiation, and cancer development. Their involvement in the context of LCMV infection and the Ifng-CTCF mutant could be crucial in understanding the observed changes in immune cell function and potential cancer susceptibility.
##
## **3. Key Genes or Potential Targets:**
##
## Several genes from the enriched pathways stand out as potential targets for further investigation due to their potential impact on the system. "Ifng" itself, being the gene affected by the CTCF binding site mutation, is a key player in immune response and could be directly involved in the observed changes in immune cell function. Genes like "Cd28", "Cd80", "Cd86", "Prf1", "Fasl", and "Gzmb" are crucial for T cell activation and cytotoxicity, suggesting their potential role in the altered immune response observed in the mutant mice. Genes involved in cancer development, such as "Bcl2", "Cdkn1a", "Mmp9", "Pik3cb", and "Tgfb3", could be important targets for understanding the potential link between LCMV infection and cancer susceptibility. Furthermore, genes involved in cellular processes like "Actg1", "Itgb3", "Src", and "Jun" could be crucial for understanding the overall cellular response to LCMV infection and the impact of the Ifng-CTCF mutation.
##
## The GO analysis is provided below.
## ----------------------------------------
##
## The ClusterProfiler GO enrichment result is saved as a text file in the current working directory.
##
## The list of significantly enriched GO terms is also saved as a text file in the current working directory.
##
## GO Enrichment Insights:
## ## 1. Significant Concepts:
##
## The GO enrichment results reveal a complex interplay of biological processes in NK, CD4+, and CD8+ T cells from lymphocytic choriomeningitis virus (LCMV)-infected mice. Several concepts highlight the immune response to viral infection, including T cell activation, differentiation, and proliferation, as well as cytokine production and immune response regulation. Additionally, there is a strong emphasis on cellular processes related to cell migration, adhesion, and death, suggesting a dynamic interplay between immune cells and the surrounding environment.
##
## Furthermore, the enrichment analysis reveals significant involvement of metabolic processes, particularly those related to amino acids, lipids, and carbohydrates. This suggests that metabolic reprogramming plays a crucial role in the immune response to LCMV infection, potentially influencing cell survival, proliferation, and effector function.
##
## ## 2. Regulators:
##
## Several transcription factors are enriched in the GO analysis, suggesting their potential involvement in regulating the immune response to LCMV infection. For example, **Foxp3**, a key regulator of regulatory T cell (Treg) differentiation, is significantly enriched in several GO terms related to T cell activation, differentiation, and cytokine production. **Gata3**, a master regulator of T helper 2 (Th2) cell differentiation, is also enriched in several GO terms related to T cell activation, differentiation, and cytokine production. **Eomes**, a transcription factor crucial for CD8+ T cell differentiation, is enriched in GO terms related to CD8+ T cell activation and differentiation. **Batf**, a transcription factor involved in both Treg and Th17 cell differentiation, is also enriched in several GO terms related to T cell activation and differentiation. These transcription factors may play a crucial role in shaping the immune response to LCMV infection in both WT and Ifng-CTCF mutant mice.
##
## ## 3. Key Genes or Potential Targets:
##
## Several genes from the enriched ontologies could be important to the system or a target based on their potential impact on NK, CD4+, and CD8+ T cells from LCMV-infected mice. **Ifng**, the gene encoding interferon-gamma, is directly affected by the CTCF binding site mutation and is likely a key player in the altered immune response observed in mutant mice. **Il2ra**, encoding the interleukin-2 receptor alpha chain, is involved in T cell activation and proliferation, and its dysregulation could contribute to the altered immune response in mutant mice. **Cd28**, a co-stimulatory molecule crucial for T cell activation, is enriched in several GO terms related to T cell activation and proliferation, suggesting its potential role in the altered immune response. **Ripk3**, a key regulator of necroptosis, is enriched in GO terms related to T cell apoptosis and cell death, suggesting its potential role in the altered immune response and cell fate decisions. **Prdx4**, a peroxiredoxin involved in oxidative stress response, is enriched in GO terms related to metabolic processes and cellular response to oxidative stress, suggesting its potential role in the altered metabolic reprogramming and cell survival in mutant mice. **Hif1a**, a transcription factor involved in hypoxia response, is enriched in several GO terms related to metabolic processes and cellular response to hypoxia, suggesting its potential role in the altered metabolic reprogramming and cell survival in mutant mice. Further investigation of these genes could provide valuable insights into the mechanisms underlying the altered immune response to LCMV infection in Ifng-CTCF mutant mice.
##
## Overall Summary:
## ## Comprehensive Summary of KEGG and GO Enrichment Results
##
## This analysis of differentially expressed genes from NK, CD4+, and CD8+ T cells isolated from LCMV-infected WT and Ifng-CTCF mutant mice reveals a complex interplay of pathways and biological processes impacted by the viral infection and the genetic mutation.
##
## **KEGG Pathway Enrichment:**
##
## * **Cancer-related pathways:** Enrichment of pathways like "Transcriptional Misregulation in Cancer", "Acute Myeloid Leukemia", "Colorectal Cancer", and "Hepatocellular Carcinoma" suggests a potential link between LCMV infection, particularly in the context of the Ifng-CTCF mutation, and cancer development.
## * **Immune response pathways:** Significant enrichment of pathways like "Natural Killer Cell Mediated Cytotoxicity", "T Cell Receptor Signaling Pathway", and "Th1 and Th2 Cell Differentiation" indicates a strong impact of LCMV infection on immune cell function and differentiation.
## * **Key regulators:** Transcription factors like Maf, Hif1a, Runx2, Cebpa, Tcf7, Lef1, E2f2, Pparg, and Gata3 are identified within the enriched pathways, potentially playing a role in regulating the observed changes in gene expression.
##
## **GO Enrichment:**
##
## * **Immune response:** GO terms related to T cell activation, differentiation, proliferation, cytokine production, and immune response regulation highlight the complex immune response to viral infection.
## * **Cellular processes:** Enrichment of terms related to cell migration, adhesion, and death suggests a dynamic interplay between immune cells and their environment.
## * **Metabolic processes:** Significant involvement of metabolic processes, particularly those related to amino acids, lipids, and carbohydrates, suggests a crucial role for metabolic reprogramming in the immune response to LCMV infection.
## * **Key regulators:** Transcription factors like Foxp3, Gata3, Eomes, and Batf are enriched in GO terms related to T cell activation, differentiation, and cytokine production, suggesting their potential involvement in shaping the immune response.
##
## **Key Genes and Potential Targets:**
##
## * **Ifng:** The gene directly affected by the CTCF binding site mutation, likely playing a key role in the altered immune response observed in mutant mice.
## * **Il2ra:** Involved in T cell activation and proliferation, its dysregulation could contribute to the altered immune response in mutant mice.
## * **Cd28:** A co-stimulatory molecule crucial for T cell activation, potentially involved in the altered immune response.
## * **Ripk3:** A key regulator of necroptosis, potentially involved in the altered immune response and cell fate decisions.
## * **Prdx4:** A peroxiredoxin involved in oxidative stress response, potentially involved in the altered metabolic reprogramming and cell survival in mutant mice.
## * **Hif1a:** A transcription factor involved in hypoxia response, potentially involved in the altered metabolic reprogramming and cell survival in mutant mice.
##
## **Overall, this combined analysis suggests that LCMV infection, particularly in the context of the Ifng-CTCF mutation, significantly impacts immune cell function, differentiation, and metabolic reprogramming. Further investigation of the identified key genes and regulators could provide valuable insights into the mechanisms underlying these changes and the potential link between LCMV infection and cancer susceptibility.**
# Network style summary of the enrichment analysis
scassist_pathway_summary_15
#save.image(file=paste(outputpath,"LCMV_G.RData",sep=""))
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.7.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] org.Mm.eg.db_3.19.1 AnnotationDbi_1.66.0
## [3] IRanges_2.38.1 S4Vectors_0.42.1
## [5] Biobase_2.64.0 BiocGenerics_0.50.0
## [7] jsonlite_1.8.8 httr_1.4.7
## [9] SCassist_0.1.0 rollama_0.1.0
## [11] scProportionTest_0.0.0.9000 cowplot_1.1.3
## [13] lubridate_1.9.3 forcats_1.0.0
## [15] stringr_1.5.1 dplyr_1.1.4
## [17] purrr_1.0.2 readr_2.1.5
## [19] tidyr_1.3.1 tibble_3.2.1
## [21] tidyverse_2.0.0 plotly_4.10.4
## [23] ggplot2_3.5.1 Seurat_5.0.0
## [25] SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.4 matrixStats_1.3.0
## [3] spatstat.sparse_3.1-0 enrichplot_1.24.4
## [5] RColorBrewer_1.1-3 tools_4.4.1
## [7] sctransform_0.4.1 utf8_1.2.4
## [9] R6_2.5.1 lazyeval_0.2.2
## [11] uwot_0.2.2 withr_3.0.0
## [13] gridExtra_2.3 progressr_0.14.0
## [15] textshaping_0.4.0 cli_3.6.3
## [17] spatstat.explore_3.3-2 fastDummies_1.7.4
## [19] scatterpie_0.2.4 labeling_0.4.3
## [21] sass_0.4.9 spatstat.data_3.1-2
## [23] ggridges_0.5.6 pbapply_1.7-2
## [25] systemfonts_1.1.0 yulab.utils_0.1.7
## [27] gson_0.1.0 DOSE_3.30.5
## [29] R.utils_2.12.3 parallelly_1.38.0
## [31] rstudioapi_0.16.0 RSQLite_2.3.7
## [33] visNetwork_2.1.2 generics_0.1.3
## [35] gridGraphics_0.5-1 ica_1.0-3
## [37] spatstat.random_3.3-2 GO.db_3.19.1
## [39] Matrix_1.7-0 ggbeeswarm_0.7.2
## [41] fansi_1.0.6 abind_1.4-5
## [43] R.methodsS3_1.8.2 lifecycle_1.0.4
## [45] yaml_2.3.9 SummarizedExperiment_1.34.0
## [47] SparseArray_1.4.8 glmGamPoi_1.16.0
## [49] qvalue_2.36.0 Rtsne_0.17
## [51] grid_4.4.1 blob_1.2.4
## [53] promises_1.3.0 crayon_1.5.3
## [55] miniUI_0.1.1.1 lattice_0.22-6
## [57] KEGGREST_1.44.1 pillar_1.9.0
## [59] knitr_1.48 GenomicRanges_1.56.1
## [61] fgsea_1.30.0 future.apply_1.11.2
## [63] codetools_0.2-20 fastmatch_1.1-4
## [65] leiden_0.4.3.1 glue_1.7.0
## [67] ggfun_0.1.5 spatstat.univar_3.0-1
## [69] data.table_1.15.4 vctrs_0.6.5
## [71] png_0.1-8 treeio_1.28.0
## [73] spam_2.10-0 gtable_0.3.5
## [75] cachem_1.1.0 xfun_0.45
## [77] S4Arrays_1.4.1 mime_0.12
## [79] tidygraph_1.3.1 survival_3.6-4
## [81] fitdistrplus_1.2-1 ROCR_1.0-11
## [83] nlme_3.1-164 ggtree_3.12.0
## [85] bit64_4.0.5 RcppAnnoy_0.0.22
## [87] GenomeInfoDb_1.40.1 bslib_0.7.0
## [89] irlba_2.3.5.1 vipor_0.4.7
## [91] KernSmooth_2.23-24 colorspace_2.1-0
## [93] DBI_1.2.3 ggrastr_1.0.2
## [95] tidyselect_1.2.1 curl_5.2.1
## [97] bit_4.0.5 compiler_4.4.1
## [99] httr2_1.0.1 hdf5r_1.3.11
## [101] DelayedArray_0.30.1 shadowtext_0.1.4
## [103] scales_1.3.0 lmtest_0.9-40
## [105] rappdirs_0.3.3 digest_0.6.36
## [107] goftest_1.2-3 spatstat.utils_3.1-0
## [109] rmarkdown_2.27 XVector_0.44.0
## [111] htmltools_0.5.8.1 pkgconfig_2.0.3
## [113] sparseMatrixStats_1.16.0 MatrixGenerics_1.16.0
## [115] highr_0.11 fastmap_1.2.0
## [117] rlang_1.1.4 htmlwidgets_1.6.4
## [119] UCSC.utils_1.0.0 DelayedMatrixStats_1.26.0
## [121] shiny_1.8.1.1 farver_2.1.2
## [123] jquerylib_0.1.4 zoo_1.8-12
## [125] BiocParallel_1.38.0 GOSemSim_2.30.2
## [127] R.oo_1.26.0 magrittr_2.0.3
## [129] GenomeInfoDbData_1.2.12 ggplotify_0.1.2
## [131] dotCall64_1.1-1 patchwork_1.2.0
## [133] munsell_0.5.1 Rcpp_1.0.12
## [135] ape_5.8 viridis_0.6.5
## [137] reticulate_1.39.0 stringi_1.8.4
## [139] ggraph_2.2.1 zlibbioc_1.50.0
## [141] MASS_7.3-60.2 plyr_1.8.9
## [143] org.Hs.eg.db_3.19.1 parallel_4.4.1
## [145] listenv_0.9.1 ggrepel_0.9.5
## [147] deldir_2.0-4 Biostrings_2.72.1
## [149] graphlayouts_1.2.0 splines_4.4.1
## [151] tensor_1.5 hms_1.1.3
## [153] igraph_2.0.3 spatstat.geom_3.3-3
## [155] RcppHNSW_0.6.0 reshape2_1.4.4
## [157] evaluate_0.24.0 BiocManager_1.30.23
## [159] tzdb_0.4.0 tweenr_2.0.3
## [161] httpuv_1.6.15 RANN_2.6.2
## [163] polyclip_1.10-6 future_1.34.0
## [165] scattermore_1.2 ggforce_0.4.2
## [167] xtable_1.8-4 RSpectra_0.16-1
## [169] tidytree_0.4.6 later_1.3.2
## [171] viridisLite_0.4.2 ragg_1.3.2
## [173] clusterProfiler_4.12.6 aplot_0.2.3
## [175] memoise_2.0.1 beeswarm_0.4.0
## [177] cluster_2.1.6 timechange_0.3.0
## [179] globals_0.16.3