GSVA

2024-11-22 17:47:45

https://bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html

library(TCGAbiolinks)

library(maftools)

getGDCprojects()

query <- GDCquery(

project = "TCGA-HNSC",

data.category = "Transcriptome Profiling", experimental.strategy = 'RNA-Seq',

data.type = "Gene Expression Quantification",

workflow.type = "HTSeq - Counts")

GDCdownload(query)

d <- getResults(query)

HNSC.Rnaseq.SE <- GDCprepare(query)

HNSCMatrix <- assay(HNSC.Rnaseq.SE,"unstranded")

HNSC.RNAseq_CorOutliers <- TCGAanalyze_Preprocessing(HNSC.Rnaseq.SE)

library(TCGAbiolinks)

# normalization of genes

dataNorm <- TCGAanalyze_Normalization(

tabDF = HNSC.RNAseq_CorOutliers,

geneInfo = geneInfoHT

)

# quantile filter of genes

dataFilt <- TCGAanalyze_Filtering(

tabDF = dataNorm,

method = "quantile",

qnt.cut = 0.25

)

# selection of normal samples "NT"

samplesNT <- TCGAquery_SampleTypes(

barcode = colnames(dataFilt),

typesample = c("NT")

)

# selection of tumor samples "TP"

samplesTP <- TCGAquery_SampleTypes(

barcode = colnames(dataFilt),

typesample = c("TP")

)

# Diff.expr.analysis (DEA)

dataDEGs <- TCGAanalyze_DEA(

mat1 = dataFilt[,samplesNT],

mat2 = dataFilt[,samplesTP],

Cond1type = "Normal",

Cond2type = "Tumor",

fdr.cut = 0.01 ,

logFC.cut = 1,

method = "glmLRT"

)

# DEGs table with expression values in normal and tumor samples

dataDEGsFiltLevel <- TCGAanalyze_LevelTab(

FC_FDR_table_mRNA = dataDEGs,

typeCond1 = "Tumor",

typeCond2 = "Normal",

TableCond1 = dataFilt[,samplesTP],

TableCond2 = dataFilt[,samplesNT]

)

https://alexslemonade.github.io/refinebio-examples/03-rnaseq/pathway-analysis_rnaseq_03_gsva.html#24_About_the_dataset_we_are_using_for_this_example

if (!("DESeq2" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("DESeq2", update = FALSE)

}

if (!("GSVA" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("GSVA", update = FALSE)

}

if (!("qusage" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("qusage", update = FALSE)

}

if (!("org.Hs.eg.db" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("org.Hs.eg.db", update = FALSE)

}

if (!("pheatmap" %in% installed.packages())) {

# Install pheatmap

install.packages("pheatmap", update = FALSE)

}

library(DESeq2)

# Attach the `qusage` library

library(qusage)

# Attach the `GSVA` library

library(GSVA)

# Human annotation package we'll use for gene identifier conversion

library(org.Hs.eg.db)

# We will need this so we can use the pipe: %>%

library(magrittr)

library(msigdbr)

Prepare data for DESeq2

dds <- DESeqDataSetFromMatrix(

countData = expression_df, # Our prepped data frame with counts

colData = metadata, # Data frame with annotation for our samples

design = ~1 # Here we are not specifying a model

)

dds_norm <- vst(dds)

vst_df <- assay(dds_norm) %>%

as.data.frame() %>% # Make into a data frame

tibble::rownames_to_column("gene")

names(vst_df)[1] <- "SYMBOL"

hallmark_gene_sets <- msigdbr::msigdbr(

species = "Homo sapiens", # Can change this to what species you need

category = "H" # Only hallmark gene sets

)

head(hallmark_gene_sets)

gs_cat <chr>	gs_subcat <chr>	gs_name <chr>	gene_symbol <chr>	entrez_gene <int>	ensembl_gene <chr>
H		HALLMARK_ADIPOGENESIS	ABCA1	19	ENSG00000165029
H		HALLMARK_ADIPOGENESIS	ABCB8	11194	ENSG00000197150

gs_cat

<chr>

gs_subcat

<chr>

gs_name

<chr>

gene_symbol

<chr>

entrez_gene

<int>

ensembl_gene

<chr>

HALLMARK_ADIPOGENESIS

ABCA1

ENSG00000165029

HALLMARK_ADIPOGENESIS

ABCB8

11194

ENSG00000197150

hallmarks_list <- split(

hallmark_gene_sets$entrez_gene, # The genes we want split into pathways

hallmark_gene_sets$gs_name # The pathways made as the higher levels of the list

)

## $HALLMARK_ADIPOGENESIS

## [1] 19 11194 10449 33 34 35 47 50 51

## [10] 112 149685 9370 79602 79602 56894 9131 204 217

keytypes(org.Hs.eg.db)

## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"

## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"

## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"

## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"

## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"

## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"

## [25] "UNIGENE" "UNIPROT"

msigdbr_collections()

# A tibble: 23 x 3

gs_cat gs_subcat num_genesets

1 C1 "" 299

2 C2 "CGP" 3384

3 C2 "CP" 29

4 C2 "CP:BIOCARTA" 292

5 C2 "CP:REACTOME" 186

6 C2 "CP:PID" 196

7 C2 "CP:REACTOME" 1615

8 C2 "CP:WIKIPATHWAYS" 664

9 C3 "MIR:MIRDB" 2377

10 C3 "MIR:MIR_Legacy" 221

cgp_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP")

cgp_list <- split(

cgp_gene_sets$entrez_gene, # The genes we want split into pathways

cgp_gene_sets$gs_name # The pathways made as the higher levels of the list

)

REACTOME_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")

REACTOME_list <- split(

REACTOME_gene_sets$entrez_gene, # The genes we want split into pathways

REACTOME_gene_sets$gs_name # The pathways made as the higher levels of the list

)

BIOCARTA_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:BIOCARTA")

BIOCARTA_list <- split(

BIOCARTA_gene_sets$entrez_gene, # The genes we want split into pathways

BIOCARTA_gene_sets$gs_name # The pathways made as the higher levels of the list

)

mapped_df <- data.frame(

"entrez_id" = mapIds(

# Replace with annotation package for the organism relevant to your data

org.Hs.eg.db,

keys = vst_df$SYMBOL,

# Replace with the type of gene identifiers in your data

keytype = "SYMBOL",

# Replace with the type of gene identifiers you would like to map to

column = "ENTREZID",

# This will keep only the first mapped value for each Ensembl ID

multiVals = "first"

)

) %>%

# If an Ensembl gene identifier doesn't map to a Entrez gene identifier,

# drop that from the data frame

dplyr::filter(!is.na(entrez_id)) %>%

# Make an `Ensembl` column to store the row names

tibble::rownames_to_column("SYMBOL") %>%

# Now let's join the rest of the expression data

dplyr::inner_join(vst_df, by = c("SYMBOL" = "SYMBOL"))

mapped_df[1:3, 1:5]

entrez_id[1945] <- NULL #remove list

entrez_id1 <- as_tibble(entrez_id)

which(is.na(filtered_mapped_matrix$entrez_id)) # check missing value

SYMBOL entrez_id TCGA.4P.AA8J.01A.11R.A39I.07 TCGA.BA.4074.01A.01R.1436.07 TCGA.BA.4075.01A.01R.1436.07

1 A1BG 1 7.761813 7.828986 7.020303

2 A1CF 29974 5.174723 5.178919 5.186378

3 A2ML1 144568 11.034088 8.121422 6.562782

gene_means <- rowMeans(mapped_df %>% dplyr::select(-SYMBOL, -entrez_id))

mapped_df <- mapped_df %>%

# Add gene_means as a column called gene_means

dplyr::mutate(gene_means) %>%

# Reorder the columns so `gene_means` column is upfront

dplyr::select(SYMBOL, entrez_id, gene_means, dplyr::everything())

filtered_mapped_df <- mapped_df %>%

# Sort so that the highest mean expression values are at the top

dplyr::arrange(dplyr::desc(gene_means)) %>%

# Filter out the duplicated rows using `dplyr::distinct()`

dplyr::distinct(entrez_id, .keep_all = TRUE)

sum(duplicated(filtered_mapped_df$SYMBOL))

filtered_mapped_matrix <- filtered_mapped_df %>%

# GSVA can't the Ensembl IDs so we should drop this column as well as the means

dplyr::select(-SYMBOL, -gene_means) %>%

# We need to store our gene identifiers as row names

tibble::column_to_rownames("entrez_id") %>%

# Now we can convert our object into a matrix

as.matrix()

gsva_results <- gsva(

filtered_mapped_matrix,

hallmarks_list,

method = "gsva",

# Appropriate for our vst transformed data

kcdf = "Gaussian",

# Minimum gene set size

min.sz = 15,

# Maximum gene set size

max.sz = 500,

# Compute Gaussian-distributed scores

mx.diff = TRUE,

# Don't print out the progress bar

verbose = FALSE

)

tgsva_REACTOME_results <- as.data.frame(t(gsva_REACTOME_results))

tgsva_REACTOME_results$ID <- rownames(tgsva_REACTOME_results)

gsva_REACTOME_results1 <- merge(metadata, tgsva_REACTOME_results, by = "ID")

ddd1 <- gsva_REACTOME_results1[, 2:52] %>%

gather(key = variable, value = value, - group) %>%

group_by(group, variable) %>%

summarise(value = list(value)) %>%

spread(group, value) %>%

group_by(variable) %>%

mutate(p_value = t.test(unlist(TT), unlist(NT))$p.value,

t_value = t.test(unlist(TT), unlist(NT))$statistic)

ggplot(ddd1, aes(reorder(variable, t_value), t_value)) +

geom_col(aes(fill=p_value<0.05)) +

coord_flip() +

labs(x="Pathway", y="t_value of GSVA score ",

title="GSVA_REACTOME in HNSC_TT_vs_NT ") +

theme_minimal()

library(annotables)

grch38 %>% filter(ensgene %in% )

登录后才可评论.

切换到网页版

longteng24

GSVA

longteng24 (2024-11-22 17:47:45) 评论 (0)

https://bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html

library(TCGAbiolinks)

library(maftools)

getGDCprojects()

query <- GDCquery(

project = "TCGA-HNSC",

data.category = "Transcriptome Profiling", experimental.strategy = 'RNA-Seq',

data.type = "Gene Expression Quantification",

workflow.type = "HTSeq - Counts")

GDCdownload(query)

d <- getResults(query)

HNSC.Rnaseq.SE <- GDCprepare(query)

HNSCMatrix <- assay(HNSC.Rnaseq.SE,"unstranded")

HNSC.RNAseq_CorOutliers <- TCGAanalyze_Preprocessing(HNSC.Rnaseq.SE)

library(TCGAbiolinks)

# normalization of genes

dataNorm <- TCGAanalyze_Normalization(

tabDF = HNSC.RNAseq_CorOutliers,

geneInfo = geneInfoHT

)

# quantile filter of genes

dataFilt <- TCGAanalyze_Filtering(

tabDF = dataNorm,

method = "quantile",

qnt.cut = 0.25

)

# selection of normal samples "NT"

samplesNT <- TCGAquery_SampleTypes(

barcode = colnames(dataFilt),

typesample = c("NT")

)

# selection of tumor samples "TP"

samplesTP <- TCGAquery_SampleTypes(

barcode = colnames(dataFilt),

typesample = c("TP")

)

# Diff.expr.analysis (DEA)

dataDEGs <- TCGAanalyze_DEA(

mat1 = dataFilt[,samplesNT],

mat2 = dataFilt[,samplesTP],

Cond1type = "Normal",

Cond2type = "Tumor",

fdr.cut = 0.01 ,

logFC.cut = 1,

method = "glmLRT"

)

# DEGs table with expression values in normal and tumor samples

dataDEGsFiltLevel <- TCGAanalyze_LevelTab(

FC_FDR_table_mRNA = dataDEGs,

typeCond1 = "Tumor",

typeCond2 = "Normal",

TableCond1 = dataFilt[,samplesTP],

TableCond2 = dataFilt[,samplesNT]

)

https://alexslemonade.github.io/refinebio-examples/03-rnaseq/pathway-analysis_rnaseq_03_gsva.html#24_About_the_dataset_we_are_using_for_this_example

if (!("DESeq2" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("DESeq2", update = FALSE)

}

if (!("GSVA" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("GSVA", update = FALSE)

}

if (!("qusage" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("qusage", update = FALSE)

}

if (!("org.Hs.eg.db" %in% installed.packages())) {

# Install this package if it isn't installed yet

BiocManager::install("org.Hs.eg.db", update = FALSE)

}

if (!("pheatmap" %in% installed.packages())) {

# Install pheatmap

install.packages("pheatmap", update = FALSE)

}

library(DESeq2)

# Attach the `qusage` library

library(qusage)

# Attach the `GSVA` library

library(GSVA)

# Human annotation package we'll use for gene identifier conversion

library(org.Hs.eg.db)

# We will need this so we can use the pipe: %>%

library(magrittr)

library(msigdbr)

Prepare data for DESeq2

dds <- DESeqDataSetFromMatrix(

countData = expression_df, # Our prepped data frame with counts

colData = metadata, # Data frame with annotation for our samples

design = ~1 # Here we are not specifying a model

)

dds_norm <- vst(dds)

vst_df <- assay(dds_norm) %>%

as.data.frame() %>% # Make into a data frame

tibble::rownames_to_column("gene")

names(vst_df)[1] <- "SYMBOL"

hallmark_gene_sets <- msigdbr::msigdbr(

species = "Homo sapiens", # Can change this to what species you need

category = "H" # Only hallmark gene sets

)

head(hallmark_gene_sets)

gs_cat <chr>	gs_subcat <chr>	gs_name <chr>	gene_symbol <chr>	entrez_gene <int>	ensembl_gene <chr>
H		HALLMARK_ADIPOGENESIS	ABCA1	19	ENSG00000165029
H		HALLMARK_ADIPOGENESIS	ABCB8	11194	ENSG00000197150

gs_cat

<chr>

gs_subcat

<chr>

gs_name

<chr>

gene_symbol

<chr>

entrez_gene

<int>

ensembl_gene

<chr>

HALLMARK_ADIPOGENESIS

ABCA1

ENSG00000165029

HALLMARK_ADIPOGENESIS

ABCB8

11194

ENSG00000197150

hallmarks_list <- split(

hallmark_gene_sets$entrez_gene, # The genes we want split into pathways

hallmark_gene_sets$gs_name # The pathways made as the higher levels of the list

)

## $HALLMARK_ADIPOGENESIS

## [1] 19 11194 10449 33 34 35 47 50 51

## [10] 112 149685 9370 79602 79602 56894 9131 204 217

keytypes(org.Hs.eg.db)

## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"

## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"

## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"

## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"

## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"

## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"

## [25] "UNIGENE" "UNIPROT"

msigdbr_collections()

# A tibble: 23 x 3

gs_cat gs_subcat num_genesets

1 C1 "" 299

2 C2 "CGP" 3384

3 C2 "CP" 29

4 C2 "CP:BIOCARTA" 292

5 C2 "CP:REACTOME" 186

6 C2 "CP:PID" 196

7 C2 "CP:REACTOME" 1615

8 C2 "CP:WIKIPATHWAYS" 664

9 C3 "MIR:MIRDB" 2377

10 C3 "MIR:MIR_Legacy" 221

cgp_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP")

cgp_list <- split(

cgp_gene_sets$entrez_gene, # The genes we want split into pathways

cgp_gene_sets$gs_name # The pathways made as the higher levels of the list

)

REACTOME_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")

REACTOME_list <- split(

REACTOME_gene_sets$entrez_gene, # The genes we want split into pathways

REACTOME_gene_sets$gs_name # The pathways made as the higher levels of the list

)

BIOCARTA_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:BIOCARTA")

BIOCARTA_list <- split(

BIOCARTA_gene_sets$entrez_gene, # The genes we want split into pathways

BIOCARTA_gene_sets$gs_name # The pathways made as the higher levels of the list

)

mapped_df <- data.frame(

"entrez_id" = mapIds(

# Replace with annotation package for the organism relevant to your data

org.Hs.eg.db,

keys = vst_df$SYMBOL,

# Replace with the type of gene identifiers in your data

keytype = "SYMBOL",

# Replace with the type of gene identifiers you would like to map to

column = "ENTREZID",

# This will keep only the first mapped value for each Ensembl ID

multiVals = "first"

)

) %>%

# If an Ensembl gene identifier doesn't map to a Entrez gene identifier,

# drop that from the data frame

dplyr::filter(!is.na(entrez_id)) %>%

# Make an `Ensembl` column to store the row names

tibble::rownames_to_column("SYMBOL") %>%

# Now let's join the rest of the expression data

dplyr::inner_join(vst_df, by = c("SYMBOL" = "SYMBOL"))

mapped_df[1:3, 1:5]

entrez_id[1945] <- NULL #remove list

entrez_id1 <- as_tibble(entrez_id)

which(is.na(filtered_mapped_matrix$entrez_id)) # check missing value

SYMBOL entrez_id TCGA.4P.AA8J.01A.11R.A39I.07 TCGA.BA.4074.01A.01R.1436.07 TCGA.BA.4075.01A.01R.1436.07

1 A1BG 1 7.761813 7.828986 7.020303

2 A1CF 29974 5.174723 5.178919 5.186378

3 A2ML1 144568 11.034088 8.121422 6.562782

gene_means <- rowMeans(mapped_df %>% dplyr::select(-SYMBOL, -entrez_id))

mapped_df <- mapped_df %>%

# Add gene_means as a column called gene_means

dplyr::mutate(gene_means) %>%

# Reorder the columns so `gene_means` column is upfront

dplyr::select(SYMBOL, entrez_id, gene_means, dplyr::everything())

filtered_mapped_df <- mapped_df %>%

# Sort so that the highest mean expression values are at the top

dplyr::arrange(dplyr::desc(gene_means)) %>%

# Filter out the duplicated rows using `dplyr::distinct()`

dplyr::distinct(entrez_id, .keep_all = TRUE)

sum(duplicated(filtered_mapped_df$SYMBOL))

filtered_mapped_matrix <- filtered_mapped_df %>%

# GSVA can't the Ensembl IDs so we should drop this column as well as the means

dplyr::select(-SYMBOL, -gene_means) %>%

# We need to store our gene identifiers as row names

tibble::column_to_rownames("entrez_id") %>%

# Now we can convert our object into a matrix

as.matrix()

gsva_results <- gsva(

filtered_mapped_matrix,

hallmarks_list,

method = "gsva",

# Appropriate for our vst transformed data

kcdf = "Gaussian",

# Minimum gene set size

min.sz = 15,

# Maximum gene set size

max.sz = 500,

# Compute Gaussian-distributed scores

mx.diff = TRUE,

# Don't print out the progress bar

verbose = FALSE

)

tgsva_REACTOME_results <- as.data.frame(t(gsva_REACTOME_results))

tgsva_REACTOME_results$ID <- rownames(tgsva_REACTOME_results)

gsva_REACTOME_results1 <- merge(metadata, tgsva_REACTOME_results, by = "ID")

ddd1 <- gsva_REACTOME_results1[, 2:52] %>%

gather(key = variable, value = value, - group) %>%

group_by(group, variable) %>%

summarise(value = list(value)) %>%

spread(group, value) %>%

group_by(variable) %>%

mutate(p_value = t.test(unlist(TT), unlist(NT))$p.value,

t_value = t.test(unlist(TT), unlist(NT))$statistic)

ggplot(ddd1, aes(reorder(variable, t_value), t_value)) +

geom_col(aes(fill=p_value<0.05)) +

coord_flip() +

labs(x="Pathway", y="t_value of GSVA score ",

title="GSVA_REACTOME in HNSC_TT_vs_NT ") +

theme_minimal()

library(annotables)

grch38 %>% filter(ensgene %in% )

GSVA

今日热点

一周热点

longteng24

GSVA