在 R 中提取基因游戏 RNAseq 数据集
Extracting Gene Games RNAseq DataSet in R
我有一个我可以理解或解决的问题。我从 GEO 下载了 GSE115262。 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115262。我想从 GSM3172784HC$annotation.gene_name 中提取基因名称。当我这样做时,我得到的是数字而不是基因名称。我如何获得字符值?如果我 运行 Str(),这就是我得到的 $ annotation.gene_name :因子 w/ 56233 水平 "5_8S_rRNA","5S_rRNA",..: 53514 52750 11836 48738。我们看到我得到了数字。如果我 运行 head() 并查看 GSM3172784HC$annotation.gene_name,我得到了基因名称,这就是我想要的。我如何获得这些?
#### Need to load in all libraries
#General Bioconductor packages
library("GEOquery");
library("Biobase");
# Loop Through Files for download
for(i in 1:length(tmp$V1)){
getGEOSuppFiles(tmp$V1[i])
};
######## Healthy Controls GSE115262 ##########
## May need to read thing mult. times to get into R
GSM3172784HC<-read.table(gzfile("FilePath.txt.gz"), header=T)
## New data-frame
HCData<- cbind(GSM3172784HC$annotation.gene_name, GSM3172784HC$expected_count);
HCData<- as.data.frame(HCData)
row.names(HCData) <- HCData$V1
colnames(HCData) <- c("HC1")
str(GSM3172784HC)
'data.frame': 57955 obs. of 11 variables:
$ X : int 1 2 3 4 5 6 7 8 9 10 ...
$ annotation.gene_id : Factor w/ 57955 levels "ENSG00000000003",..: 1 2 3 4 5 6 7 8 9 10 ...
$ annotation.gene_biotype: Factor w/ 43 levels "3prime_overlapping_ncRNA",..: 20 20 20 20 20 20 20 20 20 20 ...
$ annotation.gene_name : Factor w/ 56233 levels "5_8S_rRNA","5S_rRNA",..: 53514 52750 11836 48738 5916 13731 7375 14125 14433 24521 ...
$ annotation.source : Factor w/ 4 levels "ensembl","ensembl_havana",..: 2 2 2 2 2 2 2 2 2 2 ...
$ transcript_id.s. : Factor w/ 57955 levels "ENST00000000233,ENST00000415666,ENST00000459680,ENST00000463733,ENST00000467281,ENST00000489673",..: 17666 17669 17397 16695 5799 17850 14301 7 1276 12553 ...
$ length : num 1749 940 1073 1538 2430 ...
$ effective_length : num 1623 814 947 1412 2304 ...
$ expected_count : num 0 0 1 1 0 2 2 0 1 1 ...
$ TPM : num 0 0 0.27 0.18 0 0.23 0.07 0 0.65 0.17 ...
$ FPKM : num 0 0 0.41 0.27 0 0.35 0.11 0 0.98 0.25 ...
head(GSM3172784HC)
X annotation.gene_id annotation.gene_biotype annotation.gene_name
1 1 ENSG00000000003 protein_coding TSPAN6
2 2 ENSG00000000005 protein_coding TNMD
3 3 ENSG00000000419 protein_coding DPM1
4 4 ENSG00000000457 protein_coding SCYL3
5 5 ENSG00000000460 protein_coding C1orf112
6 6 ENSG00000000938 protein_coding FGR
annotation.source
1 ensembl_havana
2 ensembl_havana
3 ensembl_havana
4 ensembl_havana
5 ensembl_havana
6 ensembl_havana
transcript_id.s.
1 ENST00000373020,ENST00000494424,ENST00000496771,ENST00000612152,ENST00000614008
2 ENST00000373031,ENST00000485971
3 ENST00000371582,ENST00000371584,ENST00000371588,ENST00000413082,ENST00000466152,ENST00000494752
4 ENST00000367770,ENST00000367771,ENST00000367772,ENST00000423670,ENST00000470238
5 ENST00000286031,ENST00000359326,ENST00000413811,ENST00000459772,ENST00000466580,ENST00000472795,ENST00000481744,ENST00000496973,ENST00000498289
6 ENST00000374003,ENST00000374004,ENST00000374005,ENST00000399173,ENST00000457296,ENST00000468038,ENST00000475472
length effective_length expected_count TPM FPKM
1 1749.40 1623.17 0 0.00 0.00
2 940.50 814.28 0 0.00 0.00
3 1073.00 946.77 1 0.27 0.41
4 1538.00 1411.77 1 0.18 0.27
5 2430.11 2303.88 0 0.00 0.00
6 2350.00 2223.77 2 0.23 0.35
我们可以将列转换为 character
library(dplyr)
GSM3172784HC <- GSM3172784HC %>%
mutate_if(is.factor, as.character)
或 mutate/across
GSM3172784HC <- GSM3172784HC %>%
mutate(across(where(is.factor), as.character))
在base R
中,我们可以做到
i1 <- sapply(GSM3172784HC, is.factor)
GSM3172784HC[i1] <- lapply(GSM3172784HC[i1], as.character)
注意:对于 R >= 4.0.0
,默认情况下 stringsAsFactors = FALSE
我有一个我可以理解或解决的问题。我从 GEO 下载了 GSE115262。 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115262。我想从 GSM3172784HC$annotation.gene_name 中提取基因名称。当我这样做时,我得到的是数字而不是基因名称。我如何获得字符值?如果我 运行 Str(),这就是我得到的 $ annotation.gene_name :因子 w/ 56233 水平 "5_8S_rRNA","5S_rRNA",..: 53514 52750 11836 48738。我们看到我得到了数字。如果我 运行 head() 并查看 GSM3172784HC$annotation.gene_name,我得到了基因名称,这就是我想要的。我如何获得这些?
#### Need to load in all libraries
#General Bioconductor packages
library("GEOquery");
library("Biobase");
# Loop Through Files for download
for(i in 1:length(tmp$V1)){
getGEOSuppFiles(tmp$V1[i])
};
######## Healthy Controls GSE115262 ##########
## May need to read thing mult. times to get into R
GSM3172784HC<-read.table(gzfile("FilePath.txt.gz"), header=T)
## New data-frame
HCData<- cbind(GSM3172784HC$annotation.gene_name, GSM3172784HC$expected_count);
HCData<- as.data.frame(HCData)
row.names(HCData) <- HCData$V1
colnames(HCData) <- c("HC1")
str(GSM3172784HC)
'data.frame': 57955 obs. of 11 variables:
$ X : int 1 2 3 4 5 6 7 8 9 10 ...
$ annotation.gene_id : Factor w/ 57955 levels "ENSG00000000003",..: 1 2 3 4 5 6 7 8 9 10 ...
$ annotation.gene_biotype: Factor w/ 43 levels "3prime_overlapping_ncRNA",..: 20 20 20 20 20 20 20 20 20 20 ...
$ annotation.gene_name : Factor w/ 56233 levels "5_8S_rRNA","5S_rRNA",..: 53514 52750 11836 48738 5916 13731 7375 14125 14433 24521 ...
$ annotation.source : Factor w/ 4 levels "ensembl","ensembl_havana",..: 2 2 2 2 2 2 2 2 2 2 ...
$ transcript_id.s. : Factor w/ 57955 levels "ENST00000000233,ENST00000415666,ENST00000459680,ENST00000463733,ENST00000467281,ENST00000489673",..: 17666 17669 17397 16695 5799 17850 14301 7 1276 12553 ...
$ length : num 1749 940 1073 1538 2430 ...
$ effective_length : num 1623 814 947 1412 2304 ...
$ expected_count : num 0 0 1 1 0 2 2 0 1 1 ...
$ TPM : num 0 0 0.27 0.18 0 0.23 0.07 0 0.65 0.17 ...
$ FPKM : num 0 0 0.41 0.27 0 0.35 0.11 0 0.98 0.25 ...
head(GSM3172784HC)
X annotation.gene_id annotation.gene_biotype annotation.gene_name
1 1 ENSG00000000003 protein_coding TSPAN6
2 2 ENSG00000000005 protein_coding TNMD
3 3 ENSG00000000419 protein_coding DPM1
4 4 ENSG00000000457 protein_coding SCYL3
5 5 ENSG00000000460 protein_coding C1orf112
6 6 ENSG00000000938 protein_coding FGR
annotation.source
1 ensembl_havana
2 ensembl_havana
3 ensembl_havana
4 ensembl_havana
5 ensembl_havana
6 ensembl_havana
transcript_id.s.
1 ENST00000373020,ENST00000494424,ENST00000496771,ENST00000612152,ENST00000614008
2 ENST00000373031,ENST00000485971
3 ENST00000371582,ENST00000371584,ENST00000371588,ENST00000413082,ENST00000466152,ENST00000494752
4 ENST00000367770,ENST00000367771,ENST00000367772,ENST00000423670,ENST00000470238
5 ENST00000286031,ENST00000359326,ENST00000413811,ENST00000459772,ENST00000466580,ENST00000472795,ENST00000481744,ENST00000496973,ENST00000498289
6 ENST00000374003,ENST00000374004,ENST00000374005,ENST00000399173,ENST00000457296,ENST00000468038,ENST00000475472
length effective_length expected_count TPM FPKM
1 1749.40 1623.17 0 0.00 0.00
2 940.50 814.28 0 0.00 0.00
3 1073.00 946.77 1 0.27 0.41
4 1538.00 1411.77 1 0.18 0.27
5 2430.11 2303.88 0 0.00 0.00
6 2350.00 2223.77 2 0.23 0.35
我们可以将列转换为 character
library(dplyr)
GSM3172784HC <- GSM3172784HC %>%
mutate_if(is.factor, as.character)
或 mutate/across
GSM3172784HC <- GSM3172784HC %>%
mutate(across(where(is.factor), as.character))
在base R
中,我们可以做到
i1 <- sapply(GSM3172784HC, is.factor)
GSM3172784HC[i1] <- lapply(GSM3172784HC[i1], as.character)
注意:对于 R >= 4.0.0
,默认情况下 stringsAsFactors = FALSE