使用线图的熔化函数在 R 中排列和映射数据框
Arranging and mapping dataframe in R using melt function for line plot
我有数据框,有 4 个时间点(行)和 18 个基因(列),其中一行中有一个与每个基因相关联的 ID,这有助于在 ggplot2 中绘制线图。我导入数据,然后使用 melt() 函数将数据从宽格式排列到长格式。我观察到的是 ID 在数据帧中间中断。我希望将它们安排在最后一列中,这在使用 ggplot2 库绘制线图时会有帮助。请协助我。
谢谢,
图菲克
数据导入
B1_Test <- read.csv(file ="./B1_Test.csv", stringsAsFactors = FALSE)
dput(head(B1_Test))
structure(list(Timepoints = c("1", "2", "3", "5", "ID"), Gene_A = c("-2.05066",
"-0.657222", "-1.49477", "-1.80191", "A1.1"), Gene_B = c("-8.35787",
"-9.52402", "-10.6604", "-10.516", "A1.2"), Gene_C = c("-2.06287",
"-0.846725", "-1.63796", "-1.31922", "A1.3"), Gene_D = c("-3.83545",
"-1.19723", "-1.53115", "-3.25903", "A1.4"), Gene_E = c("-6.59039",
"-5.98822", "-6.23785", "-5.00584", "A1.5"), Gene_F = c("-5.02469",
"-4.41637", "-5.46219", "-3.97594", "A1.1"), Gene_G = c("-7.75424",
"-8.17158", "-7.90569", "-8.01352", "A1.6"), Gene_H = c("-4.65703",
"-3.42328", "-4.08867", "-3.76642", "A1.2"), Gene_I = c("-11.7749",
"-11.649", "-11.3751", "-10.3728", "A1.3"), Gene_K = c("-4.08981",
"-3.09873", "-3.95986", "-3.97249", "A1.4"), Gene_L = c(NA, "-19.7923",
NA, "-15.1216", "A1.5"), Gene_M = c("-4.11469", "-3.19647", "-3.99615",
"-3.06183", "A1.6"), Gene_N = c("-6.53017", "-6.16685", "-6.865",
"-6.44303", "A1.9"), Gene_O = c("-4.58034", "-3.45153", "-4.86697",
"-5.25414", "A2.2"), Gene_P = c("-3.45614", "-2.72413", "-2.75492",
"-2.76479", "A2.2"), Gene_R = c("-5.24809", "-4.15782", "-5.28192",
"-5.72024", "A2.6"), Gene_S = c("-7.73098", "-7.20226", "-8.04388",
"-7.68191", "A2.6"), Gene_T = c("-5.09079", "-4.52039", "-4.75427",
"-5.4321", "A1.9")), row.names = c(NA, 5L), class = "data.frame")
利用melt()将数据从wide格式排列到lon格式
require(reshape2)
B1_Test_melt <- melt(B1_Test , id.vars = 'Timepoints', variable.name = 'Genes')
dput((B1_Test_melt))
structure(list(Timepoints = c("1", "2", "3", "5", "ID", "1",
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3",
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID",
"1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2",
"3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5",
"ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1",
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3",
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID"
), Genes = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L,
9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L,
12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 14L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 17L,
17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L), .Label = c("Gene_A",
"Gene_B", "Gene_C", "Gene_D", "Gene_E", "Gene_F", "Gene_G", "Gene_H",
"Gene_I", "Gene_K", "Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P",
"Gene_R", "Gene_S", "Gene_T"), class = "factor"), value = c("-2.05066",
"-0.657222", "-1.49477", "-1.80191", "A1.1", "-8.35787", "-9.52402",
"-10.6604", "-10.516", "A1.2", "-2.06287", "-0.846725", "-1.63796",
"-1.31922", "A1.3", "-3.83545", "-1.19723", "-1.53115", "-3.25903",
"A1.4", "-6.59039", "-5.98822", "-6.23785", "-5.00584", "A1.5",
"-5.02469", "-4.41637", "-5.46219", "-3.97594", "A1.1", "-7.75424",
"-8.17158", "-7.90569", "-8.01352", "A1.6", "-4.65703", "-3.42328",
"-4.08867", "-3.76642", "A1.2", "-11.7749", "-11.649", "-11.3751",
"-10.3728", "A1.3", "-4.08981", "-3.09873", "-3.95986", "-3.97249",
"A1.4", NA, "-19.7923", NA, "-15.1216", "A1.5", "-4.11469", "-3.19647",
"-3.99615", "-3.06183", "A1.6", "-6.53017", "-6.16685", "-6.865",
"-6.44303", "A1.9", "-4.58034", "-3.45153", "-4.86697", "-5.25414",
"A2.2", "-3.45614", "-2.72413", "-2.75492", "-2.76479", "A2.2",
"-5.24809", "-4.15782", "-5.28192", "-5.72024", "A2.6", "-7.73098",
"-7.20226", "-8.04388", "-7.68191", "A2.6", "-5.09079", "-4.52039",
"-4.75427", "-5.4321", "A1.9")), row.names = c(NA, -90L), class = "data.frame")
预期输出
dput((B1_Test_v1))
structure(list(Timepoints = c(1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L,
1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L
), Genes = c("Gene_A", "Gene_A", "Gene_A", "Gene_A", "Gene_B",
"Gene_B", "Gene_B", "Gene_B", "Gene_C", "Gene_C", "Gene_C", "Gene_C",
"Gene_D", "Gene_D", "Gene_D", "Gene_D", "Gene_E", "Gene_E", "Gene_E",
"Gene_E", "Gene_F", "Gene_F", "Gene_F", "Gene_F"), value = c(-2.05066,
-0.657222, -1.49477, -1.80191, -8.35787, -9.52402, -10.6604,
-10.516, -2.06287, -0.846725, -1.63796, -1.31922, -3.83545, -1.19723,
-1.53115, -3.25903, -6.59039, -5.98822, -6.23785, -5.00584, -5.02469,
-4.41637, -5.46219, -3.97594), ID = c("A1.1", "A1.1", "A1.1",
"A1.1", "A1.2", "A1.2", "A1.2", "A1.2", "A1.3", "A1.3", "A1.3",
"A1.3", "A1.4", "A1.4", "A1.4", "A1.4", "A1.5", "A1.5", "A1.5",
"A1.5", "A1.1", "A1.1", "A1.1", "A1.1")), class = "data.frame", row.names = c(NA,
-24L))
源文档中的数据格式不正确:数据中的 ID
行将所有数字列损坏为字符串。你应该首先说服给你那个数据文件的人提供一个合理的数据集(在我看来,混合 class 列是不合理的)。
如果没有,请删除 ID
行,重塑它,然后将该数据合并回重塑后的剩余数据。
B1_IDs <- melt(B1_Test[ B1_Test$Timepoints == "ID", ], id.vars = 'Timepoints', variable.name = 'Genes', value.name = 'ID')[, c("Genes", "ID")]
head(B1_IDs)
# Genes ID
# 1 Gene_A A1.1
# 2 Gene_B A1.2
# 3 Gene_C A1.3
# 4 Gene_D A1.4
# 5 Gene_E A1.5
# 6 Gene_F A1.1
现在对非ID
行进行整形:
B1_Test_melt <- melt(B1_Test[B1_Test$Timepoints != "ID", ] , id.vars = 'Timepoints', variable.name = 'Genes')
B1_Test_melt
# *** output flushed ***
head(B1_Test_melt)
# Timepoints Genes value
# 1 1 Gene_A -2.05066
# 2 2 Gene_A -0.657222
# 3 3 Gene_A -1.49477
# 4 5 Gene_A -1.80191
# 5 1 Gene_B -8.35787
# 6 2 Gene_B -9.52402
并将两者合并:
B1_merged <- merge(B1_Test_melt, B1_IDs, by = "Genes", all = TRUE)
head(B1_merged)
# Genes Timepoints value ID
# 1 Gene_A 1 -2.05066 A1.1
# 2 Gene_A 2 -0.657222 A1.1
# 3 Gene_A 3 -1.49477 A1.1
# 4 Gene_A 5 -1.80191 A1.1
# 5 Gene_B 1 -8.35787 A1.2
# 6 Gene_B 2 -9.52402 A1.2
(除非我遗漏了什么,你可能还想做 B1_merged$value <- as.numeric(B1_merged$value)
。还要注意 Genes
是 factor
,如果需要可以使用 as.character
修复.)
首先要做的是将 ID 与数据分开:
Gene_ID <- data.frame( t( B1_Test[5,-1]))
> Gene_ID
X5
Gene_A A1.1
Gene_B A1.2
Gene_C A1.3
Gene_D A1.4
Gene_E A1.5
snip....
然后熔化非ID行:
> Gene_vals <- melt( B1_Test[-5,], id.vars = 'Timepoints', variable.name = 'Genes')
> head(Gene_vals)
Timepoints Genes value
1 1 Gene_A -2.05066
2 2 Gene_A -0.657222
3 3 Gene_A -1.49477
4 5 Gene_A -1.80191
5 1 Gene_B -8.35787
6 2 Gene_B -9.52402
> str(Gene_vals)
'data.frame': 72 obs. of 3 variables:
$ Timepoints: chr "1" "2" "3" "5" ...
$ Genes : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
$ value : chr "-2.05066" "-0.657222" "-1.49477" "-1.80191" ...
> Gene_vals$value <- as.numeric(Gene_vals$value)
> str(Gene_vals)
'data.frame': 72 obs. of 3 variables:
$ Timepoints: chr "1" "2" "3" "5" ...
$ Genes : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
$ value : num -2.051 -0.657 -1.495 -1.802 -8.358 ...
并合并它们:
> final <- merge(Gene_vals, Gene_ID, by.x="Genes", by.y="row.names")
> head(final)
Genes Timepoints value X5
1 Gene_A 1 -2.050660 A1.1
2 Gene_A 2 -0.657222 A1.1
3 Gene_A 3 -1.494770 A1.1
4 Gene_A 5 -1.801910 A1.1
5 Gene_B 1 -8.357870 A1.2
6 Gene_B 2 -9.524020 A1.2
基础 R 解决方案:
# Create a dataframe comrpised of the ID & gene vectors:
ID <- data.frame(t(df[nrow(df),]), stringsAsFactors = F)
ID <- data.frame(cbind(Genes = row.names(ID)[2:nrow(ID)], ID = ID[2:nrow(ID),]),
stringsAsFactors = F,
row.names = NULL)
# Melt the original dataframe (less the ID rows) into long format:
df_long <- data.frame(
reshape(
df[1:(nrow(df)-1),],
direction = "long",
varying = names(df)[names(df) != "Timepoints"],
v.names = "value",
times = names(df)[names(df) != "Timepoints"],
timevar = "Genes"
),
row.names = NULL
)
# Left join the dataframe holding the IDs and the long df:
df_long <- merge(df_long, ID, by = "Genes", all.x = T)
数据:
df <-
structure(
list(
Timepoints = c("1", "2", "3", "5", "ID"),
Gene_A = c("-2.05066",
"-0.657222", "-1.49477", "-1.80191", "A1.1"),
Gene_B = c("-8.35787",
"-9.52402", "-10.6604", "-10.516", "A1.2"),
Gene_C = c("-2.06287",
"-0.846725", "-1.63796", "-1.31922", "A1.3"),
Gene_D = c("-3.83545",
"-1.19723", "-1.53115", "-3.25903", "A1.4"),
Gene_E = c("-6.59039",
"-5.98822", "-6.23785", "-5.00584", "A1.5"),
Gene_F = c("-5.02469",
"-4.41637", "-5.46219", "-3.97594", "A1.1"),
Gene_G = c("-7.75424",
"-8.17158", "-7.90569", "-8.01352", "A1.6"),
Gene_H = c("-4.65703",
"-3.42328", "-4.08867", "-3.76642", "A1.2"),
Gene_I = c("-11.7749",
"-11.649", "-11.3751", "-10.3728", "A1.3"),
Gene_K = c("-4.08981",
"-3.09873", "-3.95986", "-3.97249", "A1.4"),
Gene_L = c(NA, "-19.7923",
NA, "-15.1216", "A1.5"),
Gene_M = c("-4.11469", "-3.19647", "-3.99615",
"-3.06183", "A1.6"),
Gene_N = c("-6.53017", "-6.16685", "-6.865",
"-6.44303", "A1.9"),
Gene_O = c("-4.58034", "-3.45153", "-4.86697",
"-5.25414", "A2.2"),
Gene_P = c("-3.45614", "-2.72413", "-2.75492",
"-2.76479", "A2.2"),
Gene_R = c("-5.24809", "-4.15782", "-5.28192",
"-5.72024", "A2.6"),
Gene_S = c("-7.73098", "-7.20226", "-8.04388",
"-7.68191", "A2.6"),
Gene_T = c("-5.09079", "-4.52039", "-4.75427",
"-5.4321", "A1.9")
),
row.names = c(NA, 5L),
class = "data.frame"
)
我有数据框,有 4 个时间点(行)和 18 个基因(列),其中一行中有一个与每个基因相关联的 ID,这有助于在 ggplot2 中绘制线图。我导入数据,然后使用 melt() 函数将数据从宽格式排列到长格式。我观察到的是 ID 在数据帧中间中断。我希望将它们安排在最后一列中,这在使用 ggplot2 库绘制线图时会有帮助。请协助我。
谢谢,
图菲克
数据导入
B1_Test <- read.csv(file ="./B1_Test.csv", stringsAsFactors = FALSE)
dput(head(B1_Test))
structure(list(Timepoints = c("1", "2", "3", "5", "ID"), Gene_A = c("-2.05066",
"-0.657222", "-1.49477", "-1.80191", "A1.1"), Gene_B = c("-8.35787",
"-9.52402", "-10.6604", "-10.516", "A1.2"), Gene_C = c("-2.06287",
"-0.846725", "-1.63796", "-1.31922", "A1.3"), Gene_D = c("-3.83545",
"-1.19723", "-1.53115", "-3.25903", "A1.4"), Gene_E = c("-6.59039",
"-5.98822", "-6.23785", "-5.00584", "A1.5"), Gene_F = c("-5.02469",
"-4.41637", "-5.46219", "-3.97594", "A1.1"), Gene_G = c("-7.75424",
"-8.17158", "-7.90569", "-8.01352", "A1.6"), Gene_H = c("-4.65703",
"-3.42328", "-4.08867", "-3.76642", "A1.2"), Gene_I = c("-11.7749",
"-11.649", "-11.3751", "-10.3728", "A1.3"), Gene_K = c("-4.08981",
"-3.09873", "-3.95986", "-3.97249", "A1.4"), Gene_L = c(NA, "-19.7923",
NA, "-15.1216", "A1.5"), Gene_M = c("-4.11469", "-3.19647", "-3.99615",
"-3.06183", "A1.6"), Gene_N = c("-6.53017", "-6.16685", "-6.865",
"-6.44303", "A1.9"), Gene_O = c("-4.58034", "-3.45153", "-4.86697",
"-5.25414", "A2.2"), Gene_P = c("-3.45614", "-2.72413", "-2.75492",
"-2.76479", "A2.2"), Gene_R = c("-5.24809", "-4.15782", "-5.28192",
"-5.72024", "A2.6"), Gene_S = c("-7.73098", "-7.20226", "-8.04388",
"-7.68191", "A2.6"), Gene_T = c("-5.09079", "-4.52039", "-4.75427",
"-5.4321", "A1.9")), row.names = c(NA, 5L), class = "data.frame")
利用melt()将数据从wide格式排列到lon格式
require(reshape2)
B1_Test_melt <- melt(B1_Test , id.vars = 'Timepoints', variable.name = 'Genes')
dput((B1_Test_melt))
structure(list(Timepoints = c("1", "2", "3", "5", "ID", "1",
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3",
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID",
"1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2",
"3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5",
"ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1",
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3",
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID"
), Genes = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L,
9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L,
12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 14L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 17L,
17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L), .Label = c("Gene_A",
"Gene_B", "Gene_C", "Gene_D", "Gene_E", "Gene_F", "Gene_G", "Gene_H",
"Gene_I", "Gene_K", "Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P",
"Gene_R", "Gene_S", "Gene_T"), class = "factor"), value = c("-2.05066",
"-0.657222", "-1.49477", "-1.80191", "A1.1", "-8.35787", "-9.52402",
"-10.6604", "-10.516", "A1.2", "-2.06287", "-0.846725", "-1.63796",
"-1.31922", "A1.3", "-3.83545", "-1.19723", "-1.53115", "-3.25903",
"A1.4", "-6.59039", "-5.98822", "-6.23785", "-5.00584", "A1.5",
"-5.02469", "-4.41637", "-5.46219", "-3.97594", "A1.1", "-7.75424",
"-8.17158", "-7.90569", "-8.01352", "A1.6", "-4.65703", "-3.42328",
"-4.08867", "-3.76642", "A1.2", "-11.7749", "-11.649", "-11.3751",
"-10.3728", "A1.3", "-4.08981", "-3.09873", "-3.95986", "-3.97249",
"A1.4", NA, "-19.7923", NA, "-15.1216", "A1.5", "-4.11469", "-3.19647",
"-3.99615", "-3.06183", "A1.6", "-6.53017", "-6.16685", "-6.865",
"-6.44303", "A1.9", "-4.58034", "-3.45153", "-4.86697", "-5.25414",
"A2.2", "-3.45614", "-2.72413", "-2.75492", "-2.76479", "A2.2",
"-5.24809", "-4.15782", "-5.28192", "-5.72024", "A2.6", "-7.73098",
"-7.20226", "-8.04388", "-7.68191", "A2.6", "-5.09079", "-4.52039",
"-4.75427", "-5.4321", "A1.9")), row.names = c(NA, -90L), class = "data.frame")
预期输出
dput((B1_Test_v1))
structure(list(Timepoints = c(1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L,
1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L
), Genes = c("Gene_A", "Gene_A", "Gene_A", "Gene_A", "Gene_B",
"Gene_B", "Gene_B", "Gene_B", "Gene_C", "Gene_C", "Gene_C", "Gene_C",
"Gene_D", "Gene_D", "Gene_D", "Gene_D", "Gene_E", "Gene_E", "Gene_E",
"Gene_E", "Gene_F", "Gene_F", "Gene_F", "Gene_F"), value = c(-2.05066,
-0.657222, -1.49477, -1.80191, -8.35787, -9.52402, -10.6604,
-10.516, -2.06287, -0.846725, -1.63796, -1.31922, -3.83545, -1.19723,
-1.53115, -3.25903, -6.59039, -5.98822, -6.23785, -5.00584, -5.02469,
-4.41637, -5.46219, -3.97594), ID = c("A1.1", "A1.1", "A1.1",
"A1.1", "A1.2", "A1.2", "A1.2", "A1.2", "A1.3", "A1.3", "A1.3",
"A1.3", "A1.4", "A1.4", "A1.4", "A1.4", "A1.5", "A1.5", "A1.5",
"A1.5", "A1.1", "A1.1", "A1.1", "A1.1")), class = "data.frame", row.names = c(NA,
-24L))
源文档中的数据格式不正确:数据中的 ID
行将所有数字列损坏为字符串。你应该首先说服给你那个数据文件的人提供一个合理的数据集(在我看来,混合 class 列是不合理的)。
如果没有,请删除 ID
行,重塑它,然后将该数据合并回重塑后的剩余数据。
B1_IDs <- melt(B1_Test[ B1_Test$Timepoints == "ID", ], id.vars = 'Timepoints', variable.name = 'Genes', value.name = 'ID')[, c("Genes", "ID")]
head(B1_IDs)
# Genes ID
# 1 Gene_A A1.1
# 2 Gene_B A1.2
# 3 Gene_C A1.3
# 4 Gene_D A1.4
# 5 Gene_E A1.5
# 6 Gene_F A1.1
现在对非ID
行进行整形:
B1_Test_melt <- melt(B1_Test[B1_Test$Timepoints != "ID", ] , id.vars = 'Timepoints', variable.name = 'Genes')
B1_Test_melt
# *** output flushed ***
head(B1_Test_melt)
# Timepoints Genes value
# 1 1 Gene_A -2.05066
# 2 2 Gene_A -0.657222
# 3 3 Gene_A -1.49477
# 4 5 Gene_A -1.80191
# 5 1 Gene_B -8.35787
# 6 2 Gene_B -9.52402
并将两者合并:
B1_merged <- merge(B1_Test_melt, B1_IDs, by = "Genes", all = TRUE)
head(B1_merged)
# Genes Timepoints value ID
# 1 Gene_A 1 -2.05066 A1.1
# 2 Gene_A 2 -0.657222 A1.1
# 3 Gene_A 3 -1.49477 A1.1
# 4 Gene_A 5 -1.80191 A1.1
# 5 Gene_B 1 -8.35787 A1.2
# 6 Gene_B 2 -9.52402 A1.2
(除非我遗漏了什么,你可能还想做 B1_merged$value <- as.numeric(B1_merged$value)
。还要注意 Genes
是 factor
,如果需要可以使用 as.character
修复.)
首先要做的是将 ID 与数据分开:
Gene_ID <- data.frame( t( B1_Test[5,-1]))
> Gene_ID
X5
Gene_A A1.1
Gene_B A1.2
Gene_C A1.3
Gene_D A1.4
Gene_E A1.5
snip....
然后熔化非ID行:
> Gene_vals <- melt( B1_Test[-5,], id.vars = 'Timepoints', variable.name = 'Genes')
> head(Gene_vals)
Timepoints Genes value
1 1 Gene_A -2.05066
2 2 Gene_A -0.657222
3 3 Gene_A -1.49477
4 5 Gene_A -1.80191
5 1 Gene_B -8.35787
6 2 Gene_B -9.52402
> str(Gene_vals)
'data.frame': 72 obs. of 3 variables:
$ Timepoints: chr "1" "2" "3" "5" ...
$ Genes : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
$ value : chr "-2.05066" "-0.657222" "-1.49477" "-1.80191" ...
> Gene_vals$value <- as.numeric(Gene_vals$value)
> str(Gene_vals)
'data.frame': 72 obs. of 3 variables:
$ Timepoints: chr "1" "2" "3" "5" ...
$ Genes : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
$ value : num -2.051 -0.657 -1.495 -1.802 -8.358 ...
并合并它们:
> final <- merge(Gene_vals, Gene_ID, by.x="Genes", by.y="row.names")
> head(final)
Genes Timepoints value X5
1 Gene_A 1 -2.050660 A1.1
2 Gene_A 2 -0.657222 A1.1
3 Gene_A 3 -1.494770 A1.1
4 Gene_A 5 -1.801910 A1.1
5 Gene_B 1 -8.357870 A1.2
6 Gene_B 2 -9.524020 A1.2
基础 R 解决方案:
# Create a dataframe comrpised of the ID & gene vectors:
ID <- data.frame(t(df[nrow(df),]), stringsAsFactors = F)
ID <- data.frame(cbind(Genes = row.names(ID)[2:nrow(ID)], ID = ID[2:nrow(ID),]),
stringsAsFactors = F,
row.names = NULL)
# Melt the original dataframe (less the ID rows) into long format:
df_long <- data.frame(
reshape(
df[1:(nrow(df)-1),],
direction = "long",
varying = names(df)[names(df) != "Timepoints"],
v.names = "value",
times = names(df)[names(df) != "Timepoints"],
timevar = "Genes"
),
row.names = NULL
)
# Left join the dataframe holding the IDs and the long df:
df_long <- merge(df_long, ID, by = "Genes", all.x = T)
数据:
df <-
structure(
list(
Timepoints = c("1", "2", "3", "5", "ID"),
Gene_A = c("-2.05066",
"-0.657222", "-1.49477", "-1.80191", "A1.1"),
Gene_B = c("-8.35787",
"-9.52402", "-10.6604", "-10.516", "A1.2"),
Gene_C = c("-2.06287",
"-0.846725", "-1.63796", "-1.31922", "A1.3"),
Gene_D = c("-3.83545",
"-1.19723", "-1.53115", "-3.25903", "A1.4"),
Gene_E = c("-6.59039",
"-5.98822", "-6.23785", "-5.00584", "A1.5"),
Gene_F = c("-5.02469",
"-4.41637", "-5.46219", "-3.97594", "A1.1"),
Gene_G = c("-7.75424",
"-8.17158", "-7.90569", "-8.01352", "A1.6"),
Gene_H = c("-4.65703",
"-3.42328", "-4.08867", "-3.76642", "A1.2"),
Gene_I = c("-11.7749",
"-11.649", "-11.3751", "-10.3728", "A1.3"),
Gene_K = c("-4.08981",
"-3.09873", "-3.95986", "-3.97249", "A1.4"),
Gene_L = c(NA, "-19.7923",
NA, "-15.1216", "A1.5"),
Gene_M = c("-4.11469", "-3.19647", "-3.99615",
"-3.06183", "A1.6"),
Gene_N = c("-6.53017", "-6.16685", "-6.865",
"-6.44303", "A1.9"),
Gene_O = c("-4.58034", "-3.45153", "-4.86697",
"-5.25414", "A2.2"),
Gene_P = c("-3.45614", "-2.72413", "-2.75492",
"-2.76479", "A2.2"),
Gene_R = c("-5.24809", "-4.15782", "-5.28192",
"-5.72024", "A2.6"),
Gene_S = c("-7.73098", "-7.20226", "-8.04388",
"-7.68191", "A2.6"),
Gene_T = c("-5.09079", "-4.52039", "-4.75427",
"-5.4321", "A1.9")
),
row.names = c(NA, 5L),
class = "data.frame"
)