从列中提取 Chr 编号
Extract the Chr number from the column
我有一个数据框,其中有一列包含染色体详细信息(1 到 22)。我想创建另一个只有 Chr 数字的列
没有可重现的例子很难回答。使用 stringr
包和 regex
你可以实现你正在寻找的东西,但你需要知道所有的可能性。也许如果你想要的和烦人的信息之间只有下划线,你可以使用 str_split
和“_”作为模式参数来解决你的问题。
请参考https://whosebug.com/help/how-to-ask
library(stringr)
df <- data.frame(chromosome = c("chr6_GL000253v2_alt", "chr6_GL000254v2_alt",
"chr6_GL000255v2_alt", "chr6_GL000256v2_alt", "chr4", "chr11",
"chr8", "chr12", "chr2", "chr12", "chr4", "chr6", "chr15", "chr4",
"chr2"))
df$chromosome_fixed=str_split(df$chromosome,"_",simplify = T)[,1]
请在下面找到软件包 data.table
的解决方案:
REPREX
- 代码
library(data.table)
library(stringr)
DT[, Chr_ID := lapply(.SD, str_extract,"(?<=^chr)\d+"), .SDcols = "chromosome"]
- 输出
DT
#> chromosome Chr_ID
#> 1: chr6_GL000253v2_alt 6
#> 2: chr6_GL000254v2_alt 6
#> 3: chr6_GL000255v2_alt 6
#> 4: chr6_GL000256v2_alt 6
#> 5: chr4 4
#> 6: chr11 11
#> 7: chr8 8
#> 8: chr12 12
#> 9: chr2 2
#> 10: chr12 12
#> 11: chr4 4
#> 12: chr6 6
#> 13: chr15 15
#> 14: chr4 4
#> 15: chr2 2
- 您的数据
DT <- data.table(chromosome = c("chr6_GL000253v2_alt", "chr6_GL000254v2_alt",
"chr6_GL000255v2_alt", "chr6_GL000256v2_alt", "chr4", "chr11",
"chr8", "chr12", "chr2", "chr12", "chr4", "chr6", "chr15", "chr4",
"chr2"))
DT
#> chromosome
#> 1: chr6_GL000253v2_alt
#> 2: chr6_GL000254v2_alt
#> 3: chr6_GL000255v2_alt
#> 4: chr6_GL000256v2_alt
#> 5: chr4
#> 6: chr11
#> 7: chr8
#> 8: chr12
#> 9: chr2
#> 10: chr12
#> 11: chr4
#> 12: chr6
#> 13: chr15
#> 14: chr4
#> 15: chr2
由 reprex package (v2.0.1)
于 2021-10-12 创建
因为您没有共享数据。我创建了类似的列并将数字提取到名为 Number:
的新列中
#Populate a dummy table
df = pd.DataFrame(data=['chr6_GL','chr6_GL00','chr4','chr11','chr8','chr12'], columns=['Data'])
#Extract the numbers using regex and assign it to a new column called 'Number'
df['Numbers']=df['Data'].str.extract(r'chr([0-9]*)')
数据数量
我有一个数据框,其中有一列包含染色体详细信息(1 到 22)。我想创建另一个只有 Chr 数字的列
没有可重现的例子很难回答。使用 stringr
包和 regex
你可以实现你正在寻找的东西,但你需要知道所有的可能性。也许如果你想要的和烦人的信息之间只有下划线,你可以使用 str_split
和“_”作为模式参数来解决你的问题。
请参考https://whosebug.com/help/how-to-ask
library(stringr)
df <- data.frame(chromosome = c("chr6_GL000253v2_alt", "chr6_GL000254v2_alt",
"chr6_GL000255v2_alt", "chr6_GL000256v2_alt", "chr4", "chr11",
"chr8", "chr12", "chr2", "chr12", "chr4", "chr6", "chr15", "chr4",
"chr2"))
df$chromosome_fixed=str_split(df$chromosome,"_",simplify = T)[,1]
请在下面找到软件包 data.table
的解决方案:
REPREX
- 代码
library(data.table)
library(stringr)
DT[, Chr_ID := lapply(.SD, str_extract,"(?<=^chr)\d+"), .SDcols = "chromosome"]
- 输出
DT
#> chromosome Chr_ID
#> 1: chr6_GL000253v2_alt 6
#> 2: chr6_GL000254v2_alt 6
#> 3: chr6_GL000255v2_alt 6
#> 4: chr6_GL000256v2_alt 6
#> 5: chr4 4
#> 6: chr11 11
#> 7: chr8 8
#> 8: chr12 12
#> 9: chr2 2
#> 10: chr12 12
#> 11: chr4 4
#> 12: chr6 6
#> 13: chr15 15
#> 14: chr4 4
#> 15: chr2 2
- 您的数据
DT <- data.table(chromosome = c("chr6_GL000253v2_alt", "chr6_GL000254v2_alt",
"chr6_GL000255v2_alt", "chr6_GL000256v2_alt", "chr4", "chr11",
"chr8", "chr12", "chr2", "chr12", "chr4", "chr6", "chr15", "chr4",
"chr2"))
DT
#> chromosome
#> 1: chr6_GL000253v2_alt
#> 2: chr6_GL000254v2_alt
#> 3: chr6_GL000255v2_alt
#> 4: chr6_GL000256v2_alt
#> 5: chr4
#> 6: chr11
#> 7: chr8
#> 8: chr12
#> 9: chr2
#> 10: chr12
#> 11: chr4
#> 12: chr6
#> 13: chr15
#> 14: chr4
#> 15: chr2
由 reprex package (v2.0.1)
于 2021-10-12 创建因为您没有共享数据。我创建了类似的列并将数字提取到名为 Number:
的新列中#Populate a dummy table
df = pd.DataFrame(data=['chr6_GL','chr6_GL00','chr4','chr11','chr8','chr12'], columns=['Data'])
#Extract the numbers using regex and assign it to a new column called 'Number'
df['Numbers']=df['Data'].str.extract(r'chr([0-9]*)')
数据数量