导入多个ascii文本文件——为每个数据框添加原始文件名——格式化需要的数据
Import multiple ascii text files - add original file name to each data frame - format the required data
我想一次导入多个包含选定行的 ascii 文本文件,我想添加文件名作为 ID 号(变量),然后根据需要格式化数据(见下图)
setwd("working_dir")
library(data.table)
library(WriteXLS)
files <- list.files(pattern=".*.asc")
file.list <- sapply(files, function(x)read.csv(x,header=F,skip = 10,nrows=10,stringsAsFactors=F), simplify=FALSE)
df1 <- rbindlist(file.list, idcol="id")[, id := substr(id,1,7)]
WriteXLS(df1,"all_1.xls",Encoding = "latin1")
这是 df1 数据框的样子,我只想 format/select 我想要的结果图片中显示的所需数据
df1 <-structure(list(id = c("9864707", "9864707", "9864707", "9864707",
"9864707", "9864707", "9864707", "9864707", "9864707", "9864707",
"9864708", "9864708", "9864708", "9864708", "9864708", "9864708",
"9864708", "9864708", "9864708", "9864708"), V1 = c("Mean irradiance (kW/m²)",
"1.000", "Pmax", "267.793", "Module voltage", "Voc", "37.552",
"Module current", "Isc", "9.217", "Mean irradiance (kW/m²)",
"1.000", "Pmax", "268.211", "Module voltage", "Voc", "38.234",
"Module current", "Isc", "9.181"), V2 = c("Cell efficiency (%)",
"18.4", "Module temperature (°C)", "22.2", "", "Vmp", "31.159",
"", "Imp", "8.735", "Cell efficiency (%)", "18.4", "Module temperature (°C)",
"22.2", "", "Vmp", "31.208", "", "Imp", "8.735"), V3 = c("Module efficiency (%)",
"16.4", "", "", "", "Series resistance", "0.1256", "", "Shunt resistance",
"191.7", "Module efficiency (%)", "16.5", "", "", "", "Series resistance",
"0.3718", "", "Shunt resistance", "309.8"), V4 = c("Fill factor (%)",
"77.4", "", "", "", "", "", "", "", "", "Fill factor (%)", "76.4",
"", "", "", "", "", "", "", "")), .Names = c("id", "V1", "V2",
"V3", "V4"), row.names = c(NA, -20L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000000000a7b0788>)
我的实际结果是这样的
我想要的结果
下面是示例 ascii 文本文件的链接。你们谁能帮我得到我想要的结果。
R 会话信息
这是一个粗略的解决方法,但它会起作用...至少根据您提供的信息。
library(dplyr)
library(jsonlite)
# Reproducing the data frame
a <- data.frame(
id = c("9864707", "9864707", "9864707", "9864707",
"9864707", "9864707", "9864707", "9864707", "9864707", "9864707",
"9864708", "9864708", "9864708", "9864708", "9864708", "9864708",
"9864708", "9864708", "9864708", "9864708"),
V1 = c("Mean irradiance (kW/m²)",
"1.000", "Pmax", "267.793", "Module voltage", "Voc", "37.552",
"Module current", "Isc", "9.217", "Mean irradiance (kW/m²)",
"1.000", "Pmax", "268.211", "Module voltage", "Voc", "38.234",
"Module current", "Isc", "9.181"),
V2 = c("Cell efficiency (%)",
"18.4", "Module temperature (°C)", "22.2", "", "Vmp", "31.159",
"", "Imp", "8.735", "Cell efficiency (%)", "18.4",
"Module temperature (°C)",
"22.2", "", "Vmp", "31.208", "", "Imp", "8.735"),
V3 = c("Module efficiency (%)",
"16.4", "", "", "", "Series resistance", "0.1256", "", "Shunt resistance",
"191.7", "Module efficiency (%)", "16.5", "", "", "", "Series resistance",
"0.3718", "", "Shunt resistance", "309.8"),
V4 = c("Fill factor (%)",
"77.4", "", "", "", "", "", "", "", "", "Fill factor (%)", "76.4",
"", "", "", "", "", "", "", ""),
stringsAsFactors = FALSE)
# Splitting the data frame into a list of data frames where the id is the key value
b <- split(a, a$id)
# Loop over the list of data frames to apply a cleaning function
c <- lapply(b, function(i){
# Remove the rows where there is only one identifier and no values; as these
# are not columns and will result in an unbalanced vector of 14 names, to 12
# values which we'll get to in a second.
# 1) Filter the data frame where the cell in column V2 has no characters
# 2) unlist the data frame to a character vector after removing the id column
aa <- i %>% filter(nchar(V2) > 0) %>% select(-id) %>% unlist %>% as.character
# Remove empty characters
bb <- aa[nchar(aa) > 0]
# If we find a letter in the character; we know it's supposed to be a column name
c_name <- bb[grepl("[A-z]",bb)]
# Inversely; find the numerics
d_val <- as.numeric(bb[!grepl("[A-z]",bb)])
# Bind the numberic values nto a new data frame
df <- as.data.frame(rbind(d_val),stringsAsFactors = FALSE)
# name the columns with the names we extracted
colnames(df) <- c_name
# Get the unique id of the file (where we split above) and bind it into the
# above created data frame
df <- cbind(id = i[['id']][[1]], df)
# drop the rownames
row.names(df) <- NULL
# return the data frame
df
})
# Bind into one data.frame
d <- rbind.pages(c)
> str(d)
# 'data.frame': 2 obs. of 13 variables:
# $ id : chr "9864707" "9864708"
# $ Mean irradiance (kW/m²): num 1 1
# $ Pmax : num 268 268
# $ Voc : num 37.6 38.2
# $ Isc : num 9.22 9.18
# $ Cell efficiency (%) : num 18.4 18.4
# $ Module temperature (°C): num 22.2 22.2
# $ Vmp : num 31.2 31.2
# $ Imp : num 8.73 8.73
# $ Module efficiency (%) : num 16.4 16.5
# $ Series resistance : num 0.126 0.372
# $ Shunt resistance : num 192 310
# $ Fill factor (%) : num 77.4 76.4
我想一次导入多个包含选定行的 ascii 文本文件,我想添加文件名作为 ID 号(变量),然后根据需要格式化数据(见下图)
setwd("working_dir")
library(data.table)
library(WriteXLS)
files <- list.files(pattern=".*.asc")
file.list <- sapply(files, function(x)read.csv(x,header=F,skip = 10,nrows=10,stringsAsFactors=F), simplify=FALSE)
df1 <- rbindlist(file.list, idcol="id")[, id := substr(id,1,7)]
WriteXLS(df1,"all_1.xls",Encoding = "latin1")
这是 df1 数据框的样子,我只想 format/select 我想要的结果图片中显示的所需数据
df1 <-structure(list(id = c("9864707", "9864707", "9864707", "9864707",
"9864707", "9864707", "9864707", "9864707", "9864707", "9864707",
"9864708", "9864708", "9864708", "9864708", "9864708", "9864708",
"9864708", "9864708", "9864708", "9864708"), V1 = c("Mean irradiance (kW/m²)",
"1.000", "Pmax", "267.793", "Module voltage", "Voc", "37.552",
"Module current", "Isc", "9.217", "Mean irradiance (kW/m²)",
"1.000", "Pmax", "268.211", "Module voltage", "Voc", "38.234",
"Module current", "Isc", "9.181"), V2 = c("Cell efficiency (%)",
"18.4", "Module temperature (°C)", "22.2", "", "Vmp", "31.159",
"", "Imp", "8.735", "Cell efficiency (%)", "18.4", "Module temperature (°C)",
"22.2", "", "Vmp", "31.208", "", "Imp", "8.735"), V3 = c("Module efficiency (%)",
"16.4", "", "", "", "Series resistance", "0.1256", "", "Shunt resistance",
"191.7", "Module efficiency (%)", "16.5", "", "", "", "Series resistance",
"0.3718", "", "Shunt resistance", "309.8"), V4 = c("Fill factor (%)",
"77.4", "", "", "", "", "", "", "", "", "Fill factor (%)", "76.4",
"", "", "", "", "", "", "", "")), .Names = c("id", "V1", "V2",
"V3", "V4"), row.names = c(NA, -20L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000000000a7b0788>)
我的实际结果是这样的
我想要的结果
下面是示例 ascii 文本文件的链接。你们谁能帮我得到我想要的结果。
R 会话信息
这是一个粗略的解决方法,但它会起作用...至少根据您提供的信息。
library(dplyr)
library(jsonlite)
# Reproducing the data frame
a <- data.frame(
id = c("9864707", "9864707", "9864707", "9864707",
"9864707", "9864707", "9864707", "9864707", "9864707", "9864707",
"9864708", "9864708", "9864708", "9864708", "9864708", "9864708",
"9864708", "9864708", "9864708", "9864708"),
V1 = c("Mean irradiance (kW/m²)",
"1.000", "Pmax", "267.793", "Module voltage", "Voc", "37.552",
"Module current", "Isc", "9.217", "Mean irradiance (kW/m²)",
"1.000", "Pmax", "268.211", "Module voltage", "Voc", "38.234",
"Module current", "Isc", "9.181"),
V2 = c("Cell efficiency (%)",
"18.4", "Module temperature (°C)", "22.2", "", "Vmp", "31.159",
"", "Imp", "8.735", "Cell efficiency (%)", "18.4",
"Module temperature (°C)",
"22.2", "", "Vmp", "31.208", "", "Imp", "8.735"),
V3 = c("Module efficiency (%)",
"16.4", "", "", "", "Series resistance", "0.1256", "", "Shunt resistance",
"191.7", "Module efficiency (%)", "16.5", "", "", "", "Series resistance",
"0.3718", "", "Shunt resistance", "309.8"),
V4 = c("Fill factor (%)",
"77.4", "", "", "", "", "", "", "", "", "Fill factor (%)", "76.4",
"", "", "", "", "", "", "", ""),
stringsAsFactors = FALSE)
# Splitting the data frame into a list of data frames where the id is the key value
b <- split(a, a$id)
# Loop over the list of data frames to apply a cleaning function
c <- lapply(b, function(i){
# Remove the rows where there is only one identifier and no values; as these
# are not columns and will result in an unbalanced vector of 14 names, to 12
# values which we'll get to in a second.
# 1) Filter the data frame where the cell in column V2 has no characters
# 2) unlist the data frame to a character vector after removing the id column
aa <- i %>% filter(nchar(V2) > 0) %>% select(-id) %>% unlist %>% as.character
# Remove empty characters
bb <- aa[nchar(aa) > 0]
# If we find a letter in the character; we know it's supposed to be a column name
c_name <- bb[grepl("[A-z]",bb)]
# Inversely; find the numerics
d_val <- as.numeric(bb[!grepl("[A-z]",bb)])
# Bind the numberic values nto a new data frame
df <- as.data.frame(rbind(d_val),stringsAsFactors = FALSE)
# name the columns with the names we extracted
colnames(df) <- c_name
# Get the unique id of the file (where we split above) and bind it into the
# above created data frame
df <- cbind(id = i[['id']][[1]], df)
# drop the rownames
row.names(df) <- NULL
# return the data frame
df
})
# Bind into one data.frame
d <- rbind.pages(c)
> str(d)
# 'data.frame': 2 obs. of 13 variables:
# $ id : chr "9864707" "9864708"
# $ Mean irradiance (kW/m²): num 1 1
# $ Pmax : num 268 268
# $ Voc : num 37.6 38.2
# $ Isc : num 9.22 9.18
# $ Cell efficiency (%) : num 18.4 18.4
# $ Module temperature (°C): num 22.2 22.2
# $ Vmp : num 31.2 31.2
# $ Imp : num 8.73 8.73
# $ Module efficiency (%) : num 16.4 16.5
# $ Series resistance : num 0.126 0.372
# $ Shunt resistance : num 192 310
# $ Fill factor (%) : num 77.4 76.4