根据列中的字符条件删除行
Removing rows based on character conditions in a column
早上好,我创建了以下 R 代码:
setwd("xxx")
library(reshape)
##Insert needed year
url <- "./Quarterly/1990_qtrly.csv"
##Writes data in R with applicable columns
qtrly_data <- read.csv(url, header = TRUE, sep = ",", quote="\"", dec=".", na.strings=" ", skip=0)
relevant_cols <- c("area_fips", "industry_code", "own_code", "agglvl_code", "year", "qtr")
overall <- c(relevant_cols, colnames(qtrly_data)[8:16])
lq <- c(relevant_cols, colnames(qtrly_data)[17:25])
oty <- c(relevant_cols, colnames(qtrly_data)[18:42])
types <- c("overall", "lq", "oty")
overallx <- colnames(qtrly_data)[9:16]
lqx <- colnames(qtrly_data)[18:25]
otyx <- colnames(qtrly_data)[seq(27,42,2)]
###Adding in the disclosure codes from each section
disc_codes <- c("disclosure_code", "lq_disclosure_code", "oty_disclosure_code")
cols_list = list(overall, lq, oty)
denom_list = list(overallx, lqx, otyx)
##Uses a two-loop peice of code to go through data denominations and categories, while melting it into the correct format
for (j in 1:length(types))
{
cat("Working on type: " , types[j], "\n")
these_denominations <- denom_list[[j]]
type_data <- qtrly_data[ , cols_list[[j]] ]
QCEW_County <- melt(type_data, id=c(relevant_cols, disc_codes[j]))
colnames(QCEW_County) <- c(relevant_cols, "disclosure_code", "text_denomination", "value")
Data_Cat <- j
for (k in 1:length(these_denominations))
{
cat("Working on type: " , types[j], "and denomination: ", these_denominations[k], "\n")
QCEW_County_Denominated <- QCEW_County[QCEW_County[, "text_denomination"] == these_denominations[k], ]
QCEW_County_Denominated$disclosure_code <- ifelse(QCEW_County_Denominated$disclosure_code == "", 0, 1)
Data_Denom <- k
QCEW_County_Denominated <- cbind(QCEW_County_Denominated, Data_Cat, Data_Denom)
QCEW_County_Denominated$Source_ID <- 1
QCEW_County_Denominated$text_denomination <- NULL
colnames(QCEW_County_Denominated) <- NULL
###Actually writes the txt file to the QCEW folder
write.table(QCEW_County_Denominated, file="C:\Users\jjackson\Downloads\QCEW\1990_test.txt", append=TRUE, quote=FALSE, sep=',', row.names=FALSE)
}
}
现在,我需要删除一些东西,即 QCEW_County_Denominated 数据框中 "area_fips" 列以字符 "C" 开头的所有行,在在同一列中,还有一些以 US 开头的代码,我想将其替换为 0。最后,我还有 "industry_code" 列,在我的最终数据框中有 3 个值需要替换。 31-33对31,44-45对44,48-49对48。我明白这是一项艰巨的任务。我正在慢慢地自己弄清楚,但是如果有人可以在我自己弄清楚这个问题的同时给我一个正确方向的帮助,我将不胜感激。 R 中的条件语句看起来像是我的致命弱点,因为它总是让我开始对其语法与其他统计包的不同之处感到困惑。
谢谢,祝你有愉快的一天。
您可以使用 regex
和子集删除和重新编码您的数据。
使用 grepl
,您可以 select 列 area_fips 中不以 C 开头的行。
QCEW_County_Denominated <- QCEW_County_Denominated[!grepl("^C", QCEW_County_Denominated$area_fips), ]
使用 gsub
,您可以将 area_fips 列中以 0 开头的值替换为 0。
QCEW_County_Denominated$area_fips <- as.numeric(gsub("^US", 0, QCEW_County_Denominated$area_fips))
最后,您可以使用子集替换 industry_code.
中的值
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "31-33"] <- 31
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "44-45"] <- 44
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "48-49"] <- 48
早上好,我创建了以下 R 代码:
setwd("xxx")
library(reshape)
##Insert needed year
url <- "./Quarterly/1990_qtrly.csv"
##Writes data in R with applicable columns
qtrly_data <- read.csv(url, header = TRUE, sep = ",", quote="\"", dec=".", na.strings=" ", skip=0)
relevant_cols <- c("area_fips", "industry_code", "own_code", "agglvl_code", "year", "qtr")
overall <- c(relevant_cols, colnames(qtrly_data)[8:16])
lq <- c(relevant_cols, colnames(qtrly_data)[17:25])
oty <- c(relevant_cols, colnames(qtrly_data)[18:42])
types <- c("overall", "lq", "oty")
overallx <- colnames(qtrly_data)[9:16]
lqx <- colnames(qtrly_data)[18:25]
otyx <- colnames(qtrly_data)[seq(27,42,2)]
###Adding in the disclosure codes from each section
disc_codes <- c("disclosure_code", "lq_disclosure_code", "oty_disclosure_code")
cols_list = list(overall, lq, oty)
denom_list = list(overallx, lqx, otyx)
##Uses a two-loop peice of code to go through data denominations and categories, while melting it into the correct format
for (j in 1:length(types))
{
cat("Working on type: " , types[j], "\n")
these_denominations <- denom_list[[j]]
type_data <- qtrly_data[ , cols_list[[j]] ]
QCEW_County <- melt(type_data, id=c(relevant_cols, disc_codes[j]))
colnames(QCEW_County) <- c(relevant_cols, "disclosure_code", "text_denomination", "value")
Data_Cat <- j
for (k in 1:length(these_denominations))
{
cat("Working on type: " , types[j], "and denomination: ", these_denominations[k], "\n")
QCEW_County_Denominated <- QCEW_County[QCEW_County[, "text_denomination"] == these_denominations[k], ]
QCEW_County_Denominated$disclosure_code <- ifelse(QCEW_County_Denominated$disclosure_code == "", 0, 1)
Data_Denom <- k
QCEW_County_Denominated <- cbind(QCEW_County_Denominated, Data_Cat, Data_Denom)
QCEW_County_Denominated$Source_ID <- 1
QCEW_County_Denominated$text_denomination <- NULL
colnames(QCEW_County_Denominated) <- NULL
###Actually writes the txt file to the QCEW folder
write.table(QCEW_County_Denominated, file="C:\Users\jjackson\Downloads\QCEW\1990_test.txt", append=TRUE, quote=FALSE, sep=',', row.names=FALSE)
}
}
现在,我需要删除一些东西,即 QCEW_County_Denominated 数据框中 "area_fips" 列以字符 "C" 开头的所有行,在在同一列中,还有一些以 US 开头的代码,我想将其替换为 0。最后,我还有 "industry_code" 列,在我的最终数据框中有 3 个值需要替换。 31-33对31,44-45对44,48-49对48。我明白这是一项艰巨的任务。我正在慢慢地自己弄清楚,但是如果有人可以在我自己弄清楚这个问题的同时给我一个正确方向的帮助,我将不胜感激。 R 中的条件语句看起来像是我的致命弱点,因为它总是让我开始对其语法与其他统计包的不同之处感到困惑。 谢谢,祝你有愉快的一天。
您可以使用 regex
和子集删除和重新编码您的数据。
使用 grepl
,您可以 select 列 area_fips 中不以 C 开头的行。
QCEW_County_Denominated <- QCEW_County_Denominated[!grepl("^C", QCEW_County_Denominated$area_fips), ]
使用 gsub
,您可以将 area_fips 列中以 0 开头的值替换为 0。
QCEW_County_Denominated$area_fips <- as.numeric(gsub("^US", 0, QCEW_County_Denominated$area_fips))
最后,您可以使用子集替换 industry_code.
中的值QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "31-33"] <- 31
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "44-45"] <- 44
QCEW_County_Denominated$industry_code[QCEW_County_Denominated$industry_code == "48-49"] <- 48