删除表达式数据集中的值 python
Drop values in expression dataset python
我有这个微阵列数据集。我想绕过我在这个管道的早期版本中遇到的一个问题,(https://geoparse.readthedocs.io/en/latest/Analyse_hsa-miR-124a-3p_transfection_time-course.html) 我已经创建了一个实验文件并将其作为数据框读取。我想消除表达式 table 中不再作为字符串值存在于我读入的数据框的列加入中的每一列。
# Import tools
import GEOparse
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# download datasets
gse1 = GEOparse.get_GEO(geo="GSE99039", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
gse2 = GEOparse.get_GEO(geo="GSE6613", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
gse3 = GEOparse.get_GEO(geo="GSE72267", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
# import all GSM data for each GSE file
with open("GSE99039_GPL570.csv") as f:
GSE99039_GPL570 = f.read().splitlines()
with open("GSE6613_GPL96.csv") as f:
GSE6613_GPL96 = f.read().splitlines()
with open("GSE72267_GPL571.csv") as f:
GSE72267_GPL571 = f.read().splitlines()
# gse1
gse1.gsm = gse1.phenotype_data
print(gse1.gsm.head())
# gse1
gse1.details = pd.read_csv('GSE99039_MicroarrayDetails.csv', delimiter = ',')
print(gse1.details.head())
gse1.detailsv1 = gse1.details[(gse1.details.values == "CONTROL") | (gse1.details.values == "IPD") | (gse1.details.values == "GPD") ]
print(gse1.detailsv1.head())
# gse1
pivoted_control_samples = gse1.pivot_samples('VALUE')[GSE99039_GPL570]
print(pivoted_control_samples)
# gse1
# Pulls the probes out
pivoted_control_samples_average = pivoted_control_samples.median(axis=1)
# Print number of probes before filtering
print("Number of probes before filtering: ", len(pivoted_control_samples_average))
# Extract all probes > 0.25
expression_threshold = pivoted_control_samples_average.quantile(0.25)
expressed_probes = pivoted_control_samples_average[pivoted_control_samples_average >= expression_threshold].index.tolist()
# Print probes above cut off
print("Number of probes above threshold: ", len(expressed_probes))
# confirm filtering worked
samples = gse1.pivot_samples("VALUE").loc[expressed_probes]
print(samples.head())
# print phenotype data
print(gse1.phenotype_data[["title", "source_name_ch1", "Disease_Label", "Sex" ]])
这是我创建的数据框的样子,在脚本中命名为 gse1.detailsv1
:
Accession Title Source name ... Subject_id Disease label Sex
0 GSM2630758 E7R_039a01 Whole blood ... L3012 CONTROL Female
1 GSM2630759 E7R_039a02 Whole blood ... L2838 IPD Male
2 GSM2630760 E7R_039a03 Whole blood ... L2540 IPD Female
3 GSM2630761 E7R_039a04 Whole blood ... L3015 CONTROL Female
4 GSM2630762 E7R_039a05 Whole blood ... L2884 IPD Female
[5 rows x 7 columns]
这就是我的表达式 table 的样子,在脚本中命名为 samples
:
name GSM2630758 GSM2630759 ... GSM2631314 GSM2631315
ID_REF ...
1007_s_at 5.397 4.952 ... 5.567 5.529
1053_at 5.199 5.198 ... 5.706 5.078
117_at 8.327 8.589 ... 8.511 8.458
121_at 7.042 6.935 ... 7.526 7.673
1294_at 7.753 8.210 ... 7.537 7.418
[5 rows x 558 columns]
假设,如果 GSM2630758 在第一个数据帧的 Accession 列中不存在,我想删除 GSM2630758。我需要遍历它并消除所有不再存在的值。
samples.drop(set(samples.columns[1:]) - set(gse1.detailsv.Accession.unique()), axis=1)
如果 gse1.detailsv1
数据集足够小,您可以创建所有 Accession 的列表并选择这些列:
cols = set(gse1.detailsv1["Accesion"].unique()) & set(samples.columns)
samples = samples[cols]
我有这个微阵列数据集。我想绕过我在这个管道的早期版本中遇到的一个问题,(https://geoparse.readthedocs.io/en/latest/Analyse_hsa-miR-124a-3p_transfection_time-course.html) 我已经创建了一个实验文件并将其作为数据框读取。我想消除表达式 table 中不再作为字符串值存在于我读入的数据框的列加入中的每一列。
# Import tools
import GEOparse
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# download datasets
gse1 = GEOparse.get_GEO(geo="GSE99039", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
gse2 = GEOparse.get_GEO(geo="GSE6613", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
gse3 = GEOparse.get_GEO(geo="GSE72267", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
# import all GSM data for each GSE file
with open("GSE99039_GPL570.csv") as f:
GSE99039_GPL570 = f.read().splitlines()
with open("GSE6613_GPL96.csv") as f:
GSE6613_GPL96 = f.read().splitlines()
with open("GSE72267_GPL571.csv") as f:
GSE72267_GPL571 = f.read().splitlines()
# gse1
gse1.gsm = gse1.phenotype_data
print(gse1.gsm.head())
# gse1
gse1.details = pd.read_csv('GSE99039_MicroarrayDetails.csv', delimiter = ',')
print(gse1.details.head())
gse1.detailsv1 = gse1.details[(gse1.details.values == "CONTROL") | (gse1.details.values == "IPD") | (gse1.details.values == "GPD") ]
print(gse1.detailsv1.head())
# gse1
pivoted_control_samples = gse1.pivot_samples('VALUE')[GSE99039_GPL570]
print(pivoted_control_samples)
# gse1
# Pulls the probes out
pivoted_control_samples_average = pivoted_control_samples.median(axis=1)
# Print number of probes before filtering
print("Number of probes before filtering: ", len(pivoted_control_samples_average))
# Extract all probes > 0.25
expression_threshold = pivoted_control_samples_average.quantile(0.25)
expressed_probes = pivoted_control_samples_average[pivoted_control_samples_average >= expression_threshold].index.tolist()
# Print probes above cut off
print("Number of probes above threshold: ", len(expressed_probes))
# confirm filtering worked
samples = gse1.pivot_samples("VALUE").loc[expressed_probes]
print(samples.head())
# print phenotype data
print(gse1.phenotype_data[["title", "source_name_ch1", "Disease_Label", "Sex" ]])
这是我创建的数据框的样子,在脚本中命名为 gse1.detailsv1
:
Accession Title Source name ... Subject_id Disease label Sex
0 GSM2630758 E7R_039a01 Whole blood ... L3012 CONTROL Female
1 GSM2630759 E7R_039a02 Whole blood ... L2838 IPD Male
2 GSM2630760 E7R_039a03 Whole blood ... L2540 IPD Female
3 GSM2630761 E7R_039a04 Whole blood ... L3015 CONTROL Female
4 GSM2630762 E7R_039a05 Whole blood ... L2884 IPD Female
[5 rows x 7 columns]
这就是我的表达式 table 的样子,在脚本中命名为 samples
:
name GSM2630758 GSM2630759 ... GSM2631314 GSM2631315
ID_REF ...
1007_s_at 5.397 4.952 ... 5.567 5.529
1053_at 5.199 5.198 ... 5.706 5.078
117_at 8.327 8.589 ... 8.511 8.458
121_at 7.042 6.935 ... 7.526 7.673
1294_at 7.753 8.210 ... 7.537 7.418
[5 rows x 558 columns]
假设,如果 GSM2630758 在第一个数据帧的 Accession 列中不存在,我想删除 GSM2630758。我需要遍历它并消除所有不再存在的值。
samples.drop(set(samples.columns[1:]) - set(gse1.detailsv.Accession.unique()), axis=1)
如果 gse1.detailsv1
数据集足够小,您可以创建所有 Accession 的列表并选择这些列:
cols = set(gse1.detailsv1["Accesion"].unique()) & set(samples.columns)
samples = samples[cols]