使用 Python 删除异常值
Remove outlier with Python
我有一个包含 30 行和 9 列的 DataFrame。我想去除 2 sigma 离群值。
我这样做:
from scipy import stats
df[(np.abs(stats.zscore(df)) < 2).all(axis=1)]
但是如果单个列中存在离群值,它会删除整行。我只想删除这个单一值。我怎样才能做到这一点?第一列包含时间。这永远不应该被触及。如何排除这一列?
数据是这样的:
Trace for Mass: 60Ni 61Ni 62Ni 63Cu 64Ni 65Cu 66Zn
Resolution: High High High High High High High
Time Intensity Intensity Intensity Intensity Intensity Intensity Intensity
[sec] [cps] [cps] [cps] [cps] [cps] [cps] [cps]
0. 4.246875178068876e-003 4.550645244307816e-004 8.364085806533694e-004 3.21496045216918e-003 3.215973265469074e-003 1.595904817804694e-003 1.983924303203821e-003
1.051999807357788 4.264393821358681e-003 5.171436932869256e-004 8.292743586935103e-004 3.154967911541462e-003 3.216561861336231e-003 1.622977200895548e-003 1.874359208159149e-003
2.102999925613403 4.27544629201293e-003 4.796394787263125e-004 8.318902109749615e-004 3.211528761312366e-003 3.147452371194959e-003 1.622740761376917e-003 1.879810937680304e-003
3.154999971389771 4.278738517314196e-003 4.829006502404809e-004 7.972901221364737e-004 3.218628698959947e-003 3.22998408228159e-003 1.604416524060071e-003 1.938240835443139e-003
4.206999778747559 4.211603198200464e-003 4.424861108418554e-004 8.007381693460047e-004 3.2428870908916e-003 3.166524693369865e-003 1.590821426361799e-003 1.903632888570428e-003
5.257999897003174 4.267803858965635e-003 5.1306706154719e-004 8.309389813803136e-004 3.144200425595045e-003 3.117314074188471e-003 1.603707205504179e-003 1.815222087316215e-003
6.309999942779541 4.182798787951469e-003 5.052632768638432e-004 7.896805764175952e-004 3.130593337118626e-003 3.10095027089119e-003 1.570251770317555e-003 1.817710697650909e-003
7.361000061035156 4.296375438570976e-003 4.910536226816475e-004 8.9122453937307e-004 3.204192267730832e-003 3.028199542313814e-003 1.533132861368358e-003 1.788084045983851e-003
8.413000106811523 4.335530567914248e-003 6.025235052220523e-004 8.631621603854001e-004 3.268211148679256e-003 2.987353131175041e-003 1.608435995876789e-003 1.796260941773653e-003
9.463999748229981 4.290143493562937e-003 4.839488829020411e-004 8.525795419700444e-004 3.222533734515309e-003 3.005951410159469e-003 1.583610195666552e-003 1.700276043266058e-003
10.51599979400635 4.287909716367722e-003 5.497571546584368e-004 9.083477198146284e-004 3.219338599592447e-003 2.950039459392428e-003 1.682562520727515e-003 1.783343963325024e-003
11.56699943542481 4.260278772562742e-003 4.665948799811304e-004 7.738673011772335e-004 3.193542594090104e-003 2.853760728612542e-003 1.568833249621093e-003 1.736654434353113e-003
12.61899948120117 4.26474679261446e-003 5.00720867421478e-004 8.611407829448581e-004 3.217800287529826e-003 2.865647897124291e-003 1.595077337697148e-003 1.658685388974845e-003
13.67099952697754 4.222772549837828e-003 4.647313617169857e-004 8.633999968878925e-004 3.159464336931706e-003 2.801976399496198e-003 1.629361184313893e-003 1.673259655945003e-003
14.72200012207031 4.23405971378088e-003 4.880253691226244e-004 8.320091292262077e-004 3.10550956055522e-003 2.766199875622988e-003 1.57923623919487e-003 1.671363832429051e-003
15.77400016784668 4.263806156814098e-003 5.268111126497388e-004 8.335548918694258e-004 3.150589996948838e-003 2.747958991676569e-003 1.52225757483393e-003 1.638660905882716e-003
16.82500076293945 4.173276014626026e-003 5.153965321369469e-004 7.848058012314141e-004 3.132368205115199e-003 2.736426191404462e-003 1.501098275184631e-003 1.646955031901598e-003
17.87699890136719 4.209604579955339e-003 4.582091642078012e-004 7.977656787261367e-004 3.183129709213972e-003 2.714420203119516e-003 1.604771241545677e-003 1.606788486242294e-003
18.92900085449219 4.214542452245951e-003 4.919854109175503e-004 8.5032032802701e-004 3.177686594426632e-003 2.588512841612101e-003 1.560558215714991e-003 1.607973361387849e-003
19.97999954223633 4.171629901975393e-003 4.438837058842182e-004 8.449696470052004e-004 3.142070723697543e-003 2.649111207574606e-003 1.58833886962384e-003 1.547667197883129e-003
21.0310001373291 4.234999883919954e-003 5.094563821330667e-004 8.215457201004028e-004 3.189756069332361e-003 2.645698608830571e-003 1.556538976728916e-003 1.515797688625753e-003
22.08300018310547 4.159520845860243e-003 5.21336798556149e-004 7.7945546945557e-004 3.093914361670613e-003 2.504269825294614e-003 1.597914495505393e-003 1.550629152916372e-003
23.13399887084961 4.095097538083792e-003 5.284418002702296e-004 8.160762954503298e-004 3.164552384987474e-003 2.605574205517769e-003 1.5143376076594e-003 1.545534702017903e-003
24.18600082397461 4.190911073237658e-003 4.741653683595359e-004 8.253505802713335e-004 3.078178269788623e-003 2.457562601193786e-003 1.61718437448144e-003 1.502647297456861e-003
25.23799896240234 4.155758768320084e-003 4.477270995266736e-004 8.012137841433287e-004 3.119352972134948e-003 2.549331868067384e-003 1.551455701701343e-003 1.538307638838887e-003
26.28899955749512 4.055834375321865e-003 4.267746699042618e-004 8.247561054304242e-004 3.050019731745124e-003 2.364743268117309e-003 1.565523212775588e-003 1.418655156157911e-003
27.34099960327148 4.160813987255096e-003 4.637996316887438e-004 8.405701955780387e-004 3.15011665225029e-003 2.621341263875365e-003 1.558548538014293e-003 1.534871873445809e-003
28.39200019836426 4.123781807720661e-003 5.418366636149585e-004 8.308201213367283e-004 3.128936979919672e-003 2.427210099995136e-003 1.607372076250613e-003 1.475754892453551e-003
29.44400024414063 4.185620695352554e-003 4.987408174201846e-004 7.421225891448557e-004 3.080426249653101e-003 2.371448557823896e-003 1.567532890476286e-003 1.444243011064827e-003
30.49600028991699 4.092158749699593e-003 5.319360643625259e-004 8.368841372430325e-004 3.113200422376394e-003 2.385094529017806e-003 1.580300158821046e-003 1.433581346645951e-003
此文件由以下人员读取:
pd.options.display.float_format = '{:.4f}'.format
data = pd.read_csv(dateiname, sep='\t', names=['Time', '60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn'], skiprows=6, nrows=30, index_col=False, dtype=float)
最好提供你的数据,但是 IIUC,使用 mask
来掩盖你的异常值 NaN
:
from scipy import stats
cols = list(df.drop(columns='Time').columns)
# or
# cols = ['60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn']
df[cols] = df[cols].mask(np.abs(stats.zscore(df[cols])) >= 2)
或 where
from scipy import stats
cols = list(df.drop(columns='Time').columns)
# or
# cols = ['60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn']
df[cols] = df[cols].where(np.abs(stats.zscore(df[cols])) < 2)
输出:
Time 60Ni 61Ni 62Ni 63Cu 64Ni 65Cu 66Zn
0 0.000000 0.004247 0.000455 0.000836 0.003215 0.003216 0.001596 0.001984
1 1.052000 0.004264 0.000517 0.000829 0.003155 0.003217 0.001623 0.001874
2 2.103000 0.004275 0.000480 0.000832 0.003212 0.003147 0.001623 0.001880
3 3.155000 0.004279 0.000483 0.000797 0.003219 0.003230 0.001604 0.001938
4 4.207000 0.004212 0.000442 0.000801 0.003243 0.003167 0.001591 0.001904
5 5.258000 0.004268 0.000513 0.000831 0.003144 0.003117 0.001604 0.001815
6 6.310000 0.004183 0.000505 0.000790 0.003131 0.003101 0.001570 0.001818
7 7.361000 0.004296 0.000491 0.000891 0.003204 0.003028 0.001533 0.001788
8 8.413000 0.004336 NaN 0.000863 NaN 0.002987 0.001608 0.001796
9 9.464000 0.004290 0.000484 0.000853 0.003223 0.003006 0.001584 0.001700
10 10.516000 0.004288 0.000550 NaN 0.003219 0.002950 NaN 0.001783
11 11.566999 0.004260 0.000467 0.000774 0.003194 0.002854 0.001569 0.001737
12 12.618999 0.004265 0.000501 0.000861 0.003218 0.002866 0.001595 0.001659
13 13.671000 0.004223 0.000465 0.000863 0.003159 0.002802 0.001629 0.001673
14 14.722000 0.004234 0.000488 0.000832 0.003106 0.002766 0.001579 0.001671
15 15.774000 0.004264 0.000527 0.000834 0.003151 0.002748 0.001522 0.001639
16 16.825001 0.004173 0.000515 0.000785 0.003132 0.002736 NaN 0.001647
17 17.876999 0.004210 0.000458 0.000798 0.003183 0.002714 0.001605 0.001607
18 18.929001 0.004215 0.000492 0.000850 0.003178 0.002589 0.001561 0.001608
19 19.980000 0.004172 0.000444 0.000845 0.003142 0.002649 0.001588 0.001548
20 21.031000 0.004235 0.000509 0.000822 0.003190 0.002646 0.001557 0.001516
21 22.083000 0.004160 0.000521 0.000779 0.003094 0.002504 0.001598 0.001551
22 23.133999 0.004095 0.000528 0.000816 0.003165 0.002606 0.001514 0.001546
23 24.186001 0.004191 0.000474 0.000825 0.003078 0.002458 0.001617 0.001503
24 25.237999 0.004156 0.000448 0.000801 0.003119 0.002549 0.001551 0.001538
25 26.289000 NaN 0.000427 0.000825 NaN 0.002365 0.001566 0.001419
26 27.341000 0.004161 0.000464 0.000841 0.003150 0.002621 0.001559 0.001535
27 28.392000 0.004124 0.000542 0.000831 0.003129 0.002427 0.001607 0.001476
28 29.444000 0.004186 0.000499 NaN 0.003080 0.002371 0.001568 0.001444
29 30.496000 0.004092 0.000532 0.000837 0.003113 0.002385 0.001580 0.001434
如果您需要用缺失值替换离群值,请使用DataFrame.mask
:
df = df.mask(np.abs(stats.zscore(df)) < 2)
#working for replace outlier by missing values
#df = df.mask(np.abs(stats.zscore(df)) < 2, np.nan)
I just want to get this single value deleted.
这是不可能的,我们只能像您的解决方案一样删除行。
这个异常值校正是一个模块内的函数,它被另一个函数调用,因为我有 ~30 个输入文件。
如果我使用@mozway 的解决方案执行此操作,那么我想分别将每个文件中每一列的平均值保存到一个文件中。但是 header(包含“文件名”、“时间”、“60Ni”、“61Ni”等)丢失了。我做错了什么?
这是离群值函数的完整代码:
import modules.config as conf
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
outfile = (conf.outdir)
infile = (conf.WorkspaceVariableInput)
def outlier_Cu_blk(file, append=True):
global outfile # get the path of the output folder
"""Create an output folder if it does not exist"""
try:
os.makedirs(outfile + '/blk')
except FileExistsError:
# directory already exists
pass
outfile_blk = outfile + '/blk' #open this folder
fullname = os.path.join(outfile_blk, 'Cu_export_blk.csv') # Export file
entries = Path(infile + '/Blk')
plot_name = Path(entries).stem
basename = os.path.basename(file)
"""Reading of the input files"""
pd.options.display.float_format = '{:.4f}'.format # Parameter for pandas
data = pd.read_csv(file, sep='\t', names=['Time', '60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn'], skiprows=6, nrows=30, index_col=False, dtype=float) # reading of the textfile, 6 rows are skipped and 30 rows read. Index column is deactivated
"""Outlier correction"""
cols = list(data.drop(columns='Time').columns)
datao = pd.DataFrame({'Time':data['Time']})
datao[cols] = data[cols].where(np.abs(stats.zscore(data[cols])) < 2)
"""calculating the mean of the corrected data and save into a single file"""
datao.to_csv('Cu_export_blk.csv', sep='\t', header = True, index_label='Index_name')
mean_filtered_transposed = pd.DataFrame(data=np.mean(data)).T
mean_filtered_transposed['Time'] = pd.to_datetime(mean_filtered_transposed["Time"], unit='s')
clean = mean_filtered_transposed.drop(mean_filtered_transposed.columns[[0]], axis=1)
clean.insert(0, 'Inputfile', file)
print(mean_filtered_transposed)
print(clean)
if append:
clean.to_csv(fullname, sep=' ', mode="a", header=False, index_label='Index_name')
else:
clean.to_csv(fullname, sep=' ', mode="w", header=True, index_label='Index_name')
我知道这可能很乱,但我只是个初学者:-)
我有一个包含 30 行和 9 列的 DataFrame。我想去除 2 sigma 离群值。
我这样做:
from scipy import stats
df[(np.abs(stats.zscore(df)) < 2).all(axis=1)]
但是如果单个列中存在离群值,它会删除整行。我只想删除这个单一值。我怎样才能做到这一点?第一列包含时间。这永远不应该被触及。如何排除这一列?
数据是这样的:
Trace for Mass: 60Ni 61Ni 62Ni 63Cu 64Ni 65Cu 66Zn
Resolution: High High High High High High High
Time Intensity Intensity Intensity Intensity Intensity Intensity Intensity
[sec] [cps] [cps] [cps] [cps] [cps] [cps] [cps]
0. 4.246875178068876e-003 4.550645244307816e-004 8.364085806533694e-004 3.21496045216918e-003 3.215973265469074e-003 1.595904817804694e-003 1.983924303203821e-003
1.051999807357788 4.264393821358681e-003 5.171436932869256e-004 8.292743586935103e-004 3.154967911541462e-003 3.216561861336231e-003 1.622977200895548e-003 1.874359208159149e-003
2.102999925613403 4.27544629201293e-003 4.796394787263125e-004 8.318902109749615e-004 3.211528761312366e-003 3.147452371194959e-003 1.622740761376917e-003 1.879810937680304e-003
3.154999971389771 4.278738517314196e-003 4.829006502404809e-004 7.972901221364737e-004 3.218628698959947e-003 3.22998408228159e-003 1.604416524060071e-003 1.938240835443139e-003
4.206999778747559 4.211603198200464e-003 4.424861108418554e-004 8.007381693460047e-004 3.2428870908916e-003 3.166524693369865e-003 1.590821426361799e-003 1.903632888570428e-003
5.257999897003174 4.267803858965635e-003 5.1306706154719e-004 8.309389813803136e-004 3.144200425595045e-003 3.117314074188471e-003 1.603707205504179e-003 1.815222087316215e-003
6.309999942779541 4.182798787951469e-003 5.052632768638432e-004 7.896805764175952e-004 3.130593337118626e-003 3.10095027089119e-003 1.570251770317555e-003 1.817710697650909e-003
7.361000061035156 4.296375438570976e-003 4.910536226816475e-004 8.9122453937307e-004 3.204192267730832e-003 3.028199542313814e-003 1.533132861368358e-003 1.788084045983851e-003
8.413000106811523 4.335530567914248e-003 6.025235052220523e-004 8.631621603854001e-004 3.268211148679256e-003 2.987353131175041e-003 1.608435995876789e-003 1.796260941773653e-003
9.463999748229981 4.290143493562937e-003 4.839488829020411e-004 8.525795419700444e-004 3.222533734515309e-003 3.005951410159469e-003 1.583610195666552e-003 1.700276043266058e-003
10.51599979400635 4.287909716367722e-003 5.497571546584368e-004 9.083477198146284e-004 3.219338599592447e-003 2.950039459392428e-003 1.682562520727515e-003 1.783343963325024e-003
11.56699943542481 4.260278772562742e-003 4.665948799811304e-004 7.738673011772335e-004 3.193542594090104e-003 2.853760728612542e-003 1.568833249621093e-003 1.736654434353113e-003
12.61899948120117 4.26474679261446e-003 5.00720867421478e-004 8.611407829448581e-004 3.217800287529826e-003 2.865647897124291e-003 1.595077337697148e-003 1.658685388974845e-003
13.67099952697754 4.222772549837828e-003 4.647313617169857e-004 8.633999968878925e-004 3.159464336931706e-003 2.801976399496198e-003 1.629361184313893e-003 1.673259655945003e-003
14.72200012207031 4.23405971378088e-003 4.880253691226244e-004 8.320091292262077e-004 3.10550956055522e-003 2.766199875622988e-003 1.57923623919487e-003 1.671363832429051e-003
15.77400016784668 4.263806156814098e-003 5.268111126497388e-004 8.335548918694258e-004 3.150589996948838e-003 2.747958991676569e-003 1.52225757483393e-003 1.638660905882716e-003
16.82500076293945 4.173276014626026e-003 5.153965321369469e-004 7.848058012314141e-004 3.132368205115199e-003 2.736426191404462e-003 1.501098275184631e-003 1.646955031901598e-003
17.87699890136719 4.209604579955339e-003 4.582091642078012e-004 7.977656787261367e-004 3.183129709213972e-003 2.714420203119516e-003 1.604771241545677e-003 1.606788486242294e-003
18.92900085449219 4.214542452245951e-003 4.919854109175503e-004 8.5032032802701e-004 3.177686594426632e-003 2.588512841612101e-003 1.560558215714991e-003 1.607973361387849e-003
19.97999954223633 4.171629901975393e-003 4.438837058842182e-004 8.449696470052004e-004 3.142070723697543e-003 2.649111207574606e-003 1.58833886962384e-003 1.547667197883129e-003
21.0310001373291 4.234999883919954e-003 5.094563821330667e-004 8.215457201004028e-004 3.189756069332361e-003 2.645698608830571e-003 1.556538976728916e-003 1.515797688625753e-003
22.08300018310547 4.159520845860243e-003 5.21336798556149e-004 7.7945546945557e-004 3.093914361670613e-003 2.504269825294614e-003 1.597914495505393e-003 1.550629152916372e-003
23.13399887084961 4.095097538083792e-003 5.284418002702296e-004 8.160762954503298e-004 3.164552384987474e-003 2.605574205517769e-003 1.5143376076594e-003 1.545534702017903e-003
24.18600082397461 4.190911073237658e-003 4.741653683595359e-004 8.253505802713335e-004 3.078178269788623e-003 2.457562601193786e-003 1.61718437448144e-003 1.502647297456861e-003
25.23799896240234 4.155758768320084e-003 4.477270995266736e-004 8.012137841433287e-004 3.119352972134948e-003 2.549331868067384e-003 1.551455701701343e-003 1.538307638838887e-003
26.28899955749512 4.055834375321865e-003 4.267746699042618e-004 8.247561054304242e-004 3.050019731745124e-003 2.364743268117309e-003 1.565523212775588e-003 1.418655156157911e-003
27.34099960327148 4.160813987255096e-003 4.637996316887438e-004 8.405701955780387e-004 3.15011665225029e-003 2.621341263875365e-003 1.558548538014293e-003 1.534871873445809e-003
28.39200019836426 4.123781807720661e-003 5.418366636149585e-004 8.308201213367283e-004 3.128936979919672e-003 2.427210099995136e-003 1.607372076250613e-003 1.475754892453551e-003
29.44400024414063 4.185620695352554e-003 4.987408174201846e-004 7.421225891448557e-004 3.080426249653101e-003 2.371448557823896e-003 1.567532890476286e-003 1.444243011064827e-003
30.49600028991699 4.092158749699593e-003 5.319360643625259e-004 8.368841372430325e-004 3.113200422376394e-003 2.385094529017806e-003 1.580300158821046e-003 1.433581346645951e-003
此文件由以下人员读取:
pd.options.display.float_format = '{:.4f}'.format
data = pd.read_csv(dateiname, sep='\t', names=['Time', '60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn'], skiprows=6, nrows=30, index_col=False, dtype=float)
最好提供你的数据,但是 IIUC,使用 mask
来掩盖你的异常值 NaN
:
from scipy import stats
cols = list(df.drop(columns='Time').columns)
# or
# cols = ['60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn']
df[cols] = df[cols].mask(np.abs(stats.zscore(df[cols])) >= 2)
或 where
from scipy import stats
cols = list(df.drop(columns='Time').columns)
# or
# cols = ['60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn']
df[cols] = df[cols].where(np.abs(stats.zscore(df[cols])) < 2)
输出:
Time 60Ni 61Ni 62Ni 63Cu 64Ni 65Cu 66Zn
0 0.000000 0.004247 0.000455 0.000836 0.003215 0.003216 0.001596 0.001984
1 1.052000 0.004264 0.000517 0.000829 0.003155 0.003217 0.001623 0.001874
2 2.103000 0.004275 0.000480 0.000832 0.003212 0.003147 0.001623 0.001880
3 3.155000 0.004279 0.000483 0.000797 0.003219 0.003230 0.001604 0.001938
4 4.207000 0.004212 0.000442 0.000801 0.003243 0.003167 0.001591 0.001904
5 5.258000 0.004268 0.000513 0.000831 0.003144 0.003117 0.001604 0.001815
6 6.310000 0.004183 0.000505 0.000790 0.003131 0.003101 0.001570 0.001818
7 7.361000 0.004296 0.000491 0.000891 0.003204 0.003028 0.001533 0.001788
8 8.413000 0.004336 NaN 0.000863 NaN 0.002987 0.001608 0.001796
9 9.464000 0.004290 0.000484 0.000853 0.003223 0.003006 0.001584 0.001700
10 10.516000 0.004288 0.000550 NaN 0.003219 0.002950 NaN 0.001783
11 11.566999 0.004260 0.000467 0.000774 0.003194 0.002854 0.001569 0.001737
12 12.618999 0.004265 0.000501 0.000861 0.003218 0.002866 0.001595 0.001659
13 13.671000 0.004223 0.000465 0.000863 0.003159 0.002802 0.001629 0.001673
14 14.722000 0.004234 0.000488 0.000832 0.003106 0.002766 0.001579 0.001671
15 15.774000 0.004264 0.000527 0.000834 0.003151 0.002748 0.001522 0.001639
16 16.825001 0.004173 0.000515 0.000785 0.003132 0.002736 NaN 0.001647
17 17.876999 0.004210 0.000458 0.000798 0.003183 0.002714 0.001605 0.001607
18 18.929001 0.004215 0.000492 0.000850 0.003178 0.002589 0.001561 0.001608
19 19.980000 0.004172 0.000444 0.000845 0.003142 0.002649 0.001588 0.001548
20 21.031000 0.004235 0.000509 0.000822 0.003190 0.002646 0.001557 0.001516
21 22.083000 0.004160 0.000521 0.000779 0.003094 0.002504 0.001598 0.001551
22 23.133999 0.004095 0.000528 0.000816 0.003165 0.002606 0.001514 0.001546
23 24.186001 0.004191 0.000474 0.000825 0.003078 0.002458 0.001617 0.001503
24 25.237999 0.004156 0.000448 0.000801 0.003119 0.002549 0.001551 0.001538
25 26.289000 NaN 0.000427 0.000825 NaN 0.002365 0.001566 0.001419
26 27.341000 0.004161 0.000464 0.000841 0.003150 0.002621 0.001559 0.001535
27 28.392000 0.004124 0.000542 0.000831 0.003129 0.002427 0.001607 0.001476
28 29.444000 0.004186 0.000499 NaN 0.003080 0.002371 0.001568 0.001444
29 30.496000 0.004092 0.000532 0.000837 0.003113 0.002385 0.001580 0.001434
如果您需要用缺失值替换离群值,请使用DataFrame.mask
:
df = df.mask(np.abs(stats.zscore(df)) < 2)
#working for replace outlier by missing values
#df = df.mask(np.abs(stats.zscore(df)) < 2, np.nan)
I just want to get this single value deleted.
这是不可能的,我们只能像您的解决方案一样删除行。
这个异常值校正是一个模块内的函数,它被另一个函数调用,因为我有 ~30 个输入文件。
如果我使用@mozway 的解决方案执行此操作,那么我想分别将每个文件中每一列的平均值保存到一个文件中。但是 header(包含“文件名”、“时间”、“60Ni”、“61Ni”等)丢失了。我做错了什么?
这是离群值函数的完整代码:
import modules.config as conf
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
outfile = (conf.outdir)
infile = (conf.WorkspaceVariableInput)
def outlier_Cu_blk(file, append=True):
global outfile # get the path of the output folder
"""Create an output folder if it does not exist"""
try:
os.makedirs(outfile + '/blk')
except FileExistsError:
# directory already exists
pass
outfile_blk = outfile + '/blk' #open this folder
fullname = os.path.join(outfile_blk, 'Cu_export_blk.csv') # Export file
entries = Path(infile + '/Blk')
plot_name = Path(entries).stem
basename = os.path.basename(file)
"""Reading of the input files"""
pd.options.display.float_format = '{:.4f}'.format # Parameter for pandas
data = pd.read_csv(file, sep='\t', names=['Time', '60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn'], skiprows=6, nrows=30, index_col=False, dtype=float) # reading of the textfile, 6 rows are skipped and 30 rows read. Index column is deactivated
"""Outlier correction"""
cols = list(data.drop(columns='Time').columns)
datao = pd.DataFrame({'Time':data['Time']})
datao[cols] = data[cols].where(np.abs(stats.zscore(data[cols])) < 2)
"""calculating the mean of the corrected data and save into a single file"""
datao.to_csv('Cu_export_blk.csv', sep='\t', header = True, index_label='Index_name')
mean_filtered_transposed = pd.DataFrame(data=np.mean(data)).T
mean_filtered_transposed['Time'] = pd.to_datetime(mean_filtered_transposed["Time"], unit='s')
clean = mean_filtered_transposed.drop(mean_filtered_transposed.columns[[0]], axis=1)
clean.insert(0, 'Inputfile', file)
print(mean_filtered_transposed)
print(clean)
if append:
clean.to_csv(fullname, sep=' ', mode="a", header=False, index_label='Index_name')
else:
clean.to_csv(fullname, sep=' ', mode="w", header=True, index_label='Index_name')
我知道这可能很乱,但我只是个初学者:-)