使用 Python 删除异常值

Question

我有一个包含 30 行和 9 列的 DataFrame。我想去除 2 sigma 离群值。

我这样做：

from scipy import stats
df[(np.abs(stats.zscore(df)) < 2).all(axis=1)]

但是如果单个列中存在离群值，它会删除整行。我只想删除这个单一值。我怎样才能做到这一点？第一列包含时间。这永远不应该被触及。如何排除这一列？

数据是这样的：

Trace for Mass: 60Ni    61Ni    62Ni    63Cu    64Ni    65Cu    66Zn
Resolution: High    High    High    High    High    High    High
                            
Time    Intensity   Intensity   Intensity   Intensity   Intensity   Intensity   Intensity
[sec]   [cps]   [cps]   [cps]   [cps]   [cps]   [cps]   [cps]

0.  4.246875178068876e-003  4.550645244307816e-004  8.364085806533694e-004  3.21496045216918e-003   3.215973265469074e-003  1.595904817804694e-003  1.983924303203821e-003  
1.051999807357788   4.264393821358681e-003  5.171436932869256e-004  8.292743586935103e-004  3.154967911541462e-003  3.216561861336231e-003  1.622977200895548e-003  1.874359208159149e-003  
2.102999925613403   4.27544629201293e-003   4.796394787263125e-004  8.318902109749615e-004  3.211528761312366e-003  3.147452371194959e-003  1.622740761376917e-003  1.879810937680304e-003  
3.154999971389771   4.278738517314196e-003  4.829006502404809e-004  7.972901221364737e-004  3.218628698959947e-003  3.22998408228159e-003   1.604416524060071e-003  1.938240835443139e-003  
4.206999778747559   4.211603198200464e-003  4.424861108418554e-004  8.007381693460047e-004  3.2428870908916e-003    3.166524693369865e-003  1.590821426361799e-003  1.903632888570428e-003  
5.257999897003174   4.267803858965635e-003  5.1306706154719e-004    8.309389813803136e-004  3.144200425595045e-003  3.117314074188471e-003  1.603707205504179e-003  1.815222087316215e-003  
6.309999942779541   4.182798787951469e-003  5.052632768638432e-004  7.896805764175952e-004  3.130593337118626e-003  3.10095027089119e-003   1.570251770317555e-003  1.817710697650909e-003  
7.361000061035156   4.296375438570976e-003  4.910536226816475e-004  8.9122453937307e-004    3.204192267730832e-003  3.028199542313814e-003  1.533132861368358e-003  1.788084045983851e-003  
8.413000106811523   4.335530567914248e-003  6.025235052220523e-004  8.631621603854001e-004  3.268211148679256e-003  2.987353131175041e-003  1.608435995876789e-003  1.796260941773653e-003  
9.463999748229981   4.290143493562937e-003  4.839488829020411e-004  8.525795419700444e-004  3.222533734515309e-003  3.005951410159469e-003  1.583610195666552e-003  1.700276043266058e-003  
10.51599979400635   4.287909716367722e-003  5.497571546584368e-004  9.083477198146284e-004  3.219338599592447e-003  2.950039459392428e-003  1.682562520727515e-003  1.783343963325024e-003  
11.56699943542481   4.260278772562742e-003  4.665948799811304e-004  7.738673011772335e-004  3.193542594090104e-003  2.853760728612542e-003  1.568833249621093e-003  1.736654434353113e-003  
12.61899948120117   4.26474679261446e-003   5.00720867421478e-004   8.611407829448581e-004  3.217800287529826e-003  2.865647897124291e-003  1.595077337697148e-003  1.658685388974845e-003  
13.67099952697754   4.222772549837828e-003  4.647313617169857e-004  8.633999968878925e-004  3.159464336931706e-003  2.801976399496198e-003  1.629361184313893e-003  1.673259655945003e-003  
14.72200012207031   4.23405971378088e-003   4.880253691226244e-004  8.320091292262077e-004  3.10550956055522e-003   2.766199875622988e-003  1.57923623919487e-003   1.671363832429051e-003  
15.77400016784668   4.263806156814098e-003  5.268111126497388e-004  8.335548918694258e-004  3.150589996948838e-003  2.747958991676569e-003  1.52225757483393e-003   1.638660905882716e-003  
16.82500076293945   4.173276014626026e-003  5.153965321369469e-004  7.848058012314141e-004  3.132368205115199e-003  2.736426191404462e-003  1.501098275184631e-003  1.646955031901598e-003  
17.87699890136719   4.209604579955339e-003  4.582091642078012e-004  7.977656787261367e-004  3.183129709213972e-003  2.714420203119516e-003  1.604771241545677e-003  1.606788486242294e-003  
18.92900085449219   4.214542452245951e-003  4.919854109175503e-004  8.5032032802701e-004    3.177686594426632e-003  2.588512841612101e-003  1.560558215714991e-003  1.607973361387849e-003  
19.97999954223633   4.171629901975393e-003  4.438837058842182e-004  8.449696470052004e-004  3.142070723697543e-003  2.649111207574606e-003  1.58833886962384e-003   1.547667197883129e-003  
21.0310001373291    4.234999883919954e-003  5.094563821330667e-004  8.215457201004028e-004  3.189756069332361e-003  2.645698608830571e-003  1.556538976728916e-003  1.515797688625753e-003  
22.08300018310547   4.159520845860243e-003  5.21336798556149e-004   7.7945546945557e-004    3.093914361670613e-003  2.504269825294614e-003  1.597914495505393e-003  1.550629152916372e-003  
23.13399887084961   4.095097538083792e-003  5.284418002702296e-004  8.160762954503298e-004  3.164552384987474e-003  2.605574205517769e-003  1.5143376076594e-003    1.545534702017903e-003  
24.18600082397461   4.190911073237658e-003  4.741653683595359e-004  8.253505802713335e-004  3.078178269788623e-003  2.457562601193786e-003  1.61718437448144e-003   1.502647297456861e-003  
25.23799896240234   4.155758768320084e-003  4.477270995266736e-004  8.012137841433287e-004  3.119352972134948e-003  2.549331868067384e-003  1.551455701701343e-003  1.538307638838887e-003  
26.28899955749512   4.055834375321865e-003  4.267746699042618e-004  8.247561054304242e-004  3.050019731745124e-003  2.364743268117309e-003  1.565523212775588e-003  1.418655156157911e-003  
27.34099960327148   4.160813987255096e-003  4.637996316887438e-004  8.405701955780387e-004  3.15011665225029e-003   2.621341263875365e-003  1.558548538014293e-003  1.534871873445809e-003  
28.39200019836426   4.123781807720661e-003  5.418366636149585e-004  8.308201213367283e-004  3.128936979919672e-003  2.427210099995136e-003  1.607372076250613e-003  1.475754892453551e-003  
29.44400024414063   4.185620695352554e-003  4.987408174201846e-004  7.421225891448557e-004  3.080426249653101e-003  2.371448557823896e-003  1.567532890476286e-003  1.444243011064827e-003  
30.49600028991699   4.092158749699593e-003  5.319360643625259e-004  8.368841372430325e-004  3.113200422376394e-003  2.385094529017806e-003  1.580300158821046e-003  1.433581346645951e-003

此文件由以下人员读取：

pd.options.display.float_format = '{:.4f}'.format

data = pd.read_csv(dateiname, sep='\t', names=['Time', '60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn'], skiprows=6, nrows=30, index_col=False, dtype=float)

Answer 1

最好提供你的数据，但是 IIUC，使用 mask 来掩盖你的异常值 NaN:

from scipy import stats
cols = list(df.drop(columns='Time').columns)
# or
# cols = ['60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn']

df[cols] = df[cols].mask(np.abs(stats.zscore(df[cols])) >= 2)

或 where

from scipy import stats
cols = list(df.drop(columns='Time').columns)
# or
# cols = ['60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn']

df[cols] = df[cols].where(np.abs(stats.zscore(df[cols])) < 2)

输出：

         Time      60Ni      61Ni      62Ni      63Cu      64Ni      65Cu      66Zn
0    0.000000  0.004247  0.000455  0.000836  0.003215  0.003216  0.001596  0.001984
1    1.052000  0.004264  0.000517  0.000829  0.003155  0.003217  0.001623  0.001874
2    2.103000  0.004275  0.000480  0.000832  0.003212  0.003147  0.001623  0.001880
3    3.155000  0.004279  0.000483  0.000797  0.003219  0.003230  0.001604  0.001938
4    4.207000  0.004212  0.000442  0.000801  0.003243  0.003167  0.001591  0.001904
5    5.258000  0.004268  0.000513  0.000831  0.003144  0.003117  0.001604  0.001815
6    6.310000  0.004183  0.000505  0.000790  0.003131  0.003101  0.001570  0.001818
7    7.361000  0.004296  0.000491  0.000891  0.003204  0.003028  0.001533  0.001788
8    8.413000  0.004336       NaN  0.000863       NaN  0.002987  0.001608  0.001796
9    9.464000  0.004290  0.000484  0.000853  0.003223  0.003006  0.001584  0.001700
10  10.516000  0.004288  0.000550       NaN  0.003219  0.002950       NaN  0.001783
11  11.566999  0.004260  0.000467  0.000774  0.003194  0.002854  0.001569  0.001737
12  12.618999  0.004265  0.000501  0.000861  0.003218  0.002866  0.001595  0.001659
13  13.671000  0.004223  0.000465  0.000863  0.003159  0.002802  0.001629  0.001673
14  14.722000  0.004234  0.000488  0.000832  0.003106  0.002766  0.001579  0.001671
15  15.774000  0.004264  0.000527  0.000834  0.003151  0.002748  0.001522  0.001639
16  16.825001  0.004173  0.000515  0.000785  0.003132  0.002736       NaN  0.001647
17  17.876999  0.004210  0.000458  0.000798  0.003183  0.002714  0.001605  0.001607
18  18.929001  0.004215  0.000492  0.000850  0.003178  0.002589  0.001561  0.001608
19  19.980000  0.004172  0.000444  0.000845  0.003142  0.002649  0.001588  0.001548
20  21.031000  0.004235  0.000509  0.000822  0.003190  0.002646  0.001557  0.001516
21  22.083000  0.004160  0.000521  0.000779  0.003094  0.002504  0.001598  0.001551
22  23.133999  0.004095  0.000528  0.000816  0.003165  0.002606  0.001514  0.001546
23  24.186001  0.004191  0.000474  0.000825  0.003078  0.002458  0.001617  0.001503
24  25.237999  0.004156  0.000448  0.000801  0.003119  0.002549  0.001551  0.001538
25  26.289000       NaN  0.000427  0.000825       NaN  0.002365  0.001566  0.001419
26  27.341000  0.004161  0.000464  0.000841  0.003150  0.002621  0.001559  0.001535
27  28.392000  0.004124  0.000542  0.000831  0.003129  0.002427  0.001607  0.001476
28  29.444000  0.004186  0.000499       NaN  0.003080  0.002371  0.001568  0.001444
29  30.496000  0.004092  0.000532  0.000837  0.003113  0.002385  0.001580  0.001434

Answer 2

如果您需要用缺失值替换离群值，请使用DataFrame.mask:

df = df.mask(np.abs(stats.zscore(df)) < 2)

#working for replace outlier by missing values
#df = df.mask(np.abs(stats.zscore(df)) < 2, np.nan)

I just want to get this single value deleted.

这是不可能的，我们只能像您的解决方案一样删除行。

Answer 3

这个异常值校正是一个模块内的函数，它被另一个函数调用，因为我有 ~30 个输入文件。

如果我使用@mozway 的解决方案执行此操作，那么我想分别将每个文件中每一列的平均值保存到一个文件中。但是 header（包含“文件名”、“时间”、“60Ni”、“61Ni”等）丢失了。我做错了什么？

这是离群值函数的完整代码：

import modules.config as conf
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path

outfile = (conf.outdir) 
infile = (conf.WorkspaceVariableInput)

def outlier_Cu_blk(file, append=True):

    global outfile   # get the path of the output folder
    
    """Create an output folder if it does not exist"""
    try:
        os.makedirs(outfile + '/blk')
    except FileExistsError:
        # directory already exists
        pass
    
    outfile_blk = outfile + '/blk' #open this folder
    fullname = os.path.join(outfile_blk, 'Cu_export_blk.csv') # Export file
    entries = Path(infile + '/Blk') 
    plot_name = Path(entries).stem
    basename = os.path.basename(file)
    
    """Reading of the input files"""
    
    pd.options.display.float_format = '{:.4f}'.format # Parameter for pandas

    data = pd.read_csv(file, sep='\t', names=['Time', '60Ni', '61Ni', '62Ni', '63Cu', '64Ni', '65Cu', '66Zn'], skiprows=6, nrows=30, index_col=False, dtype=float) # reading of the textfile, 6 rows are skipped and 30 rows read. Index column is deactivated
   
    """Outlier correction"""
    
    cols = list(data.drop(columns='Time').columns)
    datao = pd.DataFrame({'Time':data['Time']})
    datao[cols] = data[cols].where(np.abs(stats.zscore(data[cols])) < 2)

    """calculating the mean of the corrected data and save into a single file"""
    
    datao.to_csv('Cu_export_blk.csv', sep='\t', header = True, index_label='Index_name')
    
    mean_filtered_transposed = pd.DataFrame(data=np.mean(data)).T
    mean_filtered_transposed['Time'] = pd.to_datetime(mean_filtered_transposed["Time"], unit='s')
    clean = mean_filtered_transposed.drop(mean_filtered_transposed.columns[[0]], axis=1) 
    clean.insert(0, 'Inputfile', file)
    print(mean_filtered_transposed)
    print(clean)
    if append:
        clean.to_csv(fullname, sep=' ', mode="a", header=False, index_label='Index_name')
    else:
        clean.to_csv(fullname, sep=' ', mode="w", header=True, index_label='Index_name')

我知道这可能很乱，但我只是个初学者:-)

使用 Python 删除异常值

Remove outlier with Python

python

numpy

scipy

pandas