如何根据 python 中不同文件的变量名添加值
How to add values based on variable name from different files in python
我有两个不同的文件,第一个文件有变量名 'InstanceType' 和其他变量。第二个文件具有相同的变量名称 'InstanceType',其中具有基于分类的其他变量。我想通过添加基于变量 'InstanceType' 的列来添加第二个变量数据。
这是我的代码:
df=pd.read_excel(io.BytesIO(uploaded['spot-prices-2021-05-16.xlsx']))
df
输入:
AvailabilityZone InstanceType ProductDescription SpotPrice ymd_hms(Timestamp)
0 us-east-1f r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
1 us-east-1c r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
2 us-east-1b r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
3 us-east-1a r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
4 us-east-1d p3.8xlarge Red Hat Enterprise Linux 3.8020 2021-05-16 21:14:07
第一个文件 csv
第二个文件输入
InstanceType vCPU GPUs Memory (GiB) Baseline Performance / vCPU CPU Credits Earned
0 t4g.nano 2 NaN 0.5 0.05 6.0 NaN
1 t4g.micro 2 NaN 1.0 0.10 12.0 NaN
2 t4g.small 2 NaN 2.0 0.20 24.0 NaN
3 t4g.medium 2 NaN 4.0 0.20 24.0 NaN
4 t4g.large 2 NaN 8.0 0.30 36.0 NaN
df3=pd.merge(df,df2, on='InstanceType', how= str('outer'))
输出:
AvailabilityZone InstanceType ProductDescription SpotPrice ymd_hms(Timestamp) vCPU GPUs Memory (GiB) Baseline Performance / vCPU CPU Credits Earned / Hr GPU Mem (Gib)
0 us-east-1f r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12 NaN NaN NaN NaN NaN
预期结果是 - 它应该根据 'InstanceType' 获取所有值并显示在每一行中。
提前致谢。
你的数据有问题,你放在评论中的 CSV 文件是用分号分隔的 ;
而不是默认的逗号 ,
另外,建议使用pandas.read_csv
,即使你有Excel sheet,你可以先用to_csv
方法转换成CSV,然后读取作为 CSV。
我假设您已经阅读了两个 CSV 文件,所以这是我的解决方案:
import pandas as pd
df1 = pd.read_csv('spot-prices-2021-05-16.csv', delimiter=';')
df2 = pd.read_csv('InstanceAWS.csv')
df_merged = pd.merge(df1, df2, on='InstanceType', how='outer')
print(df_merged.head)
我已经做到了你想要的。您可以将输出文件导入pandas,它将准确显示您想要的内容。首先生成“combined.csv”,然后 运行 导入代码。
这是要导入的代码:
import pandas as pd
df = pd.read_csv('combined.csv')
print(df)
这里是生成“combined.csv”的代码。 运行需要30秒。确保两个源 CSV 与代码位于同一文件夹中。
import re
import os
from contextlib import redirect_stdout
import io
def enforceWhitelist(temp5): # function to ensure only the characters we want are allowed
new = re.sub('[^abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_:.\-]+' ,'', temp5) #regex pattern for whitelisting
return new
def getInfo(temp9): #function to return the details from the 2nd(sub) csv as string
global listnamesY
for temp10 in range(len(listnamesY[0])):
temp12 = ""
for temp11 in range(len(listnamesY)):
temp12 = temp12 + listnamesY[temp11][temp10]+","
if temp9 in temp12:
return temp12
return temp9+","+("NOT_FOUND,"*(len(listnamesY)-1))
tempmain = [] #this is the array with all the lines of the original 1st(main) csv
with open("spot-prices-2021-05-16.csv") as main_in:
for temp1 in main_in:
tempmain.append(temp1)
tempsub = [] #this is the array with all the lines of the 2nd(sub) csv
with open("InstanceAWS.csv") as sub_in:
for temp2 in sub_in:
tempsub.append(temp2)
listnames1 = [] #this is the array with the names of columns of 1st(main) csv
listnames2 = [] #this is the array with the names of columns of 2nd(sub) csv
listnamesX = [] #this is the array with actual columns of 1st(main) csv
listnamesY = [] #this is the array with actual columns of 2nd(sub) csv
f = io.StringIO()
with redirect_stdout(f): #to redirect stdout to string
for temp3 in range(len(tempmain[0].split(";"))):
listnames1.append(enforceWhitelist(tempmain[0].split(';')[temp3]+"_1"))
exec(listnames1[temp3]+" = []")
for temp4 in range(len(tempmain)):
if temp4 > 0:
exec(listnames1[temp3]+".append('"+enforceWhitelist(tempmain[temp4].split(';')[temp3])+"')")
exec("listnamesX.append("+enforceWhitelist(tempmain[0].split(';')[temp3]+'_1')+")")
for temp3 in range(len(tempsub[0].split(","))):
listnames2.append(enforceWhitelist(tempsub[0].split(',')[temp3]+"_2"))
exec(listnames2[temp3]+" = []")
for temp4 in range(len(tempsub)):
if temp4 > 0:
exec(listnames2[temp3]+".append('"+enforceWhitelist(tempsub[temp4].split(',')[temp3])+"')")
exec("listnamesY.append("+enforceWhitelist(tempsub[0].split(',')[temp3]+'_2')+")")
for temp13 in range(len(listnames1)):
if listnames1[temp13][:-2] == "InstanceType":
for temp14 in range(len(listnames2)):
print(listnames2[temp14][:-2], end=',')
else:
print(listnames1[temp13][:-2], end=',')
print("")
for temp6 in range(len(listnamesX[0])):
for temp7 in range(len(listnamesX)):
if listnames1[temp7][:-2] == "InstanceType":
print(getInfo(listnamesX[temp7][temp6]), end='')
else:
print(listnamesX[temp7][temp6], end=',')
print("")
s = f.getvalue() #the string with the stdout
s="No."+s #first column has no name so added a name
try:
os.remove("combined.csv") #delete the old file to see the latest generation
except:
pass
f2 = open("combined.csv", "w")
for temp15 in range(len(s.split("\n"))):
f2.write(s.split("\n")[temp15][:-1]+"\n") #write to output csv file
f2.close()
我有两个不同的文件,第一个文件有变量名 'InstanceType' 和其他变量。第二个文件具有相同的变量名称 'InstanceType',其中具有基于分类的其他变量。我想通过添加基于变量 'InstanceType' 的列来添加第二个变量数据。 这是我的代码:
df=pd.read_excel(io.BytesIO(uploaded['spot-prices-2021-05-16.xlsx']))
df
输入:
AvailabilityZone InstanceType ProductDescription SpotPrice ymd_hms(Timestamp)
0 us-east-1f r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
1 us-east-1c r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
2 us-east-1b r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
3 us-east-1a r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12
4 us-east-1d p3.8xlarge Red Hat Enterprise Linux 3.8020 2021-05-16 21:14:07
第一个文件 csv
第二个文件输入
InstanceType vCPU GPUs Memory (GiB) Baseline Performance / vCPU CPU Credits Earned
0 t4g.nano 2 NaN 0.5 0.05 6.0 NaN
1 t4g.micro 2 NaN 1.0 0.10 12.0 NaN
2 t4g.small 2 NaN 2.0 0.20 24.0 NaN
3 t4g.medium 2 NaN 4.0 0.20 24.0 NaN
4 t4g.large 2 NaN 8.0 0.30 36.0 NaN
df3=pd.merge(df,df2, on='InstanceType', how= str('outer'))
输出:
AvailabilityZone InstanceType ProductDescription SpotPrice ymd_hms(Timestamp) vCPU GPUs Memory (GiB) Baseline Performance / vCPU CPU Credits Earned / Hr GPU Mem (Gib)
0 us-east-1f r5a.4xlarge Windows 1.0210 2021-05-16 21:14:12 NaN NaN NaN NaN NaN
预期结果是 - 它应该根据 'InstanceType' 获取所有值并显示在每一行中。
提前致谢。
你的数据有问题,你放在评论中的 CSV 文件是用分号分隔的 ;
而不是默认的逗号 ,
另外,建议使用pandas.read_csv
,即使你有Excel sheet,你可以先用to_csv
方法转换成CSV,然后读取作为 CSV。
我假设您已经阅读了两个 CSV 文件,所以这是我的解决方案:
import pandas as pd
df1 = pd.read_csv('spot-prices-2021-05-16.csv', delimiter=';')
df2 = pd.read_csv('InstanceAWS.csv')
df_merged = pd.merge(df1, df2, on='InstanceType', how='outer')
print(df_merged.head)
我已经做到了你想要的。您可以将输出文件导入pandas,它将准确显示您想要的内容。首先生成“combined.csv”,然后 运行 导入代码。
这是要导入的代码:
import pandas as pd
df = pd.read_csv('combined.csv')
print(df)
这里是生成“combined.csv”的代码。 运行需要30秒。确保两个源 CSV 与代码位于同一文件夹中。
import re
import os
from contextlib import redirect_stdout
import io
def enforceWhitelist(temp5): # function to ensure only the characters we want are allowed
new = re.sub('[^abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_:.\-]+' ,'', temp5) #regex pattern for whitelisting
return new
def getInfo(temp9): #function to return the details from the 2nd(sub) csv as string
global listnamesY
for temp10 in range(len(listnamesY[0])):
temp12 = ""
for temp11 in range(len(listnamesY)):
temp12 = temp12 + listnamesY[temp11][temp10]+","
if temp9 in temp12:
return temp12
return temp9+","+("NOT_FOUND,"*(len(listnamesY)-1))
tempmain = [] #this is the array with all the lines of the original 1st(main) csv
with open("spot-prices-2021-05-16.csv") as main_in:
for temp1 in main_in:
tempmain.append(temp1)
tempsub = [] #this is the array with all the lines of the 2nd(sub) csv
with open("InstanceAWS.csv") as sub_in:
for temp2 in sub_in:
tempsub.append(temp2)
listnames1 = [] #this is the array with the names of columns of 1st(main) csv
listnames2 = [] #this is the array with the names of columns of 2nd(sub) csv
listnamesX = [] #this is the array with actual columns of 1st(main) csv
listnamesY = [] #this is the array with actual columns of 2nd(sub) csv
f = io.StringIO()
with redirect_stdout(f): #to redirect stdout to string
for temp3 in range(len(tempmain[0].split(";"))):
listnames1.append(enforceWhitelist(tempmain[0].split(';')[temp3]+"_1"))
exec(listnames1[temp3]+" = []")
for temp4 in range(len(tempmain)):
if temp4 > 0:
exec(listnames1[temp3]+".append('"+enforceWhitelist(tempmain[temp4].split(';')[temp3])+"')")
exec("listnamesX.append("+enforceWhitelist(tempmain[0].split(';')[temp3]+'_1')+")")
for temp3 in range(len(tempsub[0].split(","))):
listnames2.append(enforceWhitelist(tempsub[0].split(',')[temp3]+"_2"))
exec(listnames2[temp3]+" = []")
for temp4 in range(len(tempsub)):
if temp4 > 0:
exec(listnames2[temp3]+".append('"+enforceWhitelist(tempsub[temp4].split(',')[temp3])+"')")
exec("listnamesY.append("+enforceWhitelist(tempsub[0].split(',')[temp3]+'_2')+")")
for temp13 in range(len(listnames1)):
if listnames1[temp13][:-2] == "InstanceType":
for temp14 in range(len(listnames2)):
print(listnames2[temp14][:-2], end=',')
else:
print(listnames1[temp13][:-2], end=',')
print("")
for temp6 in range(len(listnamesX[0])):
for temp7 in range(len(listnamesX)):
if listnames1[temp7][:-2] == "InstanceType":
print(getInfo(listnamesX[temp7][temp6]), end='')
else:
print(listnamesX[temp7][temp6], end=',')
print("")
s = f.getvalue() #the string with the stdout
s="No."+s #first column has no name so added a name
try:
os.remove("combined.csv") #delete the old file to see the latest generation
except:
pass
f2 = open("combined.csv", "w")
for temp15 in range(len(s.split("\n"))):
f2.write(s.split("\n")[temp15][:-1]+"\n") #write to output csv file
f2.close()