如何创建特定大小的数据框,其中包含具有均匀随机分布的连续值和分类值
How to create a dataframe of a particular size containing both continuous and categorical values with a uniform random distribution
所以,我正在尝试生成一些给定维度大小的假随机数据。本质上,我想要一个数据框,其中的数据具有均匀的随机分布。数据由连续值和分类值组成。我已经编写了以下代码,但它没有按照我想要的方式工作。
import random
import pandas as pd
import time
from datetime import datetime
# declare global variables
adv_name = ['soft toys', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_loc = ['location_1', 'location_2', 'location_3',
'location_4', 'location_5']
adv_prod = ['baby product', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_size = [1, 2, 3, 4, 10]
adv_layout = ['static', 'dynamic'] # advertisment layout type on website
# adv_date, start_time, end_time = []
num = 10 # the given dimension
# define function to generate random advert locations
def rand_shuf_loc(str_lst, num):
lst = adv_loc
# using list comprehension
rand_shuf_str = [item for item in lst for i in range(num)]
return(rand_shuf_str)
# define function to generate random advert names
def rand_shuf_prod(loc_list, num):
rand_shuf_str = [item for item in loc_list for i in range(num)]
random.shuffle(rand_shuf_str)
return(rand_shuf_str)
# define function to generate random impression and click data
def rand_clic_impr(num):
rand_impr_lst = []
click_lst = []
for i in range(num):
rand_impr_lst.append(random.randint(0, 100))
click_lst.append(random.randint(0, 100))
return {'rand_impr_lst': rand_impr_lst, 'rand_click_lst': click_lst}
# define function to generate random product price and discount
def rand_prod_price_discount(num):
prod_price_lst = [] # advertised product price
prod_discnt_lst = [] # advertised product discount
for i in range(num):
prod_price_lst.append(random.randint(10, 100))
prod_discnt_lst.append(random.randint(10, 100))
return {'prod_price_lst': prod_price_lst, 'prod_discnt_lst': prod_discnt_lst}
def rand_prod_click_timestamp(stime, etime, num):
prod_clik_tmstmp = []
frmt = '%d-%m-%Y %H:%M:%S'
for i in range(num):
rtime = int(random.random()*86400)
hours = int(rtime/3600)
minutes = int((rtime - hours*3600)/60)
seconds = rtime - hours*3600 - minutes*60
time_string = '%02d:%02d:%02d' % (hours, minutes, seconds)
prod_clik_tmstmp.append(time_string)
time_stmp = [item for item in prod_clik_tmstmp for i in range(num)]
return {'prod_clik_tmstmp_lst':time_stmp}
def main():
print('generating data...')
# print('generating random geographic coordinates...')
# get the impressions and click data
impression = rand_clic_impr(num)
clicks = rand_clic_impr(num)
product_price = rand_prod_price_discount(num)
product_discount = rand_prod_price_discount(num)
prod_clik_tmstmp = rand_prod_click_timestamp("20-01-2018 13:30:00",
"23-01-2018 04:50:34",num)
lst_dict = {"ad_loc": rand_shuf_loc(adv_loc, num),
"prod": rand_shuf_prod(adv_prod, num),
"imprsn": impression['rand_impr_lst'],
"cliks": clicks['rand_click_lst'],
"prod_price": product_price['prod_price_lst'],
"prod_discnt": product_discount['prod_discnt_lst'],
"prod_clik_stmp": prod_clik_tmstmp['prod_clik_tmstmp_lst']}
fake_data = pd.DataFrame.from_dict(lst_dict, orient="index")
res = fake_data.apply(lambda x: x.fillna(0)
if x.dtype.kind in 'biufc'
# where 'biufc' means boolean, integer,
# unicode, float & complex data types
else x.fillna(random.randint(0, 100)
)
)
print(res.transpose())
res.to_csv("fake_data.csv", sep=",")
# invoke the main function
if __name__ == "__main__":
main()
问题 1
当我执行上面的代码片段时,它打印正常,但是当写入 csv 格式时,它是水平放置的;即,它看起来像这样..。写入 csv 文件时如何垂直放置它?我想要的是 7 列(参见上面的 lst_dict 变量 ),其中 n 行数 ?
问题 2
我不明白为什么前 50 列生成随机日期而其余列填充数值?
要回答您的第一个问题,请替换
print(res.transpose())
与
res.transpose() print(res)
要回答您的第二个问题,请查看方法输出的长度
rand_shuf_loc()
它以及其他辅助函数只生成一个包含 50 个项目的列表。
使用方法创建res
fake_data.apply
用随机数字替换所有 nan,因此它也将数字应用于没有任何预定义值的列。
所以,我正在尝试生成一些给定维度大小的假随机数据。本质上,我想要一个数据框,其中的数据具有均匀的随机分布。数据由连续值和分类值组成。我已经编写了以下代码,但它没有按照我想要的方式工作。
import random
import pandas as pd
import time
from datetime import datetime
# declare global variables
adv_name = ['soft toys', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_loc = ['location_1', 'location_2', 'location_3',
'location_4', 'location_5']
adv_prod = ['baby product', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_size = [1, 2, 3, 4, 10]
adv_layout = ['static', 'dynamic'] # advertisment layout type on website
# adv_date, start_time, end_time = []
num = 10 # the given dimension
# define function to generate random advert locations
def rand_shuf_loc(str_lst, num):
lst = adv_loc
# using list comprehension
rand_shuf_str = [item for item in lst for i in range(num)]
return(rand_shuf_str)
# define function to generate random advert names
def rand_shuf_prod(loc_list, num):
rand_shuf_str = [item for item in loc_list for i in range(num)]
random.shuffle(rand_shuf_str)
return(rand_shuf_str)
# define function to generate random impression and click data
def rand_clic_impr(num):
rand_impr_lst = []
click_lst = []
for i in range(num):
rand_impr_lst.append(random.randint(0, 100))
click_lst.append(random.randint(0, 100))
return {'rand_impr_lst': rand_impr_lst, 'rand_click_lst': click_lst}
# define function to generate random product price and discount
def rand_prod_price_discount(num):
prod_price_lst = [] # advertised product price
prod_discnt_lst = [] # advertised product discount
for i in range(num):
prod_price_lst.append(random.randint(10, 100))
prod_discnt_lst.append(random.randint(10, 100))
return {'prod_price_lst': prod_price_lst, 'prod_discnt_lst': prod_discnt_lst}
def rand_prod_click_timestamp(stime, etime, num):
prod_clik_tmstmp = []
frmt = '%d-%m-%Y %H:%M:%S'
for i in range(num):
rtime = int(random.random()*86400)
hours = int(rtime/3600)
minutes = int((rtime - hours*3600)/60)
seconds = rtime - hours*3600 - minutes*60
time_string = '%02d:%02d:%02d' % (hours, minutes, seconds)
prod_clik_tmstmp.append(time_string)
time_stmp = [item for item in prod_clik_tmstmp for i in range(num)]
return {'prod_clik_tmstmp_lst':time_stmp}
def main():
print('generating data...')
# print('generating random geographic coordinates...')
# get the impressions and click data
impression = rand_clic_impr(num)
clicks = rand_clic_impr(num)
product_price = rand_prod_price_discount(num)
product_discount = rand_prod_price_discount(num)
prod_clik_tmstmp = rand_prod_click_timestamp("20-01-2018 13:30:00",
"23-01-2018 04:50:34",num)
lst_dict = {"ad_loc": rand_shuf_loc(adv_loc, num),
"prod": rand_shuf_prod(adv_prod, num),
"imprsn": impression['rand_impr_lst'],
"cliks": clicks['rand_click_lst'],
"prod_price": product_price['prod_price_lst'],
"prod_discnt": product_discount['prod_discnt_lst'],
"prod_clik_stmp": prod_clik_tmstmp['prod_clik_tmstmp_lst']}
fake_data = pd.DataFrame.from_dict(lst_dict, orient="index")
res = fake_data.apply(lambda x: x.fillna(0)
if x.dtype.kind in 'biufc'
# where 'biufc' means boolean, integer,
# unicode, float & complex data types
else x.fillna(random.randint(0, 100)
)
)
print(res.transpose())
res.to_csv("fake_data.csv", sep=",")
# invoke the main function
if __name__ == "__main__":
main()
问题 1
当我执行上面的代码片段时,它打印正常,但是当写入 csv 格式时,它是水平放置的;即,它看起来像这样..
问题 2 我不明白为什么前 50 列生成随机日期而其余列填充数值?
要回答您的第一个问题,请替换
print(res.transpose())
与
res.transpose() print(res)
要回答您的第二个问题,请查看方法输出的长度
rand_shuf_loc()
它以及其他辅助函数只生成一个包含 50 个项目的列表。
使用方法创建res
fake_data.apply
用随机数字替换所有 nan,因此它也将数字应用于没有任何预定义值的列。