如何更改 mlflow 日志的目录?
How to change the directory of mlflow logs?
我正在使用 MLflow 记录指标,但我想更改默认的保存日志目录。所以,除了我的主文件之外,我不想写日志文件,而是想将它们存储到 /path/outputs/lg
。我不知道如何改变它。我在 Model
.
中没有使用它
import os
from time import time
import mlflow
import numpy as np
import torch
import tqdm
# from segmentation_models_pytorch.utils import metrics
from AICore.emergency_landing.metrics import IoU, F1
from AICore.emergency_landing.utils import AverageMeter
from AICore.emergency_landing.utils import TBLogger
class Model:
def __init__(self, model, num_classes=5, ignore_index=0, optimizer=None, scheduler=None, criterion=None,
device=None, epochs=30, train_loader=None, val_loader=None, tb_logger: TBLogger = None,
logger=None,
best_model_path=None,
model_check_point_path=None,
load_from_best_model=None,
load_from_model_checkpoint=None,
early_stopping=None,
debug=False):
self.debug = debug
self.early_stopping = {
'init': early_stopping,
'changed': 0
}
self.optimizer = optimizer
self.scheduler = scheduler
self.criterion = criterion
self.device = device
self.epochs = epochs
self.train_loader = train_loader
self.val_loader = val_loader
self.model = model.to(device)
self.tb_logger = tb_logger
self.logger = logger
self.best_loss = np.Inf
if not os.path.exists(best_model_path):
os.makedirs(best_model_path)
self.best_model_path = best_model_path
if not os.path.exists(model_check_point_path):
os.makedirs(model_check_point_path)
self.model_check_point_path = model_check_point_path
self.load_from_best_model = load_from_best_model
self.load_from_model_checkpoint = load_from_model_checkpoint
if self.load_from_best_model is not None:
self.load_model(path=self.load_from_best_model)
if self.load_from_model_checkpoint is not None:
self.load_model_checkpoint(path=self.load_from_model_checkpoint)
self.train_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.val_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.test_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.train_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.val_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.test_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
def metrics(self, is_train=True):
if is_train:
train_losses = AverageMeter('Training Loss', ':.4e')
train_iou = AverageMeter('Training iou', ':6.2f')
train_f_score = AverageMeter('Training F_score', ':6.2f')
return train_losses, train_iou, train_f_score
else:
val_losses = AverageMeter('Validation Loss', ':.4e')
val_iou = AverageMeter('Validation mean iou', ':6.2f')
val_f_score = AverageMeter('Validation F_score', ':6.2f')
return val_losses, val_iou, val_f_score
def fit(self):
self.logger.info("\nStart training\n\n")
start_training_time = time()
with mlflow.start_run():
for e in range(self.epochs):
start_training_epoch_time = time()
self.model.train()
train_losses_avg, train_iou_avg, train_f_score_avg = self.metrics(is_train=True)
with tqdm.tqdm(self.train_loader, unit="batch") as tepoch:
tepoch.set_description(f"Epoch {e}")
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# Clear the gradients
self.optimizer.zero_grad()
# Forward Pass
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
train_losses_avg.update(loss.item(), image.size(0))
# Calculate gradients
loss.backward()
# Update Weights
self.optimizer.step()
iou = self.train_iou(out.cpu(), target.cpu()).item()
train_iou_avg.update(iou)
f1_score = self.train_f1(out.cpu(), target.cpu()).item()
train_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=train_losses_avg.avg,
iou=train_iou_avg.avg,
f_score=train_f_score_avg.avg)
if self.debug:
break
self.tb_logger.log(log_type='criterion/training', value=train_losses_avg.avg, epoch=e)
self.tb_logger.log(log_type='iou/training', value=train_iou_avg.avg, epoch=e)
self.tb_logger.log(log_type='f_score/training', value=train_f_score_avg.avg, epoch=e)
mlflow.log_metric('criterion/training', train_losses_avg.avg, step=e)
mlflow.log_metric('iou/training', train_iou_avg.avg, step=e)
mlflow.log_metric('f_score/training', train_f_score_avg.avg, step=e)
end_training_epoch_time = time() - start_training_epoch_time
print('\n')
self.logger.info(
f'Training Results - [{end_training_epoch_time:.3f}s] Epoch: {e}:'
f' f_score: {train_f_score_avg.avg:.3f},'
f' IoU: {train_iou_avg.avg:.3f},'
f' Loss: {train_losses_avg.avg:.3f}')
# validation step
val_loss = self.evaluation(e)
# apply scheduler
if self.scheduler:
self.scheduler.step()
# early stopping
if self.early_stopping['init'] >= self.early_stopping['changed']:
self._early_stopping_model(val_loss=val_loss)
else:
print(f'The model can not learn more, Early Stopping at epoch[{e}]')
break
# save best model
if self.best_model_path is not None:
self._best_model(val_loss=val_loss, path=self.best_model_path)
# model check points
if self.model_check_point_path is not None:
self.save_model_check_points(path=self.model_check_point_path, epoch=e, net=self.model,
optimizer=self.optimizer, loss=self.criterion,
avg_loss=train_losses_avg.avg)
# log mlflow
if self.scheduler:
mlflow.log_param("get_last_lr", self.scheduler.get_last_lr())
mlflow.log_param("scheduler", self.scheduler.state_dict())
self.tb_logger.flush()
if self.debug:
break
end_training_time = time() - start_training_time
print(f'Finished Training after {end_training_time:.3f}s')
self.tb_logger.close()
def evaluation(self, epoch):
print('Validating...')
start_validation_epoch_time = time()
self.model.eval() # Optional when not using Model Specific layer
with torch.no_grad():
val_losses_avg, val_iou_avg, val_f_score_avg = self.metrics(is_train=False)
with tqdm.tqdm(self.val_loader, unit="batch") as tepoch:
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
val_losses_avg.update(loss.item(), image.size(0))
iou = self.val_iou(out.cpu(), target.cpu()).item()
val_iou_avg.update(iou)
f1_score = self.val_f1(out.cpu(), target.cpu()).item()
val_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=val_losses_avg.avg,
iou=val_iou_avg.avg,
f_score=val_f_score_avg.avg)
if self.debug:
break
print('\n')
self.tb_logger.log(log_type='criterion/validation', value=val_losses_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='iou/validation', value=val_iou_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='f_score/validation', value=val_f_score_avg.avg, epoch=epoch)
mlflow.log_metric('criterion/validation', val_losses_avg.avg, step=epoch)
mlflow.log_metric('iou/validation', val_iou_avg.avg, step=epoch)
mlflow.log_metric('f_score/validation', val_f_score_avg.avg, step=epoch)
end_validation_epoch_time = time() - start_validation_epoch_time
self.logger.info(
f'validation Results - [{end_validation_epoch_time:.3f}s] Epoch: {epoch}:'
f' f_score: {val_f_score_avg.avg:.3f},'
f' IoU: {val_iou_avg.avg:.3f},'
f' Loss: {val_losses_avg.avg:.3f}')
print('\n')
return val_losses_avg.avg
def _save_model(self, name, path, params):
torch.save(params, path)
def _early_stopping_model(self, val_loss):
if self.best_loss < val_loss:
self.early_stopping['changed'] += 1
else:
self.early_stopping['changed'] = 0
def _best_model(self, val_loss, path):
if self.best_loss > val_loss:
self.best_loss = val_loss
name = f'/best_model_loss_{self.best_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'model_state_dict': self.model.state_dict(),
})
print(f'The best model is saved with criterion: {self.best_loss:.2f}')
def save_model_check_points(self, path, epoch, net, optimizer, loss, avg_loss):
name = f'/model_epoch_{epoch}_loss_{avg_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'epoch': epoch,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'criterion': loss,
})
print(f'model checkpoint is saved at model_epoch_{epoch}_loss_{avg_loss:.2f}')
def load_model_checkpoint(self, path):
checkpoint = torch.load(path)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
self.criterion = checkpoint['criterion']
return epoch
def load_model(self, path):
best_model = torch.load(path)
self.model.load_state_dict(best_model['model_state_dict'])
解决方法是:
mlflow.set_tracking_uri(uri=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
exp = mlflow.get_experiment_by_name(name='Emegency_landing')
if not exp:
experiment_id = mlflow.create_experiment(name='Emegency_landing',
artifact_location=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
else:
experiment_id = exp.experiment_id
然后您应该将实验 ID 传递给:
with mlflow.start_run(experiment_id=experiment_id):
pass
如果您不提及 /path/mlruns
,当您 运行 命令 mlflow ui
时,它会自动创建另一个名为 mlruns
的文件夹。所以,注意这点要和mlruns
.
同名
我正在使用 MLflow 记录指标,但我想更改默认的保存日志目录。所以,除了我的主文件之外,我不想写日志文件,而是想将它们存储到 /path/outputs/lg
。我不知道如何改变它。我在 Model
.
import os
from time import time
import mlflow
import numpy as np
import torch
import tqdm
# from segmentation_models_pytorch.utils import metrics
from AICore.emergency_landing.metrics import IoU, F1
from AICore.emergency_landing.utils import AverageMeter
from AICore.emergency_landing.utils import TBLogger
class Model:
def __init__(self, model, num_classes=5, ignore_index=0, optimizer=None, scheduler=None, criterion=None,
device=None, epochs=30, train_loader=None, val_loader=None, tb_logger: TBLogger = None,
logger=None,
best_model_path=None,
model_check_point_path=None,
load_from_best_model=None,
load_from_model_checkpoint=None,
early_stopping=None,
debug=False):
self.debug = debug
self.early_stopping = {
'init': early_stopping,
'changed': 0
}
self.optimizer = optimizer
self.scheduler = scheduler
self.criterion = criterion
self.device = device
self.epochs = epochs
self.train_loader = train_loader
self.val_loader = val_loader
self.model = model.to(device)
self.tb_logger = tb_logger
self.logger = logger
self.best_loss = np.Inf
if not os.path.exists(best_model_path):
os.makedirs(best_model_path)
self.best_model_path = best_model_path
if not os.path.exists(model_check_point_path):
os.makedirs(model_check_point_path)
self.model_check_point_path = model_check_point_path
self.load_from_best_model = load_from_best_model
self.load_from_model_checkpoint = load_from_model_checkpoint
if self.load_from_best_model is not None:
self.load_model(path=self.load_from_best_model)
if self.load_from_model_checkpoint is not None:
self.load_model_checkpoint(path=self.load_from_model_checkpoint)
self.train_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.val_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.test_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
self.train_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.val_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
self.test_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
def metrics(self, is_train=True):
if is_train:
train_losses = AverageMeter('Training Loss', ':.4e')
train_iou = AverageMeter('Training iou', ':6.2f')
train_f_score = AverageMeter('Training F_score', ':6.2f')
return train_losses, train_iou, train_f_score
else:
val_losses = AverageMeter('Validation Loss', ':.4e')
val_iou = AverageMeter('Validation mean iou', ':6.2f')
val_f_score = AverageMeter('Validation F_score', ':6.2f')
return val_losses, val_iou, val_f_score
def fit(self):
self.logger.info("\nStart training\n\n")
start_training_time = time()
with mlflow.start_run():
for e in range(self.epochs):
start_training_epoch_time = time()
self.model.train()
train_losses_avg, train_iou_avg, train_f_score_avg = self.metrics(is_train=True)
with tqdm.tqdm(self.train_loader, unit="batch") as tepoch:
tepoch.set_description(f"Epoch {e}")
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# Clear the gradients
self.optimizer.zero_grad()
# Forward Pass
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
train_losses_avg.update(loss.item(), image.size(0))
# Calculate gradients
loss.backward()
# Update Weights
self.optimizer.step()
iou = self.train_iou(out.cpu(), target.cpu()).item()
train_iou_avg.update(iou)
f1_score = self.train_f1(out.cpu(), target.cpu()).item()
train_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=train_losses_avg.avg,
iou=train_iou_avg.avg,
f_score=train_f_score_avg.avg)
if self.debug:
break
self.tb_logger.log(log_type='criterion/training', value=train_losses_avg.avg, epoch=e)
self.tb_logger.log(log_type='iou/training', value=train_iou_avg.avg, epoch=e)
self.tb_logger.log(log_type='f_score/training', value=train_f_score_avg.avg, epoch=e)
mlflow.log_metric('criterion/training', train_losses_avg.avg, step=e)
mlflow.log_metric('iou/training', train_iou_avg.avg, step=e)
mlflow.log_metric('f_score/training', train_f_score_avg.avg, step=e)
end_training_epoch_time = time() - start_training_epoch_time
print('\n')
self.logger.info(
f'Training Results - [{end_training_epoch_time:.3f}s] Epoch: {e}:'
f' f_score: {train_f_score_avg.avg:.3f},'
f' IoU: {train_iou_avg.avg:.3f},'
f' Loss: {train_losses_avg.avg:.3f}')
# validation step
val_loss = self.evaluation(e)
# apply scheduler
if self.scheduler:
self.scheduler.step()
# early stopping
if self.early_stopping['init'] >= self.early_stopping['changed']:
self._early_stopping_model(val_loss=val_loss)
else:
print(f'The model can not learn more, Early Stopping at epoch[{e}]')
break
# save best model
if self.best_model_path is not None:
self._best_model(val_loss=val_loss, path=self.best_model_path)
# model check points
if self.model_check_point_path is not None:
self.save_model_check_points(path=self.model_check_point_path, epoch=e, net=self.model,
optimizer=self.optimizer, loss=self.criterion,
avg_loss=train_losses_avg.avg)
# log mlflow
if self.scheduler:
mlflow.log_param("get_last_lr", self.scheduler.get_last_lr())
mlflow.log_param("scheduler", self.scheduler.state_dict())
self.tb_logger.flush()
if self.debug:
break
end_training_time = time() - start_training_time
print(f'Finished Training after {end_training_time:.3f}s')
self.tb_logger.close()
def evaluation(self, epoch):
print('Validating...')
start_validation_epoch_time = time()
self.model.eval() # Optional when not using Model Specific layer
with torch.no_grad():
val_losses_avg, val_iou_avg, val_f_score_avg = self.metrics(is_train=False)
with tqdm.tqdm(self.val_loader, unit="batch") as tepoch:
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(self.device)
target = target.to(self.device)
# out = self.model(image)['out']
# if unet == true => remove ['out']
out = self.model(image)
# Find the Loss
loss = self.criterion(out, target)
# Calculate Loss
val_losses_avg.update(loss.item(), image.size(0))
iou = self.val_iou(out.cpu(), target.cpu()).item()
val_iou_avg.update(iou)
f1_score = self.val_f1(out.cpu(), target.cpu()).item()
val_f_score_avg.update(f1_score)
tepoch.set_postfix(loss=val_losses_avg.avg,
iou=val_iou_avg.avg,
f_score=val_f_score_avg.avg)
if self.debug:
break
print('\n')
self.tb_logger.log(log_type='criterion/validation', value=val_losses_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='iou/validation', value=val_iou_avg.avg, epoch=epoch)
self.tb_logger.log(log_type='f_score/validation', value=val_f_score_avg.avg, epoch=epoch)
mlflow.log_metric('criterion/validation', val_losses_avg.avg, step=epoch)
mlflow.log_metric('iou/validation', val_iou_avg.avg, step=epoch)
mlflow.log_metric('f_score/validation', val_f_score_avg.avg, step=epoch)
end_validation_epoch_time = time() - start_validation_epoch_time
self.logger.info(
f'validation Results - [{end_validation_epoch_time:.3f}s] Epoch: {epoch}:'
f' f_score: {val_f_score_avg.avg:.3f},'
f' IoU: {val_iou_avg.avg:.3f},'
f' Loss: {val_losses_avg.avg:.3f}')
print('\n')
return val_losses_avg.avg
def _save_model(self, name, path, params):
torch.save(params, path)
def _early_stopping_model(self, val_loss):
if self.best_loss < val_loss:
self.early_stopping['changed'] += 1
else:
self.early_stopping['changed'] = 0
def _best_model(self, val_loss, path):
if self.best_loss > val_loss:
self.best_loss = val_loss
name = f'/best_model_loss_{self.best_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'model_state_dict': self.model.state_dict(),
})
print(f'The best model is saved with criterion: {self.best_loss:.2f}')
def save_model_check_points(self, path, epoch, net, optimizer, loss, avg_loss):
name = f'/model_epoch_{epoch}_loss_{avg_loss:.2f}'.replace('.', '_')
self._save_model(name, path=f'{path}/{name}.pt', params={
'epoch': epoch,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'criterion': loss,
})
print(f'model checkpoint is saved at model_epoch_{epoch}_loss_{avg_loss:.2f}')
def load_model_checkpoint(self, path):
checkpoint = torch.load(path)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
self.criterion = checkpoint['criterion']
return epoch
def load_model(self, path):
best_model = torch.load(path)
self.model.load_state_dict(best_model['model_state_dict'])
解决方法是:
mlflow.set_tracking_uri(uri=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
exp = mlflow.get_experiment_by_name(name='Emegency_landing')
if not exp:
experiment_id = mlflow.create_experiment(name='Emegency_landing',
artifact_location=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
else:
experiment_id = exp.experiment_id
然后您应该将实验 ID 传递给:
with mlflow.start_run(experiment_id=experiment_id):
pass
如果您不提及 /path/mlruns
,当您 运行 命令 mlflow ui
时,它会自动创建另一个名为 mlruns
的文件夹。所以,注意这点要和mlruns
.