import os import sys import time import argparse import logging import glob import torch from torch.utils.data import DataLoader from torch.nn import DataParallel from torch.optim import lr_scheduler from torch.optim import Adam import torch.nn.functional as F import torch.nn as nn #from torch.utils.tensorboard import SummaryWriter sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../') from data.dataset import SubjectDataset #from model.net_cls_1024u_13 import Net from net.net_cls_1024u_231025 import Net #from net.net_cls_1024u_2d import Net from cls_utils.data import save_summary_data, load_json import pathlib cur_path = pathlib.Path.cwd() #torch.manual_seed(0) #torch.cuda.manual_seed_all(0) # cfg = load_json("/home/lung/ai-project/cls_train/config/train.json") cfg = load_json("/df_lung/ai-project/cls_train/config/train_20241112.json") parser = argparse.ArgumentParser(description='train data') parser.add_argument('--num_workers', default=1, type=int, help='Number of workers for each data loader') parser.add_argument('--GPU', default='0', type=str, help='GPU index') #模型迭代一次全部数据集 def train_epoch(epoch, device, model, loss, optimizer, dataloader, summary): print('迭代第{}次'.format(epoch+1)) model.train() loss_sum = 0 time_now = time.time() for step, (data, target) in enumerate(dataloader): """pos_weight = 1.0 / (target == 1).sum().item() neg_weight = 1.0 / (target == 0).sum().item() print(pos_weight, (target == 1).sum().item()) print(neg_weight, (target == 0).sum().item()) weights = torch.tensor([pos_weight if label == 1 else neg_weight for label in target]).reshape(target.shape[0], 1) loss = nn.BCEWithLogitsLoss(weight=weights) loss.to(device)""" data, target = data.to(device), target.to(device) output = model(data) """print('输出和标签:') print("output: ", output.detach().reshape(1,-1)) print('target: ', target.detach().reshape(1, -1))""" #l = loss(output.sigmoid(), target) l = loss(output, target) #清空参数梯度信息 optimizer.zero_grad() l.backward() #将参数进行更新 optimizer.step() loss_data = l.item() loss_sum += loss_data time_spent = time.time() - time_now time_now = time.time() logging.info( '{}, Epoch : {:3d}, Step : {:3d}, Training Loss : {:.3f}/{:.3f}/{:.3f} ' .format(time.strftime('%Y-%m-%d %H:%M:%S'), epoch, step+1, l.item(), loss_data, time_spent)) summary['loss'] = loss_sum / len(dataloader) return summary def run(args, cfg, folder_name='', model_index=0, is_2d=False, load_pretrain = False): os.environ['CUDA_VISIBLE_DEVICES'] = args.GPU device = 'cuda' if torch.cuda.is_available() else 'cpu' print(device) gpus = len(args.GPU) num_workers = args.num_workers * gpus batch_size = cfg['batch_size'] * gpus #初始化模型 model = Net(n_channels=cfg['n_channels'], n_diff_classes=cfg['n_diff_classes']) #定义优化器 optimizer = Adam(model.parameters(), lr=cfg['lr']) scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg['patience'], gamma=cfg['learning_rate_drop']) #加载之前训练好的模型,从那个基础上进行模型训练 """base_path = cfg['ckpt_save_path'] + folder_name if len(folder_name) > 1 else cfg['ckpt_save_path'] if is_2d: ckpt_folder = os.path.join(base_path, cfg['train_csv_file'], cfg['train_csv_file']+'_'+str(model_index)) else: ckpt_folder = os.path.join(base_path, cfg['train_csv_file']) ckpt_path = os.listdir(ckpt_folder) pretrain_ckpt_path = os.path.join(ckpt_folder,ckpt_path[0]) print('checkpoint filename:', pretrain_ckpt_path) #pretrain_ckpt_path = os.path.join(cfg['ckpt_pretrain_path'], cfg["ckpt_file"]) model_param = torch.load(pretrain_ckpt_path) model.load_state_dict(model_param['state_dict'], strict=True)""" if load_pretrain: pretrain_ckpt_path = cfg['ckpt_pretrain_path'] model_param = torch.load(pretrain_ckpt_path) model.load_state_dict(model_param['state_dict'], strict=True) print(f"加载之前训练的参数: {pretrain_ckpt_path}") #将模型和损失函数放到同一个设备上 model = model.to(device=device) #定义损失函数 loss = nn.BCELoss() loss.to(device) #多gpu进行分布式训练 if 'cuda' == device: device_ids = list(range(gpus)) model = DataParallel(model, device_ids=device_ids) #训练数据集 if is_2d: csv_path = os.path.join(cfg['csv_path'], folder_name) if len(folder_name) > 1 else cfg['csv_path'] """train_csv_path = os.path.join(cfg['train_data_path'], csv_path, cfg["train_csv_file"], 'train.csv')""" train_csv_path = os.path.join(cfg['train_data_path'], csv_path, cfg["train_csv_file"], cfg["train_csv_file"]+'_'+str(model_index), 'train.csv') else: #print('3d') csv_path = os.path.join(cfg['csv_path'], folder_name) if len(folder_name) > 1 else cfg['csv_path'] train_csv_path = os.path.join(cfg['train_data_path'], csv_path, cfg["train_csv_file"], 'train.csv') train_npy_path = os.path.join(cfg['train_data_path'], cfg["npy_folder"]) dataset_train = SubjectDataset(train_npy_path, train_csv_path, is_train=True, is_2d=is_2d, augment=True, permute=True) #打印训练数据个数 print('train number:', len(dataset_train)) dataloader_train = DataLoader(dataset_train, batch_size=batch_size, num_workers=num_workers, shuffle=True) #保存所有的训练损失结果 summary_train_loss = [] summary_train = {'loss': float('inf')} if is_2d: base_path = cfg['ckpt_save_path'] + folder_name if len(folder_name) > 1 else cfg['ckpt_save_path'] ckpt_folder = os.path.join(base_path, cfg['train_csv_file'], cfg['train_csv_file']+'_'+str(model_index)) else: base_path = cfg['ckpt_save_path'] + folder_name if len(folder_name) > 1 else cfg['ckpt_save_path'] ckpt_folder = os.path.join(base_path, cfg['train_csv_file']) if not os.path.exists(ckpt_folder): os.makedirs(ckpt_folder) for epoch in range(cfg['epoch']): scheduler.step() logging.info('{}, Epoch : {:3d}, lr : {}' .format(time.strftime('%Y-%m-%d %H:%M:%S'), epoch, scheduler.get_last_lr())) time_now = time.time() summary_train = train_epoch(epoch=epoch, device=device, model=model, loss=loss, optimizer=optimizer, dataloader=dataloader_train, summary=summary_train) time_spent = time.time() - time_now logging.info( '{}, Epoch : {:3d}, Summary Training Loss : {:.3f}, ' 'Run Time : {:.2f}' .format(time.strftime('%Y-%m-%d %H:%M:%S'), epoch, summary_train['loss'], time_spent) ) summary_train_loss.append(summary_train['loss']) if (epoch + 1) % 200 == 0: ckpt_path = os.path.join(ckpt_folder, cfg['train_csv_file'] + time.strftime('%Y%m%d-%H%M') + '.ckpt') torch.save({'epoch': epoch, 'loss':summary_train['loss'], 'state_dict':model.module.state_dict()}, ckpt_path) ckpt_path = os.path.join(ckpt_folder, cfg['train_csv_file'] + time.strftime('%Y%m%d-%H%M') + '.ckpt') #保存损失函数结果 torch.save({'epoch': epoch, 'loss':summary_train['loss'], 'state_dict':model.module.state_dict()}, ckpt_path) print(f"train done, save to: {ckpt_path}") #将训练过程中的损失变化转换成图片保存下来 img_path = os.path.join(cfg['train_data_path'], cfg['image_path'], cfg['train_csv_file']) if not os.path.exists(img_path): os.makedirs(img_path) result_img_path = os.path.join(img_path, time.strftime('%Y%m%d-%H%M') + '.png') save_summary_data(summary_trains=summary_train_loss, result_img_path=result_img_path) print(f"save_summary_data: {result_img_path}") def main(train_csv_file, folder_name='', load_pretrain = False): filename = cfg['training_filename'] directory = os.path.dirname(filename) if not os.path.exists(directory): os.makedirs(directory) if not os.path.exists(filename): with open(filename, 'w') as file: pass print(f"文件 '{filename}' 已创建。") else: print(f"文件 '{filename}' 已存在。") logging.basicConfig(level=logging.INFO, filename=cfg['training_filename'], filemode='a') args = parser.parse_args() #将同一个二分类的所有批量数据都训练 #自动获取所有的训练数据的批数 cfg['train_csv_file'] = train_csv_file csv_path = os.path.join(cfg['csv_path'], folder_name) print(f"cur_path: {cur_path}") paths = os.path.join(cur_path, cfg['train_data_path'], csv_path, cfg['train_csv_file'], '*') print(f"paths: {paths}") model_sum = len(glob.glob(paths)) for i in range(model_sum): print(f"run: {i}") run(args, cfg, model_index=i+1, folder_name=folder_name, is_2d=False, load_pretrain = load_pretrain) def run_signal_3d(): cfg = load_json("/home/lung/ai-project/cls_train/config/train.json") logging.basicConfig(level=logging.INFO, filename=cfg['training_filename'], filemode='a') args = parser.parse_args() run(args, cfg, is_2d=False) if __name__ == '__main__': folder_names = ['08'] # base_path = '/home/lung/ai-project/cls_train/data/train_data/plus_3d_0818/subject_all_csv/' base_path = '/df_lung/ai-project/cls_train/data/train_data/plus_3d_0818/subject_all_csv/' # train_csv_file = "cls_20241112_2041" # main(train_csv_file, folder_name="08", load_pretrain = True) for name in folder_names: train_path = base_path + name for train_csv_file in os.listdir(train_path): if train_csv_file == 'cls_1_2041': main(train_csv_file, folder_name=name, load_pretrain = True)