import os import sys import json import numpy as np import pandas as pd import glob import torch import collections import random from torchvision import transforms from matplotlib import pyplot as plt sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../') from cls_utils.utils import check_and_makedirs from cls_utils.augement import random_permutation_data, augment_data from net.component_c import get_data #读取指定json_path路径下的json文件 def load_json(json_path): with open(json_path) as f: cfg = json.load(f) return cfg #将新数据写入指定的csv文件 def save_supplement_data_csv(content, output_file): data = {} data['npy_path'] = [content[0]] data['label'] = [content[1]] if os.path.exists(output_file) is False: check_and_makedirs(output_file=output_file) data = pd.DataFrame(data) #mode='a'为追加,index为索引号,header为标题 data.to_csv(output_file, mode='a', index=False, header=False) #将numpy数组保存带npy文件中 def save_data_to_npy(original_data=None, output_file=None): check_and_makedirs(output_file) np.save(output_file, original_data) print('文件:', output_file, ' 保存成功') #npy文件中的数据加载出来 def load_npy_to_data(input_file=None): original_data = np.load(input_file) return original_data #读取指定csv文件 def load_data_csv(input_file): ids = pd.read_csv(input_file, header=None, encoding='utf-8') return ids #将数据保存到csv文件内 def save_data_csv(content, output_file): data = pd.DataFrame(content) data.to_csv(output_file, index=False, header=None, encoding='utf-8') def save_data_npy(npy_path, file_prefix, data=None, truth=None, affine=None, patch_index=None): check_and_makedirs(os.path.join(npy_path, file_prefix)) if data is not None: np.save(os.path.join(npy_path, file_prefix + '_data.npy'), data.astype(np.float32)) if truth is not None: np.save(os.path.join(npy_path, file_prefix + '_truth.npy'), truth.astype(np.uint8)) if affine is not None: np.save(os.path.join(npy_path, file_prefix + '_affine.npy'), affine.astype(np.float32)) if patch_index is not None: np.save(os.path.join(npy_path, file_prefix + '_patch_index.npy'), patch_index.astype(np.int16)) def load_all_dicom_file(dirname, prefix='' ,postfix=''): file_path = os.path.join(dirname, prefix + '.' + postfix) all_file_path = glob.glob(file_path) return all_file_path #将一个指定csv文件中的所有的label_id找出来 def train_all_label_id(csv_path): data = load_data_csv(csv_path) all_labels = data[0].tolist() all_labels = [label.split('/')[1] for label in all_labels] train_all_labels = [int(label.split('.')[0]) for label in all_labels] return train_all_labels #通过id来获取相对应的类别 def find_lable_by_id(lable_id, subject_all_df): row = subject_all_df[subject_all_df['lable_id'] == lable_id] lable = row.iloc[0, 1] file_path = 'cls_' + str(lable) + '/' + lable_id + '.npy' return file_path #将一出错的数据直接添加到训练集中,生成新的数据不平衡的csv文件, positive=True代表增加正类样本 def add_label_ids(label_ids, csv_path, positive=True, cls_name=''): add_lable = 1 if positive else 0 original_csv_path = os.path.join(csv_path, '08', cls_name, 'train.csv') subject_all_csv_path = os.path.join(csv_path, 'subject_all.csv') print(original_csv_path) data = load_data_csv(original_csv_path) subject_all_df = pd.read_csv(subject_all_csv_path, header=None, names=['lable_id', 'lable']) #对第一列中的字符串进行切分,提取lable_id def get_lable_id(path_str): lable_file = path_str.split('/')[1] lable_id = lable_file.split('.')[0] return lable_id subject_all_df['lable_id'] = subject_all_df['lable_id'].apply(get_lable_id) for label_id in label_ids: print(label_id) result = find_lable_by_id(label_id, subject_all_df) file_paths = [find_lable_by_id(label_id, subject_all_df) for label_id in label_ids] #find_lable_by_id(label_ids[0], subject_all_df) #label_ids = ['cls_2047/'+str(label_id)+'.npy' for label_id in label_ids] node1_data = data[data[1] == 1] node2_data = data[data[1] == 0] add_data = pd.DataFrame(file_paths) add_data[1] = add_lable node_all_data = pd.concat([node1_data, node2_data, add_data]) """node1_name = '2047' node2_name = '1016' cls_unite_csv_path = os.path.join(csv_path, 'cls_' + node1_name + '_' + node2_name, 'train.csv') check_and_makedirs(cls_unite_csv_path)""" save_data_csv(node_all_data, original_csv_path) #将一个list中的label_id替换到一个指定csv文件中相对应的类型中 def replace_label_ids(label_ids, csv_path, tabel_id): original_csv_path = os.path.join(csv_path, 'cls_2047_1016_1', 'train.csv') print(original_csv_path) data = load_data_csv(original_csv_path) label_ids = ['cls_2047/'+str(label_id)+'.npy' for label_id in label_ids] node1_data = data[data[1] == 1] node2_data = data[data[1] == 0] node1_data.iloc[:len(label_ids), 0] = label_ids node_all_data = pd.concat([node1_data, node2_data]) node1_name = '2047' node2_name = '1016' cls_unite_csv_path = os.path.join(csv_path, 'cls_' + node1_name + '_' + node2_name, 'train.csv') check_and_makedirs(cls_unite_csv_path) save_data_csv(node_all_data, cls_unite_csv_path) def create_cls_train_last_3d(node_times, tabel_id=None, csv_path=None, csv_name=None, pretrain_csv_path=''): #读取之前训练的csv文件 pretrain_data = load_data_csv(pretrain_csv_path) pretrain_data_labels = pretrain_data[0].tolist() subject_all_path = os.path.join(csv_path, csv_name) data = load_data_csv(subject_all_path) node1_data = data[data[1].isin(node_times[0])] node1_data = node1_data[~node1_data[0].isin(pretrain_data_labels)] node2_data = data[data[1].isin(node_times[1])] node1_data.loc[:, 1] = 1 node2_data.loc[:, 1] = 0 node_all_data = pd.concat([node1_data, node2_data]) node1_name = '1' #node1_name = str(node_times[0][0]) if len(node_times[0])==1 else '-'.join([str(time) for time in node_times[0]]) node2_name = str(node_times[1][0]) if len(node_times[1])==1 else '-'.join([str(time) for time in node_times[1]]) cls_unite_csv_path = os.path.join(csv_path, tabel_id, 'cls_' + node1_name + '_' + node2_name, 'train.csv') check_and_makedirs(cls_unite_csv_path) save_data_csv(node_all_data, cls_unite_csv_path) def create_cls_train_csv_3d(node_times, tabel_id=None, csv_path=None, csv_name=None, max_len=None): """如果max_len不为None,则正负两个类别都取同样个数(max_len)的数据""" subject_all_path = os.path.join(csv_path, csv_name) data = load_data_csv(subject_all_path) node1_data = pd.DataFrame() for node_time in node_times[0]: node_data = data[data[1] == node_time] node_data = node_data[:(len(node_data) * 3)//4] node1_data = pd.concat([node1_data, node_data]) node2_data = pd.DataFrame() for node_time in node_times[1]: node_data = data[data[1] == node_time] node_data = node_data[:(len(node_data) * 3)//4] node2_data = pd.concat([node2_data, node_data]) """node1_data = data[data[1].isin(node_times[0])] node2_data = data[data[1].isin(node_times[1])]""" node1_data.loc[:, 1] = 1 node2_data.loc[:, 1] = 0 # node_all_data = pd.concat([node1_data, node2_data]) node_all_data = node1_data node1_name = '20241112' #node1_name = str(node_times[0][0]) if len(node_times[0])==1 else '-'.join([str(time) for time in node_times[0]]) node2_name = str(node_times[1][0]) if len(node_times[1])==1 else '-'.join([str(time) for time in node_times[1]]) cls_unite_csv_path = os.path.join(csv_path, tabel_id, 'cls_' + node1_name + '_' + node2_name, 'train.csv') check_and_makedirs(cls_unite_csv_path) save_data_csv(node_all_data, cls_unite_csv_path) print(f"save_data_csv: {cls_unite_csv_path}") def create_cls_train_all_csv(node_times, tabel_id=None, csv_path=None, csv_name=None): subject_all_path = os.path.join(csv_path, csv_name) data = load_data_csv(subject_all_path) node1_data = data[data[1].isin(node_times[0])] node2_data = data[data[1].isin(node_times[1])] node1_data.loc[:, 1] = 1 node2_data.loc[:, 1] = 0 node_all_data = pd.concat([node1_data, node2_data]) node1_name = str(node_times[0][0]) if len(node_times[0])==1 else '-'.join([str(time) for time in node_times[0]]) node2_name = str(node_times[1][0]) if len(node_times[1])==1 else '-'.join([str(time) for time in node_times[1]]) cls_unite_csv_path = os.path.join(csv_path, tabel_id, 'cls_' + node1_name + '_' + node2_name, 'train.csv') check_and_makedirs(cls_unite_csv_path) #node_all_data.to_csv(cls_unite_csv_path, index=False, header=None, encoding='utf-8') save_data_csv(node_all_data, cls_unite_csv_path) def create_cls_train_csv(node_times, node2=None, csv_path=None, csv_name=None, tabel_id='', node1_end=False, node2_end=False, min=0): """ 从subject_all.csv文件内找出指定类别的数据,生成数据集对应的csv文件 Parameters: node_times: node_time[0]表示要进行二分类的第一个类别,除此以外都为另一个类别 """ #获取subject_all.csv文件所在的路径 subject_all_path = os.path.join(csv_path, csv_name) data = load_data_csv(subject_all_path) data = data.sample(frac=1, replace=False) """ indices = data.index.to_numpy() indices = np.random.shuffle(indices) data = data.reindex(indices).reset_index(drop=True)""" #选择数量少的最为训练数据量的标准 node1_data = data[data[1].isin(node_times[0])] node2_data = data[data[1].isin(node_times[1])] indices1, indices2 = node1_data.index.to_numpy(), node2_data.index.to_numpy() np.random.shuffle(indices1) np.random.shuffle(indices2) node1_data = node1_data.reindex(indices1).reset_index(drop=True) node2_data = node2_data.reindex(indices2).reset_index(drop=True) if len(node1_data) < len(node2_data): min = len(node1_data) max_len = len(node2_data) min_is_first = True else: min = len(node2_data) max_len = len(node1_data) min_is_first = False #指定数据量 for i in range(int(max_len/min) + 1): only_one_data = node1_data if min_is_first else node2_data if i == int(max_len/min) + 1: second_data = node1_data[i*min:] if not min_is_first else node2_data[i*min:] else: second_data = node1_data[i*min:(i+1)*min] if not min_is_first else node2_data[i*min:(i+1)*min] only_one_data.loc[:, 1] = 1 if min_is_first else 0 second_data.loc[:, 1] = 0 if min_is_first else 1 node_all_data = pd.concat([only_one_data, second_data]) #将数据保存 node1_name = str(node_times[0][0]) if len(node_times[0])==1 else '-'.join([str(time) for time in node_times[0]]) node2_name = str(node_times[1][0]) if len(node_times[1])==1 else '-'.join([str(time) for time in node_times[1]]) cls_unite_csv_path = os.path.join(csv_path, tabel_id, 'cls_' + node1_name + '_' + node2_name, 'cls_' + node1_name + '_' + node2_name + '_' + str(i+1), 'train.csv') check_and_makedirs(cls_unite_csv_path) #node_all_data.to_csv(cls_unite_csv_path, index=False, header=None, encoding='utf-8') save_data_csv(node_all_data, cls_unite_csv_path) """node1_data = node1_data.iloc[9*min :9*min+min] #node2_data = node2_data.head(min) if node2_end is False else node2_data.tail(min) node1_data = node1_data.head(min) if node1_end is False else node1_data.tail(min) #data = data[data[1].isin(node_times)] print('总数据个数:', len(node1_data)+len(node2_data)) #从数据中随机抽取0.5的数据 #data = data.sample(frac=0.5) #print('抽取后的数据个数: ', len(data)) #将node_times中的第一个类别确定为正类 node1_data.loc[:, 1] = 1 node2_data.loc[:, 1] = 0 node_all_data = pd.concat([node1_data, node2_data]) print("正类个数:", len(node1_data)) print("负类个数:", len(node2_data)) #将数据保存 if len(node_times) == 2: node2 = node_times[1] cls_unite_csv_path = os.path.join(csv_path, 'cls_' + str(node_times[0]) + '_' + str(node2), 'cls_' + str(node_times[0]) + '_' + str(node2), 'train.csv') check_and_makedirs(cls_unite_csv_path) #node_all_data.to_csv(cls_unite_csv_path, index=False, header=None, encoding='utf-8') save_data_csv(node_all_data, cls_unite_csv_path)""" def get_data_from_file(npy_path, subject_id): """ 从指定的路径下加载一个文件 """ subject_npy_path = os.path.join(npy_path, str(subject_id)) #print(subject_npy_path) data = load_npy_to_data(subject_npy_path) return data def get_data_and_label(npy_path, subject_id, is_2d=False, augment=False, permute=False): data = get_data_from_file(npy_path, subject_id) data = torch.from_numpy(data) #将数据平移 lower_bound, upper_bound = -10, 10 shift_x, shift_y = random.randint(lower_bound, upper_bound), random.randint(lower_bound, upper_bound) new_array = np.full(data.shape, -1000, dtype=float) start_x, start_y = max(0, shift_x), max(0, shift_y) end_x, end_y = min(256, 256 + shift_x), min(256, 256 + shift_y) #print(start_x, start_y, end_x, end_y) new_array[:, start_x:end_x, start_y:end_y] = data[:, max(0, -shift_x):min(256, 256-shift_x), max(0, -shift_y):min(256, 256-shift_y)] data = new_array #print(data.shape) if is_2d: data = data.unsqueeze(0) result_data = data #在这里将数据进行翻转 transform_data = transforms.Compose([transforms.RandomVerticalFlip(), transforms.RandomHorizontalFlip(), transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), fill=-1000), transforms.RandomAffine(degrees=0, translate=(0.05, 0.05), fill=-1000)]) for _ in range(7): transforms_new_data = transform_data(data) result_data = torch.cat((result_data, transforms_new_data), dim=0) return result_data.numpy() else: if augment: #data = augment_data(data) data = data[np.newaxis] if permute: """if data.shape[-3] != data.shape[-2] or data.shape[-2] != data.shape[-1]: raise ValueError('To utilize permutations, data array must be in 3d cube shape with all dimentions having ' 'the same length')""" data = random_permutation_data(data) #data = data.transpose(0, 3, 1, 2) #data = data[:, 104:152, :, :] #print(data.shape) return data def save_model_ckpt(ckpt_path, ckpt_name, new_ckpt_name, model): """ 将之前训练好的模型中保存下来的参数重新提取成一个新的模型参数文件,要保证model_names中的元素 在模型参数文件中存在相对应的键值 """ pretrain_ckpt_path = os.path.join(ckpt_path, ckpt_name) new_ckpt_path = os.path.join(ckpt_path, new_ckpt_name) #加载ckpt文件数据 model_param = torch.load(pretrain_ckpt_path) new_model_param = dict() #新建一个collections.OrderedDict()对象,用于存放新的参数数据 state_dict = collections.OrderedDict() for name, param in model.named_parameters(): if name in model_param['state_dict'].keys() is False: raise ValueError('Ensure that the mmodel structure is consistent with the parameter file model structure') state_dict[name] = model_param['state_dict'][name] #将数据输出 new_model_param['state_dict'] = state_dict #print(new_model_param['state_dict']['diff_classifier.weight']) torch.save(new_model_param, new_ckpt_path) #将训练结束之后的数据在知道哪个路径下生成图片 def save_summary_data(summary_trains=None, summary_valids=None, result_img_path=None): #首先保证该文件夹是否存在 check_and_makedirs(result_img_path) plt.figure() plt.xlim(1, len(summary_trains)) plt.ylim(0, max(summary_trains)) plt.xlabel('epoch') plt.ylabel('loss') x = np.arange(1, len(summary_trains)+1, 1) plt.plot(x, summary_trains, 'b--', label='train loss') if summary_valids != None: plt.plot(x, summary_valids, 'r-', label='valid loss') plt.legend() plt.savefig(result_img_path, dpi=120) #plt.show() #不使用该函数 #将模型训练后的参数名称改成相对应的 def save_new_ckpt(ckpt_path, ckpt_name, new_ckpt_name): pretrain_ckpt_path = os.path.join(ckpt_path, ckpt_name) new_ckpt_path = os.path.join(ckpt_path, new_ckpt_name) #将要进行处理的参数文件加载进来 model_param = torch.load(pretrain_ckpt_path) new_model_param = dict() #新建一个collections.OrderedDict()对象,用于存放新的参数数据 state_dict = collections.OrderedDict() for key in model_param['state_dict'].keys(): #将名字进行改变 name = key.replace('module.', '') state_dict[name] = model_param['state_dict'][key] print(name) new_model_param['state_dict'] = state_dict torch.save(new_model_param, new_ckpt_path) #读取一个csv文件,将csv文件中的标签为1或者0的label_id找出来 def get_csv_all_label_ids_bylabel(csv_path, node_times, label=1): node_times = [str(node_time) for node_time in node_times] all_datas = load_data_csv(csv_path) all_datas_label = all_datas[all_datas[1] == label][0].tolist() all_nodes_list = [x.split('/')[0].split('_')[1] for x in all_datas_label] all_label_ids_list = [x.split('/')[1].split('.')[0] for x in all_datas_label] result_label_ids = [] for index in range(len(all_nodes_list)): if all_nodes_list[index] in node_times: result_label_ids.append(int(all_label_ids_list[index])) return result_label_ids #------------------------------------------------------------------- #测试函数 def test_load_data(): npy_path = './cls_train/data/train_data/plus_0815/npy_data' subject_id = 'cls_1016/2268_49.npy' data = get_data_and_label(npy_path=npy_path, subject_id=subject_id) print(data.shape) #对与训练好的参数文件进行处理,保留模型需要的,不需要的就将其删除 def test_save_ckpt(): ckpt_path = './cls_train/best_cls' ckpt_name = 'train_test.ckpt' new_ckpt_name = 'test.ckpt' #save_model_ckpt(ckpt_path=ckpt_path, ckpt_name=ckpt_name, new_ckpt_name=new_ckpt_name, model=model) save_new_ckpt(ckpt_path=ckpt_path, ckpt_name=ckpt_name, new_ckpt_name=new_ckpt_name) #读取ckpt文件,观察其参数 def read_ckpt(): ckpt_path = './cls_train/best_cls' ckpt_name = 'train_test.ckpt' pretrain_ckpt_path = os.path.join(ckpt_path, ckpt_name) model_param = torch.load(pretrain_ckpt_path) #print(type(model_param['state_dict'])) for key in model_param['state_dict'].keys(): print(model_param['state_dict'][key].shape) #print(model_param['state_dict']['diff_classifier.weight']) #在subject_all.csv中删除指定node_time的数据 def delete_node_csv(node_time): csv_path = os.path.join("./cls_train/data/train_data/plus_0617", "subject_all_csv", "subject_all.csv") data = load_data_csv(csv_path) data = data[data[1] != node_time] save_data_csv(data, csv_path) if __name__ == '__main__': #delete_node_csv(node_time=2046) """csv_path = '/home/lung/project/ai-project/cls_train/data/train_data/plus_3d_0818/subject_all_csv/test/cls_1_5001-6001/train.csv' train_all_id = train_all_label_id(csv_path) print(train_all_id)""" #test_load_data() """npy_path = '/home/lung/project/ai-project/cls_train/data/train_data/plus_3d_0818/npy_data' subject_id = 'cls_2047/432.npy' data = get_data_and_label(npy_path, subject_id, is_2d=False, augment=True, permute=True) data = data[0, 29].astype(np.float32) print(data) plt.imshow(data, cmap='gray') plt.show()""" csv_path = '/home/lung/project/ai-project/cls_train/data/train_data/plus_3d_0818/subject_all_csv/08/cls_234567_1016/train.csv' node_times = [2041] label = 1 result = get_csv_all_label_ids_bylabel(csv_path, node_times, label) print(result)