import os import sys import argparse import logging import numpy as np import torch import time import re import copy import cv2 import random from matplotlib import pyplot as plt sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../') from data.db import read_series_dicom, process_single_ct from pytorch_train.torch_model import TorchModel from pytorch_train.torch_model_2d import TorchModel_2d #from pytorch_train.train import cfg from cls_utils.data_utils import crop_ct_data, get_crop_data_padding, get_crop_data_2d from cls_utils.utils import hu_value_to_uint8, normalize, base64_to_list, hu_normalize from data.db import select_signal_series, select_series_by_node_time, get_all_contours_by_labelId, extract_error_label from cls_utils.data import load_json, train_all_label_id, get_csv_all_label_ids_bylabel from cls_utils.augement import generate_general_permute_keys, generate_general_indexs, permute_data, augment_data # cfg = load_json("/home/lung/ai-project/cls_train/config/predict.json") cfg = load_json("/df_lung/ai-project/cls_train/config/predict.json") parser = argparse.ArgumentParser(description='predict data') parser.add_argument('--GPU', default='0', type=str, help='GPU index') #通过一个validation_log文件,统计指定数据内的准确率 def compute_accuracy(log_path, threshold=0.5, positive=False, train=True): with open(log_path, 'r') as f: log_contents = f.readlines() error_num = 0 sum_num = 0 #检索每一行数据,找到每个结节最后一行的均值结果 for line in log_contents: match = re.search(r"result: \[(.*?)\]\n", line) if match: sum_num += 1 result = match.group(1) result = float(result) #print(result) if positive: if result < threshold: error_num += 1 else: if result > threshold: error_num += 1 label = '正类' if positive else '负类' train_or_test = '训练' if train else '测试' logging.info("{}{}, 阈值:{}, 总个数: {:3}, 出错个数: {:3}, summary result 准确率: {:5}%".format(train_or_test, label, round(threshold, 2), sum_num, error_num, round(1-error_num/ sum_num, 4)*100)) #print("总个数: {}, 出错个数: {}, summary result 准确率:{}%".format(sum_num, error_num, (1-round(error_num/ sum_num, 4))*100)) def load_all_pretrain_ckpt(base): """ 加载指定文件夹下的所有的pretrain_ckpt文件 例如加载文件./cls_train/best_cls_0704/cls_1010_2046/cls_1010_2046_1/cls_1010_204620230708-1710.ckpt, 只需将base=‘./cls_train/best_cls_0704/cls_1010_2046’即可 """ all_cakpt_path = {} result = [] for root, dirs, names in os.walk(base): for name in names: index = root.split('/')[-1].split('_')[-1] path = os.path.join(root, name) all_cakpt_path[int(index)] = path for key in sorted(all_cakpt_path): result.append(all_cakpt_path[key]) return result #将出现结节的每个切面都当作中心面,其余部分都进行无效填充,进行预测 #seg=True 则该切面图像中只保留分割出来的结节,别的区域填充为无效值 #is_2d=True 则采用2d模型的要求准备数据 def predict_all_series(model, folder_name, select_box=None, label_id=None, seg=False, is_2d=False, threshold=0): dicom_folder = os.path.join(cfg['dicom_folder'], folder_name) ct_data = read_series_dicom(dicom_folder=dicom_folder) patient_id = folder_name.split('-')[0] z_min = int(select_box[0, 0]) z_max = int(select_box[0, 1]) contours = get_all_contours_by_labelId(label_id) if seg else None if contours is not None: data = ct_data.get_raw_image() img_np = np.zeros((data.shape[0], data.shape[1], data.shape[2])) for i in range(z_max-z_min+1): _, _, img = base64_to_list(contours[i]) img_np[z_min+i] = img data[img_np == 0] = -1000 ct_data.set_raw_image = data origin_data_2d = data #sum_result = torch.zeros(1,1) eliminate_num = int((threshold * (z_max - z_min + 1)) / 2) #初始化一个输入变量 in_data = torch.zeros((1, 8, 256, 256)) for z_index in range(z_min + eliminate_num, z_max - eliminate_num + 1): temp_select_box = copy.deepcopy(select_box) temp_select_box[0, 0], temp_select_box[0, 1] = z_index, z_index if is_2d: data_2d = origin_data_2d[z_index] original_data = get_crop_data_2d(data=data_2d, select_box=temp_select_box, crop_size=cfg['train_crop_size_2d']) else: original_data = get_crop_data_padding(ct_data=ct_data, select_box=temp_select_box, crop_size=cfg['train_crop_size']) data = hu_normalize(original_data) #data = normalize(hu_value_to_uint8(original_data)) data = np.tile(data, (1, 8, 1, 1)) data = torch.from_numpy(data).type(torch.float32) in_data = torch.cat((in_data, data), dim=0) #直接将其中切面凭借到一起,输入然后取结果求均值 result = model.predict(in_data[1:]) logging.info('time: {}, patiend_id: {}, result: {}' .format(time.strftime('%Y-%m-%d %H:%M:%S'), patient_id, result)) result = np.mean(result) #mean_result = sum_result/(z_max - z_min + 1 - 2 * eliminate_num) return patient_id, result def predict_all_series_orignal(model, folder_name, select_box=None, label_id=None, seg=False, is_2d=False, threshold=0): dicom_folder = os.path.join(cfg['dicom_folder'], folder_name) ct_data = read_series_dicom(dicom_folder=dicom_folder) patient_id = folder_name.split('-')[0] z_min = int(select_box[0, 0]) z_max = int(select_box[0, 1]) contours = get_all_contours_by_labelId(label_id) if seg else None if contours is not None: data = ct_data.get_raw_image() img_np = np.zeros((data.shape[0], data.shape[1], data.shape[2])) for i in range(z_max-z_min+1): _, _, img = base64_to_list(contours[i]) img_np[z_min+i] = img data[img_np == 0] = -1000 ct_data.set_raw_image = data origin_data_2d = data sum_result = torch.zeros(1, 1) #sum_result = torch.zeros(1,1) eliminate_num = int((threshold * (z_max - z_min + 1)) / 2) for z_index in range(z_min + eliminate_num, z_max - eliminate_num + 1): temp_select_box = copy.deepcopy(select_box) temp_select_box[0, 0], temp_select_box[0, 1] = z_index, z_index if is_2d: data_2d = origin_data_2d[z_index] original_data = get_crop_data_2d(data=data_2d, select_box=temp_select_box, crop_size=cfg['train_crop_size_2d']) #plt.imsave(f'/home/lung/project/ai-project/cls_train/log/image/03/{z_index}.png', original_data, cmap='gray') #np.save(f'/home/lung/project/ai-project/cls_train/log/npy/03/{z_index}.npy', original_data) #cv2.imwrite(f'/home/lung/project/ai-project/cls_train/log/image/01/{z_index}.png', original_data) else: original_data = get_crop_data_padding(ct_data=ct_data, select_box=temp_select_box, crop_size=cfg['train_crop_size']) data = hu_normalize(original_data) #data = normalize(hu_value_to_uint8(original_data)) data = np.tile(data, (1, 8, 1, 1)) data = torch.from_numpy(data).type(torch.float32) #直接将其中切面凭借到一起,输入然后取结果求均值 result = model.predict(data) sum_result = sum_result + result logging.info('time: {}, patiend_id: {}, z_index: {}, result: {}' .format(time.strftime('%Y-%m-%d %H:%M:%S'), patient_id, z_index, result)) mean_result = sum_result/(z_max - z_min + 1 - 2 * eliminate_num) return patient_id, mean_result #调用3d模型进行预测 def predict_3d(model, label_id, folder_name, select_box=None, seg=False): dicom_folder = os.path.join(cfg['dicom_folder'], folder_name) ct_data = read_series_dicom(dicom_folder=dicom_folder) z_min = int(select_box[0, 0]) z_max = int(select_box[0, 1]) contours = get_all_contours_by_labelId(label_id) if seg else None if contours is not None: data = ct_data.get_raw_image() img_np = np.zeros((data.shape[0], data.shape[1], data.shape[2])) for i in range(z_max-z_min+1): _, _, img = base64_to_list(contours[i]) img_np[z_min+i] = img data[img_np == 0] = -1000 ct_data.set_raw_image = data result = predict(model, ct_data, select_box) return result #模型预测 #将模型参数文件的地址传递进来,将要进行预测的dicom的文件夹输入进来, #将所对应的select_box输入进来,最后进行模型预测,输出结果 def predict(model, ct_data, select_box=None): #处理成(48, 256, 256)的数据 if select_box[0, 0] == select_box[0, 1]: original_data = get_crop_data_padding(ct_data=ct_data, select_box=select_box, crop_size=cfg['train_crop_size']) else: print("预测") original_data = crop_ct_data(ct_data=ct_data, select_box=select_box, crop_size=cfg['train_crop_size']) #直接读取处理完的npy文件进行预测输出 #npy_path = './cls_train/data/npy_data' #original_data = get_data_from_file(npy_path=npy_path ,subject_id='cls_1010/395.npy') #original_data = original_data[np.newaxis] #print(original_data.shape) original_data = augment_data(original_data) original_data = original_data[np.newaxis] keys = generate_general_permute_keys() indexes_list = generate_general_indexs() predict_num = len(indexes_list) batch_size = 1 n_channels = 1 result = None for start_size in range(0, predict_num, batch_size): """copy_data = original_data_3d.copy() data = torch.from_numpy(copy_data) #将数据平移 lower_bound, upper_bound = -10, 10 shift_x, shift_y = random.randint(lower_bound, upper_bound), random.randint(lower_bound, upper_bound) new_array = np.full(data.shape, -1000, dtype=float) start_x, start_y = max(0, shift_x), max(0, shift_y) end_x, end_y = min(256, 256 + shift_x), min(256, 256 + shift_y) #print(start_x, start_y, end_x, end_y) new_array[:, start_x:end_x, start_y:end_y] = data[:, max(0, -shift_x):min(256, 256-shift_x), max(0, -shift_y):min(256, 256-shift_y)] original_data = new_array original_data = augment_data(original_data) original_data = original_data[np.newaxis]""" length = min(batch_size, predict_num - start_size) cnn_datas = [] for i in range(length): indexes = indexes_list[start_size + i] cnn_data = [] for j in indexes[:n_channels]: data = permute_data(original_data, keys[j]) data = data.transpose(0, 3, 1, 2) data = data[:, 104:152, :, :] cnn_data.append(data) cnn_datas.append(cnn_data) cnn_datas = cnn_datas[0] cnn_datas = np.array(cnn_datas, np.float32) #cnn_datas = torch.from_numpy(cnn_datas) cnn_datas = hu_normalize(cnn_datas) cnn_datas = torch.from_numpy(cnn_datas) cnn_datas = cnn_datas.to('cuda') temp_result = model.predict(cnn_datas) temp_result = torch.from_numpy(temp_result) if result == None: result = temp_result else: result = torch.cat((result, temp_result), 0) #print(result) result = torch.mean(result, dim=0).numpy() """#对数据进行归一化 data = hu_normalize(original_data) #data = model.normalize(original_data) #将数据进行指定处理 data = np.tile(data, (1, 1, 1, 1, 1)) #print(data.shape) data = torch.from_numpy(data) #data = get_data(data) #print(data.shape) data = data.type(torch.float32) result = model.predict(data)""" return result #将数据库中指定的node_time全部找出来,并全部进行模型预测,mode=None表示对该检测框的每个切面都预测 def predict_all_train_data(models, node_time, mode=None, start_label_id=0, end_label_id=0, seg=False, is_2d=False, threshold=0): folder_names, select_boxs, label_ids, patient_ids, series_instance_uids= select_series_by_node_time(node_time) select_boxs = np.array(select_boxs) error_num = 0 logging.info('time: {}, node_time : {} 训练集测试结果展示:\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), node_time)) print(len(folder_names)) for index in range(len(folder_names)): print(index) if label_ids[index] > start_label_id and label_ids[index] < 7764: #if label_ids[index] > start_label_id: if mode is None: folder_name = str(patient_ids[index])+'-'+str(series_instance_uids[index]) sum_result = 0 for i in range(len(models)): patient_id, mean_result = predict_all_series_orignal(models[i], folder_name, select_boxs[index], label_id=label_ids[index], seg=seg, is_2d=is_2d, threshold=threshold) #sum_result += mean_result #print(type(index)) logging.info( 'time: {}, patiend_id: {}, label_id: {}, model_{},result: {}' .format(time.strftime('%Y-%m-%d %H:%M:%S'), patient_id, label_ids[index], i+1, mean_result)) """summary_result = sum_result/len(models) logging.info( 'time: {}, patiend_id: {}, label_id: {}, summary result: {}\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), patient_id, label_ids[int(index)], summary_result)) if summary_result <= 0.5: error_num += 1""" else: result = predict(models, folder_names[index], select_boxs[index]) print(folder_names[index].split('-')[0], ' : ', result[0, 0]) #print(folder_names[index], ' label_id: ', label_ids[index]) #predict(model, folder_names[index], select_boxs[index]) logging.info( 'time: {}, 预测总个数: {}, 出错个数: {}, 正确率:{}%' .format(time.strftime('%Y-%m-%d %H:%M:%S'), len(label_ids), error_num, (1 - round(error_num / len(label_ids), 4))*100)) def predict_all_train_data_orignal(models, node_time, mode=None, start_label_id=0, end_label_id=0, seg=False, is_2d=False, threshold=0): folder_names, select_boxs, label_ids, patient_ids, series_instance_uids= select_series_by_node_time(node_time) select_boxs = np.array(select_boxs) error_num = 0 logging.info('time: {}, node_time : {} 训练集测试结果展示:\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), node_time)) print(len(folder_names)) for index in range(len(folder_names)): print(index) if label_ids[index] > start_label_id and label_ids[index] < 7764: #if label_ids[index] > start_label_id: if mode is None: folder_name = str(patient_ids[index])+'-'+str(series_instance_uids[index]) sum_result = 0 for i in range(len(models)): patient_id, mean_result = predict_all_series(models[i], folder_name, select_boxs[index], label_id=label_ids[index], seg=seg, is_2d=is_2d, threshold=threshold) sum_result += mean_result[0,0] #print(type(index)) logging.info( 'time: {}, patiend_id: {}, label_id: {}, model_{},result: {}' .format(time.strftime('%Y-%m-%d %H:%M:%S'), patient_id, label_ids[index], i+1, mean_result[0, 0])) summary_result = sum_result/len(models) logging.info( 'time: {}, patiend_id: {}, label_id: {}, summary result: {}\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), patient_id, label_ids[int(index)], summary_result)) if summary_result >= 0.5: error_num += 1 else: result = predict(models, folder_names[index], select_boxs[index]) print(folder_names[index].split('-')[0], ' : ', result[0, 0]) #print(folder_names[index], ' label_id: ', label_ids[index]) #predict(model, folder_names[index], select_boxs[index]) logging.info( 'time: {}, 预测总个数: {}, 出错个数: {}, 正确率:{}%' .format(time.strftime('%Y-%m-%d %H:%M:%S'), len(label_ids), error_num, (1 - round(error_num / len(label_ids), 4))*100)) #通过输入指定label_id进行模型预测 def predict_by_label_id(model, label_id, mode=None, seg=False): #dicom文件所在的文件夹名字 folder_name, select_box, patient_id, series_instance_uid = select_signal_series(label_id=label_id) folder_name = str(patient_id)+'-'+str(series_instance_uid) select_box = np.array(select_box) if mode is None: predict_all_series(model, folder_name, select_box, label_id=label_id, seg=seg) else: result = predict_3d(model,label_id=label_id, folder_name=folder_name, select_box=select_box, seg=True) logging.info('time: {}, patiend_id: {}, label_id: {}, result: {}\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), patient_id, label_id, result)) return result[0] #print(patient_id, ' : ', result[0, 0]) #根据指定label_id进行预测 #is_2d=True采用2d模型 def run(args, is_2d=False, threshold=0.4): """""" models = [] if is_2d: pretrain_ckpt_list = load_all_pretrain_ckpt(cfg['pretrain_folder']) for i in range(len(pretrain_ckpt_list)): model = TorchModel_2d(pretrain_ckpt_list[i], args.GPU) models.append(model) node_times = [1016] for node_time in node_times: predict_all_train_data(models, node_time=node_time, mode=None, start_label_id=7762, seg=True, is_2d=is_2d, threshold=threshold) else: """log_path_1 = '/home/lung/project/ai-project/log_validation_0819.log' log_path_2 = '/home/lung/project/ai-project/log_validation_0822.log' error_label_ids_1 = extract_error_label(log_path_1) error_label_ids_2 = extract_error_label(log_path_2) error_label_ids = error_label_ids_1 + error_label_ids_2 error_label_ids = [int(label_id) for label_id in error_label_ids]""" model = TorchModel(cfg['pretrain_ckpt'], args.GPU) #统一测试训练数据和测试数据,保证node_times中的第一个是负类,第二个是正类 node_times = cfg['node_times'] csv_path = cfg['csv_path'] for index in range(len(node_times)): node_list = node_times[index] #获取所有当前node_list训练数据的label_id all_train_label_ids = get_csv_all_label_ids_bylabel(csv_path, node_list, label=index) #获取所有当前node_list测试数据的label_id all_label_ids = [] for node_time in node_list: _, _, label_ids, _, _ = select_series_by_node_time(node_time) all_label_ids += label_ids all_test_label_ids = [label_id for label_id in all_label_ids if label_id not in all_train_label_ids and label_id > 432] logging.info('类别:{} 训练数据集------------------------------------\n'.format(index)) sum = 0 error = 0 for label_id in all_train_label_ids: sum += 1 result = predict_by_label_id(model=model, label_id=label_id, mode='3d', seg=True) if (index == 0 and result > 0.5) or (index == 1 and result < 0.5): error += 1 logging.info('time: {}, 总数: {}, 出错个数: {}, 正确率: {}%\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), sum, error,(1 - round(error / sum, 4))*100)) logging.info('类别:{} 测试数据集------------------------------------\n'.format(index)) sum = 0 error = 0 for label_id in all_test_label_ids: sum += 1 result = predict_by_label_id(model=model, label_id=label_id, mode='3d', seg=True) if (index == 0 and result > 0.5) or (index == 1 and result < 0.5): error += 1 logging.info('time: {}, 总数: {}, 出错个数: {}, 正确率: {}%\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), sum, error,(1 - round(error / sum, 4))*100)) #all_train_label_ids_n = get_csv_all_label_ids_bylabel(csv_path, node_list, label=1) """all_label_ids = [] for node_time in node_times: _, _, label_ids, _, _= select_series_by_node_time(node_time) all_label_ids += label_ids""" """all_label_ids = [8065] sum = 0 error = 0 for label_id in all_label_ids: if label_id > 432 : sum += 1 result = predict_by_label_id(model=model, label_id=label_id, mode='3d', seg=True) if result < 0.5: error += 1 logging.info('time: {}, 总数: {}, 出错个数: {}, 正确率: {}%\n' .format(time.strftime('%Y-%m-%d %H:%M:%S'), sum, error,(1 - round(error / sum, 4))*100))""" def main(): logging.basicConfig(level=logging.INFO, filename=cfg['validation_filename'], filemode='a') args = parser.parse_args() run(args, is_2d=False) def summary_acc(): logging.basicConfig(level=logging.INFO, filename=cfg['compute_accuracy_filename'], filemode='a') validation_folder = '/home/lung/ai-project/cls_train/log/validation/cls_234567_2031/20240815' result_1_train_log_path = os.path.join(validation_folder, '1_train.log') result_1_test_log_path = os.path.join(validation_folder, '1_test.log') result_0_train_log_path = os.path.join(validation_folder, '0_train.log') result_0_test_log_path = os.path.join(validation_folder, '0_test.log') for threshold in np.arange(0.4, 0.9, 0.01): logging.info("------------------------------------------------------------------------") compute_accuracy(result_1_train_log_path, threshold, positive=True, train=True) compute_accuracy(result_0_train_log_path, threshold, positive=False, train=True) compute_accuracy(result_1_test_log_path, threshold, positive=True, train=False) compute_accuracy(result_0_test_log_path, threshold, positive=False, train=False) if __name__ == '__main__': #main() summary_acc()