import sys, os import pathlib current_dir = pathlib.Path(__file__).parent.resolve() while "cls_train" != current_dir.name: current_dir = current_dir.parent sys.path.append(current_dir.as_posix()) from cls_utils.log_utils import get_logger from sqlalchemy import create_engine, and_ from sqlalchemy.orm import sessionmaker, scoped_session import sys import os import numpy as np import argparse import threading from tqdm import tqdm import pandas as pd from datetime import datetime import json import re import threading from pathlib import Path import scipy from joblib import Parallel, delayed from scipy.ndimage import rotate as scipy_rotate import torch import torchio as tio import torch.nn.functional as F from multiprocessing import Process from torchvision import transforms as T import cupy as cp from cupyx.scipy.ndimage import rotate as cupy_rotate from data.domain import DicomStudy, PatientInfo, UserLabel, UserLabelDelineation, DicomSeries from data.data_process_utils.test_sitk_utils import CTSeries, base64_to_list, meta_to_list from PIL import Image logger = get_logger(log_file="/df_lung/ai-project/cls_train/log/data/get_db_data_to_npy.log") # from cls_utils.sitk_utils import CTSeries # from cls_utils.data_utils import crop_ct_data, get_crop_data_padding, get_crop_data_2d # from cls_utils.utils import hu_value_to_uint8, normalize, base64_to_list # from cls_utils.data import save_supplement_data_csv, save_data_to_npy, load_npy_to_data, create_cls_train_csv, \ # load_all_dicom_file, load_json, create_cls_train_all_csv, create_cls_train_csv_3d, \ # replace_label_ids, add_label_ids, create_cls_train_last_3d MYSQL_SERVER = 'mysql+pymysql://lung:lung1qaz2wsx@127.0.0.1:3306/ct_file?charset=utf8' """ 连接数据库,返回一个session """ def conect_mysql(): engine = create_engine(MYSQL_SERVER, pool_recycle=3600) #onnection = engine.connect() db_session = sessionmaker(bind=engine) session = scoped_session(db_session) return session def get_cts(dicom_path=None): cts = CTSeries() cts.load_dicoms(dicom_path) return cts def rotate_dicom_scipy(data, num_rotations=10): angle = 360 / num_rotations rotated_data = [] for i in range(num_rotations): rotated = scipy_rotate(data, angle * (i + 1), axes=(1, 2), reshape=False, order=3) rotated_data.append(rotated) return np.stack(rotated_data) def rotate_dicom_torch(data, num_rotations=10): device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') angle = 360 / num_rotations data_tensor = torch.tensor(data, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device) # 转为5D张量 [N, C, D, H, W] # 构建旋转矩阵和采样网格 def build_rotation_grid(shape, angle_deg): angle_rad = torch.tensor(angle_deg * np.pi / 180, dtype=torch.float32, device=device) cos_a = torch.cos(angle_rad) sin_a = torch.sin(angle_rad) # 3D旋转矩阵 rotation_matrix = torch.tensor([ [cos_a, -sin_a, 0, 0], [sin_a, cos_a, 0, 0], [0, 0, 1, 0], ], dtype=torch.float32, device=device) d, h, w = shape z, y, x = torch.meshgrid( torch.linspace(-1, 1, d, device=device), torch.linspace(-1, 1, h, device=device), torch.linspace(-1, 1, w, device=device), indexing='ij' ) coords = torch.stack([x.flatten(), y.flatten(), z.flatten(), torch.ones_like(x.flatten())], dim=0) rotated_coords = torch.matmul(rotation_matrix, coords).view(3, d, h, w) return rotated_coords.permute(1, 2, 3, 0) rotated_data = [] for i in range(num_rotations): current_angle = angle * (i + 1) grid = build_rotation_grid(data_tensor.shape[2:], current_angle) rotated_volume = F.grid_sample(data_tensor, grid.unsqueeze(0), mode='bilinear', padding_mode='zeros', align_corners=True) rotated_data.append(rotated_volume.squeeze(0).squeeze(0)) torch.cuda.empty_cache() result = torch.stack(rotated_data).cpu().numpy() del rotated_data, grid, rotated_volume, data_tensor, device, angle, current_angle, build_rotation_grid torch.cuda.empty_cache() return result def rotate_dicom_cupy(data, num_rotations=10): ''' rotated_data.append(rotated.get()) # 将数据移回 CPU cp.get_default_memory_pool().free_all_blocks() # 释放显存 return np.stack(rotated_data) ''' data_gpu = cp.asarray(data) angle = 360 / num_rotations rotated_data = [] for i in range(num_rotations): rotated = cupy_rotate(data_gpu, angle * (i + 1), axes=(1, 2), reshape=False, order=1, prefilter=False) rotated_data.append(rotated) result = cp.stack(rotated_data).get() del rotated_data, rotated, data_gpu, data cp.get_default_memory_pool().free_all_blocks() return result def rotate_dicom_data(dicom_data=None, rotate_count=10, logger=None): rotate_data = None try: rotate_data = rotate_dicom_cupy(dicom_data) logger.info(f"rotate_dicom_cupy success") except: if logger is not None: logger.error(f"rotate_dicom_cupy error, dicom_data: {dicom_data[0][:10]}") if rotate_data is None: try: rotate_data = rotate_dicom_torch(dicom_data) logger.info(f"rotate_dicom_torch success") except: if logger is not None: logger.error(f"next rotate_dicom_torch error, dicom_data: {dicom_data[0][:10]}") if rotate_data is None: try: rotate_data = rotate_dicom_scipy(dicom_data, rotate_count) logger.info(f"rotate_dicom_scipy success") except: if logger is not None: logger.error(f"next rotate_dicom_scipy error, dicom_data: {dicom_data[0][:10]}") return rotate_data def get_crop_start_end_index(center, lens): center = int(center) index_dict = {} index_list = [center] start = center - 1 while len(index_list) <= lens // 2: index_list = [start] + index_list start -= 1 start = center+1 while len(index_list) < lens: index_list.append(start) start += 1 for idx, idx_index in enumerate(index_list): index_dict[int(idx_index)] = idx return index_list, index_dict def extend_crop_data_3d(data, select_box, crop_size=(48, 400, 400), fill_value=-1000, z_expand_flag=True, y_expand_flag=True, x_expand_flag=True, return_updated_box_flag=True): """ 根据expand_flag,扩展、填充,同时返回更新后的 select_box。 Args: data: 原始3D数据,形状 (D, H, W)。 select_box: 原始边界框 crop_size: 扩展尺寸。 fill_value: 超出边界部分的填充值。 Returns: padded_data: 扩展后的数据 new_select_box: 更新后的边界框。 """ select_box = select_box.astype(np.int32) assert len(data.shape) == 3, "数据维度必须为3D" assert len(select_box) == 3, "select_box必须包含3个轴的范围" assert len(crop_size) == 3, "crop_size必须指定3个维度" z_center = (select_box[0][0]+select_box[0][1]) // 2 y_center = (select_box[1][0]+select_box[1][1]) // 2 x_center = (select_box[2][0]+select_box[2][1]) // 2 crop_z_size, crop_y_size, crop_x_size = crop_size[0], crop_size[1], crop_size[2] z_index_list = list(range(select_box[0][0],select_box[0][1])) y_index_list = list(range(select_box[1][0],select_box[1][1])) x_index_list = list(range(select_box[2][0],select_box[2][1])) z_index_dict = dict(zip(z_index_list, z_index_list)) y_index_dict = dict(zip(y_index_list, y_index_list)) x_index_dict = dict(zip(x_index_list, x_index_list)) if z_expand_flag: z_index_list, z_index_dict = get_crop_start_end_index(z_center, crop_z_size) if y_expand_flag: y_index_list, y_index_dict = get_crop_start_end_index(y_center, crop_y_size) if x_expand_flag: x_index_list, x_index_dict = get_crop_start_end_index(x_center, crop_x_size) z_valid_start_index, z_valid_end_index = max(0, z_index_list[0]), min(data.shape[0]-1, z_index_list[-1]) y_valid_start_index, y_valid_end_index = max(0, y_index_list[0]), min(data.shape[1]-1, y_index_list[-1]) x_valid_start_index, x_valid_end_index = max(0, x_index_list[0]), min(data.shape[2]-1, x_index_list[-1]) padded_data = np.full(crop_size, fill_value, dtype=data.dtype) z_padded_start_index, z_padded_end_index = z_index_dict[z_valid_start_index], z_index_dict[z_valid_end_index] y_padded_start_index, y_padded_end_index = y_index_dict[y_valid_start_index], y_index_dict[y_valid_end_index] x_padded_start_index, x_padded_end_index = x_index_dict[x_valid_start_index], x_index_dict[x_valid_end_index] padded_data = np.full(crop_size, fill_value, dtype=data.dtype) padded_data[ z_padded_start_index:z_padded_end_index + 1, y_padded_start_index:y_padded_end_index + 1, x_padded_start_index:x_padded_end_index + 1 ] = data[ z_valid_start_index:z_valid_end_index + 1, y_valid_start_index:y_valid_end_index + 1, x_valid_start_index:x_valid_end_index + 1 ] assert np.all(padded_data[crop_size[0]//2, crop_size[1]//2, crop_size[2]//2] == data[z_center, y_center, x_center]), f"裁剪区域中心点,数值与原始数据中心点不一致" updated_box_info_dict = None if return_updated_box_flag: updated_box_info_dict = { "z_index_list": z_index_list, "y_index_list": y_index_list, "x_index_list": x_index_list, "z_index_dict": z_index_dict, "y_index_dict": y_index_dict, "x_index_dict": x_index_dict } return padded_data, updated_box_info_dict def data_rotation_3d(data, select_box, crop_size=(48, 256, 256), expand=40, num_rotations=10): """ 增强3D数据 Args: data: 原始3D图像,形状 (D, H, W)。 select_box: 初始肺结节的边界框。 crop_size: 裁剪尺寸。 expand: 扩展范围。 num_rotations: 旋转次数。 Returns: final_data: 增强后的数据。 """ first_crop_lens = int(np.sqrt(crop_size[1]**2 + crop_size[2]**2)) first_crop_expand_lens = first_crop_lens + expand + 1 if (first_crop_lens + expand)%2 == 0 else first_crop_lens + expand first_crop_size = (crop_size[0], first_crop_expand_lens, first_crop_expand_lens) logger.info(f"原始裁剪区域: {crop_size}, 扩展后裁剪区域: {first_crop_size}") first_extended_data, first_updated_box_info_dict = extend_crop_data_3d( data, select_box, crop_size=first_crop_size, fill_value=-1000, z_expand_flag=True, y_expand_flag=True, x_expand_flag=True, return_updated_box_flag=True ) rotated_data = rotate_dicom_data(dicom_data=first_extended_data, rotate_count=num_rotations) logger.info(f"rotated_data: {rotated_data.shape}") y_index_lens, x_index_lens = rotated_data.shape[2], rotated_data.shape[3] y_center, x_center = y_index_lens // 2, x_index_lens // 2 y_start, y_end = y_center - crop_size[1] // 2, y_center + crop_size[1] // 2 x_start, x_end = x_center - crop_size[2] // 2, x_center + crop_size[2] // 2 result_data = rotated_data[:, :, y_start:y_end, x_start:x_end] return result_data, first_updated_box_info_dict def fill_data_default(input_data, fill_data=0.0001): return np.full_like(input_data, fill_data) def min_max_normalize_2d(input_data, min_value=-1000, max_value=800, fill_value=-1000, fill_data=0.0001): ''' 2d 数据归一化 ''' if np.all(input_data==fill_value): return fill_data_default(input_data, fill_data) corrected_data = np.copy(input_data) corrected_data[corrected_data < min_value] = min_value corrected_data[corrected_data > max_value] = max_value corrected_data = corrected_data.astype(np.float32) actual_min = min_value actual_max = max_value if actual_min == actual_max: return fill_data_default(input_data, fill_data) data = (corrected_data - actual_min) / (actual_max - actual_min) return data def min_max_normalize_2d_expand(input_data, min_value=-1000, max_value=800, fill_value=-1000, fill_data=0.0001): ''' 2d 扩展数据归一 ''' if np.all(input_data==fill_value): return fill_data_default(input_data, fill_data) corrected_data = np.copy(input_data) corrected_data[corrected_data < min_value] = min_value corrected_data[corrected_data > max_value] = max_value corrected_data = corrected_data.astype(np.float32) actual_min = min_value actual_max = max_value if actual_min == actual_max: return fill_data_default(input_data, fill_data) data = (corrected_data - actual_min) / (actual_max - actual_min) return 2 * data - 1 def z_score_to_normalize_2d(input_data, fill_value=-1000, fill_data=0.0001): ''' 2d 数据归一 ''' if np.all(input_data==fill_value): return fill_data_default(input_data, fill_data) std = np.std(input_data) if std == 0: return fill_data_default(input_data, fill_data) pt_data = torch.from_numpy(input_data).to(torch.float32) image = tio.ScalarImage(tensor=pt_data.unsqueeze(0).unsqueeze(0)) znorm = tio.ZNormalization() data = znorm(image).tensor.squeeze(0).squeeze(0).numpy() return data def z_score_T_normalize_2d(input_data, fill_value=-1000, fill_data=0.0001): ''' 2d 数据扩展 ''' if np.all(input_data==fill_value): return fill_data_default(input_data, fill_data) data = min_max_normalize_2d(input_data) mean = np.mean(data) std = np.std(data) if std == 0: return fill_data_default(input_data, fill_data) normalize = T.Normalize(mean=[mean], std=[std]) data = torch.from_numpy(data).to(torch.float32).unsqueeze(0) transform = T.Compose([ normalize ]) data = transform(data) data = data.repeat(3, 1, 1).numpy() return data def normalize_net_2d(input_data, min_value=-1000, max_value=800): ''' 2d 网络归一 ''' x1,x2,x3,x4 = input_data.shape data = np.zeros_like(input_data) for idx in range(x1): for jdx in range(x2): data[idx, jdx] = z_score_to_normalize_2d(input_data[idx, jdx]) return data def normalize_net_3d(input_data, min_value=-1000, max_value=800): ''' 3d网络归一 ''' x1,x2,x3,x4 = input_data.shape data = np.zeros_like(input_data) for idx in range(x1): for jdx in range(x2): data[idx, jdx] = z_score_to_normalize_2d(input_data[idx, jdx]) return data def d2d_normalize(input_data, min_value=-1000, max_value=800): ''' 2d预训练归一 ''' x1,x2,x3,x4 = input_data.shape x5 = 3 data = np.zeros((x1, x2, x5, x3, x4)) for idx in range(x1): for jdx in range(x2): data[idx, jdx] = z_score_T_normalize_2d(input_data[idx, jdx]) return data def s3d_normalize_3d(input_data, min_value=-1000, max_value=800): ''' 3d预训练归一 ''' x1,x2,x3,x4 = input_data.shape data = np.zeros_like(input_data) for idx in range(x1): for jdx in range(x2): data[idx, jdx] = z_score_to_normalize_2d(input_data[idx, jdx]) return data def generate_node_all_label_id_df(node_time=None): ''' 查询条件: 1、系统显示 dicom_file_study.status != 5 patient_info.status != 1 2、标注状态是正常 user_label.status != 1 关联查询 user_label.study_id = dicom_file_study.id # user_label.pid == Null dicom_file_study.patient_info_id = patient_info.id 查询步骤: 1、先查询user_label所有数据,再过滤 2、根据dicom_file_study、patient_info 筛选数据 3、筛选条件: user_label.study_id = dicom_file_study.id dicom_file_study.patient_info_id = patient_info.id dicom_file_study.status != 5 patient_info.status != 1 user_label.status != 1 user_label.deleted_time == None user_label.node_time == node_time 返回值: label_ids: 所有label_id ''' if node_time is None: return None session = conect_mysql() logger.info(f"start query") query = session.query( UserLabel.node_time, UserLabel.id, PatientInfo.patient_id, UserLabel.study_id, UserLabel.series_id, DicomStudy.study_uid, DicomStudy.folder_name, DicomSeries.series_instance_uid ).join( DicomStudy, UserLabel.study_id == DicomStudy.id ).join( PatientInfo, DicomStudy.patient_info_id == PatientInfo.id ).join( DicomSeries, UserLabel.series_id == DicomSeries.id ).filter( and_( DicomStudy.status != 5, PatientInfo.status != 1, UserLabel.status != 1, UserLabel.deleted_time == None, UserLabel.node_time == node_time ) ) result = query.all() node_times = [row[0] for row in result] label_ids = [row[1] for row in result] patient_ids = [row[2] for row in result] study_ids = [row[3] for row in result] series_ids = [row[4] for row in result] study_uids = [row[5] for row in result] folder_names = [row[6] for row in result] series_instance_uids = [row[7] for row in result] session.close() df = pd.DataFrame({'node_time': node_times, 'label_id': label_ids, 'patient_id': patient_ids, 'study_id': study_ids, 'series_id': series_ids, 'study_uid': study_uids, 'folder_name': folder_names, 'series_instance_uid': series_instance_uids}) df["patient_id"] = df["patient_id"].astype(str) df["study_id"] = df["study_id"].astype(str) df["series_id"] = df["series_id"].astype(str) df["study_uid"] = df["study_uid"].astype(str) df["folder_name"] = df["folder_name"].astype(str) df["series_instance_uid"] = df["series_instance_uid"].astype(str) return df def select_single_label_id(label_id=None): session = conect_mysql() label = session.query(UserLabel).filter( and_(UserLabel.id == label_id)).first() if label is None: return None, f"{label_id}, 标注数据不存在" node_time = label.node_time bundle = session.query(DicomSeries).filter( and_(DicomSeries.id == label.series_id)).first() if bundle is None: return None, f"{label_id}, 关联的dicom数据不存在" delineations = session.query(UserLabelDelineation).filter( and_(UserLabelDelineation.label_id == label_id, UserLabelDelineation.status == 0)).order_by( UserLabelDelineation.z_index.asc()).all() session.close() return (label, bundle, delineations), "success" def generate_single_series_data_by_label_id(label_id=None, dicom_folder="", crop_size_3d=[48, 256, 256], expand=40, rotate_count=10, return_2d_data_flag=False): (label, bundle, delineations), result = select_single_label_id(label_id=label_id) patient_id = bundle.patient_id series_instance_uid = bundle.series_instance_uid data, selected_box, rotated_data_3d, node_time, rotated_data_2d, update_select_box_info_dict = None, None, None, None, None, None if result != "success": return data, selected_box, rotated_data_3d, node_time, rotated_data_2d, update_select_box_info_dict, patient_id, series_instance_uid dicom_path = f"{dicom_folder}/{patient_id}-{series_instance_uid}" cts = get_cts(dicom_path) data = cts.get_raw_data() spacing = cts.get_raw_spacing() mask = np.zeros((len(data), len(data[1]), len(data[1][1])), np.uint8) node_time = label.node_time z_count = 0 for delineation in delineations: if (delineation.contour is None or len(delineation.contour) == 0) and delineation.meta is None: continue indexlist, indexs, img_np = base64_to_list(delineation.contour) if delineation.contour is None and delineation.meta: indexlist, indexs, img_np = meta_to_list(delineation.meta, mask[0].copy()) mask[delineation.z_index] = img_np z_count += 1 if mask is not None and np.sum(mask == 1) > 0: coords = np.asarray(np.where(mask == 1)) selected_box = np.zeros((3, 2), np.float32) selected_box[:, 0] = coords.min(axis=1) selected_box[:, 1] = coords.max(axis=1) + 1 if selected_box[0][1] - selected_box[0][0] != z_count: logger.info(f"z轴长度不一致, selected_box: {selected_box}, z_count: {z_count}") selected_box[0][1] = selected_box[0][0] + z_count if selected_box[0][1] - selected_box[0][0] > crop_size_3d[0]: logger.info(f"z轴长度超过crop_size_3d, selected_box: {selected_box}, crop_size_3d: {crop_size_3d}") selected_box[0][1] = selected_box[0][0] + crop_size_3d[0] logger.info(f"selected_box: {selected_box}") rotated_data_3d, update_select_box_info_dict = data_rotation_3d(data=data, select_box=selected_box, crop_size=crop_size_3d, expand=expand, num_rotations=rotate_count) z_index_list_3d = update_select_box_info_dict["z_index_list"] z_index_list_2d = [] if return_2d_data_flag: rotated_data_2d = np.transpose(rotated_data_3d, (1, 0, 2, 3)) z_index_dict = update_select_box_info_dict["z_index_dict"] z_index_list = update_select_box_info_dict["z_index_list"] z_min, z_max = selected_box[0][0], min(selected_box[0][1], z_index_list[-1]) if z_min > z_max: raise Exception(f"generate 2d data, z_min: {z_min} > z_max: {z_max}") update_z_index_list = list(range(z_index_dict[z_min], z_index_dict[z_max])) rotated_data_2d = rotated_data_2d[update_z_index_list] z_index_list_2d = [idx_z for idx_z in range(int(z_min), int(z_max))] return data, selected_box, rotated_data_3d, node_time, rotated_data_2d, update_select_box_info_dict, patient_id, series_instance_uid, z_index_list_3d, z_index_list_2d def generate_npy_data_by_single_label_id(label_id=None, generate_3d_npy_data_flag=True, generate_2d_npy_data_flag=True, dicom_folder="", crop_size_3d=None, crop_size_2d=None, rotate_count=10, expand=40, save_path="", regular_class_3d=None, regular_class_2d=None): if label_id is None: return "label_id is None" data, selected_box, rotated_data_3d, node_time, rotated_data_2d, update_select_box_info_dict, patient_id, series_instance_uid, z_index_list_3d, z_index_list_2d = generate_single_series_data_by_label_id(label_id=label_id, dicom_folder=dicom_folder, crop_size_3d=crop_size_3d, rotate_count=rotate_count, expand=expand, return_2d_data_flag=generate_2d_npy_data_flag) if not os.path.exists(save_path): Path(save_path).mkdir(parents=True, exist_ok=True) npy_data_3d_file_list = [] npy_data_3d_z_index_list = [] npy_data_3d_rotate_count_list = [] npy_data_2d_file_list = [] npy_data_2d_z_index_list = [] npy_data_2d_rotate_count_list = [] if generate_3d_npy_data_flag and regular_class_3d: regular_str = "" if regular_class_3d is normalize_net_3d: regular_str = "normalize_net_3d" elif regular_class_3d is s3d_normalize_3d: regular_str = "s3d_normalize_3d" logger.info(f"data_3d shape: {rotated_data_3d.shape}, {regular_str}") regular_data_3d = regular_class_3d(rotated_data_3d) for idx_rotate_count in range(rotate_count): idx_data_3d = regular_data_3d[idx_rotate_count, :, :, :] idx_npy_data_3d_file = f"{save_path}/{node_time}_{label_id}_3d_rotate_10_{crop_size_3d[0]}_{crop_size_3d[1]}_{crop_size_3d[2]}_current_rotate_{idx_rotate_count+1}.npy" np.save(idx_npy_data_3d_file, idx_data_3d) logger.info(f"save 3d npy data -> {idx_npy_data_3d_file}, current rotate: {idx_rotate_count+1}") npy_data_3d_file_list.append(idx_npy_data_3d_file) npy_data_3d_z_index_list.append(z_index_list_3d) npy_data_3d_rotate_count_list.append(idx_rotate_count+1) if generate_2d_npy_data_flag and regular_class_2d: regular_str = "" if regular_class_2d is normalize_net_2d: regular_str = "normalize_net_2d" elif regular_class_2d is d2d_normalize: regular_str = "d2d_normalize" logger.info(f"data_2d shape: {rotated_data_2d.shape}, {regular_str}") regular_data_2d = regular_class_2d(rotated_data_2d) for idx_z_index_count, idx_z_index_2d in enumerate(z_index_list_2d): for idx_rotate_count in range(rotate_count): idx_data_2d = regular_data_2d[idx_z_index_count, idx_rotate_count, :, :] idx_npy_data_2d_file = f"{save_path}/{node_time}_{label_id}_2d_rotate_10_{crop_size_2d[0]}_{crop_size_2d[1]}_z_{idx_z_index_2d}_current_rotate_{idx_rotate_count+1}.npy" np.save(idx_npy_data_2d_file, idx_data_2d) logger.info(f"save 2d npy data -> {idx_npy_data_2d_file}, current z_index: {idx_z_index_2d}, current rotate: {idx_rotate_count+1}") npy_data_2d_file_list.append(idx_npy_data_2d_file) npy_data_2d_z_index_list.append(idx_z_index_2d) npy_data_2d_rotate_count_list.append(idx_rotate_count+1) return patient_id, series_instance_uid, update_select_box_info_dict, npy_data_3d_file_list, npy_data_2d_file_list, npy_data_3d_z_index_list, npy_data_3d_rotate_count_list, npy_data_2d_z_index_list, npy_data_2d_rotate_count_list def generate_npy_data_by_all_label_id_df(csv_file=None, npy_data_3d_file=None, npy_data_2d_file=None, dicom_folder="/opt/lung/ai", generate_3d_npy_data_flag=None, generate_2d_npy_data_flag=None, crop_size_3d=None, crop_size_2d=None, rotate_count=10, expand=40, regular_class_3d=None, regular_class_2d=None, save_path=""): node_df = pd.read_csv(csv_file) count = 0 data_3d_node_list = [] data_3d_label_id_list = [] data_3d_file_list = [] data_3d_z_index_list = [] data_3d_patient_id_list = [] data_3d_series_instance_uid_list = [] data_3d_rotate_count_list = [] data_2d_node_list = [] data_2d_label_id_list = [] data_2d_file_list = [] data_2d_z_index_list = [] data_2d_patient_id_list = [] data_2d_rotate_count_list = [] data_2d_series_instance_uid_list = [] for idx in tqdm(range(len(node_df))): node_time = node_df.loc[idx, 'node_time'] label_id = node_df.loc[idx, 'label_id'] idx_patient_id = node_df.loc[idx, 'patient_id'] idx_series_instance_uid = node_df.loc[idx, 'series_instance_uid'] patient_id, series_instance_uid, update_select_box_info_dict, npy_data_3d_file_list, npy_data_2d_file_list, npy_data_3d_z_index_list, npy_data_3d_rotate_count_list, npy_data_2d_z_index_list, npy_data_2d_rotate_count_list = generate_npy_data_by_single_label_id( label_id=label_id, dicom_folder=dicom_folder, generate_3d_npy_data_flag=generate_3d_npy_data_flag, generate_2d_npy_data_flag=generate_2d_npy_data_flag, crop_size_3d=crop_size_3d, crop_size_2d=crop_size_2d, rotate_count=rotate_count, expand=expand, save_path=save_path, regular_class_3d=regular_class_3d, regular_class_2d=regular_class_2d ) if generate_3d_npy_data_flag: assert len(npy_data_3d_file_list) == len(npy_data_3d_z_index_list) == len(npy_data_3d_rotate_count_list) data_3d_node_list += [node_time] * len(npy_data_3d_file_list) data_3d_label_id_list += [label_id] * len(npy_data_3d_file_list) data_3d_file_list += npy_data_3d_file_list data_3d_z_index_list += npy_data_3d_z_index_list data_3d_rotate_count_list += npy_data_3d_rotate_count_list data_3d_patient_id_list += [patient_id] * len(npy_data_3d_file_list) data_3d_series_instance_uid_list += [series_instance_uid] * len(npy_data_3d_file_list) if generate_2d_npy_data_flag: assert len(npy_data_2d_file_list) == len(npy_data_2d_z_index_list) == len(npy_data_2d_rotate_count_list) data_2d_node_list += [node_time] * len(npy_data_2d_file_list) data_2d_label_id_list += [label_id] * len(npy_data_2d_file_list) data_2d_file_list += npy_data_2d_file_list data_2d_z_index_list += npy_data_2d_z_index_list data_2d_rotate_count_list += npy_data_2d_rotate_count_list data_2d_patient_id_list += [patient_id] * len(npy_data_2d_file_list) data_2d_series_instance_uid_list += [series_instance_uid] * len(npy_data_2d_file_list) if generate_3d_npy_data_flag: npy_data_3d_df = pd.DataFrame({ "node": data_3d_node_list, "label_id": data_3d_label_id_list, "z_index": data_3d_z_index_list, "rotate_count": data_3d_rotate_count_list, "patient_id": data_3d_patient_id_list, "series_instance_uid": data_3d_series_instance_uid_list, "npy_file": data_3d_file_list, }) npy_data_3d_df.to_csv(npy_data_3d_file, index=False, encoding="utf-8") if generate_2d_npy_data_flag: npy_data_2d_df = pd.DataFrame({ "node": data_2d_node_list, "label_id": data_2d_label_id_list, "z_index": data_2d_z_index_list, "rotate_count": data_2d_rotate_count_list, "patient_id": data_2d_patient_id_list, "series_instance_uid": data_2d_series_instance_uid_list, "npy_file": data_2d_file_list, }) npy_data_2d_df.to_csv(npy_data_2d_file, index=False, encoding="utf-8") if generate_3d_npy_data_flag: logger.info(f"数据处理,保存 npy csv , npy data_3d -> {npy_data_3d_file}") if generate_2d_npy_data_flag: logger.info(f"数据处理,保存 npy csv , npy data_2d -> {npy_data_2d_file}") return def get_node_time_all_label_ids_df(node_time=None, csv_data_dir=""): if node_time is None: return None df = generate_node_all_label_id_df(node_time=node_time) task_info = datetime.now().strftime("%Y%m%d_%H%M%S") csv_file = f"{csv_data_dir}/{node_time}/{node_time}_{task_info}_rotate_10.csv" Path(csv_file).parent.mkdir(parents=True, exist_ok=True) df.to_csv(csv_file, index=False, encoding="utf-8") logger.info(f"save csv data -> {csv_file}") return csv_file def generate_train_npy_csv_file(node_npy_pos_neg_list = None, net_id_list = None, net_id_crop_size_dict = None, node_net_id_npy_file_dict = None, csv_data_dir="", train_csv_dir="", train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, is_pad_df=True, is_save_csv=False, seed=100004): def check_npy_file(node=None, net_id=None, crop_size=None, npy_file_list=None): check_list = [] if net_id == "3d": crop_size_str = f"_{crop_size[0]}_{crop_size[1]}_{crop_size[2]}_" elif net_id == "2d": crop_size_str = f"_{crop_size[0]}_{crop_size[1]}_" elif net_id == "2d3d": crop_size_str_2d = f"_{crop_size[1]}_{crop_size[2]}_" crop_size_str_3d = f"_{crop_size[0]}_{crop_size[1]}_{crop_size[2]}_" elif net_id == "s3d": crop_size_str = f"_{crop_size[0]}_{crop_size[1]}_{crop_size[2]}_" elif net_id == "d2d": crop_size_str = f"_{crop_size[0]}_{crop_size[1]}_" if net_id == "2d3d": for idx_npy_file in npy_file_list: if f"{node}_" in idx_npy_file and (crop_size_str_2d in idx_npy_file or crop_size_str_3d in idx_npy_file): check_list.append(True) else: for idx_npy_file in npy_file_list: if f"{node}_" in idx_npy_file and crop_size_str in idx_npy_file: check_list.append(True) return len(check_list) == len(npy_file_list) for node_net, npy_file_list in node_net_id_npy_file_dict.items(): node, net_id = node_net[0], node_net[1] crop_size = net_id_crop_size_dict[net_id] if net_id == "2d3d": npy_file_list = npy_file_list["2d"] + npy_file_list["3d"] if not check_npy_file(node=node, net_id=net_id, crop_size=crop_size, npy_file_list=npy_file_list): print(f"{node_net} npy_file_list check failed") from sklearn.model_selection import train_test_split def pad_df(df, max_len): if len(df) == max_len: return df elif len(df) > max_len: return df[:max_len] else: pad_df_list = [df] lens = len(df) while lens < max_len: pad_df_list.append(df) lens += len(df) pad_df = pd.concat(pad_df_list, ignore_index=True) return pad_df[:max_len] # 遍历正负node, 遍历net_id, 返回对应的训练数据 node_net_file_dict = {} for idx_node_pos_neg_dict in node_npy_pos_neg_list: idx_node_pos_list = idx_node_pos_neg_dict["pos"] idx_node_neg_list = idx_node_pos_neg_dict["neg"] idx_node_pos_list_str = "_".join(list(map(str, idx_node_pos_list))) idx_node_neg_list_str = "_".join(list(map(str, idx_node_neg_list))) for idx_net_id in net_id_list: idx_task_str = f"{idx_node_neg_list_str}_{idx_node_pos_list_str}_net_id_{idx_net_id}" if idx_net_id == "2d3d": idx_pos_npy_file_list_2d = [] idx_pos_npy_file_list_3d = [] idx_neg_npy_file_list_2d = [] idx_neg_npy_file_list_3d = [] for idx_node_pos in idx_node_pos_list: idx_pos_npy_file_list_2d += node_net_id_npy_file_dict[(idx_node_pos, idx_net_id)]["2d"] idx_pos_npy_file_list_3d += node_net_id_npy_file_dict[(idx_node_pos, idx_net_id)]["3d"] for idx_node_neg in idx_node_neg_list: idx_neg_npy_file_list_2d += node_net_id_npy_file_dict[(idx_node_neg, idx_net_id)]["2d"] idx_neg_npy_file_list_3d += node_net_id_npy_file_dict[(idx_node_neg, idx_net_id)]["3d"] node_net_file_dict[(idx_task_str, idx_net_id)] = { "pos_2d": idx_pos_npy_file_list_2d, "pos_3d": idx_pos_npy_file_list_3d, "neg_2d": idx_neg_npy_file_list_2d, "neg_3d": idx_neg_npy_file_list_3d } else: idx_pos_npy_file_list = [] idx_neg_npy_file_list = [] for idx_node_pos in idx_node_pos_list: idx_pos_npy_file_list += node_net_id_npy_file_dict[(idx_node_pos, idx_net_id)] for idx_node_neg in idx_node_neg_list: idx_neg_npy_file_list += node_net_id_npy_file_dict[(idx_node_neg, idx_net_id)] node_net_file_dict[(idx_task_str, idx_net_id)] = { "pos": idx_pos_npy_file_list, "neg": idx_neg_npy_file_list } for idx_task_str_net_id, idx_pos_neg_npy_file_dict in node_net_file_dict.items(): idx_task_str, idx_net_id = idx_task_str_net_id[0], idx_task_str_net_id[1] if idx_net_id == "2d3d": idx_pos_2d_npy_file_list = idx_pos_neg_npy_file_dict["pos_2d"] idx_pos_3d_npy_file_list = idx_pos_neg_npy_file_dict["pos_3d"] idx_neg_2d_npy_file_list = idx_pos_neg_npy_file_dict["neg_2d"] idx_neg_3d_npy_file_list = idx_pos_neg_npy_file_dict["neg_3d"] idx_pos_2d_node_list = [idx_pos_2d_npy_file.split('_')[0] for idx_pos_2d_npy_file in idx_pos_2d_npy_file_list] idx_pos_3d_node_list = [idx_pos_3d_npy_file.split('_')[0] for idx_pos_3d_npy_file in idx_pos_3d_npy_file_list] idx_neg_2d_node_list = [idx_neg_2d_npy_file.split('_')[0] for idx_neg_2d_npy_file in idx_neg_2d_npy_file_list] idx_neg_3d_node_list = [idx_neg_3d_npy_file.split('_')[0] for idx_neg_3d_npy_file in idx_neg_3d_npy_file_list] logger.info(f"idx_task_str: {idx_task_str}, idx_pos_2d_node_list: {idx_pos_2d_node_list}, {idx_pos_2d_npy_file_list}\nidx_pos_3d_node_list: {idx_pos_3d_node_list}, {idx_pos_3d_npy_file_list}\nidx_neg_2d_node_list: {idx_neg_2d_node_list}, {idx_neg_2d_npy_file_list}\nidx_neg_3d_node_list: {idx_neg_3d_node_list}, {idx_neg_3d_npy_file_list}") _idx_pos_2d_df_list = [pd.read_csv(f"{csv_data_dir}/{idx_pos_2d_npy_file.split('_')[0]}/{idx_pos_2d_npy_file}") for idx_pos_2d_npy_file in idx_pos_2d_npy_file_list] _idx_pos_3d_df_list = [pd.read_csv(f"{csv_data_dir}/{idx_pos_3d_npy_file.split('_')[0]}/{idx_pos_3d_npy_file}") for idx_pos_3d_npy_file in idx_pos_3d_npy_file_list] _idx_neg_2d_df_list = [pd.read_csv(f"{csv_data_dir}/{idx_neg_2d_npy_file.split('_')[0]}/{idx_neg_2d_npy_file}") for idx_neg_2d_npy_file in idx_neg_2d_npy_file_list] _idx_neg_3d_df_list = [pd.read_csv(f"{csv_data_dir}/{idx_neg_3d_npy_file.split('_')[0]}/{idx_neg_3d_npy_file}") for idx_neg_3d_npy_file in idx_neg_3d_npy_file_list] idx_pos_2d_df_list = [] idx_pos_3d_df_list = [] idx_neg_2d_df_list = [] idx_neg_3d_df_list = [] for idx_node, idx_df in zip(idx_pos_2d_node_list, _idx_pos_2d_df_list): idx_df["node"] = idx_node idx_pos_2d_df_list.append(idx_df) for idx_node, idx_df in zip(idx_pos_3d_node_list, _idx_pos_3d_df_list): idx_df["node"] = idx_node idx_pos_3d_df_list.append(idx_df) for idx_node, idx_df in zip(idx_neg_2d_node_list, _idx_neg_2d_df_list): idx_df["node"] = idx_node idx_neg_2d_df_list.append(idx_df) for idx_node, idx_df in zip(idx_neg_3d_node_list, _idx_neg_3d_df_list): idx_df["node"] = idx_node idx_neg_3d_df_list.append(idx_df) idx_pos_2d_train_df_list = [] idx_pos_2d_val_df_list = [] idx_pos_2d_test_df_list = [] idx_pos_3d_train_df_list = [] idx_pos_3d_val_df_list = [] idx_pos_3d_test_df_list = [] idx_neg_2d_train_df_list = [] idx_neg_2d_val_df_list = [] idx_neg_2d_test_df_list = [] idx_neg_3d_train_df_list = [] idx_neg_3d_val_df_list = [] idx_neg_3d_test_df_list = [] for idx_pos_2d_df in idx_pos_2d_df_list: idx_pos_2d_train_df, idx_pos_2d_test_val_df = train_test_split(idx_pos_2d_df, test_size=1-train_ratio, random_state=seed) idx_pos_2d_val_df, idx_pos_2d_test_df = train_test_split(idx_pos_2d_test_val_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=seed) idx_pos_2d_train_df_list.append(idx_pos_2d_train_df) idx_pos_2d_val_df_list.append(idx_pos_2d_val_df) idx_pos_2d_test_df_list.append(idx_pos_2d_test_df) for idx_pos_3d_df in idx_pos_3d_df_list: idx_pos_3d_train_df, idx_pos_3d_test_val_df = train_test_split(idx_pos_3d_df, test_size=1-train_ratio, random_state=seed) idx_pos_3d_val_df, idx_pos_3d_test_df = train_test_split(idx_pos_3d_test_val_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=seed) idx_pos_3d_train_df_list.append(idx_pos_3d_train_df) idx_pos_3d_val_df_list.append(idx_pos_3d_val_df) idx_pos_3d_test_df_list.append(idx_pos_3d_test_df) for idx_neg_2d_df in idx_neg_2d_df_list: idx_neg_2d_train_df, idx_neg_2d_test_val_df = train_test_split(idx_neg_2d_df, test_size=1-train_ratio, random_state=seed) idx_neg_2d_val_df, idx_neg_2d_test_df = train_test_split(idx_neg_2d_test_val_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=seed) idx_neg_2d_train_df_list.append(idx_neg_2d_train_df) idx_neg_2d_val_df_list.append(idx_neg_2d_val_df) idx_neg_2d_test_df_list.append(idx_neg_2d_test_df) for idx_neg_3d_df in idx_neg_3d_df_list: idx_neg_3d_train_df, idx_neg_3d_test_val_df = train_test_split(idx_neg_3d_df, test_size=1-train_ratio, random_state=seed) idx_neg_3d_val_df, idx_neg_3d_test_df = train_test_split(idx_neg_3d_test_val_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=seed) idx_neg_3d_train_df_list.append(idx_neg_3d_train_df) idx_neg_3d_val_df_list.append(idx_neg_3d_val_df) idx_neg_3d_test_df_list.append(idx_neg_3d_test_df) idx_pos_2d_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_train_df_list] idx_pos_2d_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_val_df_list] idx_pos_2d_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_test_df_list] idx_pos_3d_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_train_df_list] idx_pos_3d_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_val_df_list] idx_pos_3d_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_test_df_list] idx_neg_2d_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_train_df_list] idx_neg_2d_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_val_df_list] idx_neg_2d_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_test_df_list] idx_neg_3d_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_train_df_list] idx_neg_3d_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_val_df_list] idx_neg_3d_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_test_df_list] print(f"idx_task_str: {idx_task_str}, 同类数据填充 data_2d, before\n训练集: {idx_pos_2d_train_df_lens_list}, {idx_neg_2d_train_df_lens_list}\n验证集: {idx_pos_2d_val_df_lens_list}, {idx_neg_2d_val_df_lens_list}\n测试集: {idx_pos_2d_test_df_lens_list}, {idx_neg_2d_test_df_lens_list}") print(f"idx_task_str: {idx_task_str}, 同类数据填充 data_3d, before\n训练集: {idx_pos_3d_train_df_lens_list}, {idx_neg_3d_train_df_lens_list}\n验证集: {idx_pos_3d_val_df_lens_list}, {idx_neg_3d_val_df_lens_list}\n测试集: {idx_pos_3d_test_df_lens_list}, {idx_neg_3d_test_df_lens_list}") if is_pad_df: idx_pos_2d_train_df_list = [pad_df(idx_df, max(idx_pos_2d_train_df_lens_list)) for idx_df in idx_pos_2d_train_df_list] # idx_pos_2d_val_df_list = [pad_df(idx_df, max(idx_pos_2d_val_df_lens_list)) for idx_df in idx_pos_2d_val_df_list] # idx_pos_2d_test_df_list = [pad_df(idx_df, max(idx_pos_2d_test_df_lens_list)) for idx_df in idx_pos_2d_test_df_list] idx_pos_3d_train_df_list = [pad_df(idx_df, max(idx_pos_3d_train_df_lens_list)) for idx_df in idx_pos_3d_train_df_list] # idx_pos_3d_val_df_list = [pad_df(idx_df, max(idx_pos_3d_val_df_lens_list)) for idx_df in idx_pos_3d_val_df_list] # idx_pos_3d_test_df_list = [pad_df(idx_df, max(idx_pos_3d_test_df_lens_list)) for idx_df in idx_pos_3d_test_df_list] idx_neg_2d_train_df_list = [pad_df(idx_df, max(idx_neg_2d_train_df_lens_list)) for idx_df in idx_neg_2d_train_df_list] # idx_neg_2d_val_df_list = [pad_df(idx_df, max(idx_neg_2d_val_df_lens_list)) for idx_df in idx_neg_2d_val_df_list] # idx_neg_2d_test_df_list = [pad_df(idx_df, max(idx_neg_2d_test_df_lens_list)) for idx_df in idx_neg_2d_test_df_list] idx_neg_3d_train_df_list = [pad_df(idx_df, max(idx_neg_3d_train_df_lens_list)) for idx_df in idx_neg_3d_train_df_list] # idx_neg_3d_val_df_list = [pad_df(idx_df, max(idx_neg_3d_val_df_lens_list)) for idx_df in idx_neg_3d_val_df_list] # idx_neg_3d_test_df_list = [pad_df(idx_df, max(idx_neg_3d_test_df_lens_list)) for idx_df in idx_neg_3d_test_df_list] idx_pos_2d_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_train_df_list] idx_pos_2d_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_val_df_list] idx_pos_2d_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_test_df_list] idx_pos_3d_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_train_df_list] idx_pos_3d_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_val_df_list] idx_pos_3d_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_test_df_list] idx_neg_2d_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_train_df_list] idx_neg_2d_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_val_df_list] idx_neg_2d_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_test_df_list] idx_neg_3d_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_train_df_list] idx_neg_3d_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_val_df_list] idx_neg_3d_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_test_df_list] print(f"idx_task_str: {idx_task_str}, 同类数据填充 data_2d, after\n训练集: {idx_pos_2d_train_df_lens_list}, {idx_neg_2d_train_df_lens_list}\n验证集: {idx_pos_2d_val_df_lens_list}, {idx_neg_2d_val_df_lens_list}\n测试集: {idx_pos_2d_test_df_lens_list}, {idx_neg_2d_test_df_lens_list}") print(f"idx_task_str: {idx_task_str}, 同类数据填充 data_3d, after\n训练集: {idx_pos_3d_train_df_lens_list}, {idx_neg_3d_train_df_lens_list}\n验证集: {idx_pos_3d_val_df_lens_list}, {idx_neg_3d_val_df_lens_list}\n测试集: {idx_pos_3d_test_df_lens_list}, {idx_neg_3d_test_df_lens_list}") if is_pad_df: # 训练集 idx_pos_2d_train_df_lens = sum(idx_pos_2d_train_df_lens_list) idx_pos_3d_train_df_lens = sum(idx_pos_3d_train_df_lens_list) idx_neg_2d_train_df_lens = sum(idx_neg_2d_train_df_lens_list) idx_neg_3d_train_df_lens = sum(idx_neg_3d_train_df_lens_list) if idx_pos_2d_train_df_lens > idx_neg_2d_train_df_lens: idx_neg_2d_each_lens = idx_pos_2d_train_df_lens // len(idx_neg_2d_train_df_lens_list) idx_neg_2d_train_df_list = [pad_df(idx_df, idx_neg_2d_each_lens) for idx_df in idx_neg_2d_train_df_list] elif idx_pos_2d_train_df_lens < idx_neg_2d_train_df_lens: idx_pos_2d_each_lens = idx_neg_2d_train_df_lens // len(idx_pos_2d_train_df_lens_list) idx_pos_2d_train_df_list = [pad_df(idx_df, idx_pos_2d_each_lens) for idx_df in idx_pos_2d_train_df_list] if idx_pos_3d_train_df_lens > idx_neg_3d_train_df_lens: idx_neg_3d_each_lens = idx_pos_3d_train_df_lens // len(idx_neg_3d_train_df_lens_list) idx_neg_3d_train_df_list = [pad_df(idx_df, idx_neg_3d_each_lens) for idx_df in idx_neg_3d_train_df_list] elif idx_pos_3d_train_df_lens < idx_neg_3d_train_df_lens: idx_pos_3d_each_lens = idx_neg_3d_train_df_lens // len(idx_pos_3d_train_df_lens_list) idx_pos_3d_train_df_list = [pad_df(idx_df, idx_pos_3d_each_lens) for idx_df in idx_pos_3d_train_df_list] # # 验证集 # idx_pos_2d_val_df_lens = sum(idx_pos_2d_val_df_lens_list) # idx_neg_2d_val_df_lens = sum(idx_neg_2d_val_df_lens_list) # idx_pos_3d_val_df_lens = sum(idx_pos_3d_val_df_lens_list) # idx_neg_3d_val_df_lens = sum(idx_neg_3d_val_df_lens_list) # if idx_pos_2d_val_df_lens > idx_neg_2d_val_df_lens: # idx_neg_2d_each_lens = idx_pos_2d_val_df_lens // len(idx_neg_2d_val_df_lens_list) # idx_neg_2d_val_df_list = [pad_df(idx_df, idx_neg_2d_each_lens) for idx_df in idx_neg_2d_val_df_list] # elif idx_pos_2d_val_df_lens < idx_neg_2d_val_df_lens: # idx_pos_2d_each_lens = idx_neg_2d_val_df_lens // len(idx_pos_2d_val_df_lens_list) # idx_pos_2d_val_df_list = [pad_df(idx_df, idx_pos_2d_each_lens) for idx_df in idx_pos_2d_val_df_list] # if idx_pos_3d_val_df_lens > idx_neg_3d_val_df_lens: # idx_neg_3d_each_lens = idx_pos_3d_val_df_lens // len(idx_neg_3d_val_df_lens_list) # idx_neg_3d_val_df_list = [pad_df(idx_df, idx_neg_3d_each_lens) for idx_df in idx_neg_3d_val_df_list] # elif idx_pos_3d_val_df_lens < idx_neg_3d_val_df_lens: # idx_pos_3d_each_lens = idx_neg_3d_val_df_lens // len(idx_pos_3d_val_df_lens_list) # idx_pos_3d_val_df_list = [pad_df(idx_df, idx_pos_3d_each_lens) for idx_df in idx_pos_3d_val_df_list] # # 测试集 # idx_pos_2d_test_df_lens = sum(idx_pos_2d_test_df_lens_list) # idx_neg_2d_test_df_lens = sum(idx_neg_2d_test_df_lens_list) # idx_pos_3d_test_df_lens = sum(idx_pos_3d_test_df_lens_list) # idx_neg_3d_test_df_lens = sum(idx_neg_3d_test_df_lens_list) # if idx_pos_2d_test_df_lens > idx_neg_2d_test_df_lens: # idx_neg_2d_each_lens = idx_pos_2d_test_df_lens // len(idx_neg_2d_test_df_lens_list) # idx_neg_2d_test_df_list = [pad_df(idx_df, idx_neg_2d_each_lens) for idx_df in idx_neg_2d_test_df_list] # elif idx_pos_2d_test_df_lens < idx_neg_2d_test_df_lens: # idx_pos_2d_each_lens = idx_neg_2d_test_df_lens // len(idx_pos_2d_test_df_lens_list) # idx_pos_2d_test_df_list = [pad_df(idx_df, idx_pos_2d_each_lens) for idx_df in idx_pos_2d_test_df_list] # if idx_pos_3d_test_df_lens > idx_neg_3d_test_df_lens: # idx_neg_3d_each_lens = idx_pos_3d_test_df_lens // len(idx_neg_3d_test_df_lens_list) # idx_neg_3d_test_df_list = [pad_df(idx_df, idx_neg_3d_each_lens) for idx_df in idx_neg_3d_test_df_list] # elif idx_pos_3d_test_df_lens < idx_neg_3d_test_df_lens: # idx_pos_3d_each_lens = idx_neg_3d_test_df_lens // len(idx_pos_3d_test_df_lens_list) # idx_pos_3d_test_df_list = [pad_df(idx_df, idx_pos_3d_each_lens) for idx_df in idx_pos_3d_test_df_list] idx_pos_2d_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_train_df_list] idx_pos_2d_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_val_df_list] idx_pos_2d_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_2d_test_df_list] idx_pos_3d_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_train_df_list] idx_pos_3d_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_val_df_list] idx_pos_3d_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_3d_test_df_list] idx_neg_2d_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_train_df_list] idx_neg_2d_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_val_df_list] idx_neg_2d_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_2d_test_df_list] idx_neg_3d_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_train_df_list] idx_neg_3d_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_val_df_list] idx_neg_3d_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_3d_test_df_list] print(f"idx_task_str: {idx_task_str}, 平衡后数据 data_2d\n训练集: {idx_pos_2d_train_df_lens_list}, {idx_neg_2d_train_df_lens_list}\n验证集: {idx_pos_2d_val_df_lens_list}, {idx_neg_2d_val_df_lens_list}\n测试集: {idx_pos_2d_test_df_lens_list}, {idx_neg_2d_test_df_lens_list}") print(f"idx_task_str: {idx_task_str}, 平衡后数据 data_3d\n训练集: {idx_pos_3d_train_df_lens_list}, {idx_neg_3d_train_df_lens_list}\n验证集: {idx_pos_3d_val_df_lens_list}, {idx_neg_3d_val_df_lens_list}\n测试集: {idx_pos_3d_test_df_lens_list}, {idx_neg_3d_test_df_lens_list}") idx_pos_2d_train_df = pd.concat(idx_pos_2d_train_df_list, ignore_index=True) idx_pos_2d_train_df['label'] = 1 idx_neg_2d_train_df = pd.concat(idx_neg_2d_train_df_list, ignore_index=True) idx_neg_2d_train_df['label'] = 0 idx_data_2d_train_df = pd.concat([idx_pos_2d_train_df, idx_neg_2d_train_df], ignore_index=True) idx_pos_3d_train_df = pd.concat(idx_pos_3d_train_df_list, ignore_index=True) idx_pos_3d_train_df['label'] = 1 idx_neg_3d_train_df = pd.concat(idx_neg_3d_train_df_list, ignore_index=True) idx_neg_3d_train_df['label'] = 0 idx_data_3d_train_df = pd.concat([idx_pos_3d_train_df, idx_neg_3d_train_df], ignore_index=True) idx_pos_2d_val_df = pd.concat(idx_pos_2d_val_df_list, ignore_index=True) idx_pos_2d_val_df['label'] = 1 idx_neg_2d_val_df = pd.concat(idx_neg_2d_val_df_list, ignore_index=True) idx_neg_2d_val_df['label'] = 0 idx_data_2d_val_df = pd.concat([idx_pos_2d_val_df, idx_neg_2d_val_df], ignore_index=True) idx_pos_3d_val_df = pd.concat(idx_pos_3d_val_df_list, ignore_index=True) idx_pos_3d_val_df['label'] = 1 idx_neg_3d_val_df = pd.concat(idx_neg_3d_val_df_list, ignore_index=True) idx_neg_3d_val_df['label'] = 0 idx_data_3d_val_df = pd.concat([idx_pos_3d_val_df, idx_neg_3d_val_df], ignore_index=True) idx_pos_2d_test_df = pd.concat(idx_pos_2d_test_df_list, ignore_index=True) idx_pos_2d_test_df['label'] = 1 idx_neg_2d_test_df = pd.concat(idx_neg_2d_test_df_list, ignore_index=True) idx_neg_2d_test_df['label'] = 0 idx_data_2d_test_df = pd.concat([idx_pos_2d_test_df, idx_neg_2d_test_df], ignore_index=True) idx_pos_3d_test_df = pd.concat(idx_pos_3d_test_df_list, ignore_index=True) idx_pos_3d_test_df['label'] = 1 idx_neg_3d_test_df = pd.concat(idx_neg_3d_test_df_list, ignore_index=True) idx_neg_3d_test_df['label'] = 0 idx_data_3d_test_df = pd.concat([idx_pos_3d_test_df, idx_neg_3d_test_df], ignore_index=True) idx_2d_train_df_file = f"{train_csv_dir}/{idx_task_str}_data_2d_train.csv" idx_2d_val_df_file = f"{train_csv_dir}/{idx_task_str}_data_2d_val.csv" idx_2d_test_df_file = f"{train_csv_dir}/{idx_task_str}_data_2d_test.csv" idx_3d_train_df_file = f"{train_csv_dir}/{idx_task_str}_data_3d_train.csv" idx_3d_val_df_file = f"{train_csv_dir}/{idx_task_str}_data_3d_val.csv" idx_3d_test_df_file = f"{train_csv_dir}/{idx_task_str}_data_3d_test.csv" data_2d_train_lens = len(idx_data_2d_train_df) data_3d_train_lens = len(idx_data_3d_train_df) print(f"idx_task_str: {idx_task_str}, data_2d, data_3d before 训练集\n{data_2d_train_lens}, {data_3d_train_lens}") if data_2d_train_lens > data_3d_train_lens: idx_data_3d_train_df = pad_df(idx_data_3d_train_df, data_2d_train_lens) elif data_2d_train_lens < data_3d_train_lens: idx_data_2d_train_df = pad_df(idx_data_2d_train_df, data_3d_train_lens) data_2d_train_lens = len(idx_data_2d_train_df) data_3d_train_lens = len(idx_data_3d_train_df) print(f"idx_task_str: {idx_task_str}, data_2d, data_3d after 训练集\n{data_2d_train_lens}, {data_3d_train_lens}") print(f"idx_task_str: {idx_task_str}\ntrain_2d_df: {len(idx_data_2d_train_df)}\nval_2d_df: {len(idx_data_2d_val_df)}\ntest_2d_df: {len(idx_data_2d_test_df)}\n") print(f"idx_task_str: {idx_task_str}\ntrain_3d_df: {len(idx_data_3d_train_df)}\nval_3d_df: {len(idx_data_3d_val_df)}\ntest_3d_df: {len(idx_data_3d_test_df)}\n") assert idx_data_2d_train_df['label'].isnull().sum() == 0 assert idx_data_3d_train_df['label'].isnull().sum() == 0 assert len(idx_data_2d_train_df) == len(idx_data_3d_train_df) if is_save_csv: idx_data_2d_train_df.to_csv(idx_2d_train_df_file, index=False, encoding="utf-8") idx_data_2d_val_df.to_csv(idx_2d_val_df_file, index=False, encoding="utf-8") idx_data_2d_test_df.to_csv(idx_2d_test_df_file, index=False, encoding="utf-8") idx_data_3d_train_df.to_csv(idx_3d_train_df_file, index=False, encoding="utf-8") idx_data_3d_val_df.to_csv(idx_3d_val_df_file, index=False, encoding="utf-8") idx_data_3d_test_df.to_csv(idx_3d_test_df_file, index=False, encoding="utf-8") logger.info(f"task_info: {idx_task_str}\ntrain_2d_df_file: {idx_2d_train_df_file}\nval_2d_df_file: {idx_2d_val_df_file}\ntest_2d_df_file: {idx_2d_test_df_file}\n") logger.info(f"task_info: {idx_task_str}\ntrain_3d_df_file: {idx_3d_train_df_file}\nval_3d_df_file: {idx_3d_val_df_file}\ntest_3d_df_file: {idx_3d_test_df_file}\n") else: idx_pos_npy_file_list = idx_pos_neg_npy_file_dict["pos"] idx_neg_npy_file_list = idx_pos_neg_npy_file_dict["neg"] idx_pos_node_list = [idx_pos_npy_file.split('_')[0] for idx_pos_npy_file in idx_pos_npy_file_list] idx_neg_node_list = [idx_neg_npy_file.split('_')[0] for idx_neg_npy_file in idx_neg_npy_file_list] logger.info(f"idx_task_str: {idx_task_str}, idx_pos_node_list: {idx_pos_node_list}, {idx_pos_npy_file_list}\nidx_neg_node_list: {idx_neg_node_list}, {idx_neg_npy_file_list}") _idx_pos_df_list = [pd.read_csv(f"{csv_data_dir}/{idx_pos_npy_file.split('_')[0]}/{idx_pos_npy_file}") for idx_pos_npy_file in idx_pos_npy_file_list] _idx_neg_df_list = [pd.read_csv(f"{csv_data_dir}/{idx_neg_npy_file.split('_')[0]}/{idx_neg_npy_file}") for idx_neg_npy_file in idx_neg_npy_file_list] idx_pos_df_list = [] idx_neg_df_list = [] for idx_node, idx_df in zip(idx_pos_node_list, _idx_pos_df_list): idx_df["node"] = idx_node idx_pos_df_list.append(idx_df) for idx_node, idx_df in zip(idx_neg_node_list, _idx_neg_df_list): idx_df["node"] = idx_node idx_neg_df_list.append(idx_df) idx_pos_train_df_list = [] idx_pos_val_df_list = [] idx_pos_test_df_list = [] idx_neg_train_df_list = [] idx_neg_val_df_list = [] idx_neg_test_df_list = [] for idx_pos_df in idx_pos_df_list: logger.info(f"idx_task_str: {idx_task_str}, idx_pos_df: {len(idx_pos_df)}") idx_pos_train_df, idx_pos_test_val_df = train_test_split(idx_pos_df, test_size=1-train_ratio, random_state=seed) idx_pos_val_df, idx_pos_test_df = train_test_split(idx_pos_test_val_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=seed) idx_pos_train_df_list.append(idx_pos_train_df) idx_pos_val_df_list.append(idx_pos_val_df) idx_pos_test_df_list.append(idx_pos_test_df) for idx_neg_df in idx_neg_df_list: logger.info(f"idx_task_str: {idx_task_str}, idx_neg_df: {len(idx_neg_df)}") idx_neg_train_df, idx_neg_test_val_df = train_test_split(idx_neg_df, test_size=1-train_ratio, random_state=seed) idx_neg_val_df, idx_neg_test_df = train_test_split(idx_neg_test_val_df, test_size=test_ratio/(val_ratio+test_ratio), random_state=seed) idx_neg_train_df_list.append(idx_neg_train_df) idx_neg_val_df_list.append(idx_neg_val_df) idx_neg_test_df_list.append(idx_neg_test_df) idx_pos_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_train_df_list] idx_pos_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_val_df_list] idx_pos_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_test_df_list] idx_neg_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_train_df_list] idx_neg_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_val_df_list] idx_neg_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_test_df_list] print(f"idx_task_str: {idx_task_str}, 同类数据填充 before\n训练集: {idx_pos_train_df_lens_list}, {idx_neg_train_df_lens_list}\n验证集: {idx_pos_val_df_lens_list}, {idx_neg_val_df_lens_list}\n测试集: {idx_pos_test_df_lens_list}, {idx_neg_test_df_lens_list}") if is_pad_df: idx_pos_train_df_list = [pad_df(idx_df, max(idx_pos_train_df_lens_list)) for idx_df in idx_pos_train_df_list] idx_neg_train_df_list = [pad_df(idx_df, max(idx_neg_train_df_lens_list)) for idx_df in idx_neg_train_df_list] # idx_pos_val_df_list = [pad_df(idx_df, max(idx_pos_val_df_lens_list)) for idx_df in idx_pos_val_df_list] # idx_neg_val_df_list = [pad_df(idx_df, max(idx_neg_val_df_lens_list)) for idx_df in idx_neg_val_df_list] # idx_pos_test_df_list = [pad_df(idx_df, max(idx_pos_test_df_lens_list)) for idx_df in idx_pos_test_df_list] # idx_neg_test_df_list = [pad_df(idx_df, max(idx_neg_test_df_lens_list)) for idx_df in idx_neg_test_df_list] idx_pos_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_train_df_list] idx_pos_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_val_df_list] idx_pos_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_test_df_list] idx_neg_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_train_df_list] idx_neg_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_val_df_list] idx_neg_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_test_df_list] print(f"idx_task_str: {idx_task_str}, 同类数据填充 after\n训练集: {idx_pos_train_df_lens_list}, {idx_neg_train_df_lens_list}\n验证集: {idx_pos_val_df_lens_list}, {idx_neg_val_df_lens_list}\n测试集: {idx_pos_test_df_lens_list}, {idx_neg_test_df_lens_list}") if is_pad_df: # 训练集 idx_pos_train_df_lens = sum(idx_pos_train_df_lens_list) idx_neg_train_df_lens = sum(idx_neg_train_df_lens_list) if idx_pos_train_df_lens > idx_neg_train_df_lens: idx_neg_each_lens = idx_pos_train_df_lens // len(idx_neg_train_df_lens_list) idx_neg_train_df_list = [pad_df(idx_df, idx_neg_each_lens) for idx_df in idx_neg_train_df_list] elif idx_pos_train_df_lens < idx_neg_train_df_lens: idx_pos_each_lens = idx_neg_train_df_lens // len(idx_pos_train_df_lens_list) idx_pos_train_df_list = [pad_df(idx_df, idx_pos_each_lens) for idx_df in idx_pos_train_df_list] # # 验证集 # idx_pos_val_df_lens = sum(idx_pos_val_df_lens_list) # idx_neg_val_df_lens = sum(idx_neg_val_df_lens_list) # if idx_pos_val_df_lens > idx_neg_val_df_lens: # idx_neg_each_lens = idx_pos_val_df_lens // len(idx_neg_val_df_lens_list) # idx_neg_val_df_list = [pad_df(idx_df, idx_neg_each_lens) for idx_df in idx_neg_val_df_list] # elif idx_pos_val_df_lens < idx_neg_val_df_lens: # idx_pos_each_lens = idx_neg_val_df_lens // len(idx_pos_val_df_lens_list) # idx_pos_val_df_list = [pad_df(idx_df, idx_pos_each_lens) for idx_df in idx_pos_val_df_list] # # 测试集 # idx_pos_test_df_lens = sum(idx_pos_test_df_lens_list) # idx_neg_test_df_lens = sum(idx_neg_test_df_lens_list) # if idx_pos_test_df_lens > idx_neg_test_df_lens: # idx_neg_each_lens = idx_pos_test_df_lens // len(idx_neg_test_df_lens_list) # idx_neg_test_df_list = [pad_df(idx_df, idx_neg_each_lens) for idx_df in idx_neg_test_df_list] # elif idx_pos_test_df_lens < idx_neg_test_df_lens: # idx_pos_each_lens = idx_neg_test_df_lens // len(idx_pos_test_df_lens_list) # idx_pos_test_df_list = [pad_df(idx_df, idx_pos_each_lens) for idx_df in idx_pos_test_df_list] idx_pos_train_df_lens_list = [len(idx_df) for idx_df in idx_pos_train_df_list] idx_pos_val_df_lens_list = [len(idx_df) for idx_df in idx_pos_val_df_list] idx_pos_test_df_lens_list = [len(idx_df) for idx_df in idx_pos_test_df_list] idx_neg_train_df_lens_list = [len(idx_df) for idx_df in idx_neg_train_df_list] idx_neg_val_df_lens_list = [len(idx_df) for idx_df in idx_neg_val_df_list] idx_neg_test_df_lens_list = [len(idx_df) for idx_df in idx_neg_test_df_list] print(f"idx_task_str: {idx_task_str}, 平衡后数据\n训练集: {idx_pos_train_df_lens_list}, {idx_neg_train_df_lens_list}\n验证集: {idx_pos_val_df_lens_list}, {idx_neg_val_df_lens_list}\n测试集: {idx_pos_test_df_lens_list}, {idx_neg_test_df_lens_list}") idx_pos_train_df = pd.concat(idx_pos_train_df_list, ignore_index=True) idx_pos_train_df['label'] = 1 idx_neg_train_df = pd.concat(idx_neg_train_df_list, ignore_index=True) idx_neg_train_df['label'] = 0 idx_train_df = pd.concat([idx_pos_train_df, idx_neg_train_df], ignore_index=True) idx_pos_val_df = pd.concat(idx_pos_val_df_list, ignore_index=True) idx_pos_val_df['label'] = 1 idx_neg_val_df = pd.concat(idx_neg_val_df_list, ignore_index=True) idx_neg_val_df['label'] = 0 idx_val_df = pd.concat([idx_pos_val_df, idx_neg_val_df], ignore_index=True) idx_pos_test_df = pd.concat(idx_pos_test_df_list, ignore_index=True) idx_pos_test_df['label'] = 1 idx_neg_test_df = pd.concat(idx_neg_test_df_list, ignore_index=True) idx_neg_test_df['label'] = 0 idx_test_df = pd.concat([idx_pos_test_df, idx_neg_test_df], ignore_index=True) idx_train_df_file = f"{train_csv_dir}/{idx_task_str}_train.csv" idx_val_df_file = f"{train_csv_dir}/{idx_task_str}_val.csv" idx_test_df_file = f"{train_csv_dir}/{idx_task_str}_test.csv" print(f"idx_task_str: {idx_task_str}\ntrain_df: {len(idx_train_df)}\nval_df: {len(idx_val_df)}\ntest_df: {len(idx_test_df)}\n") assert idx_train_df['label'].isnull().sum() == 0 if is_save_csv: idx_train_df.to_csv(idx_train_df_file, index=False, encoding="utf-8") idx_val_df.to_csv(idx_val_df_file, index=False, encoding="utf-8") idx_test_df.to_csv(idx_test_df_file, index=False, encoding="utf-8") logger.info(f"task_info: {idx_task_str}\ntrain_df_file: {idx_train_df_file}\nval_df_file: {idx_val_df_file}\ntest_df_file: {idx_test_df_file}\n") return csv_data_dir = "/df_lung/cls_train_data/csv_data" npy_data_dir = "/df_lung/cls_train_data/npy_data" def get_train_data_info_csv(node_time_list=[]): for node_time in node_time_list: csv_file = get_node_time_all_label_ids_df(node_time=node_time, csv_data_dir=csv_data_dir) logger.info(f"{node_time}: {csv_file}\n") def process_npy(args): generate_npy_data_by_all_label_id_df( csv_file=args[0], npy_data_3d_file=args[1], npy_data_2d_file=args[2], dicom_folder=args[3], generate_3d_npy_data_flag=args[4], generate_2d_npy_data_flag=args[5], crop_size_3d=args[6], crop_size_2d=args[7], rotate_count=args[8], expand=args[9], regular_class_3d=args[10], regular_class_2d=args[11], save_path=args[12] ) logger.info(f"process_npy finished: csv_file: {args[0]}, crop_size_3d: {args[6]}, crop_size_2d: {args[7]}") def get_npy_data(node_csv_file_list=[], crop_size_list=[]): if False in [False for _ in node_csv_file_list if f"{_[0]}_" not in _[1]]: raise ValueError(f"node_csv_file_list: {node_csv_file_list}") process_args_list = [] for node_time, csv_file in node_csv_file_list: for idx_crop_size_dict in crop_size_list: crop_size_3d = idx_crop_size_dict["crop_size_3d"] crop_size_2d = idx_crop_size_dict["crop_size_2d"] generate_3d_npy_data_flag = idx_crop_size_dict["generate_3d_npy_data_flag"] generate_2d_npy_data_flag = idx_crop_size_dict["generate_2d_npy_data_flag"] regular_class_3d = idx_crop_size_dict["regular_class_3d"] regular_class_2d = idx_crop_size_dict["regular_class_2d"] save_path = f"{npy_data_dir}/{node_time}" Path(save_path).mkdir(parents=True, exist_ok=True) idx_npy_data_3d_file = None if generate_3d_npy_data_flag: idx_npy_data_3d_file = f"{csv_file.replace('.csv', f'_{crop_size_3d[0]}_{crop_size_3d[1]}_{crop_size_3d[2]}_npy_data_3d')}.csv" idx_npy_data_3d_file = os.path.join(save_path, idx_npy_data_3d_file) idx_npy_data_2d_file = None if generate_2d_npy_data_flag: idx_npy_data_2d_file = f"{csv_file.replace('.csv', f'_{crop_size_2d[0]}_{crop_size_2d[1]}_npy_data_2d')}.csv" idx_npy_data_2d_file = os.path.join(save_path, idx_npy_data_2d_file) process_args_list.append( [ csv_file, idx_npy_data_3d_file, idx_npy_data_2d_file, "/opt/lung/ai", generate_3d_npy_data_flag, generate_2d_npy_data_flag, crop_size_3d, crop_size_2d, 10, 40, regular_class_3d, regular_class_2d, save_path ] ) # generate_npy_data_by_all_label_id_df( # csv_file=csv_file, # npy_data_3d_file=idx_npy_data_3d_file, # npy_data_2d_file=idx_npy_data_2d_file, # dicom_folder="/opt/lung/ai", # generate_3d_npy_data_flag=generate_3d_npy_data_flag, # generate_2d_npy_data_flag=generate_2d_npy_data_flag, # crop_size_3d=crop_size_3d, # crop_size_2d=crop_size_2d, # rotate_count=10, # expand=40, # regular_class_3d=regular_class_3d, # regular_class_2d=regular_class_2d, # save_path=save_path # ) # 多进程数据 process_count = len(process_args_list) process_list = [] for idx in range(process_count): process_args = process_args_list[idx] idx_process = Process(target=process_npy, args=(process_args,)) idx_process.start() process_list.append(idx_process) for idx_process in process_list: idx_process.join() return # # 生成csv数据 # node_time_list = [2046, 2047, 2048, 2060, 2061, 2062, 3001, 4001, 5001, 6001, 1016] # get_train_data_info_csv(node_time_list=node_time_list) ''' 2021: /df_lung/cls_train_data/csv_data/2021/2021_20241204_094025_rotate_10.csv 2031: /df_lung/cls_train_data/csv_data/2031/2031_20241204_094025_rotate_10.csv 2041: /df_lung/cls_train_data/csv_data/2041/2041_20241204_094026_rotate_10.csv 1010: /df_lung/cls_train_data/csv_data/1010/1010_20241204_093726_rotate_10.csv 1020: /df_lung/cls_train_data/csv_data/1020/1020_20241204_093726_rotate_10.csv 2011: /df_lung/cls_train_data/csv_data/2011/2011_20241204_093726_rotate_10.csv 2046: /df_lung/cls_train_data/csv_data/2046/2046_20241211_155642_rotate_10.csv 2047: /df_lung/cls_train_data/csv_data/2047/2047_20241211_155642_rotate_10.csv 2048: /df_lung/cls_train_data/csv_data/2048/2048_20241211_155642_rotate_10.csv 2060: /df_lung/cls_train_data/csv_data/2060/2060_20241211_155643_rotate_10.csv 2061: /df_lung/cls_train_data/csv_data/2061/2061_20241211_155643_rotate_10.csv 2062: /df_lung/cls_train_data/csv_data/2062/2062_20241211_155643_rotate_10.csv 3001: /df_lung/cls_train_data/csv_data/3001/3001_20241211_155643_rotate_10.csv 4001: /df_lung/cls_train_data/csv_data/4001/4001_20241211_155643_rotate_10.csv 5001: /df_lung/cls_train_data/csv_data/5001/5001_20241211_155643_rotate_10.csv 6001: /df_lung/cls_train_data/csv_data/6001/6001_20241211_155643_rotate_10.csv 1016: /df_lung/cls_train_data/csv_data/1016/1016_20241211_155643_rotate_10.csv ''' # # 生成npy数据 # node_csv_file_list = [ # (2046, "/df_lung/cls_train_data/csv_data/2046/2046_20241211_155642_rotate_10.csv"), # (2047, "/df_lung/cls_train_data/csv_data/2047/2047_20241211_155642_rotate_10.csv"), # (2048, "/df_lung/cls_train_data/csv_data/2048/2048_20241211_155642_rotate_10.csv"), # (2060, "/df_lung/cls_train_data/csv_data/2060/2060_20241211_155643_rotate_10.csv"), # (2061, "/df_lung/cls_train_data/csv_data/2061/2061_20241211_155643_rotate_10.csv"), # (2062, "/df_lung/cls_train_data/csv_data/2062/2062_20241211_155643_rotate_10.csv"), # (3001, "/df_lung/cls_train_data/csv_data/3001/3001_20241211_155643_rotate_10.csv"), # (4001, "/df_lung/cls_train_data/csv_data/4001/4001_20241211_155643_rotate_10.csv"), # (5001, "/df_lung/cls_train_data/csv_data/5001/5001_20241211_155643_rotate_10.csv"), # (6001, "/df_lung/cls_train_data/csv_data/6001/6001_20241211_155643_rotate_10.csv"), # (1016, "/df_lung/cls_train_data/csv_data/1016/1016_20241211_155643_rotate_10.csv") # ] # crop_size_list = [ # { # "crop_size_3d": [48, 256, 256], # "crop_size_2d": [256, 256], # "generate_3d_npy_data_flag": True, # "generate_2d_npy_data_flag": True, # "regular_class_3d": normalize_net_3d, # "regular_class_2d": normalize_net_2d # }, # { # "crop_size_3d": [128, 128, 128], # "crop_size_2d": [128, 128], # "generate_3d_npy_data_flag": True, # "generate_2d_npy_data_flag": False, # "regular_class_3d": s3d_normalize_3d, # "regular_class_2d": None # }, # { # "crop_size_3d": [48, 280, 280], # "crop_size_2d": [280, 280], # "generate_3d_npy_data_flag": False, # "generate_2d_npy_data_flag": True, # "regular_class_3d": None, # "regular_class_2d": d2d_normalize # } # ] # get_npy_data(node_csv_file_list=node_csv_file_list, crop_size_list=crop_size_list) ''' ''' # 生成训练数据 net_id_list = ["3d", "2d", "2d3d", "s3d", "d2d"] node_npy_pos_neg_list = [ { "pos": [2031], "neg": [2021] }, { "pos": [2031], "neg": [2041] }, { "pos": [2031], "neg": [1010,1020,2011,2021,2041] }, { "pos": [2041], "neg": [1010,1020,2011,2021,2031] }, { "pos": [2031], "neg": [1020, 2011, 2021] }, { "pos": [2041], "neg": [2046, 2047, 2048, 2060, 2061, 2062, 3001, 4001, 5001, 6001, 2021, 2031] }, { "pos": [1010, 1020, 1016], "neg": [2011, 2021, 2031, 2041] }, { "pos": [2041], "neg": [2011, 2021, 2031] }, ] net_id_crop_size_dict = { "3d": [48, 256, 256], "2d": [256, 256], "2d3d": [48, 256, 256], "s3d": [128, 128, 128], "d2d": [280, 280] } node_net_id_npy_file_dict = { (2021, "3d"): ["2021_20241204_094025_rotate_10_48_256_256_npy_data_3d.csv"], (2021, "2d"): ["2021_20241204_094025_rotate_10_256_256_npy_data_2d.csv"], (2021, "2d3d"): { "2d": ["2021_20241204_094025_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2021_20241204_094025_rotate_10_48_256_256_npy_data_3d.csv"] }, (2021, "s3d"): ["2021_20241204_094025_rotate_10_128_128_128_npy_data_3d.csv"], (2021, "d2d"): ["2021_20241204_094025_rotate_10_280_280_npy_data_2d.csv"], (2031, "3d"): ["2031_20241204_094025_rotate_10_48_256_256_npy_data_3d.csv"], (2031, "2d"): ["2031_20241204_094025_rotate_10_256_256_npy_data_2d.csv"], (2031, "2d3d"): { "2d": ["2031_20241204_094025_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2031_20241204_094025_rotate_10_48_256_256_npy_data_3d.csv"] }, (2031, "s3d"): ["2031_20241204_094025_rotate_10_128_128_128_npy_data_3d.csv"], (2031, "d2d"): ["2031_20241204_094025_rotate_10_280_280_npy_data_2d.csv"], (2041, "3d"): ["2041_20241204_094026_rotate_10_48_256_256_npy_data_3d.csv"], (2041, "2d"): ["2041_20241204_094026_rotate_10_256_256_npy_data_2d.csv"], (2041, "2d3d"): { "2d": ["2041_20241204_094026_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2041_20241204_094026_rotate_10_48_256_256_npy_data_3d.csv"] }, (2041, "s3d"): ["2041_20241204_094026_rotate_10_128_128_128_npy_data_3d.csv"], (2041, "d2d"): ["2041_20241204_094026_rotate_10_280_280_npy_data_2d.csv"], (1010, "3d"): ["1010_20241204_093726_rotate_10_48_256_256_npy_data_3d.csv"], (1010, "2d"): ["1010_20241204_093726_rotate_10_256_256_npy_data_2d.csv"], (1010, "2d3d"): { "2d": ["1010_20241204_093726_rotate_10_256_256_npy_data_2d.csv"], "3d": ["1010_20241204_093726_rotate_10_48_256_256_npy_data_3d.csv"] }, (1010, "s3d"): ["1010_20241204_093726_rotate_10_128_128_128_npy_data_3d.csv"], (1010, "d2d"): ["1010_20241204_093726_rotate_10_280_280_npy_data_2d.csv"], (1020, "3d"): ["1020_20241204_093726_rotate_10_48_256_256_npy_data_3d.csv"], (1020, "2d"): ["1020_20241204_093726_rotate_10_256_256_npy_data_2d.csv"], (1020, "2d3d"): { "2d": ["1020_20241204_093726_rotate_10_256_256_npy_data_2d.csv"], "3d": ["1020_20241204_093726_rotate_10_48_256_256_npy_data_3d.csv"] }, (1020, "s3d"): ["1020_20241204_093726_rotate_10_128_128_128_npy_data_3d.csv"], (1020, "d2d"): ["1020_20241204_093726_rotate_10_280_280_npy_data_2d.csv"], (2011, "3d"): ["2011_20241204_093726_rotate_10_48_256_256_npy_data_3d.csv"], (2011, "2d"): ["2011_20241204_093726_rotate_10_256_256_npy_data_2d.csv"], (2011, "2d3d"): { "2d": ["2011_20241204_093726_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2011_20241204_093726_rotate_10_48_256_256_npy_data_3d.csv"] }, (2011, "s3d"): ["2011_20241204_093726_rotate_10_128_128_128_npy_data_3d.csv"], (2011, "d2d"): ["2011_20241204_093726_rotate_10_280_280_npy_data_2d.csv"], (2046, "3d"): ["2046_20241211_155642_rotate_10_48_256_256_npy_data_3d.csv"], (2046, "2d"): ["2046_20241211_155642_rotate_10_256_256_npy_data_2d.csv"], (2046, "2d3d"): { "2d": ["2046_20241211_155642_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2046_20241211_155642_rotate_10_48_256_256_npy_data_3d.csv"] }, (2046, "s3d"): ["2046_20241211_155642_rotate_10_128_128_128_npy_data_3d.csv"], (2046, "d2d"): ["2046_20241211_155642_rotate_10_280_280_npy_data_2d.csv"], (2047, "3d"): ["2047_20241211_155642_rotate_10_48_256_256_npy_data_3d.csv"], (2047, "2d"): ["2047_20241211_155642_rotate_10_256_256_npy_data_2d.csv"], (2047, "2d3d"): { "2d": ["2047_20241211_155642_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2047_20241211_155642_rotate_10_48_256_256_npy_data_3d.csv"] }, (2047, "s3d"): ["2047_20241211_155642_rotate_10_128_128_128_npy_data_3d.csv"], (2047, "d2d"): ["2047_20241211_155642_rotate_10_280_280_npy_data_2d.csv"], (2048, "3d"): ["2048_20241211_155642_rotate_10_48_256_256_npy_data_3d.csv"], (2048, "2d"): ["2048_20241211_155642_rotate_10_256_256_npy_data_2d.csv"], (2048, "2d3d"): { "2d": ["2048_20241211_155642_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2048_20241211_155642_rotate_10_48_256_256_npy_data_3d.csv"] }, (2048, "s3d"): ["2048_20241211_155642_rotate_10_128_128_128_npy_data_3d.csv"], (2048, "d2d"): ["2048_20241211_155642_rotate_10_280_280_npy_data_2d.csv"], (2060, "3d"): ["2060_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (2060, "2d"): ["2060_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (2060, "2d3d"): { "2d": ["2060_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2060_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (2060, "s3d"): ["2060_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (2060, "d2d"): ["2060_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], (2061, "3d"): ["2061_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (2061, "2d"): ["2061_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (2061, "2d3d"): { "2d": ["2061_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2061_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (2061, "s3d"): ["2061_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (2061, "d2d"): ["2061_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], (2062, "3d"): ["2062_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (2062, "2d"): ["2062_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (2062, "2d3d"): { "2d": ["2062_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["2062_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (2062, "s3d"): ["2062_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (2062, "d2d"): ["2062_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], (3001, "3d"): ["3001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (3001, "2d"): ["3001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (3001, "2d3d"): { "2d": ["3001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["3001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (3001, "s3d"): ["3001_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (3001, "d2d"): ["3001_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], (4001, "3d"): ["4001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (4001, "2d"): ["4001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (4001, "2d3d"): { "2d": ["4001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["4001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (4001, "s3d"): ["4001_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (4001, "d2d"): ["4001_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], (5001, "3d"): ["5001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (5001, "2d"): ["5001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (5001, "2d3d"): { "2d": ["5001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["5001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (5001, "s3d"): ["5001_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (5001, "d2d"): ["5001_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], (6001, "3d"): ["6001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (6001, "2d"): ["6001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (6001, "2d3d"): { "2d": ["6001_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["6001_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (6001, "s3d"): ["6001_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (6001, "d2d"): ["6001_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], (1016, "3d"): ["1016_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"], (1016, "2d"): ["1016_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], (1016, "2d3d"): { "2d": ["1016_20241211_155643_rotate_10_256_256_npy_data_2d.csv"], "3d": ["1016_20241211_155643_rotate_10_48_256_256_npy_data_3d.csv"] }, (1016, "s3d"): ["1016_20241211_155643_rotate_10_128_128_128_npy_data_3d.csv"], (1016, "d2d"): ["1016_20241211_155643_rotate_10_280_280_npy_data_2d.csv"], } csv_data_dir = "/df_lung/cls_train_data/csv_data" train_csv_dir = "/df_lung/cls_train_data/train_csv_data" is_pad_df = True is_save_csv = False seed = 100004 generate_train_npy_csv_file( node_npy_pos_neg_list = node_npy_pos_neg_list, net_id_list = net_id_list, net_id_crop_size_dict = net_id_crop_size_dict, node_net_id_npy_file_dict = node_net_id_npy_file_dict, csv_data_dir=csv_data_dir, train_csv_dir=train_csv_dir, is_pad_df=is_pad_df, is_save_csv=is_save_csv, seed=seed )