import sys, os import pathlib current_dir = pathlib.Path(__file__).parent.resolve() while "cls_train" != current_dir.name: current_dir = current_dir.parent sys.path.append(current_dir.as_posix()) import os import pickle from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.neural_network import MLPClassifier import xgboost as xgb import torch import torch.nn as nn from sklearn.metrics import classification_report import pandas as pd import numpy as np from pathlib import Path from cls_utils.log_utils import get_logger from sklearn.model_selection import train_test_split logger = get_logger(log_file="/df_lung/ai-project/cls_train/log/data/train_test_data_3d_feat.log") class LR: def __init__(self, in_feature=None, n_class=None, cls_info=None, save_path=None, logger=None): self.in_feature = in_feature self.n_class = n_class self.cls = LogisticRegression(max_iter=1000) self.cls_info = cls_info self.save_path = save_path self.logger = logger def train(self, train_np_data, train_np_class): self.cls.fit(train_np_data, train_np_class) file = self.save(100004) return file def predict(self, np_data): result = self.cls.predict_proba(np_data) return result def save(self, epoch): current_epoch_file = f"{self.cls_info}_epoch_{epoch}.pkl" current_epoch_file = os.path.join(self.save_path, current_epoch_file) with open(current_epoch_file, 'wb') as f: pickle.dump(self.cls, f) if self.logger: self.logger.info(f"LR saved to {current_epoch_file}") return current_epoch_file class DT: def __init__(self, in_feature=None, n_class=None, cls_info=None, save_path=None, logger=None): self.in_feature = in_feature self.n_class = n_class self.cls = DecisionTreeClassifier() self.cls_info = cls_info self.save_path = save_path self.logger = logger def train(self, train_np_data, train_np_class): self.cls.fit(train_np_data, train_np_class) file = self.save(100004) return file def predict(self, np_data): return self.cls.predict_proba(np_data) def save(self, epoch): current_epoch_file = f"{self.cls_info}_epoch_{epoch}.pkl" current_epoch_file = os.path.join(self.save_path, current_epoch_file) with open(current_epoch_file, 'wb') as f: pickle.dump(self.cls, f) if self.logger: self.logger.info(f"DT saved to {current_epoch_file}") return current_epoch_file class NB: def __init__(self, in_feature=None, n_class=None, cls_info=None, save_path=None, logger=None): self.in_feature = in_feature self.n_class = n_class self.cls = GaussianNB() self.cls_info = cls_info self.save_path = save_path self.logger = logger def train(self, train_np_data, train_np_class): self.cls.fit(train_np_data, train_np_class) file = self.save(100004) return file def predict(self, np_data): return self.cls.predict_proba(np_data) def save(self, epoch): current_epoch_file = f"{self.cls_info}_epoch_{epoch}.pkl" current_epoch_file = os.path.join(self.save_path, current_epoch_file) with open(current_epoch_file, 'wb') as f: pickle.dump(self.cls, f) if self.logger: self.logger.info(f"NB saved to {current_epoch_file}") return current_epoch_file class RF: def __init__(self, in_feature=None, n_class=None, cls_info=None, save_path=None, logger=None, n_estimators=100): self.in_feature = in_feature self.n_class = n_class self.cls = RandomForestClassifier(n_estimators=n_estimators, random_state=42) self.cls_info = cls_info self.save_path = save_path self.logger = logger def train(self, train_np_data, train_np_class): self.cls.fit(train_np_data, train_np_class) file = self.save(100004) return file def predict(self, np_data): return self.cls.predict_proba(np_data) def save(self, epoch): current_epoch_file = f"{self.cls_info}_epoch_{epoch}.pkl" current_epoch_file = os.path.join(self.save_path, current_epoch_file) with open(current_epoch_file, 'wb') as f: pickle.dump(self.cls, f) if self.logger: self.logger.info(f"Random Forest saved to {current_epoch_file}") return current_epoch_file class GBDT: def __init__(self, in_feature=None, n_class=None, cls_info=None, save_path=None, logger=None, n_estimators=100): self.in_feature = in_feature self.n_class = n_class self.cls = GradientBoostingClassifier(n_estimators=n_estimators, random_state=42) self.cls_info = cls_info self.save_path = save_path self.logger = logger def train(self, train_np_data, train_np_class): self.cls.fit(train_np_data, train_np_class) file = self.save(100004) return file def predict(self, np_data): return self.cls.predict_proba(np_data) def save(self, epoch): current_epoch_file = f"{self.cls_info}_epoch_{epoch}.pkl" current_epoch_file = os.path.join(self.save_path, current_epoch_file) with open(current_epoch_file, 'wb') as f: pickle.dump(self.cls, f) if self.logger: self.logger.info(f"GBDT saved to {current_epoch_file}") return current_epoch_file class XGB: def __init__(self, in_feature=None, n_class=None, cls_info=None, save_path=None, logger=None, n_estimators=100): self.in_feature = in_feature self.n_class = n_class self.cls = xgb.XGBClassifier(n_estimators=n_estimators, use_label_encoder=False, eval_metric='logloss', random_state=42) self.cls_info = cls_info self.save_path = save_path self.logger = logger def train(self, train_np_data, train_np_class): self.cls.fit(train_np_data, train_np_class) file = self.save(100004) return file def predict(self, np_data): return self.cls.predict_proba(np_data) def save(self, epoch): current_epoch_file = f"{self.cls_info}_epoch_{epoch}.pkl" current_epoch_file = os.path.join(self.save_path, current_epoch_file) self.cls.save_model(current_epoch_file) if self.logger: self.logger.info(f"XGBoost saved to {current_epoch_file}") return current_epoch_file def load_predict(self, pkl_file): if pkl_file and os.path.exists(pkl_file): self.cls.load_model(pkl_file) else: raise Exception(f'cannot load file: {pkl_file}') class Linear: def __init__(self, in_feature=None, n_class=None, cls_info=None, save_path=None, logger=None): self.in_feature = in_feature self.n_class = n_class self.cls = MLPClassifier( hidden_layer_sizes=(10 * in_feature, in_feature, 40), activation='relu', solver='adam', max_iter=200, random_state=42 ) self.cls_info = cls_info self.save_path = save_path self.logger = logger def train(self, train_np_data, train_np_class): self.cls.fit(train_np_data, train_np_class) file = self.save(100004) return file def predict(self, np_data): return self.cls.predict_proba(np_data) def save(self, epoch): current_epoch_file = f"{self.cls_info}_epoch_{epoch}.pkl" current_epoch_file = os.path.join(self.save_path, current_epoch_file) with open(current_epoch_file, 'wb') as f: pickle.dump(self.cls, f) if self.logger: self.logger.info(f"MLP saved to {current_epoch_file}") return current_epoch_file def load_predict(self, pkl_file): if pkl_file and os.path.exists(pkl_file): with open(pkl_file, 'rb') as f: self.cls = pickle.load(f) else: raise Exception(f'cannot load file: {pkl_file}') class FeatCls: def __init__(self, classification_name=None, n_estimators=100, in_feature_col_list=None, class_col=None, n_class=None, pkl_file=None, cls_task_info=None, usg=None, save_dir=None, log_dir=None): self.classification_name = classification_name self.n_estimators = n_estimators self.in_feature_col_list = in_feature_col_list self.in_feature = len(self.in_feature_col_list) self.class_col = class_col self.n_class = n_class self.pkl_file = pkl_file self.cls_task_info = cls_task_info self.usg = usg self.save_path = os.path.join(save_dir, self.cls_task_info) self.log_file = os.path.join(log_dir, f"{self.usg}_{self.cls_task_info}.log") Path(self.save_path).mkdir(parents=True, exist_ok=True) self.logger = get_logger(self.log_file) self.model = self.get_model() self.predict_model = self.load_predict() def get_model(self): self.support_classification = { "lr": LR( in_feature=self.in_feature, n_class=self.n_class, cls_info=self.cls_task_info, save_path=self.save_path, logger=self.logger ), "dt": DT( in_feature=self.in_feature, n_class=self.n_class, cls_info=self.cls_task_info, save_path=self.save_path, logger=self.logger ), "nb": NB( in_feature=self.in_feature, n_class=self.n_class, cls_info=self.cls_task_info, save_path=self.save_path, logger=self.logger ), "rf": RF( in_feature=self.in_feature, n_class=self.n_class, cls_info=self.cls_task_info, save_path=self.save_path, logger=self.logger, n_estimators=self.n_estimators ), "gbdt": GBDT( in_feature=self.in_feature, n_class=self.n_class, cls_info=self.cls_task_info, save_path=self.save_path, logger=self.logger, n_estimators=self.n_estimators ), "xgb": XGB( in_feature=self.in_feature, n_class=self.n_class, cls_info=self.cls_task_info, save_path=self.save_path, logger=self.logger, n_estimators=self.n_estimators ), "linear": Linear( in_feature=self.in_feature, n_class=self.n_class, cls_info=f"{self.cls_task_info}_mlp", save_path=self.save_path, logger=self.logger ), } if self.classification_name not in self.support_classification: raise Exception(f"Classification name not supported: {self.classification_name}") return self.support_classification[self.classification_name] def load_predict(self): if self.classification_name != 'xgb' and self.pkl_file and os.path.exists(self.pkl_file) and self.pkl_file.endswith('pkl'): with open(self.pkl_file, 'rb') as f: model = pickle.load(f) return model elif self.pkl_file and os.path.exists(self.pkl_file) and '.pt' in self.pkl_file: return self.model.load_state_dict(torch.load(self.pkl_file)) elif self.classification_name == 'xgb' and self.pkl_file: self.model.load_predict(self.pkl_file) return self.model elif self.pkl_file: raise Exception(f'cannot load file: {self.pkl_file}') def load_data(self, csv_file): if isinstance(csv_file, str) and os.path.isfile(csv_file): return pd.read_csv(csv_file, header=0) return csv_file def train(self, train_data): self.train_df = self.load_data(train_data) train_input_feat_df = self.train_df[self.in_feature_col_list] train_input_class_df = self.train_df[self.class_col] train_input_np_feat = train_input_feat_df.to_numpy() train_input_np_class = train_input_class_df.to_numpy() self.pkl_file = self.model.train(train_input_np_feat, train_input_np_class) self.logger.info(f"save to {self.pkl_file}") return self.pkl_file def test(self, test_data): test_df = self.load_data(test_data) test_input_feat_df = test_df[self.in_feature_col_list] test_input_class_df = test_df[self.class_col] test_input_np_feat = test_input_feat_df.to_numpy() test_input_np_class = test_input_class_df.to_numpy() test_predict_prob = self.predict_model.predict(test_input_np_feat) if len(test_predict_prob.shape) > 1 and test_predict_prob.shape[1] > 1: test_predict_prob = np.argmax(test_predict_prob, axis=1) classification_resport = classification_report( test_input_np_class, test_predict_prob, zero_division=0, output_dict=False ) if self.logger: self.logger.info(f"{self.usg}_{self.cls_task_info}, classification_resport:\n{classification_resport}") return classification_resport def predict(self, np_feat): result = self.predict_model.predict(np_feat) return result.tolist() n_features = 136 node_list = [2021, 2031, 2041, 1010, 1020, 2011, 2046, 2047, 2048, 2060, 2061, 2062, 3001, 4001, 5001, 6001, 1016] save_dir = "/df_lung/ai-project/cls_train/cls_ckpt" train_csv_dir = "/df_lung/cls_train_data/train_csv_data" log_dir = "/df_lung/ai-project/cls_train/log/train" for idx_pos_node in node_list: for idx_neg_node in node_list: if idx_pos_node == idx_neg_node: continue idx_train_file = f"{idx_neg_node}_{idx_pos_node}_data_3d_feature_train.csv" idx_val_file = f"{idx_neg_node}_{idx_pos_node}_data_3d_feature_val.csv" idx_test_file = f"{idx_neg_node}_{idx_pos_node}_data_3d_feature_test.csv" idx_train_file = os.path.join(train_csv_dir, idx_train_file) idx_val_file = os.path.join(train_csv_dir, idx_val_file) idx_test_file = os.path.join(train_csv_dir, idx_test_file) if os.path.exists(idx_train_file) and os.path.exists(idx_test_file): idx_train_df = pd.read_csv(idx_train_file) idx_test_df = pd.read_csv(idx_test_file) else: logger.info(f"{idx_pos_node}_{idx_neg_node} train_data_3d not exists") continue logger.info(f"{idx_neg_node}_{idx_pos_node}, train: {idx_train_df['class'].value_counts()}, test: {idx_test_df['class'].value_counts()}") idx_feature_list = idx_train_df.columns.tolist() idx_feature_list = [idx for idx in idx_feature_list if idx.endswith('zscore')] for idx_classification_name in ['lr', 'dt', 'nb', 'rf', 'gbdt', 'xgb', 'linear']: idx_train_cls = FeatCls( classification_name=idx_classification_name, in_feature_col_list=idx_feature_list, class_col='class', n_class=2, pkl_file=None, cls_task_info=f"{idx_classification_name}_cls_20241215", usg="train", save_dir=save_dir, log_dir=log_dir ) print(f"{idx_neg_node}_{idx_pos_node}_{idx_classification_name} start train") idx_pkl_file = idx_train_cls.train(idx_train_df) logger.info(f"{idx_neg_node}_{idx_pos_node}_{idx_classification_name}, idx_pkl_file: {idx_pkl_file}") idx_test_cls = FeatCls( classification_name=idx_classification_name, in_feature_col_list=idx_feature_list, class_col='class', n_class=2, pkl_file=idx_pkl_file, cls_task_info=f"{idx_classification_name}_cls_20241215", usg="test", save_dir=save_dir, log_dir=log_dir ) print(f"{idx_neg_node}_{idx_pos_node}_{idx_classification_name} start test ") idx_classification_report = idx_test_cls.test(idx_test_df) print(f"classification_report:\n{idx_classification_report}") with open("test_result.txt", 'a') as f: f.write(f"{idx_neg_node}_{idx_pos_node}_{idx_classification_name}, classification_report:\n{idx_classification_report}\n\n\n\n") logger.info(f"{idx_neg_node}_{idx_pos_node}_{idx_classification_name}, classification_report:\n{idx_classification_report}")