import os import sys import glob import logging import numpy as np import pandas as pd from tqdm import tqdm from collections import * import SimpleITK as sitk from form_process import * logger = logging.getLogger() fh = logging.FileHandler('./log/ClsFormProcessor.log',encoding='utf-8') sh = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(sh) logger.setLevel(10) class ClsFormProcessor(object): def __init__(self,**kwargs): self.paths = kwargs.get('paths') ######################## 应该保证输入的多个表格格式一致 self.df = LoadDFs(self.paths) self.type_kws = kwargs.get('type_kws') ########### 用于指定 指定类别的列名 self.type_cor_kws = kwargs.get('type_cor_kws') self.form_save_path = kwargs.get('form_save_path') self.uid_save_path = kwargs.get('uid_save_path') self.data_base_path = kwargs.get('data_base_path') ########### 原图(dicom/nii.gz数据的存储路径) self.folder_name = kwargs.get('folder_name') self.bbox_kws = ['x1','x2','y1','y2','z1','z2'] self.iou_th = kwargs.get('iou_th',1.0) self.data_kw = kwargs.get('data_kw',u'影像结果') self.uid_kw = kwargs.get('uid_kw',u'序列编号') self.task_id_kw = kwargs.get('task_id_kw',u'任务ID') self.data_kw_convert = 'position' def _getInfoMapDict(self): ''' 需要子类重写 ''' type_map_dict = {} for kw_before,kw_after in zip(self.type_kws,self.type_cor_kws): type_map_dict[kw_before] = kw_after self.rename_dict = { self.task_id_kw:'task_id', self.uid_kw:'uid', self.data_kw:'data', } self.rename_dict.update(type_map_dict) ################# update kw related info def __removeChars(self,x): x = str(x).replace('[','').replace(']','') if x[-1]==',': x = x[:-1] return x def __splitComma(self,x,idx): return float(str(x).split(',')[idx]) def __GetSaveName(self): self.task_ids = self.df[self.task_id_kw].drop_duplicates().values self.task_ids.sort() self.task_ids = 'ALGRESULT_'+'_'.join(list([str(val) for val in self.task_ids])) def __call__(self): self._getInfoMapDict() self._FilterForm() self._SplitForm() self._CombineInfo() self._RemoveOverlap() self._ConverToPixelCoord() self._saveResult() def _FilterForm(self): ''' 需要子类重写 ''' return def _SplitForm(self): ### 1. Extract useful info ''' keep off bad tata ''' self.__GetSaveName() target_columns = self.rename_dict.keys() target_columns = list(target_columns) self.df = self.df[target_columns] self.df = self.df.rename(index=str,columns=self.rename_dict) self.df = self.df.assign(**self.df['data'].astype(str).apply(eval).apply(pd.Series)) target_columns_v2 = [self.rename_dict[key] for key in self.rename_dict.keys()] + ['position'] target_columns.append(self.data_kw_convert) df1 = self.df[self.df[self.data_kw_convert].isnull()] df2 = self.df[~self.df[self.data_kw_convert].isnull()] if len(df1)>0: df1[self.data_kw_convert] = df1['bounds'] self.df = df2 self.df = pd.concat([df1,df2],ignore_index=True) self.df = self.df[target_columns_v2] self.df[self.data_kw_convert] = self.df[self.data_kw_convert].apply(lambda x:self.__removeChars(x)) ############# Split to bbox for idx in range(len(self.bbox_kws)): self.df[self.bbox_kws[idx]] = self.df[self.data_kw_convert].apply(lambda x:self.__splitComma(x,idx)) def _CombineInfo(self): ''' necessary info 1. center_x,center_y,center_z 2. diameter_x,diameter_y,diameter_z ''' self.df['center_x'] = (self.df['x1']+self.df['x2'])/2.0 self.df['center_y'] = (self.df['y1']+self.df['y2'])/2.0 self.df['center_z'] = (self.df['z1']+self.df['z2'])/2.0 self.df['diameter_x_mm'] = abs(self.df['x1']-self.df['x2']) self.df['diameter_y_mm'] = abs(self.df['y1']-self.df['y2']) self.df['diameter_z_mm'] = abs(self.df['z1']-self.df['z2']) self.df['diameter_mm'] = self.df[['diameter_x_mm','diameter_y_mm','diameter_z_mm']].max(axis=1) def __RemoveOverlapSingleCase(self,uid): ''' 目前的逻辑是只要当前病灶与其他病灶iou大于阈值,就舍弃这个数据 ''' map_dict = defaultdict(set) df = self.df indices = df[df['uid']==uid].index.tolist() for idx_1 in indices: bbox_1 = [float(val) for val in df.position[idx_1].split(',')] for idx_2 in indices: bbox_2 = [float(val) for val in df.position[idx_2].split(',')] if idx_1 == idx_2: map_dict[idx_1].add(idx_1) continue iou = bbox_iou(bbox_1,bbox_2) if iou>self.iou_th: map_dict[idx_1].add(idx_2) valid_keys = [key for key in map_dict if len(map_dict[key])==1] return valid_keys def _RemoveOverlap(self): uids = self.df['uid'].drop_duplicates().values self.uids = uids indices = [] pbar = tqdm(uids) for uid in pbar: case_indices = self.__RemoveOverlapSingleCase(uid) indices += case_indices self.df = self.df.take(indices) self.df.reset_index(inplace=True) logger.info('Number of records after remove overlap %d '%(len(self.df))) def __converToPixelCoordSingleCase(self,uid): image_path = '%s/%s.nii.gz'%(self.data_base_path,uid) image_path = str(image_path) records = [] if os.path.exists(image_path): image = sitk.ReadImage(image_path) indices = self.df[self.df['uid']==uid].index.tolist() spacing = image.GetSpacing() for idx in indices: center_x,center_y,center_z = self.df.center_x[idx],self.df.center_y[idx],self.df.center_z[idx] px_x,px_y,px_z = image.TransformPhysicalPointToContinuousIndex([float(val) for val in [center_x,center_y,center_z]]) current_record = [image_path] +list((spacing))+ [px_x,px_y,px_z] + [center_x,center_y,center_z] records.append(current_record) return records def _ConverToPixelCoord(self): all_records = [] pbar = tqdm(self.df['uid'].drop_duplicates().values) for uid in pbar: case_records = self.__converToPixelCoordSingleCase(uid) if len(case_records)==0: continue all_records += case_records df0 = pd.DataFrame(all_records,columns=['image_path','spac_x','spac_y','spac_z','coordX','coordY','coordZ', 'center_x','center_y','center_z']) self.df[['center_x','center_y','center_z']] = self.df[['center_x','center_y','center_z']].astype('float').round(2) df0[['center_x','center_y','center_z']] = df0[['center_x','center_y','center_z']].astype('float').round(2) self.df = self.df.merge(df0,on=['center_x','center_y','center_z']) def _saveResult(self): form_save_base_path = '%s/%s/'%(self.form_save_path,self.folder_name) uid_save_base_path = '%s/%s/'%(self.uid_save_path,self.folder_name) form_save_path = '%s/%s.csv'%(form_save_base_path,self.task_ids) uid_save_path = '%s/%s.txt'%(uid_save_base_path,self.task_ids) if not os.path.exists(form_save_base_path): os.makedirs(form_save_base_path) if not os.path.exists(uid_save_base_path): os.makedirs(uid_save_base_path) self.df.to_csv(form_save_path,index=False) uid = self.df['uid'].drop_duplicates().values WriteTxt(self.uids,uid_save_path) if __name__ == "__main__": paths = ['/fileser/xupl/cls_data_preprocess/NoduleSur/20200710/CZ_LobuSameResult.csv'] type_kws = [u'结节表征1',u'结节表征2',u'结节表征3-是否分叶',u'结节表征4-胸膜是否凹陷',u'结节表征5-卫星灶', u'结节表征6-空泡',u'结节表征7-囊状多选'] type_cor_kws = ['regular','smooth','lobulated','p_sunken','satelliteFocal','Vacuolus','sacciform'] form_save_path = '/fileser/xupl/cls_data_preprocess/NoduleSur/preprocess/label' uid_save_path = '/fileser/xupl/cls_data_preprocess/NoduleSur/preprocess/uid' data_base_path = '/fileser/DATA/IMAGE/SOURCE/PROXIMA/RAW_NII' folder_name = '20210107_test' iou_th = 1.0 data_kw = u'影像结果' uid_kw = u'序列号' param_list = { 'paths':paths, 'type_kws':type_kws, 'type_cor_kws':type_cor_kws, 'form_save_path':form_save_path, 'uid_save_path':uid_save_path, 'data_base_path':data_base_path, 'folder_name':folder_name, 'iou_th':iou_th, 'data_kw':data_kw, 'uid_kw':uid_kw } processor = ClsFormProcessor(**param_list) processor()