import os import sys import glob import shutil import numpy as np import pandas as pd from tqdm import tqdm import SimpleITK as sitk from collections import defaultdict from skimage.measure import * sys.path.append('../utils') from respacing_func import * import logging logger = logging.getLogger() fh = logging.FileHandler('PrepData.log',encoding='utf-8') sh = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(sh) logger.setLevel(10) def ReadITKImage(path): itk_img = sitk.ReadImage(path) img_np = sitk.GetArrayFromImage(itk_img) return itk_img,img_np def WriteMask(mask,origin=None,spacing=None,path=''): if origin is None and spacing is None: itk_mask = mask else: itk_mask = sitk.GetImageFromArray(mask) itk_mask = sitk.Cast(itk_mask,sitk.sitkUInt8) itk_mask.SetSpacing(spacing) itk_mask.SetOrigin(origin) print ('Saving img to %s '%path) itkWriter = sitk.ImageFileWriter() itkWriter.SetUseCompression(True) itkWriter.SetFileName(path) itkWriter.Execute(itk_mask) def ReadCSV(path): if 'csv' in path: df = pd.read_csv(path,encoding='utf_8_sig') return df elif 'xlsx' in path: df = pd.read_excel(path,encoding='utf_8_sig') return df def LoadDFs(paths): result = [] for path in paths: if ('csv' not in path) and ('xlsx' not in path): continue df = ReadCSV(path) result.append(df) if len(result)>0: return pd.concat(result,ignore_index=True) class COVIDPre(object): def __init__(self,base_paths,df_paths,respac_df_path,area_th,bbox_df_path,uid_paths,health_uid_path,pos_uid_path,health_save_path=None,pos_save_path=None): self.img_base_path = base_paths[0] self.mask_base_path = base_paths[1] self.lung_mask_base_path = base_paths[2] self.target_spac = [1.0 for _ in range(3)] self.valtest_ratio = 0 self.test_ratio = 0.5 try: self.df = LoadDFs(df_paths) self.df_path = df_paths[0] except: self.df = None self.df_path = df_paths[0] self.respac_df_path = respac_df_path self.area_th = area_th self.bbox_df_path = bbox_df_path self.train_uid_path,self.val_uid_path,self.test_uid_path = uid_paths ##################### self.health_uid_path = health_uid_path self.pos_uid_path = pos_uid_path self.health_save_path = health_save_path self.pos_save_path = pos_save_path self.info_list = [] self.connected_component_choice = False def __call__(self): ''' 1. 将新处理的数据移到别处 ''' logger.info('step 1: shutildata') self._ShutilData() ''' 2. 针对self.df中的数据做resample ''' logger.info('step 2: resample image') self._ResampleBatch() ''' 3. 将respacaing后的数据的相关信息进行存储 ''' logger.info('step 3: Record respacing related info') self._GatherInfo() ''' 对respacing后的数据进行连通域分析(如果没有标注的mask可以不进行这一步) 如果不进行这一步,用uid产生合理的信息 ''' logger.info('step 4: Get bbox info') if self.connected_component_choice: self._ExtractConnectedComponents() else: health_uids = self._LoadUid(self.health_uid_path) pos_uids = self._LoadUid(self.pos_uid_path) self._GenerateInfoWithLungMask(uids=health_uids,base_save_path=self.health_save_path,kw='health', area=10000000000000) self._GenerateInfoWithLungMask(uids=pos_uids,base_save_path=self.pos_save_path,kw='pos',area=float('inf')) ''' 将respacing后的数据存储成npy格式 ''' logger.info('step 5: Convert To NPY') self._ConvertToNPY() def _GatherInfo(self): self.info_list = np.array(self.info_list) columns = ['uid','image_path','mask_path','lung_mask_path'] output_df = pd.DataFrame(self.info_list,columns=columns) output_df.to_csv(self.respac_df_path) self.output_df = output_df def _ResampleBatch(self): pbar = tqdm(np.arange(len(self.df))) for idx in pbar: try: self._ResampleImage(idx,self.target_spac) # print ('idx is %d, length of info list is %d'%(idx,len(self.info_list))) except Exception as err: print ('err',err) def _ResampleImage(self,idx,target_spacing): df = self.df uid,image_path,mask_path,lung_mask_path = df.uid[idx],df.image_path[idx],df.mask_path[idx],df.lung_mask_path[idx] if not os.path.exists(image_path) or not os.path.exists(mask_path) or not os.path.exists(lung_mask_path): return target_img_save_path ='%s/%s.nii.gz'%(self.img_base_path,uid) target_mask_save_path ='%s/%s.nii.gz'%(self.mask_base_path,uid) target_lung_mask_save_path ='%s/%s.nii.gz'%(self.lung_mask_base_path,uid) if os.path.exists(target_img_save_path) and os.path.exists(target_mask_save_path) and os.path.exists(target_lung_mask_save_path): self.info_list.append([uid,target_img_save_path,target_mask_save_path,target_lung_mask_save_path]) return else: image_itk,image_np = ReadITKImage(image_path) mask_itk,mask_np = ReadITKImage(mask_path) lung_mask_itk,lung_mask_np = ReadITKImage(lung_mask_path) respac_img = LinearResample(image_itk,target_spacing)[0] respac_mask = NearestResample(mask_itk,target_spacing) respac_lung_mask = NearestResample(lung_mask_itk,target_spacing) WriteMask(respac_img,path=target_img_save_path) WriteMask(respac_mask,path=target_mask_save_path) WriteMask(respac_lung_mask,path=target_lung_mask_save_path) self.info_list.append([uid,target_img_save_path,target_mask_save_path,target_lung_mask_save_path]) def _LoadUid(self,path): with open(path,'r') as f: uids = f.readlines() f.close() uids = [x.strip() for x in uids] return uids def _WriteUid(self,path,uids): with open(path,'w') as f: for uid in uids: f.write(uid+'\n') f.close() def _ExtractConnectedComponents(self): map_dict = defaultdict(list) pbar = tqdm(self.self.output_df['mask_path'].drop_duplicates().values) for mask_path in pbar: mask = sitk.GetArrayFromImage(sitk.ReadImage(mask_path)) mask = np.array(mask>0).astype(np.uint8) label_mask = label(mask) current_record = [] for region in regionprops(label_mask): area = region.area bbox = region.bbox if area>self.area_th: bbox_with_area = [int(val) for val in bbox]+[area] current_record.append(bbox_with_area) map_dict[mask_path] = current_record records = self.output_df[['uid','image_path','mask_path','lung_mask_path']].values uids = list(set(val[0] for val in records)) outputs = [] for record in records: uid,image_papth,mask_path,lung_mask_path = record bbox_list = map_dict[mask_path] for bbox in bbox_list: output_record = list(record)+list(bbox) outputs.append(output_record) output_columns = ['uid','image_path','mask_path','lung_mask_path','z_min','z_max','y_min','y_max','x_min','x_max','area'] bbox_df = pd.DataFrame(outputs,columns=output_columns) bbox_df.to_csv(self.bbox_df_path,index=False) ''' Add when to continue ''' ''' Split form ''' if os.path.exists(self.train_uid_path) and os.path.exists(self.val_uid_path) and os.path.exists(self.test_uid_path): train_uids,val_uids,test_uids = self.LoadDFs(self.train_uid_path),self.LoadDFs(self.val_uid_path),self.LoadDFs(self.test_uid_path) else: from sklearn.model_selection import train_test_split if self.valtest_ratio>0: val_flag = True train_uids,val_uids = train_test_split(uids, test_size=self.valtest_ratio, random_state=42) if self.test_ratio>0: test_flag = True val_uids,test_uids = train_test_split(val_uids, test_size=self.test_ratio, random_state=42) else: test_uids = None test_flag = False else: val_flag,test_flag = False,False val_udis,test_uids = None,None self._WriteUid(train_uids,self.train_uid_path) if val_flag: self._WriteUid(val_uids,self.val_uid_path) if test_flag: self._WriteUid(test_uids,self.test_uid_path) ''' split df based on uids & save ''' self.bbox_train = bbox_df[bbox_df['uid'].isin(train_uids)] self.bbox_train.to_csv(self.bbox_df_path.replace('.csv','_train.csv')) self.output_train = self.output_df[self.output_df['uid'].isin(train_uids)] self.output_train.to_csv(self.respac_df_path.replace('.csv','_train.csv')) logger.info('Save bbox_train to %s'%(self.bbox_df_path.replace('.csv','_train.csv'))) logger.info('Save output_train to %s'%(self.respac_df_path.replace('.csv','_train.csv'))) if val_flag: self.bbox_val = bbox_df[bbox_df['uid'].isin(val_uids)] self.bbox_val.to_csv(self.bbox_df_path.replace('.csv','_val.csv')) self.output_val = self.output_df[self.output_df['uid'].isin(val_uids)] self.output_val.to_csv(self.respac_df_path.replace('.csv','_val.csv')) logger.info('Save bbox_val to %s'%(self.bbox_df_path.replace('.csv','_val.csv'))) logger.info('Save output_val to %s'%(self.respac_df_path.replace('.csv','_val.csv'))) if test_flag: self.bbox_test = bbox_df[bbox_df['uid'].isin(test_uids)] self.bbox_test.to_csv(self.bbox_df_path.replace('.csv','_test.csv')) self.output_test = self.output_df[self.output_df['uid'].isin(test_uids)] self.output_test.to_csv(self.respac_df_path.replace('.csv','_test.csv')) logger.info('Save bbox_test to %s'%(self.bbox_df_path.replace('.csv','_test.csv'))) logger.info('Save output_test to %s'%(self.respac_df_path.replace('.csv','_test.csv'))) def _GenerateInfoWithLungMask(self,uids,base_save_path,kw,area=10000000000000): columns = [u'uid', u'image_path', u'mask_path', u'lung_mask_path', u'z_min', u'z_max', u'y_min', u'y_max', u'x_min', u'x_max', u'area'] records = [] pbar = tqdm(uids) for uid in pbar: img_path = '/fileser/CT_COVID/IMAGE/NII_1.0/%s.nii.gz'%uid lungmask_path = '/fileser/CT_COVID/LUNGMASK/NII_1.0/%s.nii.gz'%uid if not os.path.exists(img_path) or not os.path.exists(lungmask_path): continue mask = sitk.GetArrayFromImage(sitk.ReadImage(lungmask_path)) mask = np.array(mask>0).astype(np.uint8) label_mask = label(mask) if len(regionprops(label_mask))<2: continue for region in regionprops(label_mask)[:2]: bbox = region.bbox new_record = [uid,img_path,lungmask_path,lungmask_path]+list([int(val) for val in bbox])+[area] records.append(new_record) df = pd.DataFrame(records,columns=columns) df.to_csv('%s/%s_%d.csv'%(base_save_path,kw,len(uids))) def _ConvertToNPY(self): for base_path in [self.img_base_path,self.mask_base_path,self.lung_mask_base_path]: paths = sorted(glob.glob('%s/*'%base_path)) pbar = tqdm(paths) for path in pbar: target_path = path.replace('NII_1.0','1.0_NPY').replace('.nii.gz','.npy') if not os.path.exists(target_path): img = sitk.GetArrayFromImage(sitk.ReadImage(path)) print ('save path to %s'%target_path) np.save(target_path,img) def _ShutilData(self): paths = sorted(glob.glob('/fileser/CT_COVID/ALPHAMASK/*')) img_path_name = 'm_ptrRawImage.nii.gz' lungmask_path_name = 'm_seperate_lung_mask.nii.gz' target_img_base_path = '/fileser/CT_COVID/IMAGE/RAW/' target_lungmask_base_path = '/fileser/CT_COVID/LUNGMASK/RAW/' pbar = tqdm(paths) for path in pbar: lung_path = '%s/%s'%(path,lungmask_path_name) img_path = '%s/%s'%(path,img_path_name) if os.path.exists(lung_path) and os.path.exists(img_path): uid = lung_path.split('/')[-2] lungmask_des_path = '%s/%s.nii.gz'%(target_lungmask_base_path,uid) img_des_path = '%s/%s.nii.gz'%(target_img_base_path,uid) if not os.path.exists(lungmask_des_path): shutil.copy(lung_path,lungmask_des_path) if not os.path.exists(img_des_path): shutil.copy(img_path,img_des_path) uids = [x.split('/')[-1] for x in paths] print ('length of uids ',len(uids)) records = [[uid,'%s/%s.nii.gz'%(target_img_base_path,uid),'%s/%s.nii.gz'%(target_lungmask_base_path,uid),'%s/%s.nii.gz'%(target_lungmask_base_path,uid)] for uid in uids] self.df = pd.DataFrame(records,columns=['uid','image_path','mask_path','lung_mask_path']) self.df.to_csv(self.df_path,index=False) if __name__=='__main__': base_paths = [] for kw in ['IMAGE','MASK','LUNGMASK']: base_paths.append('/fileser/CT_COVID/%s/NII_1.0'%kw) df_paths = ['/fileser/zrx/COVID/label/279/279_info.csv'] respac_df_path = '/fileser/zrx/COVID/label/279/279_info_respac.csv' area_th = 50 bbox_df_path = '/fileser/zrx/COVID/label/279/279_info_respac_bbox.csv' uid_paths = [] uid_base_path = '/fileser/zrx/COVID/label/279/uids' for kw in ['train','val','test']: uid_paths.append('%s/%s_uid.txt'%(uid_base_path,kw)) health_uid_path = '/fileser/zrx/COVID/label/279/task_4069_health_uids.txt' pos_uid_path = '/fileser/zrx/COVID/label/279/task_4069_pos_uids.txt' health_save_path = '/fileser/CT_COVID/INFO/ALG_HEADER/287/NII_1.0_FORTRAIN/NEGDATA/' pos_save_path = '/fileser/CT_COVID/INFO/ALG_HEADER/287/NII_1.0_FORTRAIN/FAKEPOS/' COVID_obj = COVIDPre(base_paths,df_paths,respac_df_path,area_th,bbox_df_path,uid_paths,health_uid_path,pos_uid_path,health_save_path,pos_save_path) COVID_obj()