#!/bin/bash SHELL_FOLDER=$(dirname $(readlink -f "$0")) echo "cd to "$SHELL_FOLDER cd $SHELL_FOLDER ###################################################### # export NCCL_DEBUG=INFO # export NCCL_DEBUG_SUBSYS=ALL # export NCCL_SOCKET_IFNAME=eth0 export NCCL_IB_DISABLE=1 # export OMP_NUM_THREADS=1 ###################################################### export LOCAL_RANK=0 MIN_SIZE=$WORLD_SIZE MAX_SIZE=$WORLD_SIZE TRAINERS_PER_NODE=1 JOB_ID=ddppod master_addr=$MASTER_ADDR port=$MASTER_PORT HOST_NODE_ADDR=${master_addr}":"${port} RANK=$RANK echo "current settings" echo "WORLD_SIZE: "$WORLD_SIZE echo "MASTER_ADDR: "$MASTER_ADDR echo "MASTER_PORT: "$MASTER_PORT echo "RANK: "$RANK echo "LOCAL_RANK: "$LOCAL_RANK echo "TEST: "$TEST echo "running python" # python local_run_train_lung.py \ # --master_ip ${master_addr} \ # --port $port \ # --config 'webui' \ # --test $TEST \ # --model_type "mobilenet" \ # --use_webui \ # --use_ddp \ # --job_data_root $JOB_DATA_ROOT # python NoduleDensityClassifier/run_check.py if [ $? -eq 0 ]; then echo "check gpu is ok !" python NoduleDensityClassifier/run_train.py \ --master_ip ${master_addr} \ --port $port \ --world_size $WORLD_SIZE \ --rank $RANK \ --job_data_root $JOB_DATA_ROOT else echo "No gpus" exit 1 fi # python NoduleDensityClassifier/run_train.py \ # --master_ip ${master_addr} \ # --port $port \ # --world_size $WORLD_SIZE \ # --rank $RANK \ # --job_data_root $JOB_DATA_ROOT