#!/bin/bash SHELL_FOLDER=$(dirname $(readlink -f "$0")) echo "cd to "$SHELL_FOLDER cd $SHELL_FOLDER ###################################################### # export NCCL_DEBUG=INFO # export NCCL_DEBUG_SUBSYS=ALL # export NCCL_SOCKET_IFNAME=eth0 export NCCL_IB_DISABLE=1 # export OMP_NUM_THREADS=1 ###################################################### export LOCAL_RANK=0 # export WORLD_SIZE=2 # export MASTER_ADDR='1.0.0.0' # export MASTER_PORT='12345' # export RANK=0 MIN_SIZE=$WORLD_SIZE MAX_SIZE=$WORLD_SIZE TRAINERS_PER_NODE=1 JOB_ID=ddppod master_addr=$MASTER_ADDR port=$MASTER_PORT HOST_NODE_ADDR=${master_addr}":"${port} RANK=$RANK echo "current settings" echo "WORLD_SIZE: "$WORLD_SIZE echo "MASTER_ADDR: "$MASTER_ADDR echo "MASTER_PORT: "$MASTER_PORT echo "RANK: "$RANK echo "LOCAL_RANK: "$LOCAL_RANK echo "TEST: "$TEST echo "job_data_root": $JOB_DATA_ROOT echo "running python" python TorchNoduleSeg/test.py \ --master_ip ${master_addr} \ --port $port \ --world_size $WORLD_SIZE \ --rank $RANK \ --job_data_root $JOB_DATA_ROOT