herrius
/

unit_test

Model card Files Files and versions

unit_test / slurm_run.sh

herrius's picture

Upload 259 files

32b542e about 3 years ago

history blame contribute delete

2.72 kB

	#!/bin/bash

	a=$(echo $HOSTNAME \| cut -c12-16)

	CONFIG=$1
	JOB_NAME=${2:-"experiments"}
	GPUS=${3:-8}

	SRUN=${4:-'reserved'}

	GPUS_PER_NODE=${GPUS:-8}
	if [ $GPUS_PER_NODE -ge 8 ]; then
	GPUS_PER_NODE=8
	fi
	CPUS_PER_TASK=${CPUS_PER_TASK:-4}
	SRUN_ARGS=${SRUN_ARGS:-""}

	PY_ARGS=${@:5}

	WORK_DIR=${CONFIG//configs/work_dirs}
	WORK_DIR=${WORK_DIR//.yaml//$JOB_NAME}
	echo $WORK_DIR
	mkdir -p $WORK_DIR
	mkdir -p data/temp

	now=$(date +"%Y%m%d_%H%M%S")

	a=$(echo $HOSTNAME \| cut -c12-16)


	if [ $a == '140-0' ]; then
	export DATA_PATH='/mnt/lustre/share_data/zhujinguo'
	export LD_LIBRARY_PATH=/mnt/cache/zhujinguo/anaconda3/envs/py36/lib:$LD_LIBRARY_PATH
	export TORCH_EXTENSIONS_DIR='/mnt/lustre/zhujinguo/.cache/torch_extensions'
	export NO_NVRTC=0
	partition='INTERN'
	CEPH_CONFIG='slurm_tools/petreloss_1400.config'
	SRUNreal=${SRUN}

	if [ ${SRUN} == 'vcspot' ]; then
	SRUNreal='spot --async'
	partition=VC
	elif [ ${SRUN} == 'vcauto' ]; then
	SRUNreal='auto --async'
	partition=VC
	elif [ ${SRUN} == 'vcreserved' ]; then
	SRUNreal='reserved'
	partition=VC
	elif [ ${SRUN} == 'spot' ]; then
	SRUNreal='spot --async'
	elif [ ${SRUN} == 'auto' ]; then
	SRUNreal='auto --async'

	fi

	elif [ $a == '142-4' ]; then
	# 1424
	export DATA_PATH='/mnt/lustre/share_data/zhujinguo'
	export LD_LIBRARY_PATH=/mnt/cache/zhujinguo/anaconda3/envs/py36/lib:$LD_LIBRARY_PATH
	export TORCH_EXTENSIONS_DIR='/mnt/lustre/zhujinguo/.cache/torch_extensions'
	export NO_NVRTC=0
	partition='vc_research_5'
	CEPH_CONFIG='slurm_tools/petreloss_1424.config'

	SRUNreal=${SRUN}

	if [ ${SRUN} == 'vc4spot' ]; then
	SRUNreal='spot --async'
	partition=vc_research_4
	elif [ ${SRUN} == 'vc4auto' ]; then
	SRUNreal='auto --async -x SH-IDC1-10-142-4-76'
	partition=vc_research_4
	elif [ ${SRUN} == 'vc4reserved' ]; then
	SRUNreal='reserved'
	partition=vc_research_4
	elif [ ${SRUN} == 'spot' ]; then
	SRUNreal='spot --async'
	elif [ ${SRUN} == 'auto' ]; then
	SRUNreal='auto --async'
	fi

	else
	echo only SH1424 and SH1400 supported now

	fi

	srun --partition=${partition} $SRUN_ARGS --quotatype=${SRUNreal} -o $WORK_DIR/phoenix-slurm-%j-$now.out \
	--job-name=${JOB_NAME} -n$GPUS --gres=gpu:${GPUS_PER_NODE} \
	--ntasks-per-node=${GPUS_PER_NODE} \
	--kill-on-bad-exit=1 --cpus-per-task 12 \
	python -u main.py --num-gpus $GPUS \
	--config-file ${CONFIG} --init_method slurm --resume \
	${PY_ARGS} OUTPUT_DIR $WORK_DIR DATALOADER.USE_CEPH True \
	DATALOADER.TCS_CONF_PATH $CEPH_CONFIG SOLVER.CHECKPOINT_PERIOD 10000 SOLVER.CHECKPOINT_MAX_SAVE 1 \
	${OTHERARGS} 2>&1

	# SOLVER.ACCUM_ITER 2 SOLVER.CHECKPOINT_PERIOD 1000 SOLVER.CHECKPOINT_MAX_SAVE 1 MODEL.BERT.DROP_PATH_PROB 0.1