Files
SAPFOR/dvm/tools/tester/trunk/main/task-processor.sh
2024-05-02 17:08:55 +03:00

367 lines
12 KiB
Bash

#!/bin/bash
# Bash is required due to usage of 'disown' command
SAVE_DIR=`pwd`
MY_DIR=$(cd "$(dirname "$(which "$0")")" && pwd)
RESULTS_DIR="$1"
. "$MY_DIR/machine-config.sh"
if [ -f "$SAVE_DIR/machine-config.sh" ]; then
. "$SAVE_DIR/machine-config.sh"
fi
. "$MY_DIR/configure-run.sh"
if [ -f "$SAVE_DIR/configure-run.sh" ]; then
. "$SAVE_DIR/configure-run.sh"
fi
. "$MY_DIR/test-utils.sh"
if [ $INTERACTIVE -ne 0 ]; then
stdout_fn() {
echo "$1.stdout"
}
stderr_fn() {
echo "$1.stderr"
}
fi
if [ $HAS_RES_MANAGER -eq 0 ]; then
RES_MAN_DIR=`mktemp -d`
fi
resources_freed() {
FN=`mktemp`
if [ $SHARE_RESOURCES -eq 0 ]; then
FREED_CPUS=$(( CPUS_PER_NODE * MAX_CPU_SHARING_FACTOR ))
FREED_CUDAS=$(( CUDAS_PER_NODE * MAX_CUDA_SHARING_FACTOR ))
else
FREED_CPUS=$(( totalProcs * CPUS_PER_PROC ))
FREED_CUDAS=$(( totalProcs * CUDAS_PER_PROC ))
fi
echo "FREED_CPUS=$FREED_CPUS" >>$FN
echo "FREED_CUDAS=$FREED_CUDAS" >>$FN
# echo "rm $FN" >>$FN
mv $FN $RES_MAN_DIR/
}
interactive_launcher() {
cd "$LAUNCH_DIR"
STDOUT_FN=`stdout_fn "$LAUNCH_NAME"`
STDERR_FN=`stderr_fn "$LAUNCH_NAME"`
:>$STDOUT_FN
:>$STDERR_FN
set -m
# echo ./dvm run $PROC_GRID "$TASK_EXE"
START_T=`date +%s`
if [ -f "run.sh" ]; then
PATH="$LAUNCH_DIR:$PATH" PROC_GRID="$PROC_GRID" DVMH_PPN=$LAUNCH_PPN DVMH_NUM_THREADS=$CPUS_PER_PROC DVMH_NUM_CUDAS=$CUDAS_PER_PROC ./run.sh </dev/null >"$STDOUT_FN" 2>"$STDERR_FN" &
LAUNCH_PID=$!
else
DVMH_PPN=$LAUNCH_PPN DVMH_NUM_THREADS=$CPUS_PER_PROC DVMH_NUM_CUDAS=$CUDAS_PER_PROC ./dvm run $PROC_GRID "$TASK_EXE" </dev/null >"$STDOUT_FN" 2>"$STDERR_FN" &
LAUNCH_PID=$!
fi
if [ $TEST_MAX_TIME -gt 0 ]; then
# echo "Setting proc_killer to process $LAUNCH_PID for $TEST_MAX_TIME"
proc_killer -$LAUNCH_PID $TEST_MAX_TIME </dev/null >/dev/null 2>& 1 &
KILLER_PID=$!
disown
fi
wait $LAUNCH_PID
START_RES=$?
END_T=`date +%s`
CALC_TIME=$(( END_T - START_T ))
if [ $TEST_MAX_TIME -gt 0 ]; then
kill -2 $KILLER_PID >/dev/null 2>& 1
kill -15 $KILLER_PID >/dev/null 2>& 1
kill -9 $KILLER_PID >/dev/null 2>& 1
fi
if [ $HAS_RES_MANAGER -eq 0 ]; then
resources_freed
fi
echo "$START_RES $CALC_TIME" >"$TASK_EXE.finished"
}
non_interactive_launcher() {
cd "$LAUNCH_DIR"
STDOUT_FN=`mktemp`
STDERR_FN=`mktemp`
# echo ./dvm run $PROC_GRID "$TASK_EXE"
if [ $TEST_MAX_TIME -gt 0 ]; then
export maxtime=$(( (TEST_MAX_TIME + 59) / 60))
fi
if [ -f "run.sh" ]; then
PATH="$LAUNCH_DIR:$PATH" PROC_GRID="$PROC_GRID" DVMH_PPN=$LAUNCH_PPN DVMH_NUM_THREADS=$CPUS_PER_PROC DVMH_NUM_CUDAS=$CUDAS_PER_PROC ./run.sh >$STDOUT_FN 2>$STDERR_FN
START_RES=$?
else
DVMH_PPN=$LAUNCH_PPN DVMH_NUM_THREADS=$CPUS_PER_PROC DVMH_NUM_CUDAS=$CUDAS_PER_PROC ./dvm run $PROC_GRID "$TASK_EXE" >$STDOUT_FN 2>$STDERR_FN
START_RES=$?
fi
unset maxtime
:>"$TASK_EXE.committed"
IS_LAUNCHED=`is_launched $STDOUT_FN $STDERR_FN`
rm $STDOUT_FN $STDERR_FN
if [ $START_RES -eq 0 -a $IS_LAUNCHED -ne 0 ]; then
while [ `is_finished "$LAUNCH_NAME"` -eq 0 ]; do
sleep 1
done
CALC_TIME=`get_elapsed_time "$LAUNCH_NAME"`
fi
if [ $HAS_RES_MANAGER -eq 0 ]; then
resources_freed
fi
echo "$START_RES $CALC_TIME" >"$TASK_EXE.finished"
}
already_analyzed() {
# echo -n "PLATFORM=\"$TEST_PLATFORM\""
# echo -n " NOH_FLAG=$TASK_NOH_FLAG"
# echo -n " AUTOTFM_FLAG=$TASK_AUTOTFM_FLAG"
# echo -n " PROC_GRID=\"$PROC_GRID\""
# echo -n " CPUS_PER_PROC=$CPUS_PER_PROC"
# echo -n " CUDAS_PER_PROC=$CUDAS_PER_PROC"
local res
res=0
if [ -f "$RESULTS_DIR/$TEST_SHORT_PATH.result" ]; then
if [ $( cat "$RESULTS_DIR/$TEST_SHORT_PATH.result" | grep "PLATFORM=\"$TEST_PLATFORM\"" | grep "NOH_FLAG=$TASK_NOH_FLAG" | grep "AUTOTFM_FLAG=$TASK_AUTOTFM_FLAG" | grep "PROC_GRID=\"$PROC_GRID\"" | grep "CPUS_PER_PROC=$CPUS_PER_PROC" | grep "CUDAS_PER_PROC=$CUDAS_PER_PROC" | wc -l ) -gt 0 ]; then
res=1
fi
fi
echo $res
}
launcher() {
counter=0
if [ $HAS_RES_MANAGER -eq 0 ]; then
if [ $MAX_NODES_PER_TASK -gt 1 ]; then
echo "Can manage resources only for one-node system"
MAX_NODES_PER_TASK=1
fi
FREE_CPUS=$(( CPUS_PER_NODE * MAX_CPU_SHARING_FACTOR ))
FREE_CUDAS=$(( CUDAS_PER_NODE * MAX_CUDA_SHARING_FACTOR ))
fi
exec 4>$1
while IFS= read -r TASK_SPEC; do
TEST_PLATFORM=Unknown
TASK_NOH_FLAG=0
TASK_AUTOTFM_FLAG=0
PROC_GRID=0
CPUS_PER_PROC=0
CUDAS_PER_PROC=0
eval $TASK_SPEC
LAUNCHED_FLAG=0
ALREADY_ANALYZED=$( already_analyzed )
if [ $TASK_TYPE -eq 1 -a $ALREADY_ANALYZED -eq 0 ]; then
CAN_CPUS=$CPUS_PER_NODE
CAN_CUDAS=$CUDAS_PER_NODE
if [ $SHARE_RESOURCES -ne 0 ]; then
CAN_CPUS=$(( CAN_CPUS * MAX_CPU_SHARING_FACTOR ))
CAN_CUDAS=$(( CAN_CUDAS * MAX_CUDA_SHARING_FACTOR ))
fi
LAUNCH_PPN=$MAX_PPN
CUR_PPN=$LAUNCH_PPN
if [ $CPUS_PER_PROC -gt 0 ]; then
CUR_PPN=$(( CAN_CPUS / $CPUS_PER_PROC ))
fi
if [ $CUR_PPN -lt $LAUNCH_PPN ]; then
LAUNCH_PPN=$CUR_PPN
fi
if [ $CUDAS_PER_PROC -gt 0 ]; then
CUR_PPN=$(( CAN_CUDAS / $CUDAS_PER_PROC ))
fi
if [ $CUR_PPN -lt $LAUNCH_PPN ]; then
LAUNCH_PPN=$CUR_PPN
fi
totalProcs=1
for proc in $PROC_GRID; do
totalProcs=$(( totalProcs * proc ))
done
if [ $LAUNCH_PPN -gt 0 ]; then
USE_NODES=$(( ( totalProcs + LAUNCH_PPN - 1 ) / LAUNCH_PPN ))
else
LAUNCH_PPN=1
USE_NODES=$(( MAX_NODES_PER_TASK + 1 ))
fi
NEED_CPUS=$(( totalProcs * CPUS_PER_PROC ))
NEED_CUDAS=$(( totalProcs * CUDAS_PER_PROC ))
if [ $USE_NODES -le $MAX_NODES_PER_TASK ]; then
# Launch
counter=$(( counter + 1 ))
LAUNCH_DIR=`mktemp -d`
cp -r $TASK_DIR/* $LAUNCH_DIR/
TASK_SPEC=$( echo -n "$TASK_SPEC" ; echo " LAUNCH_DIR=\"$LAUNCH_DIR\"" )
if [ $HAS_RES_MANAGER -eq 0 ]; then
LAUNCH_NAME="$LAUNCH_DIR/$TASK_EXE"
else
LAUNCH_NAME="$LAUNCH_DIR/$TASK_EXE.$totalProcs.1"
fi
TASK_SPEC=$( echo -n "$TASK_SPEC" ; echo " LAUNCH_NAME=\"$LAUNCH_NAME\"" )
while true; do
if [ -f "$SAVE_DIR/dvm-tester.pause" ] && [ "$(cat "$SAVE_DIR/dvm-tester.pause")" = "Immediate" ]; then
:
elif [ -f "$MY_DIR/dvm-tester.pause" ] && [ "$(cat "$MY_DIR/dvm-tester.pause")" = "Immediate" ]; then
:
else
break
fi
sleep 60
done
if [ $HAS_RES_MANAGER -ne 0 ]; then
while [ `can_launch` -eq 0 ]; do
sleep 1
done
else
if [ $SHARE_RESOURCES -eq 0 ]; then
NEED_CPUS=$(( CPUS_PER_NODE * MAX_CPU_SHARING_FACTOR ))
NEED_CUDAS=$(( CUDAS_PER_NODE * MAX_CUDA_SHARING_FACTOR ))
fi
cd "$RES_MAN_DIR"
while [ $FREE_CPUS -lt $NEED_CPUS -o $FREE_CUDAS -lt $NEED_CUDAS ]; do
FOUND_SMTH=0
for f in `ls`; do
FREED_CPUS=
FREED_CUDAS=
. ./$f
if [ -n "$FREED_CPUS" -a -n "$FREED_CUDAS" ]; then
FOUND_SMTH=1
FREE_CPUS=$(( FREE_CPUS + FREED_CPUS ))
FREE_CUDAS=$(( FREE_CUDAS + FREED_CUDAS ))
rm $f
fi
done
if [ $FOUND_SMTH -eq 0 ]; then
sleep 1
fi
done
FREE_CPUS=$(( FREE_CPUS - NEED_CPUS ))
FREE_CUDAS=$(( FREE_CUDAS - NEED_CUDAS ))
fi
# Actually launch
if [ $INTERACTIVE -ne 0 ]; then
interactive_launcher &
else
non_interactive_launcher &
if [ $HAS_RES_MANAGER -ne 0 ]; then
while [ ! -f "$LAUNCH_DIR/$TASK_EXE.committed" ]; do
sleep 1
done
fi
fi
LAUNCHED_FLAG=1
else
# Can not launch such big task
echo "Discarding too big task: $TASK_SPEC"
fi
elif [ $TASK_TYPE -eq 0 ]; then
LAUNCHED_FLAG=1
else
echo "Discarding task: $TASK_SPEC"
fi
if [ $LAUNCHED_FLAG -ne 0 ]; then
echo "$TASK_SPEC" >& 4
fi
done
echo ":" >& 4
exec 4>&-
echo "Total tasks launched: $counter"
}
print_result_line() {
echo -n "PLATFORM=\"$TEST_PLATFORM\""
echo -n " NOH_FLAG=$TASK_NOH_FLAG"
echo -n " AUTOTFM_FLAG=$TASK_AUTOTFM_FLAG"
echo -n " PROC_GRID=\"$PROC_GRID\""
echo -n " CPUS_PER_PROC=$CPUS_PER_PROC"
echo -n " CUDAS_PER_PROC=$CUDAS_PER_PROC"
echo -n " CALC_TIME=$TASK_CALC_TIME"
echo -n " TEST_PASSED=$TEST_PASSED"
echo -n " RESULT_COMMENT=\"$RESULT_COMMENT\""
echo " ERROR_LEVEL=$ERROR_LEVEL"
}
analyzer() {
counter=0
FIFO_NAME="$1"
while IFS= read -r TASK_SPEC; do
if [ "$TASK_SPEC" = ":" ]; then
break
fi
CPUS_PER_PROC=0
CUDAS_PER_PROC=0
TASK_NOH_FLAG=0
TASK_AUTOTFM_FLAG=0
eval $TASK_SPEC
if [ $TASK_TYPE -eq 0 ]; then
if [ ! -f "$TASK_DIR/$TASK_EXE" ]; then
# Report compilation error
if [ `basename "$TEST_SHORT_PATH"` != "$TEST_SHORT_PATH" ]; then
mkdir -p "$RESULTS_DIR/$(dirname "$TEST_SHORT_PATH")"
fi
PROC_GRID=
CPUS_PER_PROC=
CUDAS_PER_PROC=
TASK_CALC_TIME=
TEST_PASSED=0
RESULT_COMMENT="Compilation error"
ERROR_LEVEL=255
print_result_line >>"$RESULTS_DIR/$TEST_SHORT_PATH.result"
fi
# Cleanup all the test's stuff
rm -rf "$TASK_DIR"
else
counter=$(( counter + 1 ))
cd "$LAUNCH_DIR"
while [ ! -f "$TASK_EXE.finished" ]; do
sleep 1
done
read LAUNCH_EXIT_CODE TASK_CALC_TIME <"$TASK_EXE.finished"
STDOUT_FN=`stdout_fn "$LAUNCH_NAME"`
STDERR_FN=`stderr_fn "$LAUNCH_NAME"`
SUBTEST_COUNT=0
. $TEST_ANALYZER
if [ `basename "$TEST_SHORT_PATH"` != "$TEST_SHORT_PATH" ]; then
mkdir -p "$RESULTS_DIR/$(dirname "$TEST_SHORT_PATH")"
fi
print_result_line >>"$RESULTS_DIR/$TEST_SHORT_PATH.result"
if [ $SUBTEST_COUNT -gt 0 ]; then
mkdir -p $RESULTS_DIR/$TEST_SHORT_PATH
for i in `seq $SUBTEST_COUNT`; do
SUBTEST_NAME=$i
analyze_subtest $i
print_result_line >>"$RESULTS_DIR/$TEST_SHORT_PATH/$SUBTEST_NAME.result"
done
fi
# if [ $LAUNCH_EXIT_CODE -ne 0 -o "$RESULT_COMMENT" = "Crash" ]; then
# echo "Test's $TEST_SHORT_PATH stdout:"
# cat "$STDOUT_FN"
# echo "Test's $TEST_SHORT_PATH stderr:"
# cat "$STDERR_FN"
# fi
rm -rf "$LAUNCH_DIR"
fi
done <$FIFO_NAME
echo "Total tasks analyzed: $counter"
}
FIFO_NAME="$(mktemp -u).launch-fifo"
mkfifo $FIFO_NAME
analyzer $FIFO_NAME &
launcher $FIFO_NAME
wait
rm $FIFO_NAME
if [ $HAS_RES_MANAGER -eq 0 ]; then
cd "$RES_MAN_DIR"
for f in `ls`; do
. ./$f
done
cd "$SAVE_DIR"
rm -rf "$RES_MAN_DIR"
fi