Device or resource busy nfs error when training with --multiproc-data-loading=2
An error occurred when using the multiprocessing data loading:
OSError: [Errno 16] Device or resource busy: '.nfs00000000040e0c9200000784'
Here is a bash code to reproduce the error with a simple lwnet and drive dataset using jman
OUTDIR= path/to/output # change the path
BOB= Paths/to/bob # change the path to your path to bob
function run() {
local device="cpu"
[ $# -gt 3 ] && device="${4}"
local cmd=(${BOB} binseg experiment)
cmd+=("-vv" "--device=${device}" ${1} ${2})
cmd+=("--batch-size=${3}" "--output-folder=${OUTDIR}/${1}/${2}" "--multiproc-data-loading=2")
mkdir -pv ${OUTDIR}/${1}/${2}
[ $# -gt 4 ] && cmd=(jman submit "--log-dir=${OUTDIR}/${1}/${2}" "--name=$(basename ${OUTDIR})-${1}-${2}" "--memory=24G" "--queue=${5}" -- "${cmd[@]}")
if [ $# -le 4 ]; then
# executing locally, capture stdout and stderr
("${cmd[@]}" | tee "${OUTDIR}/${1}/${2}/stdout.log") 3>&1 1>&2 2>&3 | tee "${OUTDIR}/${1}/${2}/stderr.log"
else
"${cmd[@]}"
fi
}
# run/submit all baselines
# comment out from "cuda:0" to run on CPU
# comment out from "sgpu/gpu" to run locally
run lwnet drive 4 cuda:0 sgpu