Added option to handle more than one extension per sharc job.

This commit is contained in:
Michael Herzberg 2019-05-15 21:59:59 +01:00
parent 3796ca2a3f
commit 26e5067042
1 changed files with 22 additions and 8 deletions

View File

@ -17,9 +17,10 @@ usage() {
echo " -s \"<args>\" (add qsub arguments, default: ${SGE_EXTRA_ARGS})"
echo " -p \"<args>\" (add python script arguments, default: ${PY_EXTRA_ARGS})"
echo " -e <path> (set path to extension id list, default: crawl from archive)"
echo " -l <N> (limit number of sharc tasks, default: number of extensions)"
}
while getopts ":a:t:s:p:m:e:" o; do
while getopts ":a:t:s:p:m:e:l:" o; do
case "${o}" in
a)
REMOTE_ARCHIVE=${OPTARG}
@ -39,6 +40,9 @@ while getopts ":a:t:s:p:m:e:" o; do
e)
EXTENSION_IDS="${OPTARG}"
;;
l)
MAX_TASKS="${OPTARG}"
;;
*)
usage
exit 1
@ -59,13 +63,19 @@ echo "Pushing sge script ..."
scp "$BASEDIR/sge/create-db.sge" sharc.shef.ac.uk:"$TARGETDIR/create-db.sge"
echo "Building image..."
if [ -f "$BASEDIR/singularity/create-db.img" ]; then
rm -f "$BASEDIR/singularity/create-db.img"
if [ -f "$BASEDIR/scripts/singularity/create-db.img" ]; then
rm -f "$BASEDIR/scripts/singularity/create-db.img"
fi
sudo singularity build "$BASEDIR/singularity/create-db.img" "$BASEDIR/singularity/ExtensionCrawler-dev.def"
(
cd "$BASEDIR/scripts/singularity"
if [[ "$(docker images -q singularitybuilder-arch 2> /dev/null)" == "" ]]; then
docker build --tag=singularitybuilder -f singularitybuilder-arch.Dockerfile .
fi
docker run -it -v "$(pwd):$(pwd)" -w "$(pwd)" --privileged singularitybuilder-arch:latest singularity build create-db.img ExtensionCrawler.def
)
echo "Pushing image..."
scp "$BASEDIR/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img"
scp "$BASEDIR/scripts/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img"
if [[ -z $EXTENSION_IDS ]]; then
@ -86,8 +96,12 @@ fi
echo "Pushing extension IDs..."
scp ${TEMP_FOLDER}/extension.ids sharc.shef.ac.uk:$TARGETDIR/
NO_BATCH_JOBS=$(((NO_IDS+1)/75000+1))
JOBS_PER_BATCH=$((NO_IDS/NO_BATCH_JOBS+1))
if [[ ! -v MAX_TASKS ]]; then
MAX_TASKS=NO_IDS
fi
NO_BATCH_JOBS=$(((MAX_TASKS+1)/75000+1))
JOBS_PER_BATCH=$((MAX_TASKS/NO_BATCH_JOBS+1))
for run_no in $(seq 1 $NO_BATCH_JOBS); do
FIRST_ID=$(((run_no-1) * $JOBS_PER_BATCH + 1))
@ -100,5 +114,5 @@ for run_no in $(seq 1 $NO_BATCH_JOBS); do
-wd "$TARGETDIR" \
-o "$TARGETDIR/logs" \
${SGE_EXTRA_ARGS} \
"$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $NO_IDS ${PY_EXTRA_ARGS})
"$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $MAX_TASKS ${PY_EXTRA_ARGS})
done