2018-06-13 08:33:55 +00:00
|
|
|
#!/bin/bash
|
2017-08-23 17:04:33 +00:00
|
|
|
set -o nounset
|
2017-08-23 22:23:21 +00:00
|
|
|
set -o errexit
|
|
|
|
|
2018-06-13 08:33:55 +00:00
|
|
|
REMOTE_ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive
|
|
|
|
REMOTE_TARGET_DIR_PREFIX=/data/\$USER
|
|
|
|
NUM_THREADS=48
|
2018-08-02 15:13:25 +00:00
|
|
|
SGE_EXTRA_ARGS='-P rse -l h_rt=01:00:00,rmem=4G,h=\!sharc-node126 -j yes'
|
2018-06-13 08:33:55 +00:00
|
|
|
PY_EXTRA_ARGS=''
|
2018-07-21 01:15:18 +00:00
|
|
|
EXTENSION_IDS=
|
2017-09-20 10:37:57 +00:00
|
|
|
|
2018-06-13 08:33:55 +00:00
|
|
|
usage() {
|
|
|
|
echo "Usage:"
|
2018-07-26 16:38:11 +00:00
|
|
|
echo " -a <path> (set archive path, default: ${REMOTE_ARCHIVE})"
|
|
|
|
echo " -t <path> (set target directory, default: ${REMOTE_TARGET_DIR_PREFIX})"
|
|
|
|
echo " -m <num_threads> (set degree of parallelism, default: ${NUM_THREADS})"
|
|
|
|
echo " -s \"<args>\" (add qsub arguments, default: ${SGE_EXTRA_ARGS})"
|
|
|
|
echo " -p \"<args>\" (add python script arguments, default: ${PY_EXTRA_ARGS})"
|
|
|
|
echo " -e <path> (set path to extension id list, default: crawl from archive)"
|
2019-05-15 20:59:59 +00:00
|
|
|
echo " -l <N> (limit number of sharc tasks, default: number of extensions)"
|
2018-06-13 08:33:55 +00:00
|
|
|
}
|
2017-09-02 16:05:42 +00:00
|
|
|
|
2019-05-15 20:59:59 +00:00
|
|
|
while getopts ":a:t:s:p:m:e:l:" o; do
|
2018-06-13 08:33:55 +00:00
|
|
|
case "${o}" in
|
|
|
|
a)
|
|
|
|
REMOTE_ARCHIVE=${OPTARG}
|
|
|
|
;;
|
|
|
|
t)
|
|
|
|
REMOTE_TARGET_DIR_PREFIX=${OPTARG}
|
|
|
|
;;
|
|
|
|
m)
|
|
|
|
NUM_THREADS=${OPTARG}
|
|
|
|
;;
|
|
|
|
s)
|
|
|
|
SGE_EXTRA_ARGS+=" ${OPTARG}"
|
|
|
|
;;
|
|
|
|
p)
|
|
|
|
PY_EXTRA_ARGS+=" ${OPTARG}"
|
|
|
|
;;
|
2018-07-21 01:15:18 +00:00
|
|
|
e)
|
|
|
|
EXTENSION_IDS="${OPTARG}"
|
|
|
|
;;
|
2019-05-15 20:59:59 +00:00
|
|
|
l)
|
|
|
|
MAX_TASKS="${OPTARG}"
|
|
|
|
;;
|
2018-06-13 08:33:55 +00:00
|
|
|
*)
|
|
|
|
usage
|
|
|
|
exit 1
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
done
|
2017-09-20 10:37:57 +00:00
|
|
|
|
2018-06-13 08:33:55 +00:00
|
|
|
shift $((OPTIND-1))
|
2017-09-02 16:05:42 +00:00
|
|
|
|
2018-06-13 08:33:55 +00:00
|
|
|
BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P )
|
|
|
|
TEMP_FOLDER=$(mktemp -d)
|
|
|
|
TARGETDIR="${REMOTE_TARGET_DIR_PREFIX}/create-db-$(date +%Y%m%d-%H%M%S)"
|
2017-08-23 17:04:33 +00:00
|
|
|
|
2018-06-13 08:33:55 +00:00
|
|
|
echo "Using target dir: $TARGETDIR"
|
2017-08-23 22:23:21 +00:00
|
|
|
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
|
2017-08-23 17:04:33 +00:00
|
|
|
|
2017-09-20 10:37:57 +00:00
|
|
|
echo "Pushing sge script ..."
|
|
|
|
scp "$BASEDIR/sge/create-db.sge" sharc.shef.ac.uk:"$TARGETDIR/create-db.sge"
|
2017-08-23 17:04:33 +00:00
|
|
|
|
2018-06-13 08:33:55 +00:00
|
|
|
echo "Building image..."
|
2019-05-15 20:59:59 +00:00
|
|
|
if [ -f "$BASEDIR/scripts/singularity/create-db.img" ]; then
|
|
|
|
rm -f "$BASEDIR/scripts/singularity/create-db.img"
|
2017-09-21 19:09:00 +00:00
|
|
|
fi
|
2019-05-15 20:59:59 +00:00
|
|
|
(
|
|
|
|
cd "$BASEDIR/scripts/singularity"
|
|
|
|
if [[ "$(docker images -q singularitybuilder-arch 2> /dev/null)" == "" ]]; then
|
|
|
|
docker build --tag=singularitybuilder -f singularitybuilder-arch.Dockerfile .
|
|
|
|
fi
|
|
|
|
docker run -it -v "$(pwd):$(pwd)" -w "$(pwd)" --privileged singularitybuilder-arch:latest singularity build create-db.img ExtensionCrawler.def
|
|
|
|
)
|
2018-06-13 08:33:55 +00:00
|
|
|
|
|
|
|
echo "Pushing image..."
|
2019-05-15 20:59:59 +00:00
|
|
|
scp "$BASEDIR/scripts/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img"
|
2018-06-13 08:33:55 +00:00
|
|
|
|
|
|
|
|
2018-07-21 01:15:18 +00:00
|
|
|
if [[ -z $EXTENSION_IDS ]]; then
|
|
|
|
echo "Gathering extension IDs..."
|
|
|
|
ssh sharc.shef.ac.uk find "${REMOTE_ARCHIVE}/data" -name "*.tar" | grep -Po "[a-p]{32}" > ${TEMP_FOLDER}/extension.ids
|
|
|
|
else
|
|
|
|
cp "$EXTENSION_IDS" ${TEMP_FOLDER}/extension.ids
|
|
|
|
fi
|
2018-06-13 08:33:55 +00:00
|
|
|
|
|
|
|
NO_IDS=$(cat ${TEMP_FOLDER}/extension.ids | wc -l)
|
|
|
|
|
|
|
|
echo "Found $NO_IDS IDs!"
|
|
|
|
if [ "$NO_IDS" = 0 ]; then
|
|
|
|
echo "Nothing to do!"
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
echo "Pushing extension IDs..."
|
|
|
|
scp ${TEMP_FOLDER}/extension.ids sharc.shef.ac.uk:$TARGETDIR/
|
|
|
|
|
2019-05-15 20:59:59 +00:00
|
|
|
if [[ ! -v MAX_TASKS ]]; then
|
|
|
|
MAX_TASKS=NO_IDS
|
|
|
|
fi
|
|
|
|
|
|
|
|
NO_BATCH_JOBS=$(((MAX_TASKS+1)/75000+1))
|
|
|
|
JOBS_PER_BATCH=$((MAX_TASKS/NO_BATCH_JOBS+1))
|
2018-06-13 08:33:55 +00:00
|
|
|
|
|
|
|
for run_no in $(seq 1 $NO_BATCH_JOBS); do
|
|
|
|
FIRST_ID=$(((run_no-1) * $JOBS_PER_BATCH + 1))
|
|
|
|
LAST_ID=$((run_no * $JOBS_PER_BATCH))
|
2017-09-21 19:09:00 +00:00
|
|
|
|
2018-06-13 08:33:55 +00:00
|
|
|
echo "Starting job $run_no ..."
|
|
|
|
(set -x; ssh sharc.shef.ac.uk qsub \
|
|
|
|
-tc $((NUM_THREADS/NO_BATCH_JOBS)) \
|
|
|
|
-t ${FIRST_ID}-${LAST_ID} \
|
|
|
|
-wd "$TARGETDIR" \
|
|
|
|
-o "$TARGETDIR/logs" \
|
|
|
|
${SGE_EXTRA_ARGS} \
|
2019-05-15 20:59:59 +00:00
|
|
|
"$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $MAX_TASKS ${PY_EXTRA_ARGS})
|
2018-06-13 08:33:55 +00:00
|
|
|
done
|