Compare commits
7 Commits
9d615760f1
...
a57c0af208
Author | SHA1 | Date |
---|---|---|
Achim D. Brucker | a57c0af208 | |
Achim D. Brucker | 49bb2d4690 | |
Achim D. Brucker | 4ffc51e6b9 | |
Achim D. Brucker | 7a0f7ea496 | |
Achim D. Brucker | e81065aecc | |
Achim D. Brucker | cbe93ddeb6 | |
Achim D. Brucker | 7994c96c2a |
|
@ -1,93 +0,0 @@
|
|||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2018 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import sqlite3
|
||||
import MySQLdb.cursors
|
||||
import sys
|
||||
import os
|
||||
|
||||
def usage():
|
||||
print(f"Usage: {sys.argv[0]} (app_join|db_join) (sqlite|mysql) (sqlite_path|my.cnf)")
|
||||
|
||||
if len(sys.argv) < 2 or len(sys.argv) > 5:
|
||||
usage()
|
||||
sys.exit(1)
|
||||
|
||||
method, db_kind, db_path = sys.argv[1:]
|
||||
if method not in ["app_join", "db_join"] or db_kind not in ["mysql", "sqlite"]:
|
||||
usage()
|
||||
sys.exit(1)
|
||||
|
||||
if db_kind == "mysql":
|
||||
def dbobj():
|
||||
return MySQLdb.connect(
|
||||
read_default_file=os.path.expanduser(db_path),
|
||||
cursorclass=MySQLdb.cursors.SSCursor
|
||||
)
|
||||
def query(db, q, args=None):
|
||||
db.execute(q.replace("%", "%%").replace("?", "%s"), args)
|
||||
for row in db:
|
||||
yield row
|
||||
else:
|
||||
db = sqlite3.connect(db_path)
|
||||
def dbobj():
|
||||
return db
|
||||
def query(db, q, args=None):
|
||||
if args is None:
|
||||
return db.execute(q)
|
||||
else:
|
||||
return db.execute(q, args)
|
||||
|
||||
def app_join():
|
||||
with dbobj() as db1:
|
||||
with dbobj() as db2:
|
||||
with dbobj() as db3:
|
||||
with dbobj() as db4:
|
||||
for (extid, date) in query(db1, "select extid, max(date) as date from extension where date <= '2018-05-01' group by extid order by extid limit 10000 offset 10000"):
|
||||
for (crx_etag,) in query(db2, "select crx_etag from extension where extid=? and date=? order by crx_etag", (extid, date)):
|
||||
for (path, md5, typ, simhash) in query(db3, "select path, md5, typ, simhash from crxfile where crx_etag=? and simhash is not null and path like '%.js' order by path, md5, typ", (crx_etag,)):
|
||||
for (size,) in query(db4, "select size from libdet where md5=? and typ=? and size >= 1024 order by size", (md5, typ)):
|
||||
yield md5
|
||||
|
||||
def db_join():
|
||||
with dbobj() as db:
|
||||
for (md5,) in query(db, "select md5 from ((((select extid, max(date) as date from extension where date <= '2018-05-01' group by extid order by extid limit 10000 offset 10000) as e1 "
|
||||
"join (select extid, date, crx_etag from extension) as e2 using (extid, date)) "
|
||||
"join (select path, crx_etag, md5, typ from crxfile where simhash is not null and path like '%.js') as d2 using (crx_etag)) "
|
||||
"join (select md5, typ, size from libdet where size >= 1024) as d3 using (md5, typ)) order by extid, crx_etag, path, md5, typ, size"):
|
||||
yield md5
|
||||
|
||||
with dbobj() as db:
|
||||
s = {}
|
||||
for (md5, library, path, typ) in query(db, "select md5, library, path, typ from cdnjs limit 10000"):
|
||||
s[md5] = (library, path, typ)
|
||||
|
||||
hit = 0
|
||||
miss = 0
|
||||
if method == "app_join":
|
||||
f = app_join
|
||||
else:
|
||||
f = db_join
|
||||
for md5 in f():
|
||||
if md5 in s:
|
||||
hit += 1
|
||||
else:
|
||||
miss += 1
|
||||
|
||||
print(f"Hit: {hit}")
|
||||
print(f"Miss: {miss}")
|
44
comparemd5
44
comparemd5
|
@ -1,44 +0,0 @@
|
|||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2018 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
db_path = sys.argv[1]
|
||||
|
||||
with sqlite3.connect(db_path) as db:
|
||||
hit = 0
|
||||
miss = 0
|
||||
s = {}
|
||||
for (md5, library, path, typ) in db.execute("select md5, library, path, typ from cdnjs"):
|
||||
s[md5] = (library, path, typ)
|
||||
|
||||
for (extid, date) in db.execute("select extid, max(date) as date from extension group by extid order by extid"):
|
||||
for (crx_etag,) in db.execute("select crx_etag from extension where extid=? and date=? order by crx_etag", (extid, date)):
|
||||
for (path, md5, typ, simhash) in db.execute("select path, md5, typ, simhash from crxfile where crx_etag=? and simhash is not null and path like '%.js' order by path, md5, typ", (crx_etag,)):
|
||||
for (size,) in db.execute("select size from libdet where md5=? and typ=? and size >= 1024 order by size", (md5, typ)):
|
||||
if md5 in s:
|
||||
hit += 1
|
||||
# library, path, typ = s[md5]
|
||||
# print("|".join((library, path, typ, extid, date, path, typ)))
|
||||
else:
|
||||
miss += 1
|
||||
print("|".join((extid, date, path, typ)))
|
||||
|
||||
print(f"Hit: {hit}")
|
||||
print(f"Miss: {miss}")
|
110
extfind
110
extfind
|
@ -1,110 +0,0 @@
|
|||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import getopt
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import re
|
||||
|
||||
from ExtensionCrawler import config
|
||||
|
||||
|
||||
def print_help():
|
||||
print("""extfind [OPTION]""")
|
||||
print(""" -h print this help text""")
|
||||
print(""" -a <DIR> archive directory""")
|
||||
print(""" -g <GLOB> glob on the extension id, don't use with -e """)
|
||||
print(""" -e <EXTIDFILELIST> file with extension ids, don't use with -g""")
|
||||
print(""" -n <TASKID> process chunk n where n in [1,N]""")
|
||||
print(""" -N <MAXTASKID> """)
|
||||
|
||||
|
||||
def split(l, n, N):
|
||||
if n < 1 or n > N:
|
||||
raise ValueError("n must be between 1 and N")
|
||||
chunksize = int(len(l) / N) + 1
|
||||
|
||||
# Slicing beyond the list contents returns the empty list
|
||||
return l[chunksize * (n - 1):chunksize * n]
|
||||
|
||||
|
||||
def iter_extension_paths_from_file(archive, n, N, extidlistfile):
|
||||
paths = []
|
||||
with open(extidlistfile, 'r') as f:
|
||||
for line in f.readlines():
|
||||
path = os.path.join(archive, "data", line[:3], line + ".tar")
|
||||
if re.fullmatch("[a-p]{32}", line) and os.path.exists(path):
|
||||
paths += [path]
|
||||
else:
|
||||
logging.warning("WARNING: {} is not a valid extension path!".format(path))
|
||||
return split(paths, n, N)
|
||||
|
||||
|
||||
def iter_extension_paths(archive, n, N, extidglob="[a-p]"*32):
|
||||
paths = glob.glob(os.path.join(archive, "data", "[a-p]" * 3, extidglob + ".tar"))
|
||||
return split(paths, n, N)
|
||||
|
||||
|
||||
def main(argv):
|
||||
archive = config.const_basedir()
|
||||
extidglob = None
|
||||
extidlistfile = None
|
||||
taskid = 1
|
||||
maxtaskid = 1
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [
|
||||
"archive=", "glob=", "extidlistfile=", "taskid=",
|
||||
"maxtaskid=", "help"
|
||||
])
|
||||
except getopt.GetoptError:
|
||||
print_help()
|
||||
sys.exit(2)
|
||||
for opt, arg in opts:
|
||||
if opt in ("-h", "--help"):
|
||||
print_help()
|
||||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
archive = arg
|
||||
elif opt in ("-g", "--glob"):
|
||||
extidglob = arg
|
||||
elif opt in ("-e", "--extidlistfile"):
|
||||
extidlistfile = arg
|
||||
elif opt in ("-n", "--taskid"):
|
||||
taskid = int(arg)
|
||||
elif opt in ("-N", "--maxtaskid"):
|
||||
maxtaskid = int(arg)
|
||||
|
||||
if extidglob is None and extidlistfile is None:
|
||||
paths = iter_extension_paths(archive, taskid, maxtaskid)
|
||||
elif extidglob is None and extidlistfile is not None:
|
||||
paths = iter_extension_paths_from_file(archive, taskid, maxtaskid, extidlistfile)
|
||||
elif extidglob is not None and extidlistfile is None:
|
||||
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
|
||||
else:
|
||||
print_help()
|
||||
sys.exit(2)
|
||||
|
||||
for path in paths:
|
||||
print(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
|
@ -1 +0,0 @@
|
|||
extfind
|
|
@ -0,0 +1,189 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
|
||||
print_help()
|
||||
{
|
||||
echo "Usage: $prog [OPTION] ... -- COMMAND ... %INPUT% ..."
|
||||
echo ""
|
||||
echo "Run ..."
|
||||
echo ""
|
||||
echo " --help, -h display this help message"
|
||||
echo " --jobs, -j num number of jobs (default: $jobs)"
|
||||
echo " --input, -i file file with input data"
|
||||
echo " --prefix, -p prefix prefix path for job directory (default: $prefix)"
|
||||
echo " --jobname, -n name job name (default: $name)"
|
||||
echo " --wrapper, -w wrapper exec wrapper (default: $wrapper)"
|
||||
echo " --copy-from, -f copy command from direcotry (default: src)"
|
||||
echo " --max-memory, -m mem max mem (default: $mem)"
|
||||
echo " --max-time, -t timelimit (default: $timelimit)"
|
||||
echo " --host, -s remote host (default: $host)"
|
||||
echo " --srcdir, -d src for copying binary (default: $srcdir)"
|
||||
echo " assumed to be remote, if it starts with a \":\""
|
||||
echo ""
|
||||
echo " COMMAND is the command that should be executed on the HPC cluster, where"
|
||||
echo " %INPUT% will be replaced with a file containing the job-specific input data."
|
||||
}
|
||||
|
||||
|
||||
|
||||
mk_jobdir(){
|
||||
echo "Creating temporary job directory in $workdir."
|
||||
mkdir -p "$workdir"/bin
|
||||
mkdir -p "$workdir"/cfg
|
||||
mkdir -p "$workdir"/input
|
||||
mkdir -p "$workdir"/output
|
||||
mkdir -p "$workdir"/tmp
|
||||
}
|
||||
|
||||
|
||||
clean_jobdir(){
|
||||
rm -rf "$workdir"
|
||||
}
|
||||
|
||||
split_input(){
|
||||
echo "Splitting input."
|
||||
split -d -a 8 -e -n l/$jobs "$input" "$workdir/input/"
|
||||
}
|
||||
|
||||
mk_hpc_script(){
|
||||
local HOSTNAME=`hostname -f`
|
||||
echo "Creating HPC script."
|
||||
cat <<EOF > $workdir/job.sge
|
||||
|
||||
#!/bin/bash
|
||||
## This script was generated by $prog (version: $version)
|
||||
## on $timestamp
|
||||
## by $USER@$HOSTNAME
|
||||
## in $PWD
|
||||
## using the following command:
|
||||
## $invokation
|
||||
##
|
||||
## SGE configuration:
|
||||
#$ -V
|
||||
#$ -t 1-$jobs
|
||||
#$ -l rmem=$mem
|
||||
#$ -l h_rt=$timelimit
|
||||
#$ -j yes
|
||||
#$ -o "$prefix"/"$name"/output
|
||||
|
||||
export JOBINPUT="$prefix"/"$name"/input/\`printf %08d \$SGE_TASK_ID\`
|
||||
|
||||
set -o nounset
|
||||
set -x
|
||||
|
||||
/usr/bin/time -v $wrapper "$prefix"/"$name"/"bin"/$cmd
|
||||
echo "Execution successful."
|
||||
EOF
|
||||
}
|
||||
|
||||
|
||||
mk_remote_jobdir(){
|
||||
echo "Create remote working directory ($host:$prefix)."
|
||||
ssh $host mkdir -p $prefix
|
||||
}
|
||||
|
||||
install_hpc_script(){
|
||||
echo "Installing HPC Script"
|
||||
scp -q -r "$workdir" "$host":"$prefix"/"$name"
|
||||
|
||||
if [[ $srcdir == ":"* ]]; then
|
||||
echo " Copying cmd from remote src."
|
||||
ssh $host cp "${srcdir:1}"/"$srccmd" "$prefix"/"$name"/bin;
|
||||
else
|
||||
echo " Copying cmd from local src."
|
||||
scp $srccmd "$srcdir"/"$srccmd" "$host":"$prefix"/"$name"/bin;
|
||||
fi
|
||||
}
|
||||
|
||||
submit_job(){
|
||||
echo "Submitting job."
|
||||
ssh $host qsub "$prefix"/"$name"/job.sge
|
||||
}
|
||||
|
||||
## global configuration
|
||||
version="0.0"
|
||||
prog=`echo $0 | sed 's|.*/||'`;
|
||||
invokation="$prog $(printf "%q " "$@")"
|
||||
timestamp=`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
|
||||
host=`hostname`
|
||||
workdir=`mktemp -d`
|
||||
echo $workdir
|
||||
|
||||
## default values
|
||||
prefix="~/hpc/"
|
||||
jobs=1
|
||||
name="$host-$USER-$timestamp"
|
||||
input=""
|
||||
wrapper="singularity exec -B \$TMPDIR:$prefix/$name/tmp"
|
||||
cmd=""
|
||||
mem="2G"
|
||||
timelimit="01:00:00"
|
||||
local="false";
|
||||
host="sharc.shef.ac.uk"
|
||||
srcdir="."
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case "$1" in
|
||||
--jobs|-j)
|
||||
jobs="$2";
|
||||
shift;;
|
||||
--input|-i)
|
||||
input="$2";
|
||||
shift;;
|
||||
--jobname|-n)
|
||||
name="$2";
|
||||
shift;;
|
||||
--max-memory|-m)
|
||||
mem="$2";
|
||||
shift;;
|
||||
--max-time|-t)
|
||||
timelimit="$2";
|
||||
shift;;
|
||||
--host|-s)
|
||||
host="$2";
|
||||
shift;;
|
||||
--srcdir|-d)
|
||||
srcdir="$2";
|
||||
shift;;
|
||||
--wrapper|-w)
|
||||
wrapper="$2";
|
||||
shift;;
|
||||
--prefix|-p)
|
||||
prefix="$2";
|
||||
shift;;
|
||||
--help|-h)
|
||||
print_help
|
||||
exit 0;;
|
||||
--) shift; break;;
|
||||
*) print_help
|
||||
exit 1;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
cmd=`echo $(printf "%q " "$@") | sed -e 's/%INPUT%/\$JOBINPUT/'`
|
||||
cmdarray=("$@")
|
||||
srccmd=${cmdarray[0]}
|
||||
|
||||
mk_jobdir;
|
||||
|
||||
if [ -n "$input" ]; then
|
||||
if [ ! -f "$input" ]; then
|
||||
echo "Input file \"$input\" not found!"
|
||||
exit 1
|
||||
fi
|
||||
split_input;
|
||||
fi
|
||||
|
||||
mk_hpc_script;
|
||||
|
||||
mk_remote_jobdir;
|
||||
|
||||
install_hpc_script;
|
||||
|
||||
clean_jobdir;
|
||||
|
||||
submit_job;
|
||||
|
Loading…
Reference in New Issue