Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler
This commit is contained in:
commit
2344d2ac97
|
@ -66,3 +66,4 @@ target/
|
|||
|
||||
archive
|
||||
.ropeproject
|
||||
excrawl.img
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
#!/usr/bin/env python3.5
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import getopt
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
|
||||
from ExtensionCrawler import config
|
||||
|
||||
def help():
|
||||
print("""extfind [OPTION]""")
|
||||
print(""" -h print this help text""")
|
||||
print(""" -a <DIR> archive directory""")
|
||||
print(""" -g <GLOB> glob on the extension id, don't use with -e """)
|
||||
print(""" -e <EXTIDFILELIST> file with extension ids, don't use with -g""")
|
||||
print(""" -n <TASKID> process chunk n where n in [1,N]""")
|
||||
print(""" -N <MAXTASKID> """)
|
||||
|
||||
|
||||
def split(l, n, N):
|
||||
if n < 1 or n > N:
|
||||
raise ValueError("n must be between 1 and N")
|
||||
chunksize = int(len(l) / N) + 1
|
||||
|
||||
# Slicing beyond the list contents returns the empty list
|
||||
return l[chunksize * (n - 1):chunksize * n]
|
||||
|
||||
|
||||
def iter_extension_paths_from_file(archive, n, N, extidlistfile):
|
||||
paths = []
|
||||
with open(extidlistfile, 'r') as f:
|
||||
for line in f.readlines():
|
||||
path = os.path.join(archive, "data", line[:3], line + ".tar")
|
||||
if re.fullmatch("[a-p]{32}", line) and os.path.exists(path):
|
||||
paths += [path]
|
||||
else:
|
||||
logging.warn("WARNING: {} is not a valid extension path!".format(path))
|
||||
return split(paths, n, N)
|
||||
|
||||
|
||||
def iter_extension_paths(archive, n, N, extidglob="[a-p]"*32):
|
||||
paths = glob.glob(os.path.join(archive, "data", "[a-p]" * 3, extidglob + ".tar"))
|
||||
return split(paths, n, N)
|
||||
|
||||
|
||||
def main(argv):
|
||||
archive = config.const_basedir()
|
||||
extidglob = None
|
||||
extidlistfile = None
|
||||
taskid = 1
|
||||
maxtaskid = 1
|
||||
|
||||
paths = []
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [
|
||||
"archive=", "glob=", "extidlistfile=", "taskid=",
|
||||
"maxtaskid=", "help"
|
||||
])
|
||||
except getopt.GetoptError:
|
||||
help()
|
||||
sys.exit(2)
|
||||
for opt, arg in opts:
|
||||
if opt in ("-h", "--help"):
|
||||
help()
|
||||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
archive = arg
|
||||
elif opt in ("-g", "--glob"):
|
||||
extidglob = arg
|
||||
elif opt in ("-e", "--extidlistfile"):
|
||||
extidlistfile = arg
|
||||
elif opt in ("-n", "--taskid"):
|
||||
taskid = int(arg)
|
||||
elif opt in ("-N", "--maxtaskid"):
|
||||
maxtaskid = int(arg)
|
||||
|
||||
if extidglob is None and extidlistfile is None:
|
||||
paths = iter_extension_paths(archive, taskid, maxtaskid)
|
||||
elif extidglob is None and extidlistfile is not None:
|
||||
paths = iter_extension_paths_from_file(archive, taskid, maxtaskid, extidlistfile)
|
||||
elif extidglob is not None and extidlistfile is None:
|
||||
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
|
||||
else:
|
||||
help()
|
||||
sys.exit(2)
|
||||
|
||||
for path in paths:
|
||||
print(path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
|
@ -8,3 +8,4 @@ beautifulsoup4==4.6.0
|
|||
python_dateutil==2.6.1
|
||||
GitPython==2.1.5
|
||||
python_magic==0.4.13
|
||||
jsbeautifier==1.7.3
|
||||
|
|
2
setup.py
2
setup.py
|
@ -5,5 +5,5 @@ setup(
|
|||
description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
|
||||
author='Achim D. Brucker, Michael Herzberg',
|
||||
license='GPL 3.0',
|
||||
install_requires=['GitPython', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet']
|
||||
install_requires=['GitPython', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier']
|
||||
)
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
module -s load apps/python/conda 2> /dev/null
|
||||
source activate mypython35
|
||||
|
||||
export PATH=~/bin:$PATH
|
||||
export LD_LIBRARY_PATH=~/lib:$LD_LIBRARY_PATH
|
||||
|
||||
set -o nounset
|
||||
set -x
|
||||
|
||||
SING_EXEC="singularity exec --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG"
|
||||
|
||||
printenv
|
||||
|
||||
echo "The following parameter were passed: $*"
|
||||
echo "Printing the content of $ARCHIVE to force mounting:"
|
||||
ls "$ARCHIVE"
|
||||
|
||||
cd "$BASEDIR"
|
||||
./ExtensionCrawler/create-db -t 1 -a "$ARCHIVE" -n $SGE_TASK_ID -N $MAX_SGE_TASK_ID $*
|
||||
if [ -f "$BASEDIR/ids" ]; then
|
||||
EXT_SELECT="-e $BASEDIR/ids"
|
||||
else
|
||||
EXT_SELECT=
|
||||
fi
|
||||
|
||||
|
||||
/usr/bin/time $SING_EXEC ./create-db -t 1 -a "$ARCHIVE" -n $SGE_TASK_ID -N $MAX_SGE_TASK_ID $EXT_SELECT $*
|
||||
|
|
|
@ -2,34 +2,61 @@
|
|||
set -o nounset
|
||||
set -o errexit
|
||||
|
||||
BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P )
|
||||
|
||||
NRJOBS=${NRJOBS:-256}
|
||||
echo "Using $NRJOBS jobs"
|
||||
|
||||
JOBRANGE=${JOBRANGE:-1-$NRJOBS}
|
||||
echo "Executing jobs $JOBRANGE"
|
||||
|
||||
ARCHIVE=${ARCHIVE:-$(ssh sharc.shef.ac.uk find /shared/brucker_research1/Shared/BrowserExtensions/archive/.snapshot -maxdepth 1 -name \"D*\" | sort -r | head -n1)}
|
||||
echo "Using archive: $ARCHIVE"
|
||||
|
||||
TARGETDIR="${TARGETDIR:-/data/\$USER}/create-db-$(date +%Y%m%d-%H%M%S)"
|
||||
echo "Using target dir: $TARGETDIR"
|
||||
BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P )
|
||||
|
||||
SING_IMG_SRC="${SING_IMG_SRC:-/shared/brucker_research1/Shared/BrowserExtensions/excrawl.img}"
|
||||
SING_IMG="$TARGETDIR/excrawl.img"
|
||||
if ! ssh sharc.shef.ac.uk [ -f "$SING_IMG_SRC" ]; then
|
||||
echo -n "$SING_IMG_SRC does not exist! Generate new image and push? (yes/abort): "
|
||||
read confirm
|
||||
if [ "$confirm" != yes ]; then
|
||||
exit 0
|
||||
fi
|
||||
echo "Creating new image ..."
|
||||
(cd "$BASEDIR/singularity"; ./build.sh)
|
||||
echo "Pushing new image ..."
|
||||
scp "$BASEDIR/singularity/excrawl.img" sharc.shef.ac.uk:"$SING_IMG_SRC"
|
||||
rm "$BASEDIR/singularity/excrawl.img"
|
||||
fi
|
||||
echo "Creating dirs ..."
|
||||
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/ExtensionCrawler
|
||||
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
|
||||
|
||||
echo "Pushing $BASEDIR to sharc.shef.ac.uk:$TARGETDIR/ExtensionCrawler ..."
|
||||
rsync -zr --exclude "$BASEDIR/archive" "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"
|
||||
echo "Copying $SING_IMG_SRC to $SING_IMG"
|
||||
ssh sharc.shef.ac.uk cp "$SING_IMG_SRC" "$SING_IMG"
|
||||
|
||||
echo "Pushing sge script ..."
|
||||
scp "$BASEDIR/sge/create-db.sge" sharc.shef.ac.uk:"$TARGETDIR/create-db.sge"
|
||||
|
||||
if ! [ -z "${EXTIDLISTFILE:-}" ]; then
|
||||
echo "Pushing list with extension ids ..."
|
||||
scp "$EXTIDLISTFILE" sharc.shef.ac.uk:"$TARGETDIR/ids"
|
||||
fi
|
||||
|
||||
echo "Starting job ..."
|
||||
ssh sharc.shef.ac.uk \
|
||||
SING_IMG=\"$SING_IMG\" \
|
||||
ARCHIVE=\"$ARCHIVE\" \
|
||||
BASEDIR=\"$TARGETDIR\" \
|
||||
MAX_SGE_TASK_ID=\"$NRJOBS\" \
|
||||
qsub \
|
||||
-V \
|
||||
-m a \
|
||||
-l rmem=4G
|
||||
-M "msherzberg1@sheffield.ac.uk" \
|
||||
-t 1-$NRJOBS \
|
||||
-t $JOBRANGE \
|
||||
-j yes \
|
||||
-o "$TARGETDIR/logs" \
|
||||
"$TARGETDIR/ExtensionCrawler/sge/create-db.sge" \
|
||||
"$TARGETDIR/create-db.sge" \
|
||||
$*
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
#!/bin/bash
|
||||
|
||||
module -s load apps/python/conda 2> /dev/null
|
||||
source activate mypython35
|
||||
|
||||
set -o nounset
|
||||
|
||||
SING_EXEC="singularity exec --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG"
|
||||
|
||||
printenv
|
||||
|
||||
echo "The following parameter were passed: $*"
|
||||
echo "Printing the content of $ARCHIVE to force mounting:"
|
||||
ls "$ARCHIVE"
|
||||
|
||||
"$BASEDIR/ExtensionCrawler/grepper" -a "$ARCHIVE" -n $SGE_TASK_ID -N $MAX_SGE_TASK_ID $* | bzip2 > "$BASEDIR/out/$SGE_TASK_ID.bz2"
|
||||
/usr/bin/time $SING_EXEC ./grepper -t 1 -a "$ARCHIVE" -n $SGE_TASK_ID -N $MAX_SGE_TASK_ID $* | bzip2 > "$BASEDIR/out/$SGE_TASK_ID.bz2"
|
||||
|
|
|
@ -2,33 +2,55 @@
|
|||
set -o nounset
|
||||
set -o errexit
|
||||
|
||||
BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P )
|
||||
|
||||
NRJOBS=${NRJOBS:-256}
|
||||
echo "Using $NRJOBS jobs"
|
||||
|
||||
JOBRANGE=${JOBRANGE:-1-$NRJOBS}
|
||||
echo "Executing jobs $JOBRANGE"
|
||||
|
||||
ARCHIVE=${ARCHIVE:-$(ssh sharc.shef.ac.uk find /shared/brucker_research1/Shared/BrowserExtensions/archive/.snapshot -maxdepth 1 -name \"D*\" | sort -r | head -n1)}
|
||||
echo "Using archive: $ARCHIVE"
|
||||
|
||||
TARGETDIR="${TARGETDIR:-/data/\$USER}/grepper-$(date +%Y%m%d-%H%M%S)"
|
||||
echo "Using target dir: $TARGETDIR"
|
||||
BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P )
|
||||
|
||||
SING_IMG_SRC="${SING_IMG_SRC:-/shared/brucker_research1/Shared/BrowserExtensions/excrawl.img}"
|
||||
SING_IMG="$TARGETDIR/excrawl.img"
|
||||
if ! ssh sharc.shef.ac.uk [ -f "$SING_IMG_SRC" ]; then
|
||||
echo -n "$SING_IMG_SRC does not exist! Generate new image and push? (yes/abort): "
|
||||
read confirm
|
||||
if [ "$confirm" != yes ]; then
|
||||
exit 0
|
||||
fi
|
||||
echo "Creating new image ..."
|
||||
(cd "$BASEDIR/singularity"; ./build.sh)
|
||||
echo "Pushing new image ..."
|
||||
scp "$BASEDIR/singularity/excrawl.img" sharc.shef.ac.uk:"$SING_IMG_SRC"
|
||||
rm "$BASEDIR/singularity/excrawl.img"
|
||||
fi
|
||||
echo "Creating dirs ..."
|
||||
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/ExtensionCrawler
|
||||
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
|
||||
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/out
|
||||
ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/{logs,out}
|
||||
|
||||
echo "Pushing $BASEDIR to sharc.shef.ac.uk:$TARGETDIR/ExtensionCrawler ..."
|
||||
rsync -zr --exclude "$BASEDIR/archive" "$BASEDIR/" sharc.shef.ac.uk:"$TARGETDIR/ExtensionCrawler"
|
||||
echo "Copying $SING_IMG_SRC to $SING_IMG"
|
||||
ssh sharc.shef.ac.uk cp "$SING_IMG_SRC" "$SING_IMG"
|
||||
|
||||
echo "Pushing sge script ..."
|
||||
scp "$BASEDIR/sge/grepper.sge" sharc.shef.ac.uk:"$TARGETDIR/grepper.sge"
|
||||
|
||||
echo "Starting job ..."
|
||||
ssh sharc.shef.ac.uk \
|
||||
SING_IMG=\"$SING_IMG\" \
|
||||
ARCHIVE=\"$ARCHIVE\" \
|
||||
BASEDIR=\"$TARGETDIR\" \
|
||||
MAX_SGE_TASK_ID=\"$NRJOBS\" \
|
||||
qsub \
|
||||
-V \
|
||||
-t 1-$NRJOBS \
|
||||
-m a \
|
||||
-M "msherzberg1@sheffield.ac.uk" \
|
||||
-t $JOBRANGE \
|
||||
-j yes \
|
||||
-o "$TARGETDIR/logs" \
|
||||
"$TARGETDIR/ExtensionCrawler/sge/grepper.sge" \
|
||||
"$TARGETDIR/grepper.sge" \
|
||||
$*
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
#!/bin/sh
|
||||
# Copyright 2017 The University of Sheffield, UK
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ -f excrawl.img ]; then
|
||||
echo "Image excrawl.img exists already."
|
||||
echo "Please remove/rename the image and restart this script"
|
||||
exit 1
|
||||
else
|
||||
singularity create --size 600 excrawl.img
|
||||
sudo singularity bootstrap excrawl.img excrawl.def
|
||||
fi
|
|
@ -0,0 +1,116 @@
|
|||
#!/bin/sh
|
||||
# Copyright 2017 The University of Sheffield, UK
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
bootstrap:debootstrap
|
||||
OSVersion: testing
|
||||
MirrorURL: https://mirror.bytemark.co.uk/debian/
|
||||
|
||||
%labels
|
||||
Maintainer The LogicalHacking Team (https://logicalhacking.com)
|
||||
|
||||
%setup
|
||||
|
||||
%post
|
||||
|
||||
###################################################################
|
||||
# Add Debian unstable as a secondary (lower priority) source
|
||||
# and update the data base of available packages.
|
||||
cat >> /etc/apt/sources.list << EOF
|
||||
deb http://ftp.us.debian.org/debian unstable main
|
||||
EOF
|
||||
|
||||
cat > /etc/apt/preferences << EOF
|
||||
Package: *
|
||||
Pin: release a=testing
|
||||
Pin-Priority: 900
|
||||
|
||||
Package: *
|
||||
Pin: release a=unstable
|
||||
Pin-Priority: 800
|
||||
EOF
|
||||
|
||||
cat > /etc/apt/apt.conf.d/01norecommend << EOF
|
||||
APT::Install-Recommends "0";
|
||||
APT::Install-Suggests "0";
|
||||
EOF
|
||||
|
||||
chmod go+r /etc/apt/preferences
|
||||
apt-get update
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Configure locales
|
||||
apt-get install -y locales
|
||||
echo "en_GB.UTF-8 UTF-8" >> /etc/locale.gen
|
||||
locale-gen en_GB.UTF-8
|
||||
echo "LANG=en_GB.UTF-8" > /etc/default/locale
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Install the core dependencies (Python 3.5 or later)
|
||||
# from the Debian Testing repository
|
||||
apt-get install -y python3-magic python3-crypto python3-minimal python3-pip python3-setuptools python3-mysqldb
|
||||
apt-get clean
|
||||
apt-get install -y git
|
||||
apt-get clean
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Create /opt for local software (mainly cloned git repositories
|
||||
# from logicalhacking.com
|
||||
mkdir -p /opt
|
||||
chmod 755 /opt
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Add the Extension Crawler repository, for more details, visit
|
||||
# https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler
|
||||
cd /opt
|
||||
git clone https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler.git
|
||||
cd ExtensionCrawler
|
||||
git checkout production
|
||||
cd ..
|
||||
pip3 install --system -e ExtensionCrawler
|
||||
ln -s /opt/ExtensionCrawler/crx-* /usr/local/bin/
|
||||
cd /
|
||||
chmod -R go+u-w /opt/ExtensionCrawler
|
||||
chmod -R go+u-w /usr/local/lib/
|
||||
chmod -R go+u-w /usr/local/bin/
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Create mount/bind points for the various network drives
|
||||
# on SHARC (only useful when using the Singularity image on
|
||||
# the High-Performance Cluster of The University of Sheffield
|
||||
mkdir /scratch
|
||||
mkdir /fastdata
|
||||
mkdir /data
|
||||
mkdir /shared
|
||||
|
||||
# Create nvidia driver directories to get rid of the singularity
|
||||
# warnings on sharc
|
||||
mkdir /nvbin
|
||||
mkdir /nvlib
|
||||
chmod go+u-w /scratch /fastdata /data /shared
|
||||
###################################################################
|
||||
|
||||
%environment
|
||||
|
||||
# We install all python modules into the container, so we do not want
|
||||
# to use any packages that the user might have installed in their home
|
||||
# directory.
|
||||
export PYTHONNOUSERSITE=1
|
Loading…
Reference in New Issue