Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler
This commit is contained in:
commit
0248f30d04
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3.6
|
||||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3.6
|
||||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
|
|
|
@ -40,7 +40,7 @@ They will use the host, datebase, and credentials found in `~/.my.cnf`.
|
|||
Since they make use of various JSON features, it is recommended to use at
|
||||
least version 10.2.8 of MariaDB.
|
||||
|
||||
All utilities are written in Python 3.6. The required modules are listed
|
||||
All utilities are written in Python 3.7. The required modules are listed
|
||||
in the file `requirements.txt`.
|
||||
|
||||
## Installation
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3.6
|
||||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2018 The University of Sheffield, UK
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3.6
|
||||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3.6
|
||||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2018 The University of Sheffield, UK
|
||||
#
|
||||
|
|
3
crawler
3
crawler
|
@ -37,6 +37,7 @@ from ExtensionCrawler.util import log_info, log_exception, setup_logger
|
|||
def write_log(dirname, fname, text):
|
||||
"""Write text into the file with name fname in directory dirname."""
|
||||
os.makedirs(dirname, exist_ok=True)
|
||||
fname = fname.replace(":", "_")
|
||||
with open(os.path.join(dirname, fname), 'w') as logfile:
|
||||
logfile.write(text)
|
||||
|
||||
|
@ -191,7 +192,7 @@ def main(argv):
|
|||
conf_dir = os.path.join(basedir, "conf")
|
||||
os.makedirs(conf_dir, exist_ok=True)
|
||||
open(os.path.join(conf_dir, "forums.conf"), 'a').close()
|
||||
log_dir = os.path.join(basedir, "log")
|
||||
log_dir = os.path.join(basedir, "log",datetime.datetime.today().strftime("%Y-%m"))
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
|
||||
start_time = time.time()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3.6
|
||||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
|
|
|
@ -8,7 +8,7 @@ CRAWLERHOME=${2:-~/ExtensionCrawler}
|
|||
IMAGE=${3:-/shared/brucker_research1/Shared/BrowserExtensions/bin/ExtensionCrawler.img}
|
||||
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
|
||||
mkdir -p $LOGDIR
|
||||
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g'`
|
||||
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
|
||||
LOG=$LOGPREFIX-global.log
|
||||
|
||||
date --utc +'* Start Updating Extensions Archive (%c)' | tee $LOG
|
||||
|
|
|
@ -25,8 +25,8 @@ case $key in
|
|||
esac
|
||||
done
|
||||
|
||||
LATESTLOG=`ls $ARCHIVE/log/*/*0.log $ARCHIVE/log/*0.log | tail -n 1`
|
||||
LATESTGLOBALLOG=`ls $ARCHIVE/log/*/*-global.log $ARCHIVE/log/*-global.log| tail -n 1`
|
||||
LATESTLOG=`ls $ARCHIVE/log/*/*0.log | tail -n 1`
|
||||
LATESTGLOBALLOG=`ls $ARCHIVE/log/*/*-global.log | tail -n 1`
|
||||
BASEDIR=$(dirname "$0")
|
||||
|
||||
PIDS=""
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
#!/bin/bash
|
||||
|
||||
ACTION=${1:-MAIN}
|
||||
ARCHIVE=${2:-/srv/Shared/BrowserExtensions/archive}
|
||||
|
||||
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
|
||||
mkdir -p $LOGDIR
|
||||
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
|
||||
LOG=${3:-$LOGPREFIX-maintain-archive-$ACTION.log}
|
||||
|
||||
SELF=$0
|
||||
SRC=$4
|
||||
|
||||
next_generation(){
|
||||
local src=$1
|
||||
local filebase=`basename $src .tar`
|
||||
local dir=`dirname $src`
|
||||
|
||||
# Check next free file name:
|
||||
if ls $dir/$filebase.[0-9][0-9][0-9].tar.xz &> /dev/null; then
|
||||
latest=`ls $dir/$filebase.[0-9][0-9][0-9].tar.xz | \
|
||||
sort -r | head -1 | \
|
||||
sed -e "s/.*\([0-9][0-9][0-9]\).tar.xz/\1/"`
|
||||
next=`printf %03d $((latest+1))`
|
||||
else
|
||||
next=000
|
||||
fi
|
||||
|
||||
dest=$dir/$filebase.$next.tar
|
||||
echo "Processing: $src -> $dest" | tee -a $LOG
|
||||
mv -n $src $dest
|
||||
if [ ! -f $src ]; then
|
||||
tar -cf $src -T /dev/null
|
||||
if [ ! -f $src ]; then
|
||||
echo "ERROR: cannot create empty tar archive ($src)" | tee -a $LOG
|
||||
fi
|
||||
else
|
||||
echo "ERROR: old archive exists ($src)" | tee -a $LOG
|
||||
fi
|
||||
}
|
||||
|
||||
zge_compress(){
|
||||
mkdir -p $LOG.dir
|
||||
find $ARCHIVE/data/ \
|
||||
-type d \
|
||||
-name "[a-p][a-p][a-p]" \
|
||||
-exec qsub -o $LOG.dir `dirname $SELF`/xz.sge {} \;
|
||||
}
|
||||
|
||||
main(){
|
||||
find $ARCHIVE/data/ \
|
||||
-name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].tar" \
|
||||
-exec $SELF MOVE $ARCHIVE $LOG {} \;
|
||||
}
|
||||
|
||||
case "$ACTION" in
|
||||
MAIN)
|
||||
main;;
|
||||
MOVE)
|
||||
next_generation $SRC;;
|
||||
COMPRESS)
|
||||
zge_compress;;
|
||||
esac
|
|
@ -0,0 +1,10 @@
|
|||
#!/bin/bash
|
||||
#$ -V
|
||||
#$ -l rmem=2G
|
||||
#$ -j yes
|
||||
set -o nounset
|
||||
set -x
|
||||
|
||||
find $1 \
|
||||
-name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].[0-9][0-9][0-9].tar" \
|
||||
-exec xz {} \;
|
|
@ -2,7 +2,10 @@
|
|||
|
||||
ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive}
|
||||
TMPDIR=${TMPDIR:-/tmp}
|
||||
LOGPREFIX=$ARCHIVE/log/`date --utc --iso-8601=ns`
|
||||
|
||||
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
|
||||
mkdir -p $LOGDIR
|
||||
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
|
||||
LOG=$LOGPREFIX-cdnjs.log
|
||||
|
||||
SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3.6
|
||||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2018 The University of Sheffield, UK
|
||||
#
|
||||
|
|
|
@ -1,180 +0,0 @@
|
|||
#!/bin/sh
|
||||
# Copyright 2017 The University of Sheffield, UK
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
Bootstrap: docker
|
||||
From: debian
|
||||
|
||||
%labels
|
||||
Maintainer The LogicalHacking Team (https://logicalhacking.com)
|
||||
|
||||
%setup
|
||||
|
||||
%files
|
||||
.. /opt/ExtensionCrawler
|
||||
|
||||
%post
|
||||
|
||||
###################################################################
|
||||
# Add Debian unstable as a secondary (lower priority) source
|
||||
# and update the data base of available packages.
|
||||
cat >> /etc/apt/sources.list << EOF
|
||||
deb http://ftp.us.debian.org/debian unstable main
|
||||
EOF
|
||||
|
||||
cat > /etc/apt/preferences << EOF
|
||||
Package: *
|
||||
Pin: release a=testing
|
||||
Pin-Priority: 900
|
||||
|
||||
Package: *
|
||||
Pin: release a=unstable
|
||||
Pin-Priority: 800
|
||||
EOF
|
||||
|
||||
cat > /etc/apt/apt.conf.d/01norecommend << EOF
|
||||
APT::Install-Recommends "0";
|
||||
APT::Install-Suggests "0";
|
||||
EOF
|
||||
|
||||
chmod go+r /etc/apt/preferences
|
||||
apt-get update
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Configure locales
|
||||
apt-get install -y locales
|
||||
echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
|
||||
echo "en_GB.UTF-8 UTF-8" >> /etc/locale.gen
|
||||
locale-gen
|
||||
echo "LANG=en_US.UTF-8" > /etc/default/locale
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Install the core dependencies (Python 3.6 or later)
|
||||
# from the Debian Testing repository
|
||||
apt-get install -y python3-magic python3-crypto python3-minimal python3-pip python3-setuptools python3-mysqldb python3-jsbeautifier python3-tabulate
|
||||
apt-get install -y build-essential libgmp3-dev python3-dev # For pycryptodome
|
||||
apt-get clean
|
||||
apt-get install -y git
|
||||
apt-get clean
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Create /opt for local software (mainly cloned git repositories
|
||||
# from logicalhacking.com
|
||||
mkdir -p /opt
|
||||
chmod 755 /opt
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Add the Extension Crawler repository, for more details, visit
|
||||
# https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler
|
||||
cd /opt
|
||||
# git clone https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler.git
|
||||
# cd ExtensionCrawler
|
||||
# git checkout production
|
||||
# cd ..
|
||||
pip3 install wheel # simhash needs wheel to build properly, still works without it though
|
||||
pip3 install --system -e ExtensionCrawler
|
||||
cd /
|
||||
chmod -R go+u-w /opt/ExtensionCrawler
|
||||
chmod -R go+u-w /usr/local/lib/
|
||||
chmod -R go+u-w /usr/local/bin/
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Clone cdnjs repository or crate link to external archive dir
|
||||
ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive
|
||||
case ${SINGULARITY_IMAGE} in
|
||||
*-cdnjs.img)
|
||||
mkdir -p /opt/archive/filedb
|
||||
cd /opt/archive/filedb
|
||||
git clone https://github.com/cdnjs/cdnjs.git cdnjs-git
|
||||
cd cdnjs-git
|
||||
git pull
|
||||
ln -s ${ARCHIVE}/conf . > /dev/null
|
||||
ln -s ${ARCHIVE}/data > /dev/null
|
||||
ln -s ${ARCHIVE}/log > /dev/null
|
||||
;;
|
||||
*)
|
||||
cd /opt/
|
||||
ln -s ${ARCHIVE} .
|
||||
;;
|
||||
esac
|
||||
chmod -R go+u /opt
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Create mount/bind points for the various network drives
|
||||
# on SHARC (only useful when using the Singularity image on
|
||||
# the High-Performance Cluster of The University of Sheffield
|
||||
mkdir /scratch
|
||||
mkdir /fastdata
|
||||
mkdir /data
|
||||
mkdir /shared
|
||||
|
||||
# Create nvidia driver directories to get rid of the singularity
|
||||
# warnings on sharc
|
||||
mkdir /nvbin
|
||||
mkdir /nvlib
|
||||
chmod go+u-w /scratch /fastdata /data /shared
|
||||
###################################################################
|
||||
|
||||
%environment
|
||||
|
||||
export EXTENSION_ARCHIVE=/opt/archive
|
||||
export PATH=/opt/ExtensionCrawler/:${PATH}
|
||||
|
||||
# We install all python modules into the container, so we do not want
|
||||
# to use any packages that the user might have installed in their home
|
||||
# directory.
|
||||
export PYTHONNOUSERSITE=1
|
||||
|
||||
%runscript
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# this text will get copied to /singularity and will run whenever the container
|
||||
# is called as an executable
|
||||
usage() {
|
||||
cat <<EOF
|
||||
NAME
|
||||
ExtensionCrawler
|
||||
SYNOPSIS
|
||||
ExtensionCrawler tool [tool options]
|
||||
ExtensionCrawler list
|
||||
ExtensionCrawler help
|
||||
DESCRIPTION
|
||||
A collection of utilities for downloading and analyzing browser extension
|
||||
from the Chrome Web store.
|
||||
ENVIRIONMENT
|
||||
EXTENSION_ARCHIVE=${EXTENSION_ARCHIVE}
|
||||
EOF
|
||||
}
|
||||
|
||||
tools() {
|
||||
find /opt/ExtensionCrawler -maxdepth 1 -executable -type f -exec sh -c "{} -h | head -n 1" \;
|
||||
}
|
||||
|
||||
arg="${1:-none}"
|
||||
|
||||
case "$arg" in
|
||||
none) usage; exit 1;;
|
||||
help) usage; exit 0;;
|
||||
list) tools; exit 0;;
|
||||
# just try to execute it then
|
||||
*) $@;;
|
||||
esac
|
||||
|
|
@ -64,7 +64,10 @@ else
|
|||
fi
|
||||
|
||||
BINDIR=$(dirname "$ARCHIVE")/bin
|
||||
LOGPREFIX=$ARCHIVE/log/`date --utc --iso-8601=ns`
|
||||
|
||||
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
|
||||
mkdir -p $LOGDIR
|
||||
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g'`
|
||||
LOG="$LOGPREFIX-$IMAGE.log"
|
||||
|
||||
if [ -f ${IMAGE} ]; then
|
||||
|
|
Loading…
Reference in New Issue