Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

This commit is contained in:
Michael Herzberg 2019-02-13 22:52:12 +00:00
commit 0248f30d04
16 changed files with 94 additions and 194 deletions

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3.6
#!/usr/bin/env python3.7
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3.6
#!/usr/bin/env python3.7
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#

View File

@ -40,7 +40,7 @@ They will use the host, datebase, and credentials found in `~/.my.cnf`.
Since they make use of various JSON features, it is recommended to use at
least version 10.2.8 of MariaDB.
All utilities are written in Python 3.6. The required modules are listed
All utilities are written in Python 3.7. The required modules are listed
in the file `requirements.txt`.
## Installation

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3.6
#!/usr/bin/env python3.7
#
# Copyright (C) 2018 The University of Sheffield, UK
#

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3.6
#!/usr/bin/env python3.7
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3.6
#!/usr/bin/env python3.7
#
# Copyright (C) 2018 The University of Sheffield, UK
#

View File

@ -37,6 +37,7 @@ from ExtensionCrawler.util import log_info, log_exception, setup_logger
def write_log(dirname, fname, text):
"""Write text into the file with name fname in directory dirname."""
os.makedirs(dirname, exist_ok=True)
fname = fname.replace(":", "_")
with open(os.path.join(dirname, fname), 'w') as logfile:
logfile.write(text)
@ -191,7 +192,7 @@ def main(argv):
conf_dir = os.path.join(basedir, "conf")
os.makedirs(conf_dir, exist_ok=True)
open(os.path.join(conf_dir, "forums.conf"), 'a').close()
log_dir = os.path.join(basedir, "log")
log_dir = os.path.join(basedir, "log",datetime.datetime.today().strftime("%Y-%m"))
os.makedirs(log_dir, exist_ok=True)
start_time = time.time()

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3.6
#!/usr/bin/env python3.7
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#

View File

@ -8,7 +8,7 @@ CRAWLERHOME=${2:-~/ExtensionCrawler}
IMAGE=${3:-/shared/brucker_research1/Shared/BrowserExtensions/bin/ExtensionCrawler.img}
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
mkdir -p $LOGDIR
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g'`
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
LOG=$LOGPREFIX-global.log
date --utc +'* Start Updating Extensions Archive (%c)' | tee $LOG

View File

@ -25,8 +25,8 @@ case $key in
esac
done
LATESTLOG=`ls $ARCHIVE/log/*/*0.log $ARCHIVE/log/*0.log | tail -n 1`
LATESTGLOBALLOG=`ls $ARCHIVE/log/*/*-global.log $ARCHIVE/log/*-global.log| tail -n 1`
LATESTLOG=`ls $ARCHIVE/log/*/*0.log | tail -n 1`
LATESTGLOBALLOG=`ls $ARCHIVE/log/*/*-global.log | tail -n 1`
BASEDIR=$(dirname "$0")
PIDS=""

View File

@ -0,0 +1,63 @@
#!/bin/bash
ACTION=${1:-MAIN}
ARCHIVE=${2:-/srv/Shared/BrowserExtensions/archive}
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
mkdir -p $LOGDIR
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
LOG=${3:-$LOGPREFIX-maintain-archive-$ACTION.log}
SELF=$0
SRC=$4
next_generation(){
local src=$1
local filebase=`basename $src .tar`
local dir=`dirname $src`
# Check next free file name:
if ls $dir/$filebase.[0-9][0-9][0-9].tar.xz &> /dev/null; then
latest=`ls $dir/$filebase.[0-9][0-9][0-9].tar.xz | \
sort -r | head -1 | \
sed -e "s/.*\([0-9][0-9][0-9]\).tar.xz/\1/"`
next=`printf %03d $((latest+1))`
else
next=000
fi
dest=$dir/$filebase.$next.tar
echo "Processing: $src -> $dest" | tee -a $LOG
mv -n $src $dest
if [ ! -f $src ]; then
tar -cf $src -T /dev/null
if [ ! -f $src ]; then
echo "ERROR: cannot create empty tar archive ($src)" | tee -a $LOG
fi
else
echo "ERROR: old archive exists ($src)" | tee -a $LOG
fi
}
zge_compress(){
mkdir -p $LOG.dir
find $ARCHIVE/data/ \
-type d \
-name "[a-p][a-p][a-p]" \
-exec qsub -o $LOG.dir `dirname $SELF`/xz.sge {} \;
}
main(){
find $ARCHIVE/data/ \
-name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].tar" \
-exec $SELF MOVE $ARCHIVE $LOG {} \;
}
case "$ACTION" in
MAIN)
main;;
MOVE)
next_generation $SRC;;
COMPRESS)
zge_compress;;
esac

View File

@ -0,0 +1,10 @@
#!/bin/bash
#$ -V
#$ -l rmem=2G
#$ -j yes
set -o nounset
set -x
find $1 \
-name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].[0-9][0-9][0-9].tar" \
-exec xz {} \;

View File

@ -2,7 +2,10 @@
ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive}
TMPDIR=${TMPDIR:-/tmp}
LOGPREFIX=$ARCHIVE/log/`date --utc --iso-8601=ns`
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
mkdir -p $LOGDIR
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
LOG=$LOGPREFIX-cdnjs.log
SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3.6
#!/usr/bin/env python3.7
#
# Copyright (C) 2018 The University of Sheffield, UK
#

View File

@ -1,180 +0,0 @@
#!/bin/sh
# Copyright 2017 The University of Sheffield, UK
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Bootstrap: docker
From: debian
%labels
Maintainer The LogicalHacking Team (https://logicalhacking.com)
%setup
%files
.. /opt/ExtensionCrawler
%post
###################################################################
# Add Debian unstable as a secondary (lower priority) source
# and update the data base of available packages.
cat >> /etc/apt/sources.list << EOF
deb http://ftp.us.debian.org/debian unstable main
EOF
cat > /etc/apt/preferences << EOF
Package: *
Pin: release a=testing
Pin-Priority: 900
Package: *
Pin: release a=unstable
Pin-Priority: 800
EOF
cat > /etc/apt/apt.conf.d/01norecommend << EOF
APT::Install-Recommends "0";
APT::Install-Suggests "0";
EOF
chmod go+r /etc/apt/preferences
apt-get update
###################################################################
###################################################################
# Configure locales
apt-get install -y locales
echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
echo "en_GB.UTF-8 UTF-8" >> /etc/locale.gen
locale-gen
echo "LANG=en_US.UTF-8" > /etc/default/locale
###################################################################
###################################################################
# Install the core dependencies (Python 3.6 or later)
# from the Debian Testing repository
apt-get install -y python3-magic python3-crypto python3-minimal python3-pip python3-setuptools python3-mysqldb python3-jsbeautifier python3-tabulate
apt-get install -y build-essential libgmp3-dev python3-dev # For pycryptodome
apt-get clean
apt-get install -y git
apt-get clean
rm -rf /var/lib/apt/lists/*
###################################################################
###################################################################
# Create /opt for local software (mainly cloned git repositories
# from logicalhacking.com
mkdir -p /opt
chmod 755 /opt
###################################################################
###################################################################
# Add the Extension Crawler repository, for more details, visit
# https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler
cd /opt
# git clone https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler.git
# cd ExtensionCrawler
# git checkout production
# cd ..
pip3 install wheel # simhash needs wheel to build properly, still works without it though
pip3 install --system -e ExtensionCrawler
cd /
chmod -R go+u-w /opt/ExtensionCrawler
chmod -R go+u-w /usr/local/lib/
chmod -R go+u-w /usr/local/bin/
###################################################################
###################################################################
# Clone cdnjs repository or crate link to external archive dir
ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive
case ${SINGULARITY_IMAGE} in
*-cdnjs.img)
mkdir -p /opt/archive/filedb
cd /opt/archive/filedb
git clone https://github.com/cdnjs/cdnjs.git cdnjs-git
cd cdnjs-git
git pull
ln -s ${ARCHIVE}/conf . > /dev/null
ln -s ${ARCHIVE}/data > /dev/null
ln -s ${ARCHIVE}/log > /dev/null
;;
*)
cd /opt/
ln -s ${ARCHIVE} .
;;
esac
chmod -R go+u /opt
###################################################################
###################################################################
# Create mount/bind points for the various network drives
# on SHARC (only useful when using the Singularity image on
# the High-Performance Cluster of The University of Sheffield
mkdir /scratch
mkdir /fastdata
mkdir /data
mkdir /shared
# Create nvidia driver directories to get rid of the singularity
# warnings on sharc
mkdir /nvbin
mkdir /nvlib
chmod go+u-w /scratch /fastdata /data /shared
###################################################################
%environment
export EXTENSION_ARCHIVE=/opt/archive
export PATH=/opt/ExtensionCrawler/:${PATH}
# We install all python modules into the container, so we do not want
# to use any packages that the user might have installed in their home
# directory.
export PYTHONNOUSERSITE=1
%runscript
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# this text will get copied to /singularity and will run whenever the container
# is called as an executable
usage() {
cat <<EOF
NAME
ExtensionCrawler
SYNOPSIS
ExtensionCrawler tool [tool options]
ExtensionCrawler list
ExtensionCrawler help
DESCRIPTION
A collection of utilities for downloading and analyzing browser extension
from the Chrome Web store.
ENVIRIONMENT
EXTENSION_ARCHIVE=${EXTENSION_ARCHIVE}
EOF
}
tools() {
find /opt/ExtensionCrawler -maxdepth 1 -executable -type f -exec sh -c "{} -h | head -n 1" \;
}
arg="${1:-none}"
case "$arg" in
none) usage; exit 1;;
help) usage; exit 0;;
list) tools; exit 0;;
# just try to execute it then
*) $@;;
esac

View File

@ -64,7 +64,10 @@ else
fi
BINDIR=$(dirname "$ARCHIVE")/bin
LOGPREFIX=$ARCHIVE/log/`date --utc --iso-8601=ns`
LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
mkdir -p $LOGDIR
LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g'`
LOG="$LOGPREFIX-$IMAGE.log"
if [ -f ${IMAGE} ]; then