ExtensionCrawler/grepper

146 lines
4.8 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import glob
import tarfile
import traceback
import jsbeautifier
from multiprocessing import Pool
from zipfile import ZipFile
from functools import partial
import re
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
def help():
print("grepper [OPTION] BASEDIR GREP1 [GREP2...]")
print(" GREP1 [GREP2...] regex patterns")
print(" BASEDIR directory for output")
print(" -h print this help text")
print(" -b beautify JavaScript before matching")
print(" -a <DIR> archive directory")
print(" -p <PREFIX> three-letter-prefix")
print(" -t <THREADS> number of threads to use")
def process_crx(ext_id, date, crx, greps, beautify, out_f):
with ZipFile(crx) as z:
for zip_file_info in z.infolist():
if not zip_file_info.filename.endswith(".js"):
continue
with z.open(zip_file_info) as f:
content = f.read().decode(errors="surrogateescape")
if beautify:
content = jsbeautifier.beautify(content)
for i, line in enumerate(content.splitlines()):
for gr in greps:
if not re.search(gr, line):
continue
args = [
ext_id, date, gr, zip_file_info.filename +
" (line " + str(i + 1) + ")", line
]
print("|".join(args), file=out_f)
def process_id(archivedir, outdir, greps, beautify, verbose, ext_id):
txt = ""
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
tarpath = archive_file(archivedir, ext_id)
greppath = os.path.join(outdir, ext_id + ".grep")
if os.path.exists(greppath):
os.remove(greppath)
with open(greppath, "w") as out_f:
with tarfile.open(tarpath, 'r') as t:
for tar_info in t.getmembers():
if not tar_info.name.endswith(".crx") or tar_info.size is 0:
continue
date = tar_info.name.split("/")[1]
try:
with t.extractfile(tar_info) as crx:
process_crx(ext_id, date, crx, greps, beautify, out_f)
except Exception:
txt = logmsg(
verbose, txt,
"Exception when handling {}:\n".format(tar_info.name))
txt = logmsg(verbose, txt, traceback.format_exc())
return txt
def main(argv):
archive = "archive"
prefix = ""
beautify = False
parallel = 8
try:
opts, args = getopt.getopt(
argv, "ha:p:bt:", ["archive=", "prefix=", "beautify", "threads="])
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
elif opt in ("-p", "--prefix"):
prefix = arg
elif opt in ("-b", "--beautify"):
beautify = True
elif opt in ("-t", "--threads"):
parallel = int(arg)
if len(args) < 2:
help()
sys.exit(2)
outdir = args[0]
greps = args[1:]
archivedir = os.path.join(archive, "data")
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
print("Using archive '{}'".format(archive))
print("Using prefix '{}'".format(prefix))
print("Using beautifier? '{}'".format(beautify))
print("Using {} threads".format(parallel))
print("Found {} three-letter-dirs".format(len(threeletterdirs)))
for threeletterdir in threeletterdirs:
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
with Pool(parallel) as p:
for txt in p.imap(
partial(process_id, archivedir, outdir, greps, beautify,
True), ext_ids):
sys.stdout.write(txt)
sys.stdout.flush()
print("Sucessfully finished grepping")
if __name__ == "__main__":
main(sys.argv[1:])