ExtensionCrawler/grepper

146 lines
4.8 KiB
Plaintext
Raw Permalink Normal View History

2017-07-05 19:55:41 +00:00
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import glob
import tarfile
import traceback
2017-07-10 12:17:48 +00:00
import jsbeautifier
2017-07-05 19:55:41 +00:00
from multiprocessing import Pool
from zipfile import ZipFile
2017-07-31 13:19:52 +00:00
from functools import partial
import re
2017-07-05 19:55:41 +00:00
from ExtensionCrawler.config import *
2017-07-31 13:19:52 +00:00
from ExtensionCrawler.util import *
2017-07-05 19:55:41 +00:00
def help():
2017-07-10 13:09:45 +00:00
print("grepper [OPTION] BASEDIR GREP1 [GREP2...]")
2017-07-05 19:55:41 +00:00
print(" GREP1 [GREP2...] regex patterns")
print(" BASEDIR directory for output")
print(" -h print this help text")
2017-08-14 13:40:10 +00:00
print(" -b beautify JavaScript before matching")
print(" -a <DIR> archive directory")
print(" -p <PREFIX> three-letter-prefix")
print(" -t <THREADS> number of threads to use")
def process_crx(ext_id, date, crx, greps, beautify, out_f):
with ZipFile(crx) as z:
for zip_file_info in z.infolist():
if not zip_file_info.filename.endswith(".js"):
continue
with z.open(zip_file_info) as f:
content = f.read().decode(errors="surrogateescape")
if beautify:
content = jsbeautifier.beautify(content)
for i, line in enumerate(content.splitlines()):
for gr in greps:
if not re.search(gr, line):
continue
2017-07-31 13:19:52 +00:00
args = [
2017-08-14 13:40:10 +00:00
ext_id, date, gr, zip_file_info.filename +
" (line " + str(i + 1) + ")", line
2017-07-31 13:19:52 +00:00
]
print("|".join(args), file=out_f)
2017-08-14 13:40:10 +00:00
def process_id(archivedir, outdir, greps, beautify, verbose, ext_id):
2017-07-05 19:55:41 +00:00
txt = ""
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
tarpath = archive_file(archivedir, ext_id)
greppath = os.path.join(outdir, ext_id + ".grep")
if os.path.exists(greppath):
os.remove(greppath)
2017-07-10 13:09:45 +00:00
with open(greppath, "w") as out_f:
2017-08-14 13:40:10 +00:00
with tarfile.open(tarpath, 'r') as t:
for tar_info in t.getmembers():
if not tar_info.name.endswith(".crx") or tar_info.size is 0:
continue
date = tar_info.name.split("/")[1]
try:
with t.extractfile(tar_info) as crx:
process_crx(ext_id, date, crx, greps, beautify, out_f)
except Exception:
txt = logmsg(
verbose, txt,
"Exception when handling {}:\n".format(tar_info.name))
txt = logmsg(verbose, txt, traceback.format_exc())
2017-07-05 19:55:41 +00:00
return txt
def main(argv):
archive = "archive"
prefix = ""
2017-08-14 13:40:10 +00:00
beautify = False
2017-07-05 19:55:41 +00:00
parallel = 8
try:
2017-08-14 13:40:10 +00:00
opts, args = getopt.getopt(
argv, "ha:p:bt:", ["archive=", "prefix=", "beautify", "threads="])
2017-07-05 19:55:41 +00:00
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
elif opt in ("-p", "--prefix"):
prefix = arg
2017-08-14 13:40:10 +00:00
elif opt in ("-b", "--beautify"):
beautify = True
2017-07-05 19:55:41 +00:00
elif opt in ("-t", "--threads"):
parallel = int(arg)
2017-07-10 13:09:45 +00:00
if len(args) < 2:
2017-07-05 19:55:41 +00:00
help()
sys.exit(2)
outdir = args[0]
2017-07-10 13:09:45 +00:00
greps = args[1:]
2017-07-05 19:55:41 +00:00
archivedir = os.path.join(archive, "data")
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
2017-08-14 13:40:10 +00:00
print("Using archive '{}'".format(archive))
print("Using prefix '{}'".format(prefix))
print("Using beautifier? '{}'".format(beautify))
print("Using {} threads".format(parallel))
print("Found {} three-letter-dirs".format(len(threeletterdirs)))
2017-07-05 19:55:41 +00:00
for threeletterdir in threeletterdirs:
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
with Pool(parallel) as p:
for txt in p.imap(
2017-08-14 13:40:10 +00:00
partial(process_id, archivedir, outdir, greps, beautify,
True), ext_ids):
2017-07-05 19:55:41 +00:00
sys.stdout.write(txt)
sys.stdout.flush()
2017-08-14 13:40:10 +00:00
print("Sucessfully finished grepping")
2017-07-05 19:55:41 +00:00
if __name__ == "__main__":
main(sys.argv[1:])