ExtensionCrawler/grepper

184 lines
5.6 KiB
Plaintext
Raw Normal View History

2017-07-05 19:55:41 +00:00
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import tarfile
import traceback
2017-07-10 12:17:48 +00:00
import jsbeautifier
2017-08-23 15:52:18 +00:00
import fnmatch
from multiprocessing import Pool, Lock
2017-07-05 19:55:41 +00:00
from zipfile import ZipFile
2017-07-31 13:19:52 +00:00
from functools import partial
import re
2017-07-05 19:55:41 +00:00
def help():
2017-08-23 15:52:18 +00:00
print("grepper [OPTION] GREP [FILE]")
print(" GREP regex pattern")
print(" [FILE] path(s) to extension tar")
print(" -h print this help text")
print(" -b beautify JavaScript before matching")
print(" -a <DIR> archive directory")
print(" -p <PREFIX> three-letter-prefix")
print(" -e <EXTIDFILELIST> file with extension ids")
print(" -t <THREADS> number of threads to use")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
def guarded_stdout(string):
lock.acquire()
sys.stdout.write(string)
lock.release()
def guarded_stderr(string):
lock.acquire()
sys.stderr.write(string)
lock.release()
def process_crx(ext_id, date, crx, pattern, beautify):
try:
with ZipFile(crx) as z:
for zip_file_info in z.infolist():
if not zip_file_info.filename.endswith(".js"):
continue
with z.open(zip_file_info) as f:
content = f.read().decode(errors="surrogateescape")
if beautify:
content = jsbeautifier.beautify(content)
for i, line in enumerate(content.splitlines()):
if not re.search(pattern, line):
2017-08-14 13:40:10 +00:00
continue
2017-07-31 13:19:52 +00:00
args = [
2017-08-23 15:52:18 +00:00
ext_id, date, zip_file_info.filename + " (line " +
str(i + 1) + ")", line
2017-07-31 13:19:52 +00:00
]
2017-08-23 15:52:18 +00:00
guarded_stdout("|".join(args) + "\n")
2017-07-31 13:19:52 +00:00
2017-08-23 15:52:18 +00:00
except Exception:
guarded_stderr("Exception when handling {}, {}:\n{}".format(
ext_id, date, traceback.format_exc()))
2017-07-31 13:19:52 +00:00
2017-07-05 19:55:41 +00:00
2017-08-23 15:52:18 +00:00
def process_id(pattern, beautify, path):
try:
with tarfile.open(path, 'r') as t:
2017-08-14 13:40:10 +00:00
for tar_info in t.getmembers():
if not tar_info.name.endswith(".crx") or tar_info.size is 0:
continue
2017-08-23 15:52:18 +00:00
extid, date = tar_info.name.split("/")[:2]
with t.extractfile(tar_info) as crx:
process_crx(extid, date, crx, pattern, beautify)
except Exception:
guarded_stderr("Exception when handling {}:\n{}".format(
path, traceback.format_exc()))
2017-07-05 19:55:41 +00:00
2017-08-23 15:52:18 +00:00
def find(archive, pattern):
for root, _, files in os.walk(os.path.join(archive, "data")):
for file in files:
if fnmatch.fnmatch(file, pattern + ".tar"):
yield os.path.join(root, file)
2017-07-05 19:55:41 +00:00
2017-08-23 15:52:18 +00:00
def find_from_file(archive, extidlistfile):
with open(extidlistfile, 'r') as f:
extids = [l.strip() for l in f.readlines()]
for root, _, files in os.walk(os.path.join(archive, "data")):
for file in files:
for extid in extids:
if fnmatch.fnmatch(file, extid + ".tar"):
yield os.path.join(root, file)
def init(l):
global lock
lock = l
def parse_args(argv):
2017-07-05 19:55:41 +00:00
archive = "archive"
2017-08-14 13:40:10 +00:00
beautify = False
2017-07-05 19:55:41 +00:00
parallel = 8
2017-08-23 15:52:18 +00:00
taskid = 1
maxtaskid = 1
paths = []
2017-07-05 19:55:41 +00:00
try:
2017-08-23 15:52:18 +00:00
opts, args = getopt.getopt(argv, "ha:p:e:bt:n:N:", [
"archive=", "prefix=", "extidlistfile=", "beautify", "threads=",
"taskid=", "maxtaskid="
])
2017-07-05 19:55:41 +00:00
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
elif opt in ("-p", "--prefix"):
prefix = arg
2017-08-23 15:52:18 +00:00
paths += find(archive, prefix + "*")
elif opt in ("-e", "--extidlistfile"):
extidlistfile = arg
paths += find_from_file(archive, extidlistfile)
2017-08-14 13:40:10 +00:00
elif opt in ("-b", "--beautify"):
beautify = True
2017-07-05 19:55:41 +00:00
elif opt in ("-t", "--threads"):
parallel = int(arg)
2017-08-23 15:52:18 +00:00
elif opt in ("-n", "--taskid"):
taskid = int(arg)
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
2017-07-05 19:55:41 +00:00
2017-08-23 15:52:18 +00:00
if len(args) is 0:
2017-07-05 19:55:41 +00:00
help()
sys.exit(2)
2017-08-23 15:52:18 +00:00
pattern = args[0]
paths += args[1:]
if paths == []:
paths = list(find(archive, "*"))
2017-07-05 19:55:41 +00:00
2017-08-23 15:52:18 +00:00
chunksize = int(len(paths) / maxtaskid)
if taskid == maxtaskid:
paths = paths[(taskid - 1) * chunksize:]
else:
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
2017-08-14 13:40:10 +00:00
2017-08-23 15:52:18 +00:00
return pattern, paths, beautify, parallel
2017-08-14 13:40:10 +00:00
2017-07-05 19:55:41 +00:00
2017-08-23 15:52:18 +00:00
def main(argv):
pattern, paths, beautify, parallel = parse_args(argv)
l = Lock()
with Pool(initializer=init, initargs=(l, ), processes=parallel) as p:
p.map(partial(process_id, pattern, beautify), paths)
2017-08-14 13:40:10 +00:00
2017-07-05 19:55:41 +00:00
if __name__ == "__main__":
main(sys.argv[1:])