From f37e19f46a448d86d22c84266a13ff24514f093e Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Wed, 27 Sep 2017 14:05:16 +0100 Subject: [PATCH] Updated grepper. --- extfind.py | 1 + grepper | 61 +++++++++++++++++++++++++----------------------------- 2 files changed, 29 insertions(+), 33 deletions(-) create mode 120000 extfind.py diff --git a/extfind.py b/extfind.py new file mode 120000 index 0000000..0231c4e --- /dev/null +++ b/extfind.py @@ -0,0 +1 @@ +extfind \ No newline at end of file diff --git a/grepper b/grepper index a866dff..ee42bed 100755 --- a/grepper +++ b/grepper @@ -28,6 +28,8 @@ from zipfile import ZipFile from functools import partial import re from ExtensionCrawler.config import const_basedir +from extfind import iter_extension_paths, iter_extension_paths_from_file + def help(): print("grepper [OPTION] GREP [FILE]") @@ -36,7 +38,7 @@ def help(): print(" -h print this help text") print(" -b beautify JavaScript before matching") print(" -a archive directory") - print(" -p three-letter-prefix") + print(" -g glob on the extenion id, don't use with -e") print(" -e file with extension ids") print(" -t number of threads to use") print(" -n process chunk n where n in [1,N]") @@ -54,6 +56,12 @@ def guarded_stderr(string): sys.stderr.write(string) lock.release() +def has_at_least_one_match(content, pattern): + for line in content.splitlines(): + if re.search(pattern, line): + return True + return False + def process_crx(ext_id, date, crx, pattern, beautify): try: @@ -64,7 +72,8 @@ def process_crx(ext_id, date, crx, pattern, beautify): with z.open(zip_file_info) as f: content = f.read().decode(errors="surrogateescape") if beautify: - content = jsbeautifier.beautify(content) + if has_at_least_one_match(content, pattern): + content = jsbeautifier.beautify(content) for i, line in enumerate(content.splitlines()): if not re.search(pattern, line): continue @@ -93,24 +102,6 @@ def process_id(pattern, beautify, path): path, traceback.format_exc())) -def find(archive, pattern): - for root, _, files in os.walk(os.path.join(archive, "data")): - for file in files: - if fnmatch.fnmatch(file, pattern + ".tar"): - yield os.path.join(root, file) - - -def find_from_file(archive, extidlistfile): - with open(extidlistfile, 'r') as f: - extids = [l.strip() for l in f.readlines()] - - for root, _, files in os.walk(os.path.join(archive, "data")): - for file in files: - for extid in extids: - if fnmatch.fnmatch(file, extid + ".tar"): - yield os.path.join(root, file) - - def init(l): global lock lock = l @@ -120,13 +111,16 @@ def parse_args(argv): archive = const_basedir() beautify = False parallel = 8 + extidglob = None + extidlistfile = None taskid = 1 maxtaskid = 1 paths = [] + try: opts, args = getopt.getopt(argv, "ha:p:e:bt:n:N:", [ - "archive=", "prefix=", "extidlistfile=", "beautify", "threads=", + "archive=", "glob=", "extidlistfile=", "beautify", "threads=", "taskid=", "maxtaskid=" ]) except getopt.GetoptError: @@ -138,12 +132,10 @@ def parse_args(argv): sys.exit() elif opt in ("-a", "--archive"): archive = arg - elif opt in ("-p", "--prefix"): - prefix = arg - paths += find(archive, prefix + "*") + elif opt in ("-g", "--glob"): + extidglob = arg elif opt in ("-e", "--extidlistfile"): extidlistfile = arg - paths += find_from_file(archive, extidlistfile) elif opt in ("-b", "--beautify"): beautify = True elif opt in ("-t", "--threads"): @@ -158,15 +150,18 @@ def parse_args(argv): sys.exit(2) pattern = args[0] - paths += args[1:] - if paths == []: - paths = list(find(archive, "*")) - - chunksize = int(len(paths) / maxtaskid) - if taskid == maxtaskid: - paths = paths[(taskid - 1) * chunksize:] + if len(args) > 1: + paths = args[1:] + elif extidglob is None and extidlistfile is None: + paths = iter_extension_paths(archive, taskid, maxtaskid) + elif extidglob is None and extidlistfile is not None: + paths = iter_extension_paths_from_file(archive, taskid, maxtaskid, + extidlistfile) + elif extidglob is not None and extidlistfile is None: + paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob) else: - paths = paths[(taskid - 1) * chunksize:taskid * chunksize] + help() + sys.exit(2) return pattern, paths, beautify, parallel