179 lines
5.6 KiB
Python
Executable File
179 lines
5.6 KiB
Python
Executable File
#!/usr/bin/env python3.5
|
|
#
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
import getopt
|
|
import os
|
|
import sys
|
|
import tarfile
|
|
import traceback
|
|
import jsbeautifier
|
|
import fnmatch
|
|
from multiprocessing import Pool, Lock
|
|
from zipfile import ZipFile
|
|
from functools import partial
|
|
import re
|
|
from ExtensionCrawler.config import const_basedir
|
|
from extfind import iter_extension_paths, iter_extension_paths_from_file
|
|
|
|
|
|
def help():
|
|
print("grepper [OPTION] GREP [FILE]")
|
|
print(" GREP regex pattern")
|
|
print(" [FILE] path(s) to extension tar")
|
|
print(" -h print this help text")
|
|
print(" -b beautify JavaScript before matching")
|
|
print(" -a <DIR> archive directory")
|
|
print(" -g <GLOB> glob on the extenion id, don't use with -e")
|
|
print(" -e <EXTIDFILELIST> file with extension ids")
|
|
print(" -t <THREADS> number of threads to use")
|
|
print(" -n <TASKID> process chunk n where n in [1,N]")
|
|
print(" -N <MAXTASKID> ")
|
|
|
|
|
|
def guarded_stdout(string):
|
|
lock.acquire()
|
|
sys.stdout.write(string)
|
|
lock.release()
|
|
|
|
|
|
def guarded_stderr(string):
|
|
lock.acquire()
|
|
sys.stderr.write(string)
|
|
lock.release()
|
|
|
|
def has_at_least_one_match(content, pattern):
|
|
for line in content.splitlines():
|
|
if re.search(pattern, line):
|
|
return True
|
|
return False
|
|
|
|
|
|
def process_crx(ext_id, date, crx, pattern, beautify):
|
|
try:
|
|
with ZipFile(crx) as z:
|
|
for zip_file_info in z.infolist():
|
|
if not zip_file_info.filename.endswith(".js"):
|
|
continue
|
|
with z.open(zip_file_info) as f:
|
|
content = f.read().decode(errors="surrogateescape")
|
|
if beautify:
|
|
if has_at_least_one_match(content, pattern):
|
|
content = jsbeautifier.beautify(content)
|
|
for i, line in enumerate(content.splitlines()):
|
|
if not re.search(pattern, line):
|
|
continue
|
|
args = [
|
|
ext_id, date, zip_file_info.filename + " (line " +
|
|
str(i + 1) + ")", line
|
|
]
|
|
guarded_stdout("|".join(args) + "\n")
|
|
|
|
except Exception:
|
|
guarded_stderr("Exception when handling {}, {}:\n{}".format(
|
|
ext_id, date, traceback.format_exc()))
|
|
|
|
|
|
def process_id(pattern, beautify, path):
|
|
try:
|
|
with tarfile.open(path, 'r') as t:
|
|
for tar_info in t.getmembers():
|
|
if not tar_info.name.endswith(".crx") or tar_info.size is 0:
|
|
continue
|
|
extid, date = tar_info.name.split("/")[:2]
|
|
with t.extractfile(tar_info) as crx:
|
|
process_crx(extid, date, crx, pattern, beautify)
|
|
except Exception:
|
|
guarded_stderr("Exception when handling {}:\n{}".format(
|
|
path, traceback.format_exc()))
|
|
|
|
|
|
def init(l):
|
|
global lock
|
|
lock = l
|
|
|
|
|
|
def parse_args(argv):
|
|
archive = const_basedir()
|
|
beautify = False
|
|
parallel = 8
|
|
extidglob = None
|
|
extidlistfile = None
|
|
taskid = 1
|
|
maxtaskid = 1
|
|
|
|
paths = []
|
|
|
|
try:
|
|
opts, args = getopt.getopt(argv, "ha:p:e:bt:n:N:", [
|
|
"archive=", "glob=", "extidlistfile=", "beautify", "threads=",
|
|
"taskid=", "maxtaskid="
|
|
])
|
|
except getopt.GetoptError:
|
|
help()
|
|
sys.exit(2)
|
|
for opt, arg in opts:
|
|
if opt == '-h':
|
|
help()
|
|
sys.exit()
|
|
elif opt in ("-a", "--archive"):
|
|
archive = arg
|
|
elif opt in ("-g", "--glob"):
|
|
extidglob = arg
|
|
elif opt in ("-e", "--extidlistfile"):
|
|
extidlistfile = arg
|
|
elif opt in ("-b", "--beautify"):
|
|
beautify = True
|
|
elif opt in ("-t", "--threads"):
|
|
parallel = int(arg)
|
|
elif opt in ("-n", "--taskid"):
|
|
taskid = int(arg)
|
|
elif opt in ("-N", "--maxtaskid"):
|
|
maxtaskid = int(arg)
|
|
|
|
if len(args) is 0:
|
|
help()
|
|
sys.exit(2)
|
|
|
|
pattern = args[0]
|
|
if len(args) > 1:
|
|
paths = args[1:]
|
|
elif extidglob is None and extidlistfile is None:
|
|
paths = iter_extension_paths(archive, taskid, maxtaskid)
|
|
elif extidglob is None and extidlistfile is not None:
|
|
paths = iter_extension_paths_from_file(archive, taskid, maxtaskid,
|
|
extidlistfile)
|
|
elif extidglob is not None and extidlistfile is None:
|
|
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
|
|
else:
|
|
help()
|
|
sys.exit(2)
|
|
|
|
return pattern, paths, beautify, parallel
|
|
|
|
|
|
def main(argv):
|
|
pattern, paths, beautify, parallel = parse_args(argv)
|
|
|
|
l = Lock()
|
|
with Pool(initializer=init, initargs=(l, ), processes=parallel) as p:
|
|
p.map(partial(process_id, pattern, beautify), paths)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(sys.argv[1:])
|