ExtensionCrawler/grepper

179 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import tarfile
import traceback
import jsbeautifier
import fnmatch
from multiprocessing import Pool, Lock
from zipfile import ZipFile
from functools import partial
import re
from ExtensionCrawler.config import const_basedir
from extfind import iter_extension_paths, iter_extension_paths_from_file
def help():
print("grepper [OPTION] GREP [FILE]")
print(" GREP regex pattern")
print(" [FILE] path(s) to extension tar")
print(" -h print this help text")
print(" -b beautify JavaScript before matching")
print(" -a <DIR> archive directory")
print(" -g <GLOB> glob on the extenion id, don't use with -e")
print(" -e <EXTIDFILELIST> file with extension ids")
print(" -t <THREADS> number of threads to use")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
def guarded_stdout(string):
lock.acquire()
sys.stdout.write(string)
lock.release()
def guarded_stderr(string):
lock.acquire()
sys.stderr.write(string)
lock.release()
def has_at_least_one_match(content, pattern):
for line in content.splitlines():
if re.search(pattern, line):
return True
return False
def process_crx(ext_id, date, crx, pattern, beautify):
try:
with ZipFile(crx) as z:
for zip_file_info in z.infolist():
if not zip_file_info.filename.endswith(".js"):
continue
with z.open(zip_file_info) as f:
content = f.read().decode(errors="surrogateescape")
if beautify:
if has_at_least_one_match(content, pattern):
content = jsbeautifier.beautify(content)
for i, line in enumerate(content.splitlines()):
if not re.search(pattern, line):
continue
args = [
ext_id, date, zip_file_info.filename + " (line " +
str(i + 1) + ")", line
]
guarded_stdout("|".join(args) + "\n")
except Exception:
guarded_stderr("Exception when handling {}, {}:\n{}".format(
ext_id, date, traceback.format_exc()))
def process_id(pattern, beautify, path):
try:
with tarfile.open(path, 'r') as t:
for tar_info in t.getmembers():
if not tar_info.name.endswith(".crx") or tar_info.size is 0:
continue
extid, date = tar_info.name.split("/")[:2]
with t.extractfile(tar_info) as crx:
process_crx(extid, date, crx, pattern, beautify)
except Exception:
guarded_stderr("Exception when handling {}:\n{}".format(
path, traceback.format_exc()))
def init(l):
global lock
lock = l
def parse_args(argv):
archive = const_basedir()
beautify = False
parallel = 8
extidglob = None
extidlistfile = None
taskid = 1
maxtaskid = 1
paths = []
try:
opts, args = getopt.getopt(argv, "ha:p:e:bt:n:N:", [
"archive=", "glob=", "extidlistfile=", "beautify", "threads=",
"taskid=", "maxtaskid="
])
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
elif opt in ("-g", "--glob"):
extidglob = arg
elif opt in ("-e", "--extidlistfile"):
extidlistfile = arg
elif opt in ("-b", "--beautify"):
beautify = True
elif opt in ("-t", "--threads"):
parallel = int(arg)
elif opt in ("-n", "--taskid"):
taskid = int(arg)
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
if len(args) is 0:
help()
sys.exit(2)
pattern = args[0]
if len(args) > 1:
paths = args[1:]
elif extidglob is None and extidlistfile is None:
paths = iter_extension_paths(archive, taskid, maxtaskid)
elif extidglob is None and extidlistfile is not None:
paths = iter_extension_paths_from_file(archive, taskid, maxtaskid,
extidlistfile)
elif extidglob is not None and extidlistfile is None:
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
else:
help()
sys.exit(2)
return pattern, paths, beautify, parallel
def main(argv):
pattern, paths, beautify, parallel = parse_args(argv)
l = Lock()
with Pool(initializer=init, initargs=(l, ), processes=parallel) as p:
p.map(partial(process_id, pattern, beautify), paths)
if __name__ == "__main__":
main(sys.argv[1:])