ExtensionCrawler/grepper

184 lines
5.7 KiB
Python
Executable File

#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import tarfile
import traceback
import jsbeautifier
import fnmatch
from multiprocessing import Pool, Lock
from zipfile import ZipFile
from functools import partial
import re
from ExtensionCrawler.config import const_basedir
def help():
print("grepper [OPTION] GREP [FILE]")
print(" GREP regex pattern")
print(" [FILE] path(s) to extension tar")
print(" -h print this help text")
print(" -b beautify JavaScript before matching")
print(" -a <DIR> archive directory")
print(" -p <PREFIX> three-letter-prefix")
print(" -e <EXTIDFILELIST> file with extension ids")
print(" -t <THREADS> number of threads to use")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
def guarded_stdout(string):
lock.acquire()
sys.stdout.write(string)
lock.release()
def guarded_stderr(string):
lock.acquire()
sys.stderr.write(string)
lock.release()
def process_crx(ext_id, date, crx, pattern, beautify):
try:
with ZipFile(crx) as z:
for zip_file_info in z.infolist():
if not zip_file_info.filename.endswith(".js"):
continue
with z.open(zip_file_info) as f:
content = f.read().decode(errors="surrogateescape")
if beautify:
content = jsbeautifier.beautify(content)
for i, line in enumerate(content.splitlines()):
if not re.search(pattern, line):
continue
args = [
ext_id, date, zip_file_info.filename + " (line " +
str(i + 1) + ")", line
]
guarded_stdout("|".join(args) + "\n")
except Exception:
guarded_stderr("Exception when handling {}, {}:\n{}".format(
ext_id, date, traceback.format_exc()))
def process_id(pattern, beautify, path):
try:
with tarfile.open(path, 'r') as t:
for tar_info in t.getmembers():
if not tar_info.name.endswith(".crx") or tar_info.size is 0:
continue
extid, date = tar_info.name.split("/")[:2]
with t.extractfile(tar_info) as crx:
process_crx(extid, date, crx, pattern, beautify)
except Exception:
guarded_stderr("Exception when handling {}:\n{}".format(
path, traceback.format_exc()))
def find(archive, pattern):
for root, _, files in os.walk(os.path.join(archive, "data")):
for file in files:
if fnmatch.fnmatch(file, pattern + ".tar"):
yield os.path.join(root, file)
def find_from_file(archive, extidlistfile):
with open(extidlistfile, 'r') as f:
extids = [l.strip() for l in f.readlines()]
for root, _, files in os.walk(os.path.join(archive, "data")):
for file in files:
for extid in extids:
if fnmatch.fnmatch(file, extid + ".tar"):
yield os.path.join(root, file)
def init(l):
global lock
lock = l
def parse_args(argv):
archive = const_basedir()
beautify = False
parallel = 8
taskid = 1
maxtaskid = 1
paths = []
try:
opts, args = getopt.getopt(argv, "ha:p:e:bt:n:N:", [
"archive=", "prefix=", "extidlistfile=", "beautify", "threads=",
"taskid=", "maxtaskid="
])
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
archive = arg
elif opt in ("-p", "--prefix"):
prefix = arg
paths += find(archive, prefix + "*")
elif opt in ("-e", "--extidlistfile"):
extidlistfile = arg
paths += find_from_file(archive, extidlistfile)
elif opt in ("-b", "--beautify"):
beautify = True
elif opt in ("-t", "--threads"):
parallel = int(arg)
elif opt in ("-n", "--taskid"):
taskid = int(arg)
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
if len(args) is 0:
help()
sys.exit(2)
pattern = args[0]
paths += args[1:]
if paths == []:
paths = list(find(archive, "*"))
chunksize = int(len(paths) / maxtaskid)
if taskid == maxtaskid:
paths = paths[(taskid - 1) * chunksize:]
else:
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
return pattern, paths, beautify, parallel
def main(argv):
pattern, paths, beautify, parallel = parse_args(argv)
l = Lock()
with Pool(initializer=init, initargs=(l, ), processes=parallel) as p:
p.map(partial(process_id, pattern, beautify), paths)
if __name__ == "__main__":
main(sys.argv[1:])