diff --git a/grepper b/grepper index 3734bb9..79daef9 100755 --- a/grepper +++ b/grepper @@ -21,7 +21,6 @@ import os import sys import glob import tarfile -import tempfile import traceback import jsbeautifier from multiprocessing import Pool @@ -33,73 +32,38 @@ from ExtensionCrawler.config import * from ExtensionCrawler.util import * -def get_name(overview_path): - if os.path.exists(overview_path): - with open(overview_path) as overview_file: - contents = overview_file.read() - - # Extract extension name - match = re.search("""""", - contents) - if match: - return match.group(1) - - -def get_downloads(overview_path): - if os.path.exists(overview_path): - with open(overview_path) as overview_file: - contents = overview_file.read() - - match = re.search( - """ archive directory") - print(" -p= three-letter-prefix") - print(" -t= number of parallel threads") + print(" -b beautify JavaScript before matching") + print(" -a archive directory") + print(" -p three-letter-prefix") + print(" -t number of threads to use") -def process_date(extdir, ext_id, date, greps, out_f): - overview_path = os.path.join(extdir, "overview.html") - crxpath = next(iter(glob.glob(os.path.join(extdir, "*.crx"))), None) - if crxpath and os.path.getsize(crxpath) > 0: - contents = {} - with ZipFile(crxpath) as f: - for in_zip_f in f.infolist(): - if not in_zip_f.filename.endswith(".js"): - continue - with f.open(in_zip_f) as f2: - content = f2.read().decode(errors="surrogateescape") - contents[in_zip_f.filename] = jsbeautifier.beautify( - content) - jslocs = sum( - [len(content.splitlines()) for content in contents.values()]) - name = get_name(overview_path) - downloads = get_downloads(overview_path) - for filename, content in contents.items(): - for i, line in enumerate(content.splitlines()): - for gr in greps: - if re.search(gr, line): +def process_crx(ext_id, date, crx, greps, beautify, out_f): + with ZipFile(crx) as z: + for zip_file_info in z.infolist(): + if not zip_file_info.filename.endswith(".js"): + continue + with z.open(zip_file_info) as f: + content = f.read().decode(errors="surrogateescape") + if beautify: + content = jsbeautifier.beautify(content) + for i, line in enumerate(content.splitlines()): + for gr in greps: + if not re.search(gr, line): + continue args = [ - ext_id, date, - name.replace("|", "") if name is not None else "", - str(downloads) if downloads is not None else "", - str(jslocs), gr, - filename + " (line " + str(i + 1) + ")", - line + ext_id, date, gr, zip_file_info.filename + + " (line " + str(i + 1) + ")", line ] print("|".join(args), file=out_f) -def process_id(archivedir, outdir, greps, verbose, ext_id): +def process_id(archivedir, outdir, greps, beautify, verbose, ext_id): txt = "" txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id)) @@ -108,22 +72,19 @@ def process_id(archivedir, outdir, greps, verbose, ext_id): if os.path.exists(greppath): os.remove(greppath) with open(greppath, "w") as out_f: - with tempfile.TemporaryDirectory() as tmpdir: - with tarfile.open(tarpath) as t: - t.extractall(tmpdir) - iddir = os.path.join(tmpdir, ext_id) - - for date in sorted(os.listdir(iddir)): - try: - process_date( - os.path.join(iddir, date), ext_id, date, greps, - out_f) - except Exception: - txt = logmsg( - verbose, txt, - "Exception when handling {} on {}:\n".format( - ext_id, date)) - txt = logmsg(verbose, txt, traceback.format_exc()) + with tarfile.open(tarpath, 'r') as t: + for tar_info in t.getmembers(): + if not tar_info.name.endswith(".crx") or tar_info.size is 0: + continue + date = tar_info.name.split("/")[1] + try: + with t.extractfile(tar_info) as crx: + process_crx(ext_id, date, crx, greps, beautify, out_f) + except Exception: + txt = logmsg( + verbose, txt, + "Exception when handling {}:\n".format(tar_info.name)) + txt = logmsg(verbose, txt, traceback.format_exc()) return txt @@ -131,10 +92,11 @@ def process_id(archivedir, outdir, greps, verbose, ext_id): def main(argv): archive = "archive" prefix = "" + beautify = False parallel = 8 try: - opts, args = getopt.getopt(argv, "ha:p:t:", - ["archive=", "prefix=", "threads="]) + opts, args = getopt.getopt( + argv, "ha:p:bt:", ["archive=", "prefix=", "beautify", "threads="]) except getopt.GetoptError: help() sys.exit(2) @@ -146,6 +108,8 @@ def main(argv): archive = arg elif opt in ("-p", "--prefix"): prefix = arg + elif opt in ("-b", "--beautify"): + beautify = True elif opt in ("-t", "--threads"): parallel = int(arg) @@ -158,15 +122,24 @@ def main(argv): archivedir = os.path.join(archive, "data") threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*")) + + print("Using archive '{}'".format(archive)) + print("Using prefix '{}'".format(prefix)) + print("Using beautifier? '{}'".format(beautify)) + print("Using {} threads".format(parallel)) + print("Found {} three-letter-dirs".format(len(threeletterdirs))) + for threeletterdir in threeletterdirs: ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)])) with Pool(parallel) as p: for txt in p.imap( - partial(process_id, archivedir, outdir, greps, True), - ext_ids): + partial(process_id, archivedir, outdir, greps, beautify, + True), ext_ids): sys.stdout.write(txt) sys.stdout.flush() + print("Sucessfully finished grepping") + if __name__ == "__main__": main(sys.argv[1:])