Updated greper.

This commit is contained in:
Michael Herzberg 2017-08-14 14:40:10 +01:00
parent 68e7e72e93
commit 77ddcc23e4
1 changed files with 49 additions and 76 deletions

125
grepper
View File

@ -21,7 +21,6 @@ import os
import sys import sys
import glob import glob
import tarfile import tarfile
import tempfile
import traceback import traceback
import jsbeautifier import jsbeautifier
from multiprocessing import Pool from multiprocessing import Pool
@ -33,73 +32,38 @@ from ExtensionCrawler.config import *
from ExtensionCrawler.util import * from ExtensionCrawler.util import *
def get_name(overview_path):
if os.path.exists(overview_path):
with open(overview_path) as overview_file:
contents = overview_file.read()
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
contents)
if match:
return match.group(1)
def get_downloads(overview_path):
if os.path.exists(overview_path):
with open(overview_path) as overview_file:
contents = overview_file.read()
match = re.search(
"""<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
contents)
if match:
return int(match.group(1).replace(",", ''))
def help(): def help():
print("grepper [OPTION] BASEDIR GREP1 [GREP2...]") print("grepper [OPTION] BASEDIR GREP1 [GREP2...]")
print(" GREP1 [GREP2...] regex patterns") print(" GREP1 [GREP2...] regex patterns")
print(" BASEDIR directory for output") print(" BASEDIR directory for output")
print(" -h print this help text") print(" -h print this help text")
print(" -a=<DIR> archive directory") print(" -b beautify JavaScript before matching")
print(" -p=<PREFIX> three-letter-prefix") print(" -a <DIR> archive directory")
print(" -t=<THREADS> number of parallel threads") print(" -p <PREFIX> three-letter-prefix")
print(" -t <THREADS> number of threads to use")
def process_date(extdir, ext_id, date, greps, out_f): def process_crx(ext_id, date, crx, greps, beautify, out_f):
overview_path = os.path.join(extdir, "overview.html") with ZipFile(crx) as z:
crxpath = next(iter(glob.glob(os.path.join(extdir, "*.crx"))), None) for zip_file_info in z.infolist():
if crxpath and os.path.getsize(crxpath) > 0: if not zip_file_info.filename.endswith(".js"):
contents = {} continue
with ZipFile(crxpath) as f: with z.open(zip_file_info) as f:
for in_zip_f in f.infolist(): content = f.read().decode(errors="surrogateescape")
if not in_zip_f.filename.endswith(".js"): if beautify:
continue content = jsbeautifier.beautify(content)
with f.open(in_zip_f) as f2: for i, line in enumerate(content.splitlines()):
content = f2.read().decode(errors="surrogateescape") for gr in greps:
contents[in_zip_f.filename] = jsbeautifier.beautify( if not re.search(gr, line):
content) continue
jslocs = sum(
[len(content.splitlines()) for content in contents.values()])
name = get_name(overview_path)
downloads = get_downloads(overview_path)
for filename, content in contents.items():
for i, line in enumerate(content.splitlines()):
for gr in greps:
if re.search(gr, line):
args = [ args = [
ext_id, date, ext_id, date, gr, zip_file_info.filename +
name.replace("|", "<PIPE>") if name is not None else "", " (line " + str(i + 1) + ")", line
str(downloads) if downloads is not None else "",
str(jslocs), gr,
filename + " (line " + str(i + 1) + ")",
line
] ]
print("|".join(args), file=out_f) print("|".join(args), file=out_f)
def process_id(archivedir, outdir, greps, verbose, ext_id): def process_id(archivedir, outdir, greps, beautify, verbose, ext_id):
txt = "" txt = ""
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id)) txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
@ -108,22 +72,19 @@ def process_id(archivedir, outdir, greps, verbose, ext_id):
if os.path.exists(greppath): if os.path.exists(greppath):
os.remove(greppath) os.remove(greppath)
with open(greppath, "w") as out_f: with open(greppath, "w") as out_f:
with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(tarpath, 'r') as t:
with tarfile.open(tarpath) as t: for tar_info in t.getmembers():
t.extractall(tmpdir) if not tar_info.name.endswith(".crx") or tar_info.size is 0:
iddir = os.path.join(tmpdir, ext_id) continue
date = tar_info.name.split("/")[1]
for date in sorted(os.listdir(iddir)): try:
try: with t.extractfile(tar_info) as crx:
process_date( process_crx(ext_id, date, crx, greps, beautify, out_f)
os.path.join(iddir, date), ext_id, date, greps, except Exception:
out_f) txt = logmsg(
except Exception: verbose, txt,
txt = logmsg( "Exception when handling {}:\n".format(tar_info.name))
verbose, txt, txt = logmsg(verbose, txt, traceback.format_exc())
"Exception when handling {} on {}:\n".format(
ext_id, date))
txt = logmsg(verbose, txt, traceback.format_exc())
return txt return txt
@ -131,10 +92,11 @@ def process_id(archivedir, outdir, greps, verbose, ext_id):
def main(argv): def main(argv):
archive = "archive" archive = "archive"
prefix = "" prefix = ""
beautify = False
parallel = 8 parallel = 8
try: try:
opts, args = getopt.getopt(argv, "ha:p:t:", opts, args = getopt.getopt(
["archive=", "prefix=", "threads="]) argv, "ha:p:bt:", ["archive=", "prefix=", "beautify", "threads="])
except getopt.GetoptError: except getopt.GetoptError:
help() help()
sys.exit(2) sys.exit(2)
@ -146,6 +108,8 @@ def main(argv):
archive = arg archive = arg
elif opt in ("-p", "--prefix"): elif opt in ("-p", "--prefix"):
prefix = arg prefix = arg
elif opt in ("-b", "--beautify"):
beautify = True
elif opt in ("-t", "--threads"): elif opt in ("-t", "--threads"):
parallel = int(arg) parallel = int(arg)
@ -158,15 +122,24 @@ def main(argv):
archivedir = os.path.join(archive, "data") archivedir = os.path.join(archive, "data")
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*")) threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
print("Using archive '{}'".format(archive))
print("Using prefix '{}'".format(prefix))
print("Using beautifier? '{}'".format(beautify))
print("Using {} threads".format(parallel))
print("Found {} three-letter-dirs".format(len(threeletterdirs)))
for threeletterdir in threeletterdirs: for threeletterdir in threeletterdirs:
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)])) ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
with Pool(parallel) as p: with Pool(parallel) as p:
for txt in p.imap( for txt in p.imap(
partial(process_id, archivedir, outdir, greps, True), partial(process_id, archivedir, outdir, greps, beautify,
ext_ids): True), ext_ids):
sys.stdout.write(txt) sys.stdout.write(txt)
sys.stdout.flush() sys.stdout.flush()
print("Sucessfully finished grepping")
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) main(sys.argv[1:])