diff --git a/grepper b/grepper
index 3734bb9..79daef9 100755
--- a/grepper
+++ b/grepper
@@ -21,7 +21,6 @@ import os
import sys
import glob
import tarfile
-import tempfile
import traceback
import jsbeautifier
from multiprocessing import Pool
@@ -33,73 +32,38 @@ from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
-def get_name(overview_path):
- if os.path.exists(overview_path):
- with open(overview_path) as overview_file:
- contents = overview_file.read()
-
- # Extract extension name
- match = re.search("""""",
- contents)
- if match:
- return match.group(1)
-
-
-def get_downloads(overview_path):
- if os.path.exists(overview_path):
- with open(overview_path) as overview_file:
- contents = overview_file.read()
-
- match = re.search(
- """ archive directory")
- print(" -p= three-letter-prefix")
- print(" -t= number of parallel threads")
+ print(" -b beautify JavaScript before matching")
+ print(" -a archive directory")
+ print(" -p three-letter-prefix")
+ print(" -t number of threads to use")
-def process_date(extdir, ext_id, date, greps, out_f):
- overview_path = os.path.join(extdir, "overview.html")
- crxpath = next(iter(glob.glob(os.path.join(extdir, "*.crx"))), None)
- if crxpath and os.path.getsize(crxpath) > 0:
- contents = {}
- with ZipFile(crxpath) as f:
- for in_zip_f in f.infolist():
- if not in_zip_f.filename.endswith(".js"):
- continue
- with f.open(in_zip_f) as f2:
- content = f2.read().decode(errors="surrogateescape")
- contents[in_zip_f.filename] = jsbeautifier.beautify(
- content)
- jslocs = sum(
- [len(content.splitlines()) for content in contents.values()])
- name = get_name(overview_path)
- downloads = get_downloads(overview_path)
- for filename, content in contents.items():
- for i, line in enumerate(content.splitlines()):
- for gr in greps:
- if re.search(gr, line):
+def process_crx(ext_id, date, crx, greps, beautify, out_f):
+ with ZipFile(crx) as z:
+ for zip_file_info in z.infolist():
+ if not zip_file_info.filename.endswith(".js"):
+ continue
+ with z.open(zip_file_info) as f:
+ content = f.read().decode(errors="surrogateescape")
+ if beautify:
+ content = jsbeautifier.beautify(content)
+ for i, line in enumerate(content.splitlines()):
+ for gr in greps:
+ if not re.search(gr, line):
+ continue
args = [
- ext_id, date,
- name.replace("|", "") if name is not None else "",
- str(downloads) if downloads is not None else "",
- str(jslocs), gr,
- filename + " (line " + str(i + 1) + ")",
- line
+ ext_id, date, gr, zip_file_info.filename +
+ " (line " + str(i + 1) + ")", line
]
print("|".join(args), file=out_f)
-def process_id(archivedir, outdir, greps, verbose, ext_id):
+def process_id(archivedir, outdir, greps, beautify, verbose, ext_id):
txt = ""
txt = logmsg(verbose, txt, "Processing {} ...\n".format(ext_id))
@@ -108,22 +72,19 @@ def process_id(archivedir, outdir, greps, verbose, ext_id):
if os.path.exists(greppath):
os.remove(greppath)
with open(greppath, "w") as out_f:
- with tempfile.TemporaryDirectory() as tmpdir:
- with tarfile.open(tarpath) as t:
- t.extractall(tmpdir)
- iddir = os.path.join(tmpdir, ext_id)
-
- for date in sorted(os.listdir(iddir)):
- try:
- process_date(
- os.path.join(iddir, date), ext_id, date, greps,
- out_f)
- except Exception:
- txt = logmsg(
- verbose, txt,
- "Exception when handling {} on {}:\n".format(
- ext_id, date))
- txt = logmsg(verbose, txt, traceback.format_exc())
+ with tarfile.open(tarpath, 'r') as t:
+ for tar_info in t.getmembers():
+ if not tar_info.name.endswith(".crx") or tar_info.size is 0:
+ continue
+ date = tar_info.name.split("/")[1]
+ try:
+ with t.extractfile(tar_info) as crx:
+ process_crx(ext_id, date, crx, greps, beautify, out_f)
+ except Exception:
+ txt = logmsg(
+ verbose, txt,
+ "Exception when handling {}:\n".format(tar_info.name))
+ txt = logmsg(verbose, txt, traceback.format_exc())
return txt
@@ -131,10 +92,11 @@ def process_id(archivedir, outdir, greps, verbose, ext_id):
def main(argv):
archive = "archive"
prefix = ""
+ beautify = False
parallel = 8
try:
- opts, args = getopt.getopt(argv, "ha:p:t:",
- ["archive=", "prefix=", "threads="])
+ opts, args = getopt.getopt(
+ argv, "ha:p:bt:", ["archive=", "prefix=", "beautify", "threads="])
except getopt.GetoptError:
help()
sys.exit(2)
@@ -146,6 +108,8 @@ def main(argv):
archive = arg
elif opt in ("-p", "--prefix"):
prefix = arg
+ elif opt in ("-b", "--beautify"):
+ beautify = True
elif opt in ("-t", "--threads"):
parallel = int(arg)
@@ -158,15 +122,24 @@ def main(argv):
archivedir = os.path.join(archive, "data")
threeletterdirs = glob.glob(os.path.join(archivedir, prefix + "*"))
+
+ print("Using archive '{}'".format(archive))
+ print("Using prefix '{}'".format(prefix))
+ print("Using beautifier? '{}'".format(beautify))
+ print("Using {} threads".format(parallel))
+ print("Found {} three-letter-dirs".format(len(threeletterdirs)))
+
for threeletterdir in threeletterdirs:
ext_ids = list(set([d[:32] for d in os.listdir(threeletterdir)]))
with Pool(parallel) as p:
for txt in p.imap(
- partial(process_id, archivedir, outdir, greps, True),
- ext_ids):
+ partial(process_id, archivedir, outdir, greps, beautify,
+ True), ext_ids):
sys.stdout.write(txt)
sys.stdout.flush()
+ print("Sucessfully finished grepping")
+
if __name__ == "__main__":
main(sys.argv[1:])