Updated grepper.

2017-09-27 14:05:16 +01:00 · 2017-09-27 14:05:16 +01:00 · f37e19f46a
parent 2cfeb9b88c
commit f37e19f46a
2 changed files with 29 additions and 33 deletions
--- a/extfind.py
+++ b/extfind.py
@ -0,0 +1 @@
+extfind
--- a/61
+++ b/61
@ -28,6 +28,8 @@ from zipfile import ZipFile
 from functools import partial
 import re
 from ExtensionCrawler.config import const_basedir
+from extfind import iter_extension_paths, iter_extension_paths_from_file
+

 def help():
    print("grepper [OPTION] GREP [FILE]")
@ -36,7 +38,7 @@ def help():
    print("    -h                  print this help text")
    print("    -b                  beautify JavaScript before matching")
    print("    -a <DIR>            archive directory")
-    print("    -p <PREFIX>         three-letter-prefix")
+    print("    -g <GLOB>           glob on the extenion id, don't use with -e")
    print("    -e <EXTIDFILELIST>  file with extension ids")
    print("    -t <THREADS>        number of threads to use")
    print("    -n <TASKID>         process chunk n where n in [1,N]")
@ -54,6 +56,12 @@ def guarded_stderr(string):
    sys.stderr.write(string)
    lock.release()

+def has_at_least_one_match(content, pattern):
+    for line in content.splitlines():
+        if re.search(pattern, line):
+            return True
+    return False
+

 def process_crx(ext_id, date, crx, pattern, beautify):
    try:
@ -64,7 +72,8 @@ def process_crx(ext_id, date, crx, pattern, beautify):
                with z.open(zip_file_info) as f:
                    content = f.read().decode(errors="surrogateescape")
                    if beautify:
-                        content = jsbeautifier.beautify(content)
+                        if has_at_least_one_match(content, pattern):
+                            content = jsbeautifier.beautify(content)
                    for i, line in enumerate(content.splitlines()):
                        if not re.search(pattern, line):
                            continue
@ -93,24 +102,6 @@ def process_id(pattern, beautify, path):
            path, traceback.format_exc()))


-def find(archive, pattern):
-    for root, _, files in os.walk(os.path.join(archive, "data")):
-        for file in files:
-            if fnmatch.fnmatch(file, pattern + ".tar"):
-                yield os.path.join(root, file)
-
-
-def find_from_file(archive, extidlistfile):
-    with open(extidlistfile, 'r') as f:
-        extids = [l.strip() for l in f.readlines()]
-
-    for root, _, files in os.walk(os.path.join(archive, "data")):
-        for file in files:
-            for extid in extids:
-                if fnmatch.fnmatch(file, extid + ".tar"):
-                    yield os.path.join(root, file)
-
-
 def init(l):
    global lock
    lock = l
@ -120,13 +111,16 @@ def parse_args(argv):
    archive = const_basedir()
    beautify = False
    parallel = 8
+    extidglob = None
+    extidlistfile = None
    taskid = 1
    maxtaskid = 1

    paths = []
+
    try:
        opts, args = getopt.getopt(argv, "ha:p:e:bt:n:N:", [
-            "archive=", "prefix=", "extidlistfile=", "beautify", "threads=",
+            "archive=", "glob=", "extidlistfile=", "beautify", "threads=",
            "taskid=", "maxtaskid="
        ])
    except getopt.GetoptError:
@ -138,12 +132,10 @@ def parse_args(argv):
            sys.exit()
        elif opt in ("-a", "--archive"):
            archive = arg
-        elif opt in ("-p", "--prefix"):
-            prefix = arg
-            paths += find(archive, prefix + "*")
+        elif opt in ("-g", "--glob"):
+            extidglob = arg
        elif opt in ("-e", "--extidlistfile"):
            extidlistfile = arg
-            paths += find_from_file(archive, extidlistfile)
        elif opt in ("-b", "--beautify"):
            beautify = True
        elif opt in ("-t", "--threads"):
@ -158,15 +150,18 @@ def parse_args(argv):
        sys.exit(2)

    pattern = args[0]
-    paths += args[1:]
-    if paths == []:
-        paths = list(find(archive, "*"))
-
-    chunksize = int(len(paths) / maxtaskid)
-    if taskid == maxtaskid:
-        paths = paths[(taskid - 1) * chunksize:]
+    if len(args) > 1:
+        paths = args[1:]
+    elif extidglob is None and extidlistfile is None:
+        paths = iter_extension_paths(archive, taskid, maxtaskid)
+    elif extidglob is None and extidlistfile is not None:
+        paths = iter_extension_paths_from_file(archive, taskid, maxtaskid,
+                                               extidlistfile)
+    elif extidglob is not None and extidlistfile is None:
+        paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
    else:
-        paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
+        help()
+        sys.exit(2)

    return pattern, paths, beautify, parallel