Updated extgrep.

2019-02-13 22:51:48 +00:00 · 2019-02-13 22:51:48 +00:00 · 1aab16fe69
parent c60902f0a6
commit 1aab16fe69
1 changed files with 130 additions and 55 deletions
--- a/185
+++ b/185
@ -24,6 +24,7 @@ import fnmatch
 import os
 import logging
 import re
+import json
 import sys
 import operator
 import tarfile
@ -36,6 +37,7 @@ from zipfile import ZipFile
 import dateutil
 import dateutil.parser
 import jsbeautifier
+import importlib.util

 from zipfile import ZipFile

@ -53,30 +55,102 @@ def is_source_file(zipentry):
            or zipentry.filename.endswith(".css.gz") or zipentry.filename.endswith(".c")
            or zipentry.filename.endswith(".cpp") or zipentry.filename.endswith(".java"))

+def import_regexs(path):
+    spec = importlib.util.spec_from_file_location("MinerStrings", path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+def get_etag(header_tarentry):
+    headers_content = header_tarentry.read().decode().replace(
+            '"', '\\"').replace("'", '"')
+    headers_json = json.loads(headers_content)
+    if "ETag" in headers_json:
+        return headers_json["ETag"]
+
+def get_name_and_version(overview_tarentry):
+    contents = overview_tarentry.read().decode()
+
+    # Extract extension name
+    match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
+                      contents)
+    name = match.group(1) if match else None
+
+    # Extract extension version
+    match = re.search(
+        """<meta itemprop="version" content="(.*?)"\s*/>""", contents)
+    version = match.group(1) if match else None
+
+    return name, version
+
 def handle_extid(conf, extid):
+    miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
+
+    results = {}
+
+    still_in_store = None
+    crx_etags = [None]
    for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid):
-        if tarentry.name.endswith(".crx"):
+        if tarentry.isdir():
+            continue
+        date = tarentry.name.split("/")[1]
+        if conf.from_date and not (conf.from_date <= date):
+            continue
+        if conf.latest_date and not (date <= conf.latest_date):
+            continue
+
+        if date not in results:
+            results[date] = {}
+            results[date]["crx_etag"] = None
+            results[date]["name"] = None
+            results[date]["version"] = None
+            results[date]["matches"] = []
+
+        tar_file_name = tarentry.name.split("/")[-1]
+
+        if tar_file_name.endswith(".crx.headers"):
+            crx_etag = get_etag(tarfile)
+            results[date]["crx_etag"] = crx_etag
+            if crx_etag:
+                crx_etags += [crx_etag]
+
+        if tar_file_name == "overview.html":
+            results[date]["name"], results[date]["version"] = get_name_and_version(tarfile)
+
+        if tar_file_name == "overview.html.status":
+            still_in_store = tarfile.read().decode().startswith("2")
+
+        if tar_file_name.endswith(".crx"):
            with ZipFile(tarfile) as zf:
                for zipentry in zf.infolist():
                    if is_source_file(zipentry):
                        with zf.open(zipentry) as f:
                            for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
-                                merged_strings = "".join(map(lambda x: x[1], block.string_literals))
-                                print(merged_strings)

-                                # for pattern_group in regex_patterns:
-                                #     for pattern in regex_patterns[pattern_group]:
-                                #         if re.search(pattern, merged_strings):
-                                #             if pattern_group not in matches:
-                                #                 matches[pattern_group] = []
-                                #             matches[pattern_group] += [match]
-                                #             matches.add(pattern_group)
-                                # for pattern_group in string_patterns:
-                                #     for pattern in string_patterns[pattern_group]:
-                                #         if pattern in merged_strings:
-                                #             matches.add(pattern_group)
+                                file_lines = []
+                                file_lines += block.content.splitlines()
+                                file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()

+                                for search_tag in miner_strings.strings.keys():
+                                    for search_string in miner_strings.strings[search_tag]:
+                                        for line in file_lines:
+                                            if search_string in line:
+                                                results[date]["matches"] += [[zipentry.filename, search_tag, search_string]]
+                                                break

+                                for search_tag in miner_strings.patterns.keys():
+                                    for search_pattern in miner_strings.patterns[search_tag]:
+                                        for line in file_lines:
+                                            m = re.search(search_pattern, line)
+                                            if m:
+                                                results[date]["matches"] += [[zipentry.filename, search_tag, m.group()]]
+                                                break
+                                #for extid, still_in_store, most_recent_crx_etag, date, crx_etag, name, version, path, tag, match
+
+    for date in sorted(results.keys()):
+        result = results[date]
+        for match in result["matches"]:
+            print("|".join([str(x) for x in ([extid, still_in_store, crx_etags[-1], date, result["crx_etag"], result["name"], result["version"]] + match)]))


 def main(conf):
@ -90,7 +164,8 @@ def main(conf):
        logger.setLevel(logging.WARNING)

    with open(conf.EXTID_FILE) as f:
-        for extid in f.readlines():
+        print("|".join(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"]))
+        for extid in [l.strip() for l in f.readlines()]:
            handle_extid(conf, extid)


@ -101,7 +176,7 @@ if __name__ == "__main__":
        description='Grep for extensions.')
    main_parser.add_argument(
        'REGEXP_FILE',
-        help='file with regular expressions')
+        help='python file with regular expressions')
    main_parser.add_argument(
        'EXTID_FILE',
        help='file with extension ids')
@ -139,47 +214,47 @@ if __name__ == "__main__":
        default=const_basedir(),
        help='archive directory')

-    comment_group = main_parser.add_argument_group('comment blocks')
-    comment_group.add_argument(
-        '-g',
-        '--group-single-line-comments',
-        help='Group consecutive singe-line comments into blocks')
-    comment_group.add_argument(
-        '-c',
-        '--reg-exp-comments',
-        metavar='REGEXP',
-        type=str,
-        nargs='+',
-        help='search comments for regular expression')
+    # comment_group = main_parser.add_argument_group('comment blocks')
+    # comment_group.add_argument(
+    #     '-g',
+    #     '--group-single-line-comments',
+    #     help='Group consecutive singe-line comments into blocks')
+    # comment_group.add_argument(
+    #     '-c',
+    #     '--reg-exp-comments',
+    #     metavar='REGEXP',
+    #     type=str,
+    #     nargs='+',
+    #     help='search comments for regular expression')

-    source_group = main_parser.add_argument_group('source blocks')
-    source_group.add_argument(
-        '-b',
-        '--beautify',
-        action='store_true',
-        default=False,
-        help='beautify source code')
-    source_group.add_argument(
-        '-s',
-        '--reg-exp-source',
-        metavar='REGEXP',
-        type=str,
-        nargs='+',
-        help='search source for regular expression')
+    # source_group = main_parser.add_argument_group('source blocks')
+    # source_group.add_argument(
+    #     '-b',
+    #     '--beautify',
+    #     action='store_true',
+    #     default=False,
+    #     help='beautify source code')
+    # source_group.add_argument(
+    #     '-s',
+    #     '--reg-exp-source',
+    #     metavar='REGEXP',
+    #     type=str,
+    #     nargs='+',
+    #     help='search source for regular expression')

-    strings_group = main_parser.add_argument_group('string literals')
-    strings_group.add_argument(
-        '-j',
-        '--join-string-literals',
-        action='store_true',
-        help='join string literals (heuristic)')
-    strings_group.add_argument(
-        '-l',
-        '--reg-exp-string-literals',
-        metavar='REGEXP',
-        type=str,
-        nargs='+',
-        help='search string literals for regular expression')
+    # strings_group = main_parser.add_argument_group('string literals')
+    # strings_group.add_argument(
+    #     '-j',
+    #     '--join-string-literals',
+    #     action='store_true',
+    #     help='join string literals (heuristic)')
+    # strings_group.add_argument(
+    #     '-l',
+    #     '--reg-exp-string-literals',
+    #     metavar='REGEXP',
+    #     type=str,
+    #     nargs='+',
+    #     help='search string literals for regular expression')
    main_conf = main_parser.parse_args()

    sys.exit(main(main_conf))