Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

2019-02-27 19:37:13 +00:00 · 2019-02-27 19:37:13 +00:00 · 4ffc51e6b9
parent 7a0f7ea496 9d615760f1
commit 4ffc51e6b9
2 changed files with 135 additions and 95 deletions
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@ -36,6 +36,7 @@ import datetime
 import dateutil
 import dateutil.parser
 import requests
+from itertools import groupby

 from ExtensionCrawler.config import (
    const_review_payload, const_review_search_url, const_download_url,
@ -638,7 +639,8 @@ def iter_tar_entries_from_file_ext(archivedir, extid, ext):
    tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
    with tarfile.open(tar, 'r') as tf:
        for tarentry in tf:
-            yield (tarentry, tf.extractfile(tarentry))
+            if tarentry.isfile():
+                yield (tarentry, tf.extractfile(tarentry))

 def iter_tar_entries(archivedir, extid):
    for i in range(1000):
@ -651,3 +653,6 @@ def iter_tar_entries(archivedir, extid):
    ext = ".tar"
    for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
        yield (tarentry, tarfile)
+
+def iter_tar_entries_by_date(archivedir, extid):
+    return groupby(iter_tar_entries(archivedir, extid), lambda tup: tup[0].name.split("/")[1])
--- a/223
+++ b/223
@ -17,66 +17,135 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later

-import datetime
 import argparse
 import io
-import fnmatch
-import os
 import logging
 import re
+import json
 import sys
-import operator
-import tarfile
-import zlib
-from functools import partial, reduce
-from colorama import init, Fore
-from multiprocessing import Pool
-from zipfile import ZipFile
-
-import dateutil
-import dateutil.parser
-import jsbeautifier
+import importlib.util
+import csv
+import math

 from zipfile import ZipFile

 from ExtensionCrawler.config import (const_log_format, const_basedir)
-from ExtensionCrawler.archive import iter_tar_entries
-from ExtensionCrawler.config import get_local_archive_dir
-from ExtensionCrawler.js_decomposer import init_file_info
+from ExtensionCrawler.archive import iter_tar_entries_by_date
 from ExtensionCrawler.js_mincer import mince_js


-def is_source_file(zipentry):
-    """Test if filename indicates file with C-style comment."""
-    return (zipentry.filename.endswith(".js") or zipentry.filename.endswith(".js.gz")
-            or zipentry.filename.endswith(".jgz") or zipentry.filename.endswith(".jsg")
-            or zipentry.filename.endswith(".css.gz") or zipentry.filename.endswith(".c")
-            or zipentry.filename.endswith(".cpp") or zipentry.filename.endswith(".java"))
-
-def handle_extid(conf, extid):
-    for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid):
-        if tarentry.name.endswith(".crx"):
-            with ZipFile(tarfile) as zf:
-                for zipentry in zf.infolist():
-                    if is_source_file(zipentry):
-                        with zf.open(zipentry) as f:
-                            for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
-                                merged_strings = "".join(map(lambda x: x[1], block.string_literals))
-                                print(merged_strings)
-
-                                # for pattern_group in regex_patterns:
-                                #     for pattern in regex_patterns[pattern_group]:
-                                #         if re.search(pattern, merged_strings):
-                                #             if pattern_group not in matches:
-                                #                 matches[pattern_group] = []
-                                #             matches[pattern_group] += [match]
-                                #             matches.add(pattern_group)
-                                # for pattern_group in string_patterns:
-                                #     for pattern in string_patterns[pattern_group]:
-                                #         if pattern in merged_strings:
-                                #             matches.add(pattern_group)
+def get_shannon_entropy(string):
+    """
+    This code has been borrowed from
+    "http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
+    "git@github.com:dxa4481/truffleHog.git"
+    """
+    chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    if not string:
+        return 0
+    entropy = 0
+    for x in chars:
+        p_x = float(string.count(x))/len(string)
+        if p_x > 0:
+            entropy += - p_x*math.log(p_x, 2)
+    return entropy


+def is_likely_hash(string):
+    return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
+
+
+def import_regexs(path):
+    spec = importlib.util.spec_from_file_location("MinerStrings", path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def get_etag(headers_content):
+    headers_content = headers_content.replace(
+            '"', '\\"').replace("'", '"')
+    headers_json = json.loads(headers_content)
+    if "ETag" in headers_json:
+        return headers_json["ETag"]
+
+
+def get_name_and_version(overview_contents):
+    # Extract extension name
+    match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
+                      overview_contents)
+    name = match.group(1) if match else None
+
+    # Extract extension version
+    match = re.search(
+        """<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
+    version = match.group(1) if match else None
+
+    return name, version
+
+
+def handle_extid(conf, extid, csvwriter):
+    miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
+
+    results = []
+
+    still_in_store = None
+    crx_etags = [None]
+    for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
+        if conf.from_date and not (conf.from_date <= date):
+            continue
+        if conf.latest_date and not (date <= conf.latest_date):
+            continue
+
+        crx_etag = None
+        name = None
+        version = None
+        matches = []
+        for tarentry, tarfile in tups:
+            tarentry_filename = tarentry.name.split("/")[-1]
+
+            if tarentry_filename.endswith(".crx.headers"):
+                crx_etag = get_etag(tarfile.read().decode())
+                if crx_etag:
+                    crx_etags += [crx_etag]
+
+            if tarentry_filename == "overview.html":
+                name, version = get_name_and_version(tarfile.read().decode())
+
+            if tarentry_filename == "overview.html.status":
+                still_in_store = tarfile.read().decode().startswith("2")
+
+            if tarentry_filename.endswith(".crx"):
+                with ZipFile(tarfile) as zf:
+                    for zipentry in zf.infolist():
+                        if zipentry.filename.endswith(".js"):
+                            with zf.open(zipentry) as f:
+                                for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
+                                    file_lines = []
+                                    file_lines += block.content.splitlines()
+                                    file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
+
+                                    for search_tag in miner_strings.strings.keys():
+                                        for search_string in miner_strings.strings[search_tag]:
+                                            for line in file_lines:
+                                                if search_string in line:
+                                                    matches += [[zipentry.filename, search_tag, search_string]]
+                                                    break
+
+                                    for search_tag in miner_strings.patterns.keys():
+                                        for search_pattern in miner_strings.patterns[search_tag]:
+                                            for line in file_lines:
+                                                m = re.search(search_pattern, line)
+                                                if m:
+                                                    matched_string = m.group()
+                                                    if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
+                                                        matches += [[zipentry.filename, search_tag, matched_string]]
+                                                        break
+        for match in matches:
+            results += [[date, crx_etag, name, version] + match]
+
+    for result in results:
+        csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])


 def main(conf):
@ -90,18 +159,19 @@ def main(conf):
        logger.setLevel(logging.WARNING)

    with open(conf.EXTID_FILE) as f:
-        for extid in f.readlines():
-            handle_extid(conf, extid)
+        csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
+        csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"])
+        for extid in [l.strip() for l in f.readlines()]:
+            handle_extid(conf, extid, csvwriter)


-
-if __name__ == "__main__":
+def build_parser():
    main_parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description='Grep for extensions.')
    main_parser.add_argument(
        'REGEXP_FILE',
-        help='file with regular expressions')
+        help='python file with regular expressions')
    main_parser.add_argument(
        'EXTID_FILE',
        help='file with extension ids')
@ -119,8 +189,8 @@ if __name__ == "__main__":
        metavar='DATE',
        type=str,
        help='select latest crx from tar, released before DATE.\n' +
-        'Together with --from-date, specifies all crx released in specified\n'
-        + 'date range.')
+             'Together with --from-date, specifies all crx released in specified\n' +
+             'date range.')

    main_parser.add_argument(
        '-d',
@ -128,8 +198,8 @@ if __name__ == "__main__":
        metavar='DATE',
        type=str,
        help='select oldest crx from tar released after DATE.\n' +
-        'Together with --latest-date, specifies all crx released in specified\n'
-        + 'date range.')
+             'Together with --latest-date, specifies all crx released in specified\n' +
+             'date range.')

    main_parser.add_argument(
        '-a',
@ -139,47 +209,12 @@ if __name__ == "__main__":
        default=const_basedir(),
        help='archive directory')

-    comment_group = main_parser.add_argument_group('comment blocks')
-    comment_group.add_argument(
-        '-g',
-        '--group-single-line-comments',
-        help='Group consecutive singe-line comments into blocks')
-    comment_group.add_argument(
-        '-c',
-        '--reg-exp-comments',
-        metavar='REGEXP',
-        type=str,
-        nargs='+',
-        help='search comments for regular expression')
+    return main_parser

-    source_group = main_parser.add_argument_group('source blocks')
-    source_group.add_argument(
-        '-b',
-        '--beautify',
-        action='store_true',
-        default=False,
-        help='beautify source code')
-    source_group.add_argument(
-        '-s',
-        '--reg-exp-source',
-        metavar='REGEXP',
-        type=str,
-        nargs='+',
-        help='search source for regular expression')

-    strings_group = main_parser.add_argument_group('string literals')
-    strings_group.add_argument(
-        '-j',
-        '--join-string-literals',
-        action='store_true',
-        help='join string literals (heuristic)')
-    strings_group.add_argument(
-        '-l',
-        '--reg-exp-string-literals',
-        metavar='REGEXP',
-        type=str,
-        nargs='+',
-        help='search string literals for regular expression')
+if __name__ == "__main__":
+    main_parser = build_parser()
+
    main_conf = main_parser.parse_args()

    sys.exit(main(main_conf))