Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

2019-02-27 19:37:13 +00:00 · 2019-02-27 19:37:13 +00:00 · 4ffc51e6b9
parent 7a0f7ea496 9d615760f1
commit 4ffc51e6b9
2 changed files with 135 additions and 95 deletions
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@ -36,6 +36,7 @@ import datetime
 import dateutil
 import dateutil.parser
 import requests
 from itertools import groupby
 from ExtensionCrawler.config import (
    const_review_payload, const_review_search_url, const_download_url,
@ -638,7 +639,8 @@ def iter_tar_entries_from_file_ext(archivedir, extid, ext):
    tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
    with tarfile.open(tar, 'r') as tf:
        for tarentry in tf:
-            yield (tarentry, tf.extractfile(tarentry))
+            if tarentry.isfile():
                yield (tarentry, tf.extractfile(tarentry))
 def iter_tar_entries(archivedir, extid):
    for i in range(1000):
@ -651,3 +653,6 @@ def iter_tar_entries(archivedir, extid):
    ext = ".tar"
    for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
        yield (tarentry, tarfile)
 def iter_tar_entries_by_date(archivedir, extid):
    return groupby(iter_tar_entries(archivedir, extid), lambda tup: tup[0].name.split("/")[1])
--- a/223
+++ b/223
@ -17,66 +17,135 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 import datetime
 import argparse
 import io
 import fnmatch
 import os
 import logging
 import re
 import json
 import sys
-import operator
+import importlib.util
-import tarfile
+import csv
-import zlib
+import math
 from functools import partial, reduce
 from colorama import init, Fore
 from multiprocessing import Pool
 from zipfile import ZipFile
 import dateutil
 import dateutil.parser
 import jsbeautifier
 from zipfile import ZipFile
 from ExtensionCrawler.config import (const_log_format, const_basedir)
-from ExtensionCrawler.archive import iter_tar_entries
+from ExtensionCrawler.archive import iter_tar_entries_by_date
 from ExtensionCrawler.config import get_local_archive_dir
 from ExtensionCrawler.js_decomposer import init_file_info
 from ExtensionCrawler.js_mincer import mince_js
-def is_source_file(zipentry):
+def get_shannon_entropy(string):
-    """Test if filename indicates file with C-style comment."""
+    """
-    return (zipentry.filename.endswith(".js") or zipentry.filename.endswith(".js.gz")
+    This code has been borrowed from
-            or zipentry.filename.endswith(".jgz") or zipentry.filename.endswith(".jsg")
+    "http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
-            or zipentry.filename.endswith(".css.gz") or zipentry.filename.endswith(".c")
+    "git@github.com:dxa4481/truffleHog.git"
-            or zipentry.filename.endswith(".cpp") or zipentry.filename.endswith(".java"))
+    """
-
+    chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
-def handle_extid(conf, extid):
+    if not string:
-    for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid):
+        return 0
-        if tarentry.name.endswith(".crx"):
+    entropy = 0
-            with ZipFile(tarfile) as zf:
+    for x in chars:
-                for zipentry in zf.infolist():
+        p_x = float(string.count(x))/len(string)
-                    if is_source_file(zipentry):
+        if p_x > 0:
-                        with zf.open(zipentry) as f:
+            entropy += - p_x*math.log(p_x, 2)
-                            for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
+    return entropy
                                merged_strings = "".join(map(lambda x: x[1], block.string_literals))
                                print(merged_strings)
                                # for pattern_group in regex_patterns:
                                #     for pattern in regex_patterns[pattern_group]:
                                #         if re.search(pattern, merged_strings):
                                #             if pattern_group not in matches:
                                #                 matches[pattern_group] = []
                                #             matches[pattern_group] += [match]
                                #             matches.add(pattern_group)
                                # for pattern_group in string_patterns:
                                #     for pattern in string_patterns[pattern_group]:
                                #         if pattern in merged_strings:
                                #             matches.add(pattern_group)
 def is_likely_hash(string):
    return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
 def import_regexs(path):
    spec = importlib.util.spec_from_file_location("MinerStrings", path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module
 def get_etag(headers_content):
    headers_content = headers_content.replace(
            '"', '\\"').replace("'", '"')
    headers_json = json.loads(headers_content)
    if "ETag" in headers_json:
        return headers_json["ETag"]
 def get_name_and_version(overview_contents):
    # Extract extension name
    match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
                      overview_contents)
    name = match.group(1) if match else None
    # Extract extension version
    match = re.search(
        """<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
    version = match.group(1) if match else None
    return name, version
 def handle_extid(conf, extid, csvwriter):
    miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
    results = []
    still_in_store = None
    crx_etags = [None]
    for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
        if conf.from_date and not (conf.from_date <= date):
            continue
        if conf.latest_date and not (date <= conf.latest_date):
            continue
        crx_etag = None
        name = None
        version = None
        matches = []
        for tarentry, tarfile in tups:
            tarentry_filename = tarentry.name.split("/")[-1]
            if tarentry_filename.endswith(".crx.headers"):
                crx_etag = get_etag(tarfile.read().decode())
                if crx_etag:
                    crx_etags += [crx_etag]
            if tarentry_filename == "overview.html":
                name, version = get_name_and_version(tarfile.read().decode())
            if tarentry_filename == "overview.html.status":
                still_in_store = tarfile.read().decode().startswith("2")
            if tarentry_filename.endswith(".crx"):
                with ZipFile(tarfile) as zf:
                    for zipentry in zf.infolist():
                        if zipentry.filename.endswith(".js"):
                            with zf.open(zipentry) as f:
                                for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
                                    file_lines = []
                                    file_lines += block.content.splitlines()
                                    file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
                                    for search_tag in miner_strings.strings.keys():
                                        for search_string in miner_strings.strings[search_tag]:
                                            for line in file_lines:
                                                if search_string in line:
                                                    matches += [[zipentry.filename, search_tag, search_string]]
                                                    break
                                    for search_tag in miner_strings.patterns.keys():
                                        for search_pattern in miner_strings.patterns[search_tag]:
                                            for line in file_lines:
                                                m = re.search(search_pattern, line)
                                                if m:
                                                    matched_string = m.group()
                                                    if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
                                                        matches += [[zipentry.filename, search_tag, matched_string]]
                                                        break
        for match in matches:
            results += [[date, crx_etag, name, version] + match]
    for result in results:
        csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
 def main(conf):
@ -90,18 +159,19 @@ def main(conf):
        logger.setLevel(logging.WARNING)
    with open(conf.EXTID_FILE) as f:
-        for extid in f.readlines():
+        csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
-            handle_extid(conf, extid)
+        csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"])
        for extid in [l.strip() for l in f.readlines()]:
            handle_extid(conf, extid, csvwriter)
-
+def build_parser():
 if __name__ == "__main__":
    main_parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description='Grep for extensions.')
    main_parser.add_argument(
        'REGEXP_FILE',
-        help='file with regular expressions')
+        help='python file with regular expressions')
    main_parser.add_argument(
        'EXTID_FILE',
        help='file with extension ids')
@ -119,8 +189,8 @@ if __name__ == "__main__":
        metavar='DATE',
        type=str,
        help='select latest crx from tar, released before DATE.\n' +
-        'Together with --from-date, specifies all crx released in specified\n'
+             'Together with --from-date, specifies all crx released in specified\n' +
-        + 'date range.')
+             'date range.')
    main_parser.add_argument(
        '-d',
@ -128,8 +198,8 @@ if __name__ == "__main__":
        metavar='DATE',
        type=str,
        help='select oldest crx from tar released after DATE.\n' +
-        'Together with --latest-date, specifies all crx released in specified\n'
+             'Together with --latest-date, specifies all crx released in specified\n' +
-        + 'date range.')
+             'date range.')
    main_parser.add_argument(
        '-a',
@ -139,47 +209,12 @@ if __name__ == "__main__":
        default=const_basedir(),
        help='archive directory')
-    comment_group = main_parser.add_argument_group('comment blocks')
+    return main_parser
    comment_group.add_argument(
        '-g',
        '--group-single-line-comments',
        help='Group consecutive singe-line comments into blocks')
    comment_group.add_argument(
        '-c',
        '--reg-exp-comments',
        metavar='REGEXP',
        type=str,
        nargs='+',
        help='search comments for regular expression')
    source_group = main_parser.add_argument_group('source blocks')
    source_group.add_argument(
        '-b',
        '--beautify',
        action='store_true',
        default=False,
        help='beautify source code')
    source_group.add_argument(
        '-s',
        '--reg-exp-source',
        metavar='REGEXP',
        type=str,
        nargs='+',
        help='search source for regular expression')
-    strings_group = main_parser.add_argument_group('string literals')
+if __name__ == "__main__":
-    strings_group.add_argument(
+    main_parser = build_parser()
-        '-j',
+
        '--join-string-literals',
        action='store_true',
        help='join string literals (heuristic)')
    strings_group.add_argument(
        '-l',
        '--reg-exp-string-literals',
        metavar='REGEXP',
        type=str,
        nargs='+',
        help='search string literals for regular expression')
    main_conf = main_parser.parse_args()
    sys.exit(main(main_conf))