Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

This commit is contained in:
Achim D. Brucker 2019-02-27 19:37:13 +00:00
commit 4ffc51e6b9
2 changed files with 135 additions and 95 deletions

View File

@ -36,6 +36,7 @@ import datetime
import dateutil import dateutil
import dateutil.parser import dateutil.parser
import requests import requests
from itertools import groupby
from ExtensionCrawler.config import ( from ExtensionCrawler.config import (
const_review_payload, const_review_search_url, const_download_url, const_review_payload, const_review_search_url, const_download_url,
@ -638,7 +639,8 @@ def iter_tar_entries_from_file_ext(archivedir, extid, ext):
tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext) tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
with tarfile.open(tar, 'r') as tf: with tarfile.open(tar, 'r') as tf:
for tarentry in tf: for tarentry in tf:
yield (tarentry, tf.extractfile(tarentry)) if tarentry.isfile():
yield (tarentry, tf.extractfile(tarentry))
def iter_tar_entries(archivedir, extid): def iter_tar_entries(archivedir, extid):
for i in range(1000): for i in range(1000):
@ -651,3 +653,6 @@ def iter_tar_entries(archivedir, extid):
ext = ".tar" ext = ".tar"
for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext): for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
yield (tarentry, tarfile) yield (tarentry, tarfile)
def iter_tar_entries_by_date(archivedir, extid):
return groupby(iter_tar_entries(archivedir, extid), lambda tup: tup[0].name.split("/")[1])

223
extgrep
View File

@ -17,66 +17,135 @@
# #
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
import datetime
import argparse import argparse
import io import io
import fnmatch
import os
import logging import logging
import re import re
import json
import sys import sys
import operator import importlib.util
import tarfile import csv
import zlib import math
from functools import partial, reduce
from colorama import init, Fore
from multiprocessing import Pool
from zipfile import ZipFile
import dateutil
import dateutil.parser
import jsbeautifier
from zipfile import ZipFile from zipfile import ZipFile
from ExtensionCrawler.config import (const_log_format, const_basedir) from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import iter_tar_entries from ExtensionCrawler.archive import iter_tar_entries_by_date
from ExtensionCrawler.config import get_local_archive_dir
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.js_mincer import mince_js from ExtensionCrawler.js_mincer import mince_js
def is_source_file(zipentry): def get_shannon_entropy(string):
"""Test if filename indicates file with C-style comment.""" """
return (zipentry.filename.endswith(".js") or zipentry.filename.endswith(".js.gz") This code has been borrowed from
or zipentry.filename.endswith(".jgz") or zipentry.filename.endswith(".jsg") "http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
or zipentry.filename.endswith(".css.gz") or zipentry.filename.endswith(".c") "git@github.com:dxa4481/truffleHog.git"
or zipentry.filename.endswith(".cpp") or zipentry.filename.endswith(".java")) """
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
def handle_extid(conf, extid): if not string:
for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid): return 0
if tarentry.name.endswith(".crx"): entropy = 0
with ZipFile(tarfile) as zf: for x in chars:
for zipentry in zf.infolist(): p_x = float(string.count(x))/len(string)
if is_source_file(zipentry): if p_x > 0:
with zf.open(zipentry) as f: entropy += - p_x*math.log(p_x, 2)
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): return entropy
merged_strings = "".join(map(lambda x: x[1], block.string_literals))
print(merged_strings)
# for pattern_group in regex_patterns:
# for pattern in regex_patterns[pattern_group]:
# if re.search(pattern, merged_strings):
# if pattern_group not in matches:
# matches[pattern_group] = []
# matches[pattern_group] += [match]
# matches.add(pattern_group)
# for pattern_group in string_patterns:
# for pattern in string_patterns[pattern_group]:
# if pattern in merged_strings:
# matches.add(pattern_group)
def is_likely_hash(string):
return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
def import_regexs(path):
spec = importlib.util.spec_from_file_location("MinerStrings", path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def get_etag(headers_content):
headers_content = headers_content.replace(
'"', '\\"').replace("'", '"')
headers_json = json.loads(headers_content)
if "ETag" in headers_json:
return headers_json["ETag"]
def get_name_and_version(overview_contents):
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
overview_contents)
name = match.group(1) if match else None
# Extract extension version
match = re.search(
"""<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
version = match.group(1) if match else None
return name, version
def handle_extid(conf, extid, csvwriter):
miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
results = []
still_in_store = None
crx_etags = [None]
for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
if conf.from_date and not (conf.from_date <= date):
continue
if conf.latest_date and not (date <= conf.latest_date):
continue
crx_etag = None
name = None
version = None
matches = []
for tarentry, tarfile in tups:
tarentry_filename = tarentry.name.split("/")[-1]
if tarentry_filename.endswith(".crx.headers"):
crx_etag = get_etag(tarfile.read().decode())
if crx_etag:
crx_etags += [crx_etag]
if tarentry_filename == "overview.html":
name, version = get_name_and_version(tarfile.read().decode())
if tarentry_filename == "overview.html.status":
still_in_store = tarfile.read().decode().startswith("2")
if tarentry_filename.endswith(".crx"):
with ZipFile(tarfile) as zf:
for zipentry in zf.infolist():
if zipentry.filename.endswith(".js"):
with zf.open(zipentry) as f:
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
file_lines = []
file_lines += block.content.splitlines()
file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
for search_tag in miner_strings.strings.keys():
for search_string in miner_strings.strings[search_tag]:
for line in file_lines:
if search_string in line:
matches += [[zipentry.filename, search_tag, search_string]]
break
for search_tag in miner_strings.patterns.keys():
for search_pattern in miner_strings.patterns[search_tag]:
for line in file_lines:
m = re.search(search_pattern, line)
if m:
matched_string = m.group()
if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
matches += [[zipentry.filename, search_tag, matched_string]]
break
for match in matches:
results += [[date, crx_etag, name, version] + match]
for result in results:
csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
def main(conf): def main(conf):
@ -90,18 +159,19 @@ def main(conf):
logger.setLevel(logging.WARNING) logger.setLevel(logging.WARNING)
with open(conf.EXTID_FILE) as f: with open(conf.EXTID_FILE) as f:
for extid in f.readlines(): csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
handle_extid(conf, extid) csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"])
for extid in [l.strip() for l in f.readlines()]:
handle_extid(conf, extid, csvwriter)
def build_parser():
if __name__ == "__main__":
main_parser = argparse.ArgumentParser( main_parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter, formatter_class=argparse.RawTextHelpFormatter,
description='Grep for extensions.') description='Grep for extensions.')
main_parser.add_argument( main_parser.add_argument(
'REGEXP_FILE', 'REGEXP_FILE',
help='file with regular expressions') help='python file with regular expressions')
main_parser.add_argument( main_parser.add_argument(
'EXTID_FILE', 'EXTID_FILE',
help='file with extension ids') help='file with extension ids')
@ -119,8 +189,8 @@ if __name__ == "__main__":
metavar='DATE', metavar='DATE',
type=str, type=str,
help='select latest crx from tar, released before DATE.\n' + help='select latest crx from tar, released before DATE.\n' +
'Together with --from-date, specifies all crx released in specified\n' 'Together with --from-date, specifies all crx released in specified\n' +
+ 'date range.') 'date range.')
main_parser.add_argument( main_parser.add_argument(
'-d', '-d',
@ -128,8 +198,8 @@ if __name__ == "__main__":
metavar='DATE', metavar='DATE',
type=str, type=str,
help='select oldest crx from tar released after DATE.\n' + help='select oldest crx from tar released after DATE.\n' +
'Together with --latest-date, specifies all crx released in specified\n' 'Together with --latest-date, specifies all crx released in specified\n' +
+ 'date range.') 'date range.')
main_parser.add_argument( main_parser.add_argument(
'-a', '-a',
@ -139,47 +209,12 @@ if __name__ == "__main__":
default=const_basedir(), default=const_basedir(),
help='archive directory') help='archive directory')
comment_group = main_parser.add_argument_group('comment blocks') return main_parser
comment_group.add_argument(
'-g',
'--group-single-line-comments',
help='Group consecutive singe-line comments into blocks')
comment_group.add_argument(
'-c',
'--reg-exp-comments',
metavar='REGEXP',
type=str,
nargs='+',
help='search comments for regular expression')
source_group = main_parser.add_argument_group('source blocks')
source_group.add_argument(
'-b',
'--beautify',
action='store_true',
default=False,
help='beautify source code')
source_group.add_argument(
'-s',
'--reg-exp-source',
metavar='REGEXP',
type=str,
nargs='+',
help='search source for regular expression')
strings_group = main_parser.add_argument_group('string literals') if __name__ == "__main__":
strings_group.add_argument( main_parser = build_parser()
'-j',
'--join-string-literals',
action='store_true',
help='join string literals (heuristic)')
strings_group.add_argument(
'-l',
'--reg-exp-string-literals',
metavar='REGEXP',
type=str,
nargs='+',
help='search string literals for regular expression')
main_conf = main_parser.parse_args() main_conf = main_parser.parse_args()
sys.exit(main(main_conf)) sys.exit(main(main_conf))