Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler
This commit is contained in:
commit
4ffc51e6b9
|
@ -36,6 +36,7 @@ import datetime
|
||||||
import dateutil
|
import dateutil
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
import requests
|
import requests
|
||||||
|
from itertools import groupby
|
||||||
|
|
||||||
from ExtensionCrawler.config import (
|
from ExtensionCrawler.config import (
|
||||||
const_review_payload, const_review_search_url, const_download_url,
|
const_review_payload, const_review_search_url, const_download_url,
|
||||||
|
@ -638,7 +639,8 @@ def iter_tar_entries_from_file_ext(archivedir, extid, ext):
|
||||||
tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
|
tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
|
||||||
with tarfile.open(tar, 'r') as tf:
|
with tarfile.open(tar, 'r') as tf:
|
||||||
for tarentry in tf:
|
for tarentry in tf:
|
||||||
yield (tarentry, tf.extractfile(tarentry))
|
if tarentry.isfile():
|
||||||
|
yield (tarentry, tf.extractfile(tarentry))
|
||||||
|
|
||||||
def iter_tar_entries(archivedir, extid):
|
def iter_tar_entries(archivedir, extid):
|
||||||
for i in range(1000):
|
for i in range(1000):
|
||||||
|
@ -651,3 +653,6 @@ def iter_tar_entries(archivedir, extid):
|
||||||
ext = ".tar"
|
ext = ".tar"
|
||||||
for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
|
for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
|
||||||
yield (tarentry, tarfile)
|
yield (tarentry, tarfile)
|
||||||
|
|
||||||
|
def iter_tar_entries_by_date(archivedir, extid):
|
||||||
|
return groupby(iter_tar_entries(archivedir, extid), lambda tup: tup[0].name.split("/")[1])
|
||||||
|
|
223
extgrep
223
extgrep
|
@ -17,66 +17,135 @@
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
import datetime
|
|
||||||
import argparse
|
import argparse
|
||||||
import io
|
import io
|
||||||
import fnmatch
|
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
import operator
|
import importlib.util
|
||||||
import tarfile
|
import csv
|
||||||
import zlib
|
import math
|
||||||
from functools import partial, reduce
|
|
||||||
from colorama import init, Fore
|
|
||||||
from multiprocessing import Pool
|
|
||||||
from zipfile import ZipFile
|
|
||||||
|
|
||||||
import dateutil
|
|
||||||
import dateutil.parser
|
|
||||||
import jsbeautifier
|
|
||||||
|
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
||||||
from ExtensionCrawler.archive import iter_tar_entries
|
from ExtensionCrawler.archive import iter_tar_entries_by_date
|
||||||
from ExtensionCrawler.config import get_local_archive_dir
|
|
||||||
from ExtensionCrawler.js_decomposer import init_file_info
|
|
||||||
from ExtensionCrawler.js_mincer import mince_js
|
from ExtensionCrawler.js_mincer import mince_js
|
||||||
|
|
||||||
|
|
||||||
def is_source_file(zipentry):
|
def get_shannon_entropy(string):
|
||||||
"""Test if filename indicates file with C-style comment."""
|
"""
|
||||||
return (zipentry.filename.endswith(".js") or zipentry.filename.endswith(".js.gz")
|
This code has been borrowed from
|
||||||
or zipentry.filename.endswith(".jgz") or zipentry.filename.endswith(".jsg")
|
"http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
|
||||||
or zipentry.filename.endswith(".css.gz") or zipentry.filename.endswith(".c")
|
"git@github.com:dxa4481/truffleHog.git"
|
||||||
or zipentry.filename.endswith(".cpp") or zipentry.filename.endswith(".java"))
|
"""
|
||||||
|
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||||
def handle_extid(conf, extid):
|
if not string:
|
||||||
for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid):
|
return 0
|
||||||
if tarentry.name.endswith(".crx"):
|
entropy = 0
|
||||||
with ZipFile(tarfile) as zf:
|
for x in chars:
|
||||||
for zipentry in zf.infolist():
|
p_x = float(string.count(x))/len(string)
|
||||||
if is_source_file(zipentry):
|
if p_x > 0:
|
||||||
with zf.open(zipentry) as f:
|
entropy += - p_x*math.log(p_x, 2)
|
||||||
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
|
return entropy
|
||||||
merged_strings = "".join(map(lambda x: x[1], block.string_literals))
|
|
||||||
print(merged_strings)
|
|
||||||
|
|
||||||
# for pattern_group in regex_patterns:
|
|
||||||
# for pattern in regex_patterns[pattern_group]:
|
|
||||||
# if re.search(pattern, merged_strings):
|
|
||||||
# if pattern_group not in matches:
|
|
||||||
# matches[pattern_group] = []
|
|
||||||
# matches[pattern_group] += [match]
|
|
||||||
# matches.add(pattern_group)
|
|
||||||
# for pattern_group in string_patterns:
|
|
||||||
# for pattern in string_patterns[pattern_group]:
|
|
||||||
# if pattern in merged_strings:
|
|
||||||
# matches.add(pattern_group)
|
|
||||||
|
|
||||||
|
|
||||||
|
def is_likely_hash(string):
|
||||||
|
return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
|
||||||
|
|
||||||
|
|
||||||
|
def import_regexs(path):
|
||||||
|
spec = importlib.util.spec_from_file_location("MinerStrings", path)
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
return module
|
||||||
|
|
||||||
|
|
||||||
|
def get_etag(headers_content):
|
||||||
|
headers_content = headers_content.replace(
|
||||||
|
'"', '\\"').replace("'", '"')
|
||||||
|
headers_json = json.loads(headers_content)
|
||||||
|
if "ETag" in headers_json:
|
||||||
|
return headers_json["ETag"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_name_and_version(overview_contents):
|
||||||
|
# Extract extension name
|
||||||
|
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
||||||
|
overview_contents)
|
||||||
|
name = match.group(1) if match else None
|
||||||
|
|
||||||
|
# Extract extension version
|
||||||
|
match = re.search(
|
||||||
|
"""<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
|
||||||
|
version = match.group(1) if match else None
|
||||||
|
|
||||||
|
return name, version
|
||||||
|
|
||||||
|
|
||||||
|
def handle_extid(conf, extid, csvwriter):
|
||||||
|
miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
still_in_store = None
|
||||||
|
crx_etags = [None]
|
||||||
|
for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
|
||||||
|
if conf.from_date and not (conf.from_date <= date):
|
||||||
|
continue
|
||||||
|
if conf.latest_date and not (date <= conf.latest_date):
|
||||||
|
continue
|
||||||
|
|
||||||
|
crx_etag = None
|
||||||
|
name = None
|
||||||
|
version = None
|
||||||
|
matches = []
|
||||||
|
for tarentry, tarfile in tups:
|
||||||
|
tarentry_filename = tarentry.name.split("/")[-1]
|
||||||
|
|
||||||
|
if tarentry_filename.endswith(".crx.headers"):
|
||||||
|
crx_etag = get_etag(tarfile.read().decode())
|
||||||
|
if crx_etag:
|
||||||
|
crx_etags += [crx_etag]
|
||||||
|
|
||||||
|
if tarentry_filename == "overview.html":
|
||||||
|
name, version = get_name_and_version(tarfile.read().decode())
|
||||||
|
|
||||||
|
if tarentry_filename == "overview.html.status":
|
||||||
|
still_in_store = tarfile.read().decode().startswith("2")
|
||||||
|
|
||||||
|
if tarentry_filename.endswith(".crx"):
|
||||||
|
with ZipFile(tarfile) as zf:
|
||||||
|
for zipentry in zf.infolist():
|
||||||
|
if zipentry.filename.endswith(".js"):
|
||||||
|
with zf.open(zipentry) as f:
|
||||||
|
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
|
||||||
|
file_lines = []
|
||||||
|
file_lines += block.content.splitlines()
|
||||||
|
file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
|
||||||
|
|
||||||
|
for search_tag in miner_strings.strings.keys():
|
||||||
|
for search_string in miner_strings.strings[search_tag]:
|
||||||
|
for line in file_lines:
|
||||||
|
if search_string in line:
|
||||||
|
matches += [[zipentry.filename, search_tag, search_string]]
|
||||||
|
break
|
||||||
|
|
||||||
|
for search_tag in miner_strings.patterns.keys():
|
||||||
|
for search_pattern in miner_strings.patterns[search_tag]:
|
||||||
|
for line in file_lines:
|
||||||
|
m = re.search(search_pattern, line)
|
||||||
|
if m:
|
||||||
|
matched_string = m.group()
|
||||||
|
if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
|
||||||
|
matches += [[zipentry.filename, search_tag, matched_string]]
|
||||||
|
break
|
||||||
|
for match in matches:
|
||||||
|
results += [[date, crx_etag, name, version] + match]
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
|
||||||
|
|
||||||
|
|
||||||
def main(conf):
|
def main(conf):
|
||||||
|
@ -90,18 +159,19 @@ def main(conf):
|
||||||
logger.setLevel(logging.WARNING)
|
logger.setLevel(logging.WARNING)
|
||||||
|
|
||||||
with open(conf.EXTID_FILE) as f:
|
with open(conf.EXTID_FILE) as f:
|
||||||
for extid in f.readlines():
|
csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
|
||||||
handle_extid(conf, extid)
|
csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"])
|
||||||
|
for extid in [l.strip() for l in f.readlines()]:
|
||||||
|
handle_extid(conf, extid, csvwriter)
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser():
|
||||||
if __name__ == "__main__":
|
|
||||||
main_parser = argparse.ArgumentParser(
|
main_parser = argparse.ArgumentParser(
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
description='Grep for extensions.')
|
description='Grep for extensions.')
|
||||||
main_parser.add_argument(
|
main_parser.add_argument(
|
||||||
'REGEXP_FILE',
|
'REGEXP_FILE',
|
||||||
help='file with regular expressions')
|
help='python file with regular expressions')
|
||||||
main_parser.add_argument(
|
main_parser.add_argument(
|
||||||
'EXTID_FILE',
|
'EXTID_FILE',
|
||||||
help='file with extension ids')
|
help='file with extension ids')
|
||||||
|
@ -119,8 +189,8 @@ if __name__ == "__main__":
|
||||||
metavar='DATE',
|
metavar='DATE',
|
||||||
type=str,
|
type=str,
|
||||||
help='select latest crx from tar, released before DATE.\n' +
|
help='select latest crx from tar, released before DATE.\n' +
|
||||||
'Together with --from-date, specifies all crx released in specified\n'
|
'Together with --from-date, specifies all crx released in specified\n' +
|
||||||
+ 'date range.')
|
'date range.')
|
||||||
|
|
||||||
main_parser.add_argument(
|
main_parser.add_argument(
|
||||||
'-d',
|
'-d',
|
||||||
|
@ -128,8 +198,8 @@ if __name__ == "__main__":
|
||||||
metavar='DATE',
|
metavar='DATE',
|
||||||
type=str,
|
type=str,
|
||||||
help='select oldest crx from tar released after DATE.\n' +
|
help='select oldest crx from tar released after DATE.\n' +
|
||||||
'Together with --latest-date, specifies all crx released in specified\n'
|
'Together with --latest-date, specifies all crx released in specified\n' +
|
||||||
+ 'date range.')
|
'date range.')
|
||||||
|
|
||||||
main_parser.add_argument(
|
main_parser.add_argument(
|
||||||
'-a',
|
'-a',
|
||||||
|
@ -139,47 +209,12 @@ if __name__ == "__main__":
|
||||||
default=const_basedir(),
|
default=const_basedir(),
|
||||||
help='archive directory')
|
help='archive directory')
|
||||||
|
|
||||||
comment_group = main_parser.add_argument_group('comment blocks')
|
return main_parser
|
||||||
comment_group.add_argument(
|
|
||||||
'-g',
|
|
||||||
'--group-single-line-comments',
|
|
||||||
help='Group consecutive singe-line comments into blocks')
|
|
||||||
comment_group.add_argument(
|
|
||||||
'-c',
|
|
||||||
'--reg-exp-comments',
|
|
||||||
metavar='REGEXP',
|
|
||||||
type=str,
|
|
||||||
nargs='+',
|
|
||||||
help='search comments for regular expression')
|
|
||||||
|
|
||||||
source_group = main_parser.add_argument_group('source blocks')
|
|
||||||
source_group.add_argument(
|
|
||||||
'-b',
|
|
||||||
'--beautify',
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help='beautify source code')
|
|
||||||
source_group.add_argument(
|
|
||||||
'-s',
|
|
||||||
'--reg-exp-source',
|
|
||||||
metavar='REGEXP',
|
|
||||||
type=str,
|
|
||||||
nargs='+',
|
|
||||||
help='search source for regular expression')
|
|
||||||
|
|
||||||
strings_group = main_parser.add_argument_group('string literals')
|
if __name__ == "__main__":
|
||||||
strings_group.add_argument(
|
main_parser = build_parser()
|
||||||
'-j',
|
|
||||||
'--join-string-literals',
|
|
||||||
action='store_true',
|
|
||||||
help='join string literals (heuristic)')
|
|
||||||
strings_group.add_argument(
|
|
||||||
'-l',
|
|
||||||
'--reg-exp-string-literals',
|
|
||||||
metavar='REGEXP',
|
|
||||||
type=str,
|
|
||||||
nargs='+',
|
|
||||||
help='search string literals for regular expression')
|
|
||||||
main_conf = main_parser.parse_args()
|
main_conf = main_parser.parse_args()
|
||||||
|
|
||||||
sys.exit(main(main_conf))
|
sys.exit(main(main_conf))
|
||||||
|
|
Loading…
Reference in New Issue