Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

This commit is contained in:
Achim D. Brucker 2019-02-27 19:37:13 +00:00
commit 4ffc51e6b9
2 changed files with 135 additions and 95 deletions

View File

@ -36,6 +36,7 @@ import datetime
import dateutil
import dateutil.parser
import requests
from itertools import groupby
from ExtensionCrawler.config import (
const_review_payload, const_review_search_url, const_download_url,
@ -638,7 +639,8 @@ def iter_tar_entries_from_file_ext(archivedir, extid, ext):
tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext)
with tarfile.open(tar, 'r') as tf:
for tarentry in tf:
yield (tarentry, tf.extractfile(tarentry))
if tarentry.isfile():
yield (tarentry, tf.extractfile(tarentry))
def iter_tar_entries(archivedir, extid):
for i in range(1000):
@ -651,3 +653,6 @@ def iter_tar_entries(archivedir, extid):
ext = ".tar"
for (tarentry, tarfile) in iter_tar_entries_from_file_ext(archivedir, extid, ext):
yield (tarentry, tarfile)
def iter_tar_entries_by_date(archivedir, extid):
return groupby(iter_tar_entries(archivedir, extid), lambda tup: tup[0].name.split("/")[1])

223
extgrep
View File

@ -17,66 +17,135 @@
#
# SPDX-License-Identifier: GPL-3.0-or-later
import datetime
import argparse
import io
import fnmatch
import os
import logging
import re
import json
import sys
import operator
import tarfile
import zlib
from functools import partial, reduce
from colorama import init, Fore
from multiprocessing import Pool
from zipfile import ZipFile
import dateutil
import dateutil.parser
import jsbeautifier
import importlib.util
import csv
import math
from zipfile import ZipFile
from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import iter_tar_entries
from ExtensionCrawler.config import get_local_archive_dir
from ExtensionCrawler.js_decomposer import init_file_info
from ExtensionCrawler.archive import iter_tar_entries_by_date
from ExtensionCrawler.js_mincer import mince_js
def is_source_file(zipentry):
"""Test if filename indicates file with C-style comment."""
return (zipentry.filename.endswith(".js") or zipentry.filename.endswith(".js.gz")
or zipentry.filename.endswith(".jgz") or zipentry.filename.endswith(".jsg")
or zipentry.filename.endswith(".css.gz") or zipentry.filename.endswith(".c")
or zipentry.filename.endswith(".cpp") or zipentry.filename.endswith(".java"))
def handle_extid(conf, extid):
for tarentry, tarfile in iter_tar_entries(conf.archive_dir, extid):
if tarentry.name.endswith(".crx"):
with ZipFile(tarfile) as zf:
for zipentry in zf.infolist():
if is_source_file(zipentry):
with zf.open(zipentry) as f:
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
merged_strings = "".join(map(lambda x: x[1], block.string_literals))
print(merged_strings)
# for pattern_group in regex_patterns:
# for pattern in regex_patterns[pattern_group]:
# if re.search(pattern, merged_strings):
# if pattern_group not in matches:
# matches[pattern_group] = []
# matches[pattern_group] += [match]
# matches.add(pattern_group)
# for pattern_group in string_patterns:
# for pattern in string_patterns[pattern_group]:
# if pattern in merged_strings:
# matches.add(pattern_group)
def get_shannon_entropy(string):
"""
This code has been borrowed from
"http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
"git@github.com:dxa4481/truffleHog.git"
"""
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
if not string:
return 0
entropy = 0
for x in chars:
p_x = float(string.count(x))/len(string)
if p_x > 0:
entropy += - p_x*math.log(p_x, 2)
return entropy
def is_likely_hash(string):
return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
def import_regexs(path):
spec = importlib.util.spec_from_file_location("MinerStrings", path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def get_etag(headers_content):
headers_content = headers_content.replace(
'"', '\\"').replace("'", '"')
headers_json = json.loads(headers_content)
if "ETag" in headers_json:
return headers_json["ETag"]
def get_name_and_version(overview_contents):
# Extract extension name
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
overview_contents)
name = match.group(1) if match else None
# Extract extension version
match = re.search(
"""<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
version = match.group(1) if match else None
return name, version
def handle_extid(conf, extid, csvwriter):
miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
results = []
still_in_store = None
crx_etags = [None]
for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
if conf.from_date and not (conf.from_date <= date):
continue
if conf.latest_date and not (date <= conf.latest_date):
continue
crx_etag = None
name = None
version = None
matches = []
for tarentry, tarfile in tups:
tarentry_filename = tarentry.name.split("/")[-1]
if tarentry_filename.endswith(".crx.headers"):
crx_etag = get_etag(tarfile.read().decode())
if crx_etag:
crx_etags += [crx_etag]
if tarentry_filename == "overview.html":
name, version = get_name_and_version(tarfile.read().decode())
if tarentry_filename == "overview.html.status":
still_in_store = tarfile.read().decode().startswith("2")
if tarentry_filename.endswith(".crx"):
with ZipFile(tarfile) as zf:
for zipentry in zf.infolist():
if zipentry.filename.endswith(".js"):
with zf.open(zipentry) as f:
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
file_lines = []
file_lines += block.content.splitlines()
file_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
for search_tag in miner_strings.strings.keys():
for search_string in miner_strings.strings[search_tag]:
for line in file_lines:
if search_string in line:
matches += [[zipentry.filename, search_tag, search_string]]
break
for search_tag in miner_strings.patterns.keys():
for search_pattern in miner_strings.patterns[search_tag]:
for line in file_lines:
m = re.search(search_pattern, line)
if m:
matched_string = m.group()
if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
matches += [[zipentry.filename, search_tag, matched_string]]
break
for match in matches:
results += [[date, crx_etag, name, version] + match]
for result in results:
csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
def main(conf):
@ -90,18 +159,19 @@ def main(conf):
logger.setLevel(logging.WARNING)
with open(conf.EXTID_FILE) as f:
for extid in f.readlines():
handle_extid(conf, extid)
csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "tag", "match"])
for extid in [l.strip() for l in f.readlines()]:
handle_extid(conf, extid, csvwriter)
if __name__ == "__main__":
def build_parser():
main_parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description='Grep for extensions.')
main_parser.add_argument(
'REGEXP_FILE',
help='file with regular expressions')
help='python file with regular expressions')
main_parser.add_argument(
'EXTID_FILE',
help='file with extension ids')
@ -119,8 +189,8 @@ if __name__ == "__main__":
metavar='DATE',
type=str,
help='select latest crx from tar, released before DATE.\n' +
'Together with --from-date, specifies all crx released in specified\n'
+ 'date range.')
'Together with --from-date, specifies all crx released in specified\n' +
'date range.')
main_parser.add_argument(
'-d',
@ -128,8 +198,8 @@ if __name__ == "__main__":
metavar='DATE',
type=str,
help='select oldest crx from tar released after DATE.\n' +
'Together with --latest-date, specifies all crx released in specified\n'
+ 'date range.')
'Together with --latest-date, specifies all crx released in specified\n' +
'date range.')
main_parser.add_argument(
'-a',
@ -139,47 +209,12 @@ if __name__ == "__main__":
default=const_basedir(),
help='archive directory')
comment_group = main_parser.add_argument_group('comment blocks')
comment_group.add_argument(
'-g',
'--group-single-line-comments',
help='Group consecutive singe-line comments into blocks')
comment_group.add_argument(
'-c',
'--reg-exp-comments',
metavar='REGEXP',
type=str,
nargs='+',
help='search comments for regular expression')
return main_parser
source_group = main_parser.add_argument_group('source blocks')
source_group.add_argument(
'-b',
'--beautify',
action='store_true',
default=False,
help='beautify source code')
source_group.add_argument(
'-s',
'--reg-exp-source',
metavar='REGEXP',
type=str,
nargs='+',
help='search source for regular expression')
strings_group = main_parser.add_argument_group('string literals')
strings_group.add_argument(
'-j',
'--join-string-literals',
action='store_true',
help='join string literals (heuristic)')
strings_group.add_argument(
'-l',
'--reg-exp-string-literals',
metavar='REGEXP',
type=str,
nargs='+',
help='search string literals for regular expression')
if __name__ == "__main__":
main_parser = build_parser()
main_conf = main_parser.parse_args()
sys.exit(main(main_conf))