From 3796ca2a3fd391a35de18081f594ce68cbc7d98c Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Wed, 15 May 2019 21:58:11 +0100 Subject: [PATCH] Added grep-unused-permissions. --- PermissionAnalysis/grep-unused-permissions | 216 +++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 PermissionAnalysis/grep-unused-permissions diff --git a/PermissionAnalysis/grep-unused-permissions b/PermissionAnalysis/grep-unused-permissions new file mode 100644 index 0000000..582f83e --- /dev/null +++ b/PermissionAnalysis/grep-unused-permissions @@ -0,0 +1,216 @@ +#!/usr/bin/env python3.7 +# +# Copyright (C) 2019 The University of Sheffield, UK +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# SPDX-License-Identifier: GPL-3.0-or-later + +import argparse +import io +import logging +import re +import json +import sys +import csv +from jsmin import jsmin + +from zipfile import ZipFile + +from ExtensionCrawler.config import (const_log_format, const_basedir) +from ExtensionCrawler.archive import iter_tar_entries_by_date +from ExtensionCrawler.js_mincer import mince_js + + +def get_etag(headers_content): + headers_content = headers_content.replace( + '"', '\\"').replace("'", '"') + headers_json = json.loads(headers_content) + if "ETag" in headers_json: + return headers_json["ETag"] + + +def get_name_and_version(overview_contents): + # Extract extension name + match = re.search("""""", + overview_contents) + name = match.group(1) if match else None + + # Extract extension version + match = re.search( + """""", overview_contents) + version = match.group(1) if match else None + + return name, version + + +def handle_extid(conf, extid, permission_map, csvwriter): + results = [] + + still_in_store = None + crx_etags = [None] + for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid): + if conf.from_date and not (conf.from_date <= date): + continue + if conf.latest_date and not (date <= conf.latest_date): + continue + + crx_etag = None + name = None + version = None + date_matches = {} + for permission in permission_map.keys(): + date_matches[permission] = False + has_crx_file = False + used_permissions = set() + + for tarentry, tarfile in tups: + tarentry_filename = tarentry.name.split("/")[-1] + + if tarentry_filename.endswith(".crx.headers"): + crx_etag = get_etag(tarfile.read().decode()) + if crx_etag: + crx_etags += [crx_etag] + + if tarentry_filename == "overview.html": + name, version = get_name_and_version(tarfile.read().decode()) + + if tarentry_filename == "overview.html.status": + still_in_store = tarfile.read().decode().startswith("2") + + if tarentry_filename.endswith(".crx") and tarentry.size > 0: + has_crx_file = True + with ZipFile(tarfile) as zf: + for zipentry in zf.infolist(): + if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"): + with zf.open(zipentry) as f: + verbatim_lines = [] + for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): + verbatim_lines += block.content.splitlines() + + for permission, evidences in permission_map.items(): + for evidence in evidences: + for line in verbatim_lines: + if evidence in line: + date_matches[permission] = True + break + + if zipentry.filename == "manifest.json": + with zf.open(zipentry) as m: + raw_content = m.read() + # There are some manifests that seem to have weird encodings... + try: + content = raw_content.decode("utf-8-sig") + except UnicodeDecodeError: + # Trying a different encoding, manifests are weird... + content = raw_content.decode("latin1") + + manifest = json.loads(jsmin(content), strict=False) + if "permissions" in manifest: + for permission in manifest["permissions"]: + used_permissions.add(str(permission)) + + if has_crx_file: + line = [date, crx_etag, name, version] + for permission in sorted(list(permission_map.keys())): + if permission in used_permissions: + if date_matches[permission]: + line += ["REQ_AND_FOUND"] + else: + line += ["REQ_AND_NOT_FOUND"] + else: + if date_matches[permission]: + line += ["NOT_REQ_AND_FOUND"] + else: + line += ["NOT_REQ_AND_NOT_FOUND"] + results += [line] + + for result in results: + csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)]) + + +def main(conf): + logger = logging.getLogger() + ch = logging.StreamHandler(sys.stderr) + ch.setFormatter(logging.Formatter(const_log_format())) + logger.addHandler(ch) + if conf.verbose: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.WARNING) + + with open(conf.MAP_FILE) as f: + permission_map = json.load(f) + + with open(conf.EXTID_FILE) as f: + csvwriter = csv.writer(sys.stdout, csv.unix_dialect) + csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version"] + + sorted(list(permission_map.keys()))) + for extid in [l.strip() for l in f.readlines()]: + handle_extid(conf, extid, permission_map, csvwriter) + + +def build_parser(): + main_parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + description='Search extensions for unused permissions') + main_parser.add_argument( + 'MAP_FILE', + help='json file with permission - literal string mapping') + main_parser.add_argument( + 'EXTID_FILE', + help='file with extension ids') + main_parser.add_argument( + '-v', + '--verbose', + action='store_true', + default=False, + help='increase verbosity') + + + main_parser.add_argument( + '-D', + '--latest-date', + metavar='DATE', + type=str, + help='select latest crx from tar, released before DATE.\n' + + 'Together with --from-date, specifies all crx released in specified\n' + + 'date range.') + + main_parser.add_argument( + '-d', + '--from-date', + metavar='DATE', + type=str, + help='select oldest crx from tar released after DATE.\n' + + 'Together with --latest-date, specifies all crx released in specified\n' + + 'date range.') + + main_parser.add_argument( + '-a', + '--archive-dir', + metavar='archive', + type=str, + default=const_basedir(), + help='archive directory') + + return main_parser + + +if __name__ == "__main__": + main_parser = build_parser() + + main_conf = main_parser.parse_args() + + sys.exit(main(main_conf))