Added grep-unused-permissions.
This commit is contained in:
parent
ce822d8e2d
commit
3796ca2a3f
|
@ -0,0 +1,216 @@
|
|||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2019 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
import csv
|
||||
from jsmin import jsmin
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
from ExtensionCrawler.config import (const_log_format, const_basedir)
|
||||
from ExtensionCrawler.archive import iter_tar_entries_by_date
|
||||
from ExtensionCrawler.js_mincer import mince_js
|
||||
|
||||
|
||||
def get_etag(headers_content):
|
||||
headers_content = headers_content.replace(
|
||||
'"', '\\"').replace("'", '"')
|
||||
headers_json = json.loads(headers_content)
|
||||
if "ETag" in headers_json:
|
||||
return headers_json["ETag"]
|
||||
|
||||
|
||||
def get_name_and_version(overview_contents):
|
||||
# Extract extension name
|
||||
match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
|
||||
overview_contents)
|
||||
name = match.group(1) if match else None
|
||||
|
||||
# Extract extension version
|
||||
match = re.search(
|
||||
"""<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
|
||||
version = match.group(1) if match else None
|
||||
|
||||
return name, version
|
||||
|
||||
|
||||
def handle_extid(conf, extid, permission_map, csvwriter):
|
||||
results = []
|
||||
|
||||
still_in_store = None
|
||||
crx_etags = [None]
|
||||
for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
|
||||
if conf.from_date and not (conf.from_date <= date):
|
||||
continue
|
||||
if conf.latest_date and not (date <= conf.latest_date):
|
||||
continue
|
||||
|
||||
crx_etag = None
|
||||
name = None
|
||||
version = None
|
||||
date_matches = {}
|
||||
for permission in permission_map.keys():
|
||||
date_matches[permission] = False
|
||||
has_crx_file = False
|
||||
used_permissions = set()
|
||||
|
||||
for tarentry, tarfile in tups:
|
||||
tarentry_filename = tarentry.name.split("/")[-1]
|
||||
|
||||
if tarentry_filename.endswith(".crx.headers"):
|
||||
crx_etag = get_etag(tarfile.read().decode())
|
||||
if crx_etag:
|
||||
crx_etags += [crx_etag]
|
||||
|
||||
if tarentry_filename == "overview.html":
|
||||
name, version = get_name_and_version(tarfile.read().decode())
|
||||
|
||||
if tarentry_filename == "overview.html.status":
|
||||
still_in_store = tarfile.read().decode().startswith("2")
|
||||
|
||||
if tarentry_filename.endswith(".crx") and tarentry.size > 0:
|
||||
has_crx_file = True
|
||||
with ZipFile(tarfile) as zf:
|
||||
for zipentry in zf.infolist():
|
||||
if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"):
|
||||
with zf.open(zipentry) as f:
|
||||
verbatim_lines = []
|
||||
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
|
||||
verbatim_lines += block.content.splitlines()
|
||||
|
||||
for permission, evidences in permission_map.items():
|
||||
for evidence in evidences:
|
||||
for line in verbatim_lines:
|
||||
if evidence in line:
|
||||
date_matches[permission] = True
|
||||
break
|
||||
|
||||
if zipentry.filename == "manifest.json":
|
||||
with zf.open(zipentry) as m:
|
||||
raw_content = m.read()
|
||||
# There are some manifests that seem to have weird encodings...
|
||||
try:
|
||||
content = raw_content.decode("utf-8-sig")
|
||||
except UnicodeDecodeError:
|
||||
# Trying a different encoding, manifests are weird...
|
||||
content = raw_content.decode("latin1")
|
||||
|
||||
manifest = json.loads(jsmin(content), strict=False)
|
||||
if "permissions" in manifest:
|
||||
for permission in manifest["permissions"]:
|
||||
used_permissions.add(str(permission))
|
||||
|
||||
if has_crx_file:
|
||||
line = [date, crx_etag, name, version]
|
||||
for permission in sorted(list(permission_map.keys())):
|
||||
if permission in used_permissions:
|
||||
if date_matches[permission]:
|
||||
line += ["REQ_AND_FOUND"]
|
||||
else:
|
||||
line += ["REQ_AND_NOT_FOUND"]
|
||||
else:
|
||||
if date_matches[permission]:
|
||||
line += ["NOT_REQ_AND_FOUND"]
|
||||
else:
|
||||
line += ["NOT_REQ_AND_NOT_FOUND"]
|
||||
results += [line]
|
||||
|
||||
for result in results:
|
||||
csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
|
||||
|
||||
|
||||
def main(conf):
|
||||
logger = logging.getLogger()
|
||||
ch = logging.StreamHandler(sys.stderr)
|
||||
ch.setFormatter(logging.Formatter(const_log_format()))
|
||||
logger.addHandler(ch)
|
||||
if conf.verbose:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
with open(conf.MAP_FILE) as f:
|
||||
permission_map = json.load(f)
|
||||
|
||||
with open(conf.EXTID_FILE) as f:
|
||||
csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
|
||||
csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version"]
|
||||
+ sorted(list(permission_map.keys())))
|
||||
for extid in [l.strip() for l in f.readlines()]:
|
||||
handle_extid(conf, extid, permission_map, csvwriter)
|
||||
|
||||
|
||||
def build_parser():
|
||||
main_parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
description='Search extensions for unused permissions')
|
||||
main_parser.add_argument(
|
||||
'MAP_FILE',
|
||||
help='json file with permission - literal string mapping')
|
||||
main_parser.add_argument(
|
||||
'EXTID_FILE',
|
||||
help='file with extension ids')
|
||||
main_parser.add_argument(
|
||||
'-v',
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='increase verbosity')
|
||||
|
||||
|
||||
main_parser.add_argument(
|
||||
'-D',
|
||||
'--latest-date',
|
||||
metavar='DATE',
|
||||
type=str,
|
||||
help='select latest crx from tar, released before DATE.\n' +
|
||||
'Together with --from-date, specifies all crx released in specified\n' +
|
||||
'date range.')
|
||||
|
||||
main_parser.add_argument(
|
||||
'-d',
|
||||
'--from-date',
|
||||
metavar='DATE',
|
||||
type=str,
|
||||
help='select oldest crx from tar released after DATE.\n' +
|
||||
'Together with --latest-date, specifies all crx released in specified\n' +
|
||||
'date range.')
|
||||
|
||||
main_parser.add_argument(
|
||||
'-a',
|
||||
'--archive-dir',
|
||||
metavar='archive',
|
||||
type=str,
|
||||
default=const_basedir(),
|
||||
help='archive directory')
|
||||
|
||||
return main_parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main_parser = build_parser()
|
||||
|
||||
main_conf = main_parser.parse_args()
|
||||
|
||||
sys.exit(main(main_conf))
|
Loading…
Reference in New Issue