#!/usr/bin/env python3.7 # # Copyright (C) 2019 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # SPDX-License-Identifier: GPL-3.0-or-later import argparse import io import logging import re import json import sys import csv from jsmin import jsmin import ast from zipfile import ZipFile from ExtensionCrawler.config import (const_log_format, const_basedir) from ExtensionCrawler.archive import iter_tar_entries_by_date from ExtensionCrawler.js_mincer import mince_js def get_etag(headers_content): d = ast.literal_eval(headers_content) if "ETag" in d: return d["ETag"] def get_name_and_version(overview_contents): # Extract extension name match = re.search("""""", overview_contents) name = match.group(1) if match else None # Extract extension version match = re.search( """""", overview_contents) version = match.group(1) if match else None return name, version def handle_extid(conf, extid, permission_map, csvwriter): results = [] still_in_store = None crx_etags = [None] for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid): if conf.from_date and not (conf.from_date <= date): continue if conf.latest_date and not (date <= conf.latest_date): continue crx_etag = None name = None version = None date_matches = {} for permission in permission_map.keys(): date_matches[permission] = False has_crx_file = False used_permissions = set() for tarentry, tarfile in tups: tarentry_filename = tarentry.name.split("/")[-1] if tarentry_filename.endswith(".crx.headers"): crx_etag = get_etag(tarfile.read().decode()) if crx_etag: crx_etags += [crx_etag] if tarentry_filename == "overview.html": name, version = get_name_and_version(tarfile.read().decode()) if tarentry_filename == "overview.html.status": still_in_store = tarfile.read().decode().startswith("2") if tarentry_filename.endswith(".crx") and tarentry.size > 0: has_crx_file = True with ZipFile(tarfile) as zf: for zipentry in zf.infolist(): if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"): with zf.open(zipentry) as f: verbatim_lines = [] for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): verbatim_lines += block.content.splitlines() for permission, evidences in permission_map.items(): for evidence in evidences: for line in verbatim_lines: if evidence in line: date_matches[permission] = True break if zipentry.filename == "manifest.json": with zf.open(zipentry) as m: raw_content = m.read() # There are some manifests that seem to have weird encodings... try: content = raw_content.decode("utf-8-sig") except UnicodeDecodeError: # Trying a different encoding, manifests are weird... content = raw_content.decode("latin1") manifest = json.loads(jsmin(content), strict=False) if "permissions" in manifest: for permission in manifest["permissions"]: used_permissions.add(str(permission)) if has_crx_file: line = [date, crx_etag, name, version] for permission in sorted(list(permission_map.keys())): if permission in used_permissions: if date_matches[permission]: line += ["REQ_AND_FOUND"] else: line += ["REQ_AND_NOT_FOUND"] else: if date_matches[permission]: line += ["NOT_REQ_AND_FOUND"] else: line += ["NOT_REQ_AND_NOT_FOUND"] results += [line] for result in results: csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)]) def main(conf): logger = logging.getLogger() ch = logging.StreamHandler(sys.stderr) ch.setFormatter(logging.Formatter(const_log_format())) logger.addHandler(ch) if conf.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.WARNING) with open(conf.MAP_FILE) as f: permission_map = json.load(f) with open(conf.EXTID_FILE) as f: csvwriter = csv.writer(sys.stdout, csv.unix_dialect) csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version"] + sorted(list(permission_map.keys()))) for extid in [l.strip() for l in f.readlines()]: handle_extid(conf, extid, permission_map, csvwriter) def build_parser(): main_parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description='Search extensions for unused permissions') main_parser.add_argument( 'MAP_FILE', help='json file with permission - literal string mapping') main_parser.add_argument( 'EXTID_FILE', help='file with extension ids') main_parser.add_argument( '-v', '--verbose', action='store_true', default=False, help='increase verbosity') main_parser.add_argument( '-D', '--latest-date', metavar='DATE', type=str, help='select latest crx from tar, released before DATE.\n' + 'Together with --from-date, specifies all crx released in specified\n' + 'date range.') main_parser.add_argument( '-d', '--from-date', metavar='DATE', type=str, help='select oldest crx from tar released after DATE.\n' + 'Together with --latest-date, specifies all crx released in specified\n' + 'date range.') main_parser.add_argument( '-a', '--archive-dir', metavar='archive', type=str, default=const_basedir(), help='archive directory') return main_parser if __name__ == "__main__": main_parser = build_parser() main_conf = main_parser.parse_args() sys.exit(main(main_conf))