#!/usr/bin/env python3.7 # # Copyright (C) 2019 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # SPDX-License-Identifier: GPL-3.0-or-later import argparse import io import logging import re import json import sys import csv from jsmin import jsmin import ast from zipfile import ZipFile from ExtensionCrawler.config import (const_log_format, const_basedir) from ExtensionCrawler.archive import iter_tar_entries_by_date from ExtensionCrawler.js_mincer import mince_js def get_etag(headers_content): d = ast.literal_eval(headers_content) if "ETag" in d: return d["ETag"] def get_metadata(overview_contents): # Extract extension name match = re.search("""""", overview_contents) name = match.group(1) if match else None # Extract extension version match = re.search( """""", overview_contents) version = match.group(1) if match else None # Extracts extension categories match = re.search( """Attribute name="category">(.+?)""", overview_contents) categories = match.group(1).split(",") if match else [] # Extracts the number of downloads match = re.search( """ 0: has_crx_file = True with ZipFile(tarfile) as zf: for zipentry in zf.infolist(): if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"): with zf.open(zipentry) as f: verbatim_lines = [] for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): verbatim_lines += block.content.splitlines() for permission, evidences in permission_map.items(): for evidence in evidences: for line in verbatim_lines: if evidence in line: date_matches[permission] = True break if zipentry.filename == "manifest.json": with zf.open(zipentry) as m: raw_content = m.read() # There are some manifests that seem to have weird encodings... try: content = raw_content.decode("utf-8-sig") except UnicodeDecodeError: # Trying a different encoding, manifests are weird... content = raw_content.decode("latin1") manifest = json.loads(jsmin(content), strict=False) if "permissions" in manifest: for permission in manifest["permissions"]: used_permissions.add(str(permission)) if has_crx_file: line = [date, crx_etag, name, version, "+".join(categories), downloads] for permission in sorted(list(permission_map.keys())): if permission in used_permissions: if date_matches[permission]: line += ["REQ_AND_FOUND"] else: line += ["REQ_AND_NOT_FOUND"] else: if date_matches[permission]: line += ["NOT_REQ_AND_FOUND"] else: line += ["NOT_REQ_AND_NOT_FOUND"] results += [line] for result in results: csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)]) def main(conf): logger = logging.getLogger() ch = logging.StreamHandler(sys.stderr) ch.setFormatter(logging.Formatter(const_log_format())) logger.addHandler(ch) if conf.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.WARNING) with open(conf.MAP_FILE) as f: permission_map = json.load(f) with open(conf.EXTID_FILE) as f: csvwriter = csv.writer(sys.stdout, csv.unix_dialect) csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "categories", "downloads"] + sorted(list(permission_map.keys()))) for extid in [l.strip() for l in f.readlines()]: try: handle_extid(conf, extid, permission_map, csvwriter) except Exception as e: logging.exception(f"Fatal error when handling extension '{extid}'") def build_parser(): main_parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description='Search extensions for unused permissions') main_parser.add_argument( 'MAP_FILE', help='json file with permission - literal string mapping') main_parser.add_argument( 'EXTID_FILE', help='file with extension ids') main_parser.add_argument( '-v', '--verbose', action='store_true', default=False, help='increase verbosity') main_parser.add_argument( '-D', '--latest-date', metavar='DATE', type=str, help='select latest crx from tar, released before DATE.\n' + 'Together with --from-date, specifies all crx released in specified\n' + 'date range.') main_parser.add_argument( '-d', '--from-date', metavar='DATE', type=str, help='select oldest crx from tar released after DATE.\n' + 'Together with --latest-date, specifies all crx released in specified\n' + 'date range.') main_parser.add_argument( '-a', '--archive-dir', metavar='archive', type=str, default=const_basedir(), help='archive directory') return main_parser if __name__ == "__main__": main_parser = build_parser() main_conf = main_parser.parse_args() sys.exit(main(main_conf))