#!/usr/bin/env python3.7
#
# Copyright (C) 2019 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
# SPDX-License-Identifier: GPL-3.0-or-later
import argparse
import io
import logging
import re
import json
import sys
import csv
from jsmin import jsmin
import ast
from zipfile import ZipFile
from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.archive import iter_tar_entries_by_date
from ExtensionCrawler.js_mincer import mince_js
def get_etag(headers_content):
d = ast.literal_eval(headers_content)
if "ETag" in d:
return d["ETag"]
def get_metadata(overview_contents):
# Extract extension name
match = re.search("""""",
overview_contents)
name = match.group(1) if match else None
# Extract extension version
match = re.search(
"""""", overview_contents)
version = match.group(1) if match else None
# Extracts extension categories
match = re.search(
"""Attribute name="category">(.+?)""", overview_contents)
categories = match.group(1).split(",") if match else []
# Extracts the number of downloads
match = re.search(
""" 0:
has_crx_file = True
with ZipFile(tarfile) as zf:
for zipentry in zf.infolist():
if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"):
with zf.open(zipentry) as f:
verbatim_lines = []
for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
verbatim_lines += block.content.splitlines()
for permission, evidences in permission_map.items():
for evidence in evidences:
for line in verbatim_lines:
if evidence in line:
date_matches[permission] = True
break
if zipentry.filename == "manifest.json":
with zf.open(zipentry) as m:
raw_content = m.read()
# There are some manifests that seem to have weird encodings...
try:
content = raw_content.decode("utf-8-sig")
except UnicodeDecodeError:
# Trying a different encoding, manifests are weird...
content = raw_content.decode("latin1")
manifest = json.loads(jsmin(content), strict=False)
if "permissions" in manifest:
for permission in manifest["permissions"]:
used_permissions.add(str(permission))
if has_crx_file:
line = [date, crx_etag, name, version, "+".join(categories), downloads]
for permission in sorted(list(permission_map.keys())):
if permission in used_permissions:
if date_matches[permission]:
line += ["REQ_AND_FOUND"]
else:
line += ["REQ_AND_NOT_FOUND"]
else:
if date_matches[permission]:
line += ["NOT_REQ_AND_FOUND"]
else:
line += ["NOT_REQ_AND_NOT_FOUND"]
results += [line]
for result in results:
csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
def main(conf):
logger = logging.getLogger()
ch = logging.StreamHandler(sys.stderr)
ch.setFormatter(logging.Formatter(const_log_format()))
logger.addHandler(ch)
if conf.verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
with open(conf.MAP_FILE) as f:
permission_map = json.load(f)
with open(conf.EXTID_FILE) as f:
csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "categories", "downloads"]
+ sorted(list(permission_map.keys())))
for extid in [l.strip() for l in f.readlines()]:
try:
handle_extid(conf, extid, permission_map, csvwriter)
except Exception as e:
logging.exception(f"Fatal error when handling extension '{extid}'")
def build_parser():
main_parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description='Search extensions for unused permissions')
main_parser.add_argument(
'MAP_FILE',
help='json file with permission - literal string mapping')
main_parser.add_argument(
'EXTID_FILE',
help='file with extension ids')
main_parser.add_argument(
'-v',
'--verbose',
action='store_true',
default=False,
help='increase verbosity')
main_parser.add_argument(
'-D',
'--latest-date',
metavar='DATE',
type=str,
help='select latest crx from tar, released before DATE.\n' +
'Together with --from-date, specifies all crx released in specified\n' +
'date range.')
main_parser.add_argument(
'-d',
'--from-date',
metavar='DATE',
type=str,
help='select oldest crx from tar released after DATE.\n' +
'Together with --latest-date, specifies all crx released in specified\n' +
'date range.')
main_parser.add_argument(
'-a',
'--archive-dir',
metavar='archive',
type=str,
default=const_basedir(),
help='archive directory')
return main_parser
if __name__ == "__main__":
main_parser = build_parser()
main_conf = main_parser.parse_args()
sys.exit(main(main_conf))