diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py index 9346a95..9f4ed0f 100644 --- a/ExtensionCrawler/archive.py +++ b/ExtensionCrawler/archive.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # # Copyright (C) 2016,2017 The University of Sheffield, UK -# +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -26,7 +26,7 @@ from random import randint import datetime from ExtensionCrawler.config import * from ExtensionCrawler.util import * -from ExtensionCrawler.archive import * +from ExtensionCrawler.archive import archive_file from ExtensionCrawler.sqlite import * import dateutil import dateutil.parser @@ -76,7 +76,7 @@ class RequestResult: class UpdateResult: def __init__(self, id, is_new, exception, res_overview, res_crx, - res_reviews, res_support): + res_reviews, res_support,res_sql, sql_update): self.id = id self.new = is_new self.exception = exception @@ -84,6 +84,8 @@ class UpdateResult: self.res_crx = res_crx self.res_reviews = res_reviews self.res_support = res_support + self.res_sql = res_sql + self.sql_update = sql_update def is_new(self): return self.new @@ -128,9 +130,11 @@ class UpdateResult: def corrupt_tar(self): return self.exception is not None - -def get_local_archive_dir(id): - return "{}".format(id[:3]) + def sql_exception(self): + return self.res_sql is not None + + def sql_success(self): + return self.sql_update def write_text(tardir, date, fname, text): @@ -262,6 +266,7 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date): timeout=10, allow_redirects=True) etag = res.headers.get('Etag') + write_text(tmptardir, date, extfilename + ".etag", etag) logtxt = logmsg(verbose, logtxt, ( " - checking etag, last: {}\n" + " current: {}\n").format( @@ -287,6 +292,8 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date): for chunk in res.iter_content(chunk_size=512 * 1024): if chunk: # filter out keep-alive new chunks f.write(chunk) + write_text(tmptardir, date, extfilename + ".etag", + res.headers.get("ETag")) except Exception as e: logtxt = logmsg(verbose, logtxt, " - Exception: {}\n".format(str(e))) @@ -354,6 +361,8 @@ def update_extension(archivedir, verbose, forums, ext_id): logtxt = logmsg(verbose, "", " Updating {}".format(ext_id)) is_new = False tar_exception = None + sql_exception = None + sql_success = False tmptardir = "" tmptar = "" @@ -380,7 +389,7 @@ def update_extension(archivedir, verbose, forums, ext_id): logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) tar_exception = e return UpdateResult(ext_id, is_new, tar_exception, res_overview, - res_crx, res_reviews, res_support) + res_crx, res_reviews, res_support, sql_exception, False) res_overview, msg_overview = update_overview(tmptardir, date, verbose, ext_id) @@ -443,10 +452,22 @@ def update_extension(archivedir, verbose, forums, ext_id): except Exception: pass - msg_updatesqlite = update_sqlite(archivedir, tmptardir, verbose, ext_id, - date) - log(verbose, logtxt + msg_updatesqlite) + try: + sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new, + verbose, 11 * " ") + logtxt = logmsg(verbose, logtxt, msg_updatesqlite) + except Exception as e: + logtxt = logmsg(verbose, logtxt, + " * Exception during update of sqlite db ") + logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e))) + + sql_exception = e + + try: + write_text(tardir, date, ext_id + ".sql.exception", str(e)) + except Exception as e: + pass try: shutil.rmtree(path=tmpdir) except Exception as e: @@ -459,11 +480,12 @@ def update_extension(archivedir, verbose, forums, ext_id): except Exception: pass + log(verbose, logtxt) return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx, - res_reviews, res_support) + res_reviews, res_support, sql_exception, sql_success) -def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): +def update_extensions(archivedir, verbose, parallel, forums_ext_ids, ext_ids): ext_with_forums = [] ext_without_forums = [] ext_ids = list(set(ext_ids) - set(forums_ext_ids)) @@ -471,7 +493,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): log(verbose, "Updating {} extensions ({} including forums)\n".format( len(ext_ids), len(forums_ext_ids))) # First, update extensions with forums sequentially (and with delays) to - # avoid running into Googles DDOS detection. + # avoid running into Googles DDOS detection. log(verbose, " Updating {} extensions including forums (sequentially))\n".format( len(forums_ext_ids))) @@ -486,7 +508,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids): log(verbose, " Updating {} extensions excluding forums (parallel))\n".format( len(parallel_ids))) - with Pool(12) as p: + with Pool(parallel) as p: ext_without_forums = list( p.map( partial(update_extension, archivedir, verbose, False), @@ -506,5 +528,6 @@ def get_existing_ids(archivedir, verbose): def get_forum_ext_ids(confdir, verbose): with open(os.path.join(confdir, "forums.conf")) as f: ids = f.readlines() + r = re.compile('^[a-p]+$') ids = [x.strip() for x in ids] - return ids + return list(filter(r.match, ids)) diff --git a/ExtensionCrawler/config.py b/ExtensionCrawler/config.py index 278dfec..d65f70d 100644 --- a/ExtensionCrawler/config.py +++ b/ExtensionCrawler/config.py @@ -16,6 +16,8 @@ # along with this program. If not, see . # +import os + def const_sitemap_url(): return "https://chrome.google.com/webstore/sitemap" @@ -74,3 +76,12 @@ def const_review_payload(ext_id, start, end): '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' + '"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start, end) + + +def get_local_archive_dir(id): + return "{}".format(id[:3]) + + +def archive_file(archivedir, ext_id): + return os.path.join( + str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar") diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py index 01786da..e3f7576 100644 --- a/ExtensionCrawler/sqlite.py +++ b/ExtensionCrawler/sqlite.py @@ -1,7 +1,6 @@ -#!/usr/bin/env python3 # # Copyright (C) 2017 The University of Sheffield, UK -# +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -18,11 +17,205 @@ from ExtensionCrawler.config import * from ExtensionCrawler.util import * +from ExtensionCrawler.crx import * + +from ExtensionCrawler import archive + +import sqlite3 +import re +from bs4 import BeautifulSoup +from zipfile import ZipFile +import json +import os +import tempfile +import tarfile +import glob -def update_sqlite(archivedir, tmptardir, verbose, ext_id, date): - indent = " " - txt = logmsg(verbose, "", indent + "* Updating SQLite ...") - txt = logmsg(verbose, txt, "") +class SqliteUpdateError(Exception): + def __init__(self, reason="unknown"): + self.reason = reason + + +def get_etag(ext_id, datepath, con): + #Trying etag file + etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None) + if etagpath: + with open(etagpath) as f: + return f.read() + + #Trying to parse header file for etag + headerpath = next( + iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None) + if headerpath: + with open(headerpath) as f: + headers = eval(f.read()) + if "ETag" in headers: + return headers["ETag"] + + #Trying to look up previous etag in database + linkpath = next( + iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None) + if linkpath: + with open(linkpath) as f: + link = f.read() + linked_date = link[3:].split("/")[0] + + row = next( + con.execute( + "SELECT crx_etag FROM extension WHERE extid=? AND date=?", + (ext_id, linked_date)), None) + if row: + return row[0] + + +def get_overview_status(datepath): + with open(os.path.join(datepath, "overview.html.status")) as f: + return int(f.read()) + + +def get_crx_status(datepath): + statuspath = next( + iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None) + if statuspath: + with open(statuspath) as f: + return int(f.read()) + + +def parse_and_insert_overview(ext_id, date, datepath, con): + overview_path = os.path.join(datepath, "overview.html") + with open(overview_path) as overview_file: + contents = overview_file.read() + + # Extract extension name + match = re.search("""""", + contents) + name = match.group(1) if match else None + + # Extract extension version + match = re.search("""""", + contents) + version = match.group(1) if match else None + + # Extracts extension categories + match = re.search("""Attribute name="category">(.+?)""", + contents) + categories = match.group(1).split(",") if match else None + + # Extracts the number of downloads + match = re.search("""user_count.*?(\d+)""", contents) + downloads = int(match.group(1)) if match else None + + # Extracts the full extension description as it appears on the overview page + doc = BeautifulSoup(contents, 'html.parser') + + description_parent = doc.find('div', itemprop="description") + description = str(description_parent.contents[ + 0]) if description_parent and description_parent.contents else None + full_description = str( + description_parent.parent) if description_parent else None + + developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls) + developer = str( + developer_parent.contents[0]) if developer_parent else None + + last_updated_parent = doc.find( + class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls) + last_updated = str( + last_updated_parent.contents[0]) if last_updated_parent else None + + etag = get_etag(ext_id, datepath, con) + + overview_status = get_overview_status(datepath) + + crx_status = get_crx_status(datepath) + + con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", + (ext_id, date, name, version, description, downloads, + full_description, developer, etag, last_updated, + overview_status, crx_status)) + + if categories: + for category in categories: + con.execute("INSERT INTO category VALUES (?,?,?)", + (ext_id, date, category)) + + +def parse_and_insert_crx(ext_id, date, datepath, con): + etag = get_etag(ext_id, datepath, con) + crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) + filename = os.path.basename(crx_path) + + with ZipFile(crx_path) as f: + with f.open("manifest.json") as m: + try: + # There are some manifests that seem to have weird encodings... + manifest = json.loads(m.read().decode("utf-8-sig")) + if "permissions" in manifest: + for permission in manifest["permissions"]: + con.execute( + "INSERT OR REPLACE INTO permission VALUES (?,?)", + (etag, str(permission))) + except json.decoder.JSONDecodeError: + pass + + public_key = read_crx(crx_path).pk + + con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename, + public_key)) + + +def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose, + indent): + txt = "" + + txt = logmsg(verbose, txt, + indent + "- updating using {}\n".format(datepath)) + + if not os.path.exists(db_path): + raise SqliteUpdateError("db file not found") + + with sqlite3.connect(db_path) as con: + parse_and_insert_overview(ext_id, date, datepath, con) + + crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) + + etag = get_etag(ext_id, datepath, con) + etag_already_in_db = next( + con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[ + 0] + if etag and not etag_already_in_db: + if crx_path: + parse_and_insert_crx(ext_id, date, datepath, con) + else: + raise SqliteUpdateError( + "etag not in db and no crx file present") return txt + + +def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent): + update_successful = False + txt = "" + indent2 = indent + 4 * " " + + datepath = os.path.join(tmptardir, date) + + txt = logmsg(verbose, txt, + indent + "* extracting information into SQLite db...\n") + + db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite") + + txt = logmsg(verbose, txt, + indent2 + "- attempting incremental update...\n") + try: + updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date, + verbose, indent2) + txt = logmsg(verbose, txt, updatetxt) + update_successful = True + except SqliteUpdateError as e: + txt = logmsg( + verbose, txt, + indent2 + "- incremental update failed: {}\n".format(e.reason)) + + return update_successful, txt diff --git a/crawler b/crawler index 6dbb472..45e050e 100755 --- a/crawler +++ b/crawler @@ -33,6 +33,9 @@ import dateutil.parser import time import getopt +# Script should run with python 3.4 or 3.5 +assert sys.version_info >= (3, 4) and sys.version_info < (3, 6) + def write_log(dir, fname, text): os.makedirs(dir, exist_ok=True) @@ -78,6 +81,18 @@ def log_failures_to_file(dir, today, res): sorted(map(lambda x: x.id, filter(lambda x: x.corrupt_tar(), res))), "") write_log(dir, today + "-file-corruption.log", file_corruption) + + sql_exception = reduce( + lambda x, y: x + "\n" + y, + sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))), + "") + write_log(dir, today + "-sql-exception.log", sql_exception) + + sql_success = reduce( + lambda x, y: x + "\n" + y, + sorted(map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))), + "") + write_log(dir, today + "-sql-not-updated.log", sql_success) def log_summary(verbose, res, stderr=False, runtime=0): @@ -95,6 +110,8 @@ def log_summary(verbose, res, stderr=False, runtime=0): not_in_store = len(list(filter(lambda x: x.not_in_store(), res))) not_modified = len(list(filter(lambda x: x.not_modified(), res))) corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res)) + sql_exception = len(list(filter(lambda x: x.sql_exception(), res))) + sql_success = len(list(filter(lambda x: x.sql_success(), res))) new = len(list(filter(lambda x: x.is_new(), res))) updated = len( @@ -105,6 +122,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): p(" Updated {} out of {} extensions successfully\n".format( str(success), str(total))) p(" Updated extensions: {:8d}\n".format(updated)) + p(" Updated SQL databases: {:8d}\n".format(sql_success)) p(" New extensions: {:8d}\n".format(new)) p(" Not authorized: {:8d}\n".format(not_authorized)) p(" Raised Google DDOS: {:8d}\n".format(raised_ddos)) @@ -112,6 +130,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): p(" Extensions not in store: {:8d}\n".format(not_in_store)) p(" Unknown exception: {:8d}\n".format(has_exception)) p(" Corrupt tar archives: {:8d}\n".format(len(corrupt_tar_archives))) + p(" SQL exception: {:8d}\n".format(sql_exception)) p(" Total runtime: {}\n".format( str(datetime.timedelta(seconds=int(runtime))))) @@ -135,10 +154,11 @@ def help(): def main(argv): today = datetime.datetime.now(datetime.timezone.utc).isoformat() basedir = "archive" + parallel = 24 verbose = True discover = False try: - opts, args = getopt.getopt(argv, "hsda:", ["archive="]) + opts, args = getopt.getopt(argv, "hsdap:", ["archive=",'parallel=']) except getopt.GetoptError: help() sys.exit(2) @@ -148,6 +168,8 @@ def main(argv): sys.exit() elif opt in ("-a", "--archive"): basedir = arg + elif opt in ("-p", "--parallel"): + parallel = int(arg) elif opt == '-s': verbose = False elif opt == '-d': @@ -164,10 +186,11 @@ def main(argv): start_time = time.time() log(verbose, "Configuration:\n") - log(verbose, " Base dir: {}\n".format(basedir)) - log(verbose, " Archive dir: {}\n".format(archive_dir)) - log(verbose, " Conf. dir: {}\n".format(conf_dir)) - log(verbose, " Discover new ext.: {}\n".format(discover)) + log(verbose, " Base dir: {}\n".format(basedir)) + log(verbose, " Archive directory: {}\n".format(archive_dir)) + log(verbose, " Configuration directory: {}\n".format(conf_dir)) + log(verbose, " Discover new extensions: {}\n".format(discover)) + log(verbose, " Max num. of concurrent downloads: {}\n".format(parallel)) log(verbose, "\n") forum_ext_ids = get_forum_ext_ids(conf_dir, verbose) @@ -178,7 +201,7 @@ def main(argv): discovered_ids = get_new_ids(verbose, known_ids) ext_ids = list(set(discovered_ids) | set(known_ids)) - res = update_extensions(archive_dir, verbose, forum_ext_ids, ext_ids) + res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, ext_ids) # We re-try (once) the extensions with unknown exceptions, as # they are often temporary diff --git a/create_db b/create_db new file mode 100755 index 0000000..3b22a1f --- /dev/null +++ b/create_db @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2016,2017 The University of Sheffield, UK +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import getopt +import os +import sys +import glob +import tarfile +import tempfile + +from ExtensionCrawler.sqlite import * + + +def setup_tables(con): + con.execute("""CREATE TABLE review (""" + """id INTEGER PRIMARY KEY,""" + """extid TEXT,""" + """date TEXT,""" + """user TEXT,""" + """reviewdate TEXT,""" + """rating TEXT,""" + """comment TEXT""" + """)""") + con.execute("""CREATE TABLE category (""" + """extid TEXT,""" + """date TEXT,""" + """category TEXT,""" + """PRIMARY KEY (extid, date, category)""" + """)""") + con.execute("""CREATE TABLE permission (""" + """crx_etag TEXT,""" + """permission TEXT,""" + """PRIMARY KEY (crx_etag, permission)""" + """)""") + con.execute("""CREATE TABLE crx (""" + """etag TEXT PRIMARY KEY,""" + """filename TEXT,""" + """publickey BLOB""" + """)""") + con.execute("""CREATE TABLE extension (""" + """extid TEXT,""" + """date TEXT,""" + """name TEXT,""" + """version TEXT,""" + """description TEXT,""" + """downloads INTEGER,""" + """fulldescription TEXT,""" + """developer TEXT,""" + """crx_etag TEXT,""" + """lastupdated TEXT,""" + """crx_status INTEGER,""" + """overview_status INTEGER,""" + """PRIMARY KEY (extid, date),""" + """FOREIGN KEY (crx_etag) REFERENCES crx(etag)""" + """)""") + + +def help(): + print("create_db [OPTION]") + print(" -h print this help text") + print(" -a= archive directory") + print(" -p= three-letter-prefix") + + +def main(argv): + basedir = "archive" + prefix = "" + try: + opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="]) + except getopt.GetoptError: + help() + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + help() + sys.exit() + elif opt in ("-a", "--archive"): + basedir = arg + elif opt in ("-p", "--prefix"): + prefix = arg + + archive_dir = os.path.join(basedir, "data") + threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*")) + for threeletterdir in threeletterdirs: + for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]): + tarpath = os.path.join(threeletterdir, ext_id + ".tar") + dbpath = os.path.join(threeletterdir, ext_id + ".sqlite") + if os.path.exists(dbpath): + os.remove(dbpath) + with tempfile.TemporaryDirectory() as tmpdir: + with tarfile.open(tarpath) as t: + t.extractall(tmpdir) + iddir = os.path.join(tmpdir, ext_id) + + with sqlite3.connect(dbpath) as con: + setup_tables(con) + for date in sorted(os.listdir(iddir)): + datepath = os.path.join(iddir, date) + print( + update_sqlite_incremental(dbpath, datepath, ext_id, + date, True, "")) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/permstats.py b/permstats.py deleted file mode 100755 index ab3f7cd..0000000 --- a/permstats.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (C) 2016 The University of Sheffield, UK -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -from zipfile import ZipFile -import argparse -import json -import sys -import os -from jsmin import jsmin -import re - -regex_concrete_url = re.compile(r'^.*://.*[a-z0-9]+\.[a-z]+.*$') - - -class PermissionHandlerPrintNames: - def __init__(self, permname): - self.permname = permname - self.extinfo = {} - - def handle_permission(self, extid, permobj, path): - if self.permname in str(permobj): - with open(os.path.join(path, 'metadata.json')) as f: - metadata = json.load(f) - self.extinfo[extid] = '{} | {} | {}'.format(metadata[1], - metadata[6], path) - - def print_result(self, fileobj, delim): - fileobj.write('Extensions that use permission "{}":\n\n'.format( - self.permname)) - for extid in self.extinfo: - fileobj.write('{}\n'.format(self.extinfo[extid])) - fileobj.write('\n\n') - - -class PermissionHandler: - def __init__(self): - self.permissions = {} - self.extids = set() - - def handle_permission(self, extid, permobj, path): - self.extids.add(extid) - perm = str(permobj) - if not perm in self.permissions: - self.permissions[perm] = 0 - self.permissions[perm] += 1 - - def print_result(self, fileobj, delim): - fileobj.write('Total: {} extensions\n'.format(len(self.extids))) - for perm in sorted( - self.permissions, key=self.permissions.get, reverse=True): - fileobj.write('{}{}{}{}{:.2%}\n'.format( - perm, delim, self.permissions[perm], delim, - float(self.permissions[perm]) / len(self.extids))) - fileobj.write('\n\n') - - -class PermissionHandlerCondensed: - def __init__(self): - self.permissions = {} - self.extids = set() - self.exts_with_concrete_urls = set() - - def handle_permission(self, extid, permobj, path): - self.extids.add(extid) - - perm = str(permobj) - if regex_concrete_url.match(perm): - if extid in self.exts_with_concrete_urls: - return - self.exts_with_concrete_urls.add(extid) - perm = '<<<{}>>>'.format(regex_concrete_url.pattern) - if not perm in self.permissions: - self.permissions[perm] = 0 - self.permissions[perm] += 1 - - def print_result(self, fileobj, delim): - fileobj.write('Condensed. Total: {} extensions\n'.format( - len(self.extids))) - for perm in sorted( - self.permissions, key=self.permissions.get, reverse=True): - fileobj.write('{}{}{}{}{:.2%}\n'.format( - perm, delim, self.permissions[perm], delim, - float(self.permissions[perm]) / len(self.extids))) - fileobj.write('\n\n') - - -class PermissionStatisticGenerator: - def run(category_folder, permhandlers): - for root, dirs, files in os.walk(category_folder): - crxfile = next((f for f in files if f.endswith('.crx')), None) - if crxfile: - extid = os.path.basename(root) - with ZipFile(os.path.join(root, crxfile)) as zipfile: - with zipfile.open('manifest.json') as f: - content = jsmin(f.read().decode()) - - # This is needed to strip weird BOMs ... - first_bracket = content.find('{') - if first_bracket >= 0: - content = content[first_bracket:] - - manifest = json.loads(content) - if 'permissions' in manifest: - for permobj in manifest['permissions']: - for handler in permhandlers: - handler.handle_permission(extid, permobj, - root) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Prints statistics about the requested permissions of downloaded extensions.' - ) - parser.add_argument( - 'dir', - help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.' - ) - parser.add_argument( - '-d', - '--delim', - default='\t', - help='Delimiter used for the statistics output.') - parser.add_argument( - '-o', - '--output', - default=sys.stdout, - type=argparse.FileType('w'), - help='Save the statistics into a file.') - parser.add_argument( - '-p', - '--permission', - help='Prints out all extension names and descriptions that use the given permission.' - ) - parser.add_argument( - '-c', - '--categories', - action='store_true', - help='Print the results for each category separately.') - - args = parser.parse_args() - - category_folders = [args.dir] - if args.categories: - category_folders += [ - os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1] - ] - - for category_folder in category_folders: - args.output.write('Results for category {}:\n\n'.format( - category_folder)) - if args.permission: - handlers = [PermissionHandlerPrintNames(args.permission)] - else: - handlers = [PermissionHandler(), PermissionHandlerCondensed()] - PermissionStatisticGenerator.run(category_folder, handlers) - - for handler in handlers: - handler.print_result(args.output, args.delim)