diff --git a/ExtensionCrawler/archive.py b/ExtensionCrawler/archive.py
index 9346a95..9f4ed0f 100644
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
-#
+#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@@ -26,7 +26,7 @@ from random import randint
import datetime
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
-from ExtensionCrawler.archive import *
+from ExtensionCrawler.archive import archive_file
from ExtensionCrawler.sqlite import *
import dateutil
import dateutil.parser
@@ -76,7 +76,7 @@ class RequestResult:
class UpdateResult:
def __init__(self, id, is_new, exception, res_overview, res_crx,
- res_reviews, res_support):
+ res_reviews, res_support,res_sql, sql_update):
self.id = id
self.new = is_new
self.exception = exception
@@ -84,6 +84,8 @@ class UpdateResult:
self.res_crx = res_crx
self.res_reviews = res_reviews
self.res_support = res_support
+ self.res_sql = res_sql
+ self.sql_update = sql_update
def is_new(self):
return self.new
@@ -128,9 +130,11 @@ class UpdateResult:
def corrupt_tar(self):
return self.exception is not None
-
-def get_local_archive_dir(id):
- return "{}".format(id[:3])
+ def sql_exception(self):
+ return self.res_sql is not None
+
+ def sql_success(self):
+ return self.sql_update
def write_text(tardir, date, fname, text):
@@ -262,6 +266,7 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
timeout=10,
allow_redirects=True)
etag = res.headers.get('Etag')
+ write_text(tmptardir, date, extfilename + ".etag", etag)
logtxt = logmsg(verbose, logtxt, (
" - checking etag, last: {}\n" +
" current: {}\n").format(
@@ -287,6 +292,8 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
for chunk in res.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
+ write_text(tmptardir, date, extfilename + ".etag",
+ res.headers.get("ETag"))
except Exception as e:
logtxt = logmsg(verbose, logtxt,
" - Exception: {}\n".format(str(e)))
@@ -354,6 +361,8 @@ def update_extension(archivedir, verbose, forums, ext_id):
logtxt = logmsg(verbose, "", " Updating {}".format(ext_id))
is_new = False
tar_exception = None
+ sql_exception = None
+ sql_success = False
tmptardir = ""
tmptar = ""
@@ -380,7 +389,7 @@ def update_extension(archivedir, verbose, forums, ext_id):
logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
tar_exception = e
return UpdateResult(ext_id, is_new, tar_exception, res_overview,
- res_crx, res_reviews, res_support)
+ res_crx, res_reviews, res_support, sql_exception, False)
res_overview, msg_overview = update_overview(tmptardir, date, verbose,
ext_id)
@@ -443,10 +452,22 @@ def update_extension(archivedir, verbose, forums, ext_id):
except Exception:
pass
- msg_updatesqlite = update_sqlite(archivedir, tmptardir, verbose, ext_id,
- date)
- log(verbose, logtxt + msg_updatesqlite)
+ try:
+ sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new,
+ verbose, 11 * " ")
+ logtxt = logmsg(verbose, logtxt, msg_updatesqlite)
+ except Exception as e:
+ logtxt = logmsg(verbose, logtxt,
+ " * Exception during update of sqlite db ")
+ logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
+
+ sql_exception = e
+
+ try:
+ write_text(tardir, date, ext_id + ".sql.exception", str(e))
+ except Exception as e:
+ pass
try:
shutil.rmtree(path=tmpdir)
except Exception as e:
@@ -459,11 +480,12 @@ def update_extension(archivedir, verbose, forums, ext_id):
except Exception:
pass
+ log(verbose, logtxt)
return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
- res_reviews, res_support)
+ res_reviews, res_support, sql_exception, sql_success)
-def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
+def update_extensions(archivedir, verbose, parallel, forums_ext_ids, ext_ids):
ext_with_forums = []
ext_without_forums = []
ext_ids = list(set(ext_ids) - set(forums_ext_ids))
@@ -471,7 +493,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
log(verbose, "Updating {} extensions ({} including forums)\n".format(
len(ext_ids), len(forums_ext_ids)))
# First, update extensions with forums sequentially (and with delays) to
- # avoid running into Googles DDOS detection.
+ # avoid running into Googles DDOS detection.
log(verbose,
" Updating {} extensions including forums (sequentially))\n".format(
len(forums_ext_ids)))
@@ -486,7 +508,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
log(verbose,
" Updating {} extensions excluding forums (parallel))\n".format(
len(parallel_ids)))
- with Pool(12) as p:
+ with Pool(parallel) as p:
ext_without_forums = list(
p.map(
partial(update_extension, archivedir, verbose, False),
@@ -506,5 +528,6 @@ def get_existing_ids(archivedir, verbose):
def get_forum_ext_ids(confdir, verbose):
with open(os.path.join(confdir, "forums.conf")) as f:
ids = f.readlines()
+ r = re.compile('^[a-p]+$')
ids = [x.strip() for x in ids]
- return ids
+ return list(filter(r.match, ids))
diff --git a/ExtensionCrawler/config.py b/ExtensionCrawler/config.py
index 278dfec..d65f70d 100644
--- a/ExtensionCrawler/config.py
+++ b/ExtensionCrawler/config.py
@@ -16,6 +16,8 @@
# along with this program. If not, see .
#
+import os
+
def const_sitemap_url():
return "https://chrome.google.com/webstore/sitemap"
@@ -74,3 +76,12 @@ def const_review_payload(ext_id, start, end):
'"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
'"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
end)
+
+
+def get_local_archive_dir(id):
+ return "{}".format(id[:3])
+
+
+def archive_file(archivedir, ext_id):
+ return os.path.join(
+ str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
diff --git a/ExtensionCrawler/sqlite.py b/ExtensionCrawler/sqlite.py
index 01786da..e3f7576 100644
--- a/ExtensionCrawler/sqlite.py
+++ b/ExtensionCrawler/sqlite.py
@@ -1,7 +1,6 @@
-#!/usr/bin/env python3
#
# Copyright (C) 2017 The University of Sheffield, UK
-#
+#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@@ -18,11 +17,205 @@
from ExtensionCrawler.config import *
from ExtensionCrawler.util import *
+from ExtensionCrawler.crx import *
+
+from ExtensionCrawler import archive
+
+import sqlite3
+import re
+from bs4 import BeautifulSoup
+from zipfile import ZipFile
+import json
+import os
+import tempfile
+import tarfile
+import glob
-def update_sqlite(archivedir, tmptardir, verbose, ext_id, date):
- indent = " "
- txt = logmsg(verbose, "", indent + "* Updating SQLite ...")
- txt = logmsg(verbose, txt, "")
+class SqliteUpdateError(Exception):
+ def __init__(self, reason="unknown"):
+ self.reason = reason
+
+
+def get_etag(ext_id, datepath, con):
+ #Trying etag file
+ etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None)
+ if etagpath:
+ with open(etagpath) as f:
+ return f.read()
+
+ #Trying to parse header file for etag
+ headerpath = next(
+ iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
+ if headerpath:
+ with open(headerpath) as f:
+ headers = eval(f.read())
+ if "ETag" in headers:
+ return headers["ETag"]
+
+ #Trying to look up previous etag in database
+ linkpath = next(
+ iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
+ if linkpath:
+ with open(linkpath) as f:
+ link = f.read()
+ linked_date = link[3:].split("/")[0]
+
+ row = next(
+ con.execute(
+ "SELECT crx_etag FROM extension WHERE extid=? AND date=?",
+ (ext_id, linked_date)), None)
+ if row:
+ return row[0]
+
+
+def get_overview_status(datepath):
+ with open(os.path.join(datepath, "overview.html.status")) as f:
+ return int(f.read())
+
+
+def get_crx_status(datepath):
+ statuspath = next(
+ iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None)
+ if statuspath:
+ with open(statuspath) as f:
+ return int(f.read())
+
+
+def parse_and_insert_overview(ext_id, date, datepath, con):
+ overview_path = os.path.join(datepath, "overview.html")
+ with open(overview_path) as overview_file:
+ contents = overview_file.read()
+
+ # Extract extension name
+ match = re.search("""""",
+ contents)
+ name = match.group(1) if match else None
+
+ # Extract extension version
+ match = re.search("""""",
+ contents)
+ version = match.group(1) if match else None
+
+ # Extracts extension categories
+ match = re.search("""Attribute name="category">(.+?)""",
+ contents)
+ categories = match.group(1).split(",") if match else None
+
+ # Extracts the number of downloads
+ match = re.search("""user_count.*?(\d+)""", contents)
+ downloads = int(match.group(1)) if match else None
+
+ # Extracts the full extension description as it appears on the overview page
+ doc = BeautifulSoup(contents, 'html.parser')
+
+ description_parent = doc.find('div', itemprop="description")
+ description = str(description_parent.contents[
+ 0]) if description_parent and description_parent.contents else None
+ full_description = str(
+ description_parent.parent) if description_parent else None
+
+ developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls)
+ developer = str(
+ developer_parent.contents[0]) if developer_parent else None
+
+ last_updated_parent = doc.find(
+ class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
+ last_updated = str(
+ last_updated_parent.contents[0]) if last_updated_parent else None
+
+ etag = get_etag(ext_id, datepath, con)
+
+ overview_status = get_overview_status(datepath)
+
+ crx_status = get_crx_status(datepath)
+
+ con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
+ (ext_id, date, name, version, description, downloads,
+ full_description, developer, etag, last_updated,
+ overview_status, crx_status))
+
+ if categories:
+ for category in categories:
+ con.execute("INSERT INTO category VALUES (?,?,?)",
+ (ext_id, date, category))
+
+
+def parse_and_insert_crx(ext_id, date, datepath, con):
+ etag = get_etag(ext_id, datepath, con)
+ crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
+ filename = os.path.basename(crx_path)
+
+ with ZipFile(crx_path) as f:
+ with f.open("manifest.json") as m:
+ try:
+ # There are some manifests that seem to have weird encodings...
+ manifest = json.loads(m.read().decode("utf-8-sig"))
+ if "permissions" in manifest:
+ for permission in manifest["permissions"]:
+ con.execute(
+ "INSERT OR REPLACE INTO permission VALUES (?,?)",
+ (etag, str(permission)))
+ except json.decoder.JSONDecodeError:
+ pass
+
+ public_key = read_crx(crx_path).pk
+
+ con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
+ public_key))
+
+
+def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose,
+ indent):
+ txt = ""
+
+ txt = logmsg(verbose, txt,
+ indent + "- updating using {}\n".format(datepath))
+
+ if not os.path.exists(db_path):
+ raise SqliteUpdateError("db file not found")
+
+ with sqlite3.connect(db_path) as con:
+ parse_and_insert_overview(ext_id, date, datepath, con)
+
+ crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
+
+ etag = get_etag(ext_id, datepath, con)
+ etag_already_in_db = next(
+ con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
+ 0]
+ if etag and not etag_already_in_db:
+ if crx_path:
+ parse_and_insert_crx(ext_id, date, datepath, con)
+ else:
+ raise SqliteUpdateError(
+ "etag not in db and no crx file present")
return txt
+
+
+def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent):
+ update_successful = False
+ txt = ""
+ indent2 = indent + 4 * " "
+
+ datepath = os.path.join(tmptardir, date)
+
+ txt = logmsg(verbose, txt,
+ indent + "* extracting information into SQLite db...\n")
+
+ db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite")
+
+ txt = logmsg(verbose, txt,
+ indent2 + "- attempting incremental update...\n")
+ try:
+ updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date,
+ verbose, indent2)
+ txt = logmsg(verbose, txt, updatetxt)
+ update_successful = True
+ except SqliteUpdateError as e:
+ txt = logmsg(
+ verbose, txt,
+ indent2 + "- incremental update failed: {}\n".format(e.reason))
+
+ return update_successful, txt
diff --git a/crawler b/crawler
index 6dbb472..45e050e 100755
--- a/crawler
+++ b/crawler
@@ -33,6 +33,9 @@ import dateutil.parser
import time
import getopt
+# Script should run with python 3.4 or 3.5
+assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
+
def write_log(dir, fname, text):
os.makedirs(dir, exist_ok=True)
@@ -78,6 +81,18 @@ def log_failures_to_file(dir, today, res):
sorted(map(lambda x: x.id, filter(lambda x: x.corrupt_tar(), res))),
"")
write_log(dir, today + "-file-corruption.log", file_corruption)
+
+ sql_exception = reduce(
+ lambda x, y: x + "\n" + y,
+ sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))),
+ "")
+ write_log(dir, today + "-sql-exception.log", sql_exception)
+
+ sql_success = reduce(
+ lambda x, y: x + "\n" + y,
+ sorted(map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))),
+ "")
+ write_log(dir, today + "-sql-not-updated.log", sql_success)
def log_summary(verbose, res, stderr=False, runtime=0):
@@ -95,6 +110,8 @@ def log_summary(verbose, res, stderr=False, runtime=0):
not_in_store = len(list(filter(lambda x: x.not_in_store(), res)))
not_modified = len(list(filter(lambda x: x.not_modified(), res)))
corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
+ sql_exception = len(list(filter(lambda x: x.sql_exception(), res)))
+ sql_success = len(list(filter(lambda x: x.sql_success(), res)))
new = len(list(filter(lambda x: x.is_new(), res)))
updated = len(
@@ -105,6 +122,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
p(" Updated {} out of {} extensions successfully\n".format(
str(success), str(total)))
p(" Updated extensions: {:8d}\n".format(updated))
+ p(" Updated SQL databases: {:8d}\n".format(sql_success))
p(" New extensions: {:8d}\n".format(new))
p(" Not authorized: {:8d}\n".format(not_authorized))
p(" Raised Google DDOS: {:8d}\n".format(raised_ddos))
@@ -112,6 +130,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
p(" Extensions not in store: {:8d}\n".format(not_in_store))
p(" Unknown exception: {:8d}\n".format(has_exception))
p(" Corrupt tar archives: {:8d}\n".format(len(corrupt_tar_archives)))
+ p(" SQL exception: {:8d}\n".format(sql_exception))
p(" Total runtime: {}\n".format(
str(datetime.timedelta(seconds=int(runtime)))))
@@ -135,10 +154,11 @@ def help():
def main(argv):
today = datetime.datetime.now(datetime.timezone.utc).isoformat()
basedir = "archive"
+ parallel = 24
verbose = True
discover = False
try:
- opts, args = getopt.getopt(argv, "hsda:", ["archive="])
+ opts, args = getopt.getopt(argv, "hsdap:", ["archive=",'parallel='])
except getopt.GetoptError:
help()
sys.exit(2)
@@ -148,6 +168,8 @@ def main(argv):
sys.exit()
elif opt in ("-a", "--archive"):
basedir = arg
+ elif opt in ("-p", "--parallel"):
+ parallel = int(arg)
elif opt == '-s':
verbose = False
elif opt == '-d':
@@ -164,10 +186,11 @@ def main(argv):
start_time = time.time()
log(verbose, "Configuration:\n")
- log(verbose, " Base dir: {}\n".format(basedir))
- log(verbose, " Archive dir: {}\n".format(archive_dir))
- log(verbose, " Conf. dir: {}\n".format(conf_dir))
- log(verbose, " Discover new ext.: {}\n".format(discover))
+ log(verbose, " Base dir: {}\n".format(basedir))
+ log(verbose, " Archive directory: {}\n".format(archive_dir))
+ log(verbose, " Configuration directory: {}\n".format(conf_dir))
+ log(verbose, " Discover new extensions: {}\n".format(discover))
+ log(verbose, " Max num. of concurrent downloads: {}\n".format(parallel))
log(verbose, "\n")
forum_ext_ids = get_forum_ext_ids(conf_dir, verbose)
@@ -178,7 +201,7 @@ def main(argv):
discovered_ids = get_new_ids(verbose, known_ids)
ext_ids = list(set(discovered_ids) | set(known_ids))
- res = update_extensions(archive_dir, verbose, forum_ext_ids, ext_ids)
+ res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, ext_ids)
# We re-try (once) the extensions with unknown exceptions, as
# they are often temporary
diff --git a/create_db b/create_db
new file mode 100755
index 0000000..3b22a1f
--- /dev/null
+++ b/create_db
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2016,2017 The University of Sheffield, UK
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+#
+
+import getopt
+import os
+import sys
+import glob
+import tarfile
+import tempfile
+
+from ExtensionCrawler.sqlite import *
+
+
+def setup_tables(con):
+ con.execute("""CREATE TABLE review ("""
+ """id INTEGER PRIMARY KEY,"""
+ """extid TEXT,"""
+ """date TEXT,"""
+ """user TEXT,"""
+ """reviewdate TEXT,"""
+ """rating TEXT,"""
+ """comment TEXT"""
+ """)""")
+ con.execute("""CREATE TABLE category ("""
+ """extid TEXT,"""
+ """date TEXT,"""
+ """category TEXT,"""
+ """PRIMARY KEY (extid, date, category)"""
+ """)""")
+ con.execute("""CREATE TABLE permission ("""
+ """crx_etag TEXT,"""
+ """permission TEXT,"""
+ """PRIMARY KEY (crx_etag, permission)"""
+ """)""")
+ con.execute("""CREATE TABLE crx ("""
+ """etag TEXT PRIMARY KEY,"""
+ """filename TEXT,"""
+ """publickey BLOB"""
+ """)""")
+ con.execute("""CREATE TABLE extension ("""
+ """extid TEXT,"""
+ """date TEXT,"""
+ """name TEXT,"""
+ """version TEXT,"""
+ """description TEXT,"""
+ """downloads INTEGER,"""
+ """fulldescription TEXT,"""
+ """developer TEXT,"""
+ """crx_etag TEXT,"""
+ """lastupdated TEXT,"""
+ """crx_status INTEGER,"""
+ """overview_status INTEGER,"""
+ """PRIMARY KEY (extid, date),"""
+ """FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
+ """)""")
+
+
+def help():
+ print("create_db [OPTION]")
+ print(" -h print this help text")
+ print(" -a=
archive directory")
+ print(" -p= three-letter-prefix")
+
+
+def main(argv):
+ basedir = "archive"
+ prefix = ""
+ try:
+ opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="])
+ except getopt.GetoptError:
+ help()
+ sys.exit(2)
+ for opt, arg in opts:
+ if opt == '-h':
+ help()
+ sys.exit()
+ elif opt in ("-a", "--archive"):
+ basedir = arg
+ elif opt in ("-p", "--prefix"):
+ prefix = arg
+
+ archive_dir = os.path.join(basedir, "data")
+ threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*"))
+ for threeletterdir in threeletterdirs:
+ for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]):
+ tarpath = os.path.join(threeletterdir, ext_id + ".tar")
+ dbpath = os.path.join(threeletterdir, ext_id + ".sqlite")
+ if os.path.exists(dbpath):
+ os.remove(dbpath)
+ with tempfile.TemporaryDirectory() as tmpdir:
+ with tarfile.open(tarpath) as t:
+ t.extractall(tmpdir)
+ iddir = os.path.join(tmpdir, ext_id)
+
+ with sqlite3.connect(dbpath) as con:
+ setup_tables(con)
+ for date in sorted(os.listdir(iddir)):
+ datepath = os.path.join(iddir, date)
+ print(
+ update_sqlite_incremental(dbpath, datepath, ext_id,
+ date, True, ""))
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff --git a/permstats.py b/permstats.py
deleted file mode 100755
index ab3f7cd..0000000
--- a/permstats.py
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (C) 2016 The University of Sheffield, UK
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-#
-
-from zipfile import ZipFile
-import argparse
-import json
-import sys
-import os
-from jsmin import jsmin
-import re
-
-regex_concrete_url = re.compile(r'^.*://.*[a-z0-9]+\.[a-z]+.*$')
-
-
-class PermissionHandlerPrintNames:
- def __init__(self, permname):
- self.permname = permname
- self.extinfo = {}
-
- def handle_permission(self, extid, permobj, path):
- if self.permname in str(permobj):
- with open(os.path.join(path, 'metadata.json')) as f:
- metadata = json.load(f)
- self.extinfo[extid] = '{} | {} | {}'.format(metadata[1],
- metadata[6], path)
-
- def print_result(self, fileobj, delim):
- fileobj.write('Extensions that use permission "{}":\n\n'.format(
- self.permname))
- for extid in self.extinfo:
- fileobj.write('{}\n'.format(self.extinfo[extid]))
- fileobj.write('\n\n')
-
-
-class PermissionHandler:
- def __init__(self):
- self.permissions = {}
- self.extids = set()
-
- def handle_permission(self, extid, permobj, path):
- self.extids.add(extid)
- perm = str(permobj)
- if not perm in self.permissions:
- self.permissions[perm] = 0
- self.permissions[perm] += 1
-
- def print_result(self, fileobj, delim):
- fileobj.write('Total: {} extensions\n'.format(len(self.extids)))
- for perm in sorted(
- self.permissions, key=self.permissions.get, reverse=True):
- fileobj.write('{}{}{}{}{:.2%}\n'.format(
- perm, delim, self.permissions[perm], delim,
- float(self.permissions[perm]) / len(self.extids)))
- fileobj.write('\n\n')
-
-
-class PermissionHandlerCondensed:
- def __init__(self):
- self.permissions = {}
- self.extids = set()
- self.exts_with_concrete_urls = set()
-
- def handle_permission(self, extid, permobj, path):
- self.extids.add(extid)
-
- perm = str(permobj)
- if regex_concrete_url.match(perm):
- if extid in self.exts_with_concrete_urls:
- return
- self.exts_with_concrete_urls.add(extid)
- perm = '<<<{}>>>'.format(regex_concrete_url.pattern)
- if not perm in self.permissions:
- self.permissions[perm] = 0
- self.permissions[perm] += 1
-
- def print_result(self, fileobj, delim):
- fileobj.write('Condensed. Total: {} extensions\n'.format(
- len(self.extids)))
- for perm in sorted(
- self.permissions, key=self.permissions.get, reverse=True):
- fileobj.write('{}{}{}{}{:.2%}\n'.format(
- perm, delim, self.permissions[perm], delim,
- float(self.permissions[perm]) / len(self.extids)))
- fileobj.write('\n\n')
-
-
-class PermissionStatisticGenerator:
- def run(category_folder, permhandlers):
- for root, dirs, files in os.walk(category_folder):
- crxfile = next((f for f in files if f.endswith('.crx')), None)
- if crxfile:
- extid = os.path.basename(root)
- with ZipFile(os.path.join(root, crxfile)) as zipfile:
- with zipfile.open('manifest.json') as f:
- content = jsmin(f.read().decode())
-
- # This is needed to strip weird BOMs ...
- first_bracket = content.find('{')
- if first_bracket >= 0:
- content = content[first_bracket:]
-
- manifest = json.loads(content)
- if 'permissions' in manifest:
- for permobj in manifest['permissions']:
- for handler in permhandlers:
- handler.handle_permission(extid, permobj,
- root)
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description='Prints statistics about the requested permissions of downloaded extensions.'
- )
- parser.add_argument(
- 'dir',
- help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.'
- )
- parser.add_argument(
- '-d',
- '--delim',
- default='\t',
- help='Delimiter used for the statistics output.')
- parser.add_argument(
- '-o',
- '--output',
- default=sys.stdout,
- type=argparse.FileType('w'),
- help='Save the statistics into a file.')
- parser.add_argument(
- '-p',
- '--permission',
- help='Prints out all extension names and descriptions that use the given permission.'
- )
- parser.add_argument(
- '-c',
- '--categories',
- action='store_true',
- help='Print the results for each category separately.')
-
- args = parser.parse_args()
-
- category_folders = [args.dir]
- if args.categories:
- category_folders += [
- os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1]
- ]
-
- for category_folder in category_folders:
- args.output.write('Results for category {}:\n\n'.format(
- category_folder))
- if args.permission:
- handlers = [PermissionHandlerPrintNames(args.permission)]
- else:
- handlers = [PermissionHandler(), PermissionHandlerCondensed()]
- PermissionStatisticGenerator.run(category_folder, handlers)
-
- for handler in handlers:
- handler.print_result(args.output, args.delim)