Merge branch 'master' into production

2017-06-18 15:38:13 +01:00 · 2017-06-18 15:38:13 +01:00 · f95619670c
parent ea71c5b6e3 d9195c8174
commit f95619670c
6 changed files with 397 additions and 200 deletions
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 #
 # Copyright (C) 2016,2017 The University of Sheffield, UK
-# 
+#
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -26,7 +26,7 @@ from random import randint
 import datetime
 from ExtensionCrawler.config import *
 from ExtensionCrawler.util import *
-from ExtensionCrawler.archive import *
+from ExtensionCrawler.archive import archive_file
 from ExtensionCrawler.sqlite import *
 import dateutil
 import dateutil.parser
@ -76,7 +76,7 @@ class RequestResult:

 class UpdateResult:
    def __init__(self, id, is_new, exception, res_overview, res_crx,
-                 res_reviews, res_support):
+                 res_reviews, res_support,res_sql, sql_update):
        self.id = id
        self.new = is_new
        self.exception = exception
@ -84,6 +84,8 @@ class UpdateResult:
        self.res_crx = res_crx
        self.res_reviews = res_reviews
        self.res_support = res_support
+        self.res_sql = res_sql
+        self.sql_update = sql_update

    def is_new(self):
        return self.new
@ -128,9 +130,11 @@ class UpdateResult:
    def corrupt_tar(self):
        return self.exception is not None

-
-def get_local_archive_dir(id):
-    return "{}".format(id[:3])
+    def sql_exception(self):
+        return self.res_sql is not None
+    
+    def sql_success(self):
+        return self.sql_update


 def write_text(tardir, date, fname, text):
@ -262,6 +266,7 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
                timeout=10,
                allow_redirects=True)
            etag = res.headers.get('Etag')
+            write_text(tmptardir, date, extfilename + ".etag", etag)
            logtxt = logmsg(verbose, logtxt, (
                "               - checking etag, last: {}\n" +
                "                             current: {}\n").format(
@ -287,6 +292,8 @@ def update_crx(archivedir, tmptardir, verbose, ext_id, date):
                for chunk in res.iter_content(chunk_size=512 * 1024):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
+            write_text(tmptardir, date, extfilename + ".etag",
+                       res.headers.get("ETag"))
    except Exception as e:
        logtxt = logmsg(verbose, logtxt,
                        "               - Exception: {}\n".format(str(e)))
@ -354,6 +361,8 @@ def update_extension(archivedir, verbose, forums, ext_id):
    logtxt = logmsg(verbose, "", "    Updating {}".format(ext_id))
    is_new = False
    tar_exception = None
+    sql_exception = None
+    sql_success = False
    tmptardir = ""
    tmptar = ""

@ -380,7 +389,7 @@ def update_extension(archivedir, verbose, forums, ext_id):
        logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
        tar_exception = e
        return UpdateResult(ext_id, is_new, tar_exception, res_overview,
-                            res_crx, res_reviews, res_support)
+                            res_crx, res_reviews, res_support, sql_exception, False)

    res_overview, msg_overview = update_overview(tmptardir, date, verbose,
                                                 ext_id)
@ -443,10 +452,22 @@ def update_extension(archivedir, verbose, forums, ext_id):
        except Exception:
            pass

-    msg_updatesqlite = update_sqlite(archivedir, tmptardir, verbose, ext_id,
-                                     date)
-    log(verbose, logtxt + msg_updatesqlite)
+    try:
+        sql_success, msg_updatesqlite = update_sqlite(archivedir, tmptardir, ext_id, date, is_new,
+                                         verbose, 11 * " ")
+        logtxt = logmsg(verbose, logtxt, msg_updatesqlite)

+    except Exception as e:
+        logtxt = logmsg(verbose, logtxt,
+                        "           * Exception during update of sqlite db ")
+        logtxt = logmsg(verbose, logtxt, " / Exception: {}\n".format(str(e)))
+
+        sql_exception = e
+
+        try:
+            write_text(tardir, date, ext_id + ".sql.exception", str(e))
+        except Exception as e:
+            pass
    try:
        shutil.rmtree(path=tmpdir)
    except Exception as e:
@ -459,11 +480,12 @@ def update_extension(archivedir, verbose, forums, ext_id):
        except Exception:
            pass

+    log(verbose, logtxt)
    return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx,
-                        res_reviews, res_support)
+                        res_reviews, res_support, sql_exception, sql_success)


-def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
+def update_extensions(archivedir, verbose, parallel, forums_ext_ids, ext_ids):
    ext_with_forums = []
    ext_without_forums = []
    ext_ids = list(set(ext_ids) - set(forums_ext_ids))
@ -471,7 +493,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
    log(verbose, "Updating {} extensions ({} including forums)\n".format(
        len(ext_ids), len(forums_ext_ids)))
    # First, update extensions with forums sequentially (and with delays) to
-    # avoid running into Googles DDOS detection. 
+    # avoid running into Googles DDOS detection.
    log(verbose,
        "  Updating {} extensions including forums (sequentially))\n".format(
            len(forums_ext_ids)))
@ -486,7 +508,7 @@ def update_extensions(archivedir, verbose, forums_ext_ids, ext_ids):
    log(verbose,
        "  Updating {} extensions excluding forums (parallel))\n".format(
            len(parallel_ids)))
-    with Pool(12) as p:
+    with Pool(parallel) as p:
        ext_without_forums = list(
            p.map(
                partial(update_extension, archivedir, verbose, False),
@ -506,5 +528,6 @@ def get_existing_ids(archivedir, verbose):
 def get_forum_ext_ids(confdir, verbose):
    with open(os.path.join(confdir, "forums.conf")) as f:
        ids = f.readlines()
+    r = re.compile('^[a-p]+$')
    ids = [x.strip() for x in ids]
-    return ids
+    return list(filter(r.match, ids))
--- a/ExtensionCrawler/config.py
+++ b/ExtensionCrawler/config.py
@ -16,6 +16,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #

+import os
+

 def const_sitemap_url():
    return "https://chrome.google.com/webstore/sitemap"
@ -74,3 +76,12 @@ def const_review_payload(ext_id, start, end):
        '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
        '"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
                                                               end)
+
+
+def get_local_archive_dir(id):
+    return "{}".format(id[:3])
+
+
+def archive_file(archivedir, ext_id):
+    return os.path.join(
+        str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
--- a/ExtensionCrawler/sqlite.py
+++ b/ExtensionCrawler/sqlite.py
@ -1,7 +1,6 @@
-#!/usr/bin/env python3
 #
 # Copyright (C) 2017 The University of Sheffield, UK
-# 
+#
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -18,11 +17,205 @@

 from ExtensionCrawler.config import *
 from ExtensionCrawler.util import *
+from ExtensionCrawler.crx import *
+
+from ExtensionCrawler import archive
+
+import sqlite3
+import re
+from bs4 import BeautifulSoup
+from zipfile import ZipFile
+import json
+import os
+import tempfile
+import tarfile
+import glob


-def update_sqlite(archivedir, tmptardir, verbose, ext_id, date):
-    indent = "           "
-    txt = logmsg(verbose, "", indent + "* Updating SQLite ...")
-    txt = logmsg(verbose, txt, "")
+class SqliteUpdateError(Exception):
+    def __init__(self, reason="unknown"):
+        self.reason = reason
+
+
+def get_etag(ext_id, datepath, con):
+    #Trying etag file
+    etagpath = next(iter(glob.glob(os.path.join(datepath, "*.etag"))), None)
+    if etagpath:
+        with open(etagpath) as f:
+            return f.read()
+
+    #Trying to parse header file for etag
+    headerpath = next(
+        iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
+    if headerpath:
+        with open(headerpath) as f:
+            headers = eval(f.read())
+            if "ETag" in headers:
+                return headers["ETag"]
+
+    #Trying to look up previous etag in database
+    linkpath = next(
+        iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
+    if linkpath:
+        with open(linkpath) as f:
+            link = f.read()
+            linked_date = link[3:].split("/")[0]
+
+            row = next(
+                con.execute(
+                    "SELECT crx_etag FROM extension WHERE extid=? AND date=?",
+                    (ext_id, linked_date)), None)
+            if row:
+                return row[0]
+
+
+def get_overview_status(datepath):
+    with open(os.path.join(datepath, "overview.html.status")) as f:
+        return int(f.read())
+
+
+def get_crx_status(datepath):
+    statuspath = next(
+        iter(glob.glob(os.path.join(datepath, "*.crx.status"))), None)
+    if statuspath:
+        with open(statuspath) as f:
+            return int(f.read())
+
+
+def parse_and_insert_overview(ext_id, date, datepath, con):
+    overview_path = os.path.join(datepath, "overview.html")
+    with open(overview_path) as overview_file:
+        contents = overview_file.read()
+
+        # Extract extension name
+        match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
+                          contents)
+        name = match.group(1) if match else None
+
+        # Extract extension version
+        match = re.search("""<meta itemprop="version" content="(.*?)"\s*/>""",
+                          contents)
+        version = match.group(1) if match else None
+
+        # Extracts extension categories
+        match = re.search("""Attribute name="category">(.+?)</Attribute>""",
+                          contents)
+        categories = match.group(1).split(",") if match else None
+
+        # Extracts the number of downloads
+        match = re.search("""user_count.*?(\d+)""", contents)
+        downloads = int(match.group(1)) if match else None
+
+        # Extracts the full extension description as it appears on the overview page
+        doc = BeautifulSoup(contents, 'html.parser')
+
+        description_parent = doc.find('div', itemprop="description")
+        description = str(description_parent.contents[
+            0]) if description_parent and description_parent.contents else None
+        full_description = str(
+            description_parent.parent) if description_parent else None
+
+        developer_parent = doc.find(class_=lambda cls: cls and "e-f-Me" in cls)
+        developer = str(
+            developer_parent.contents[0]) if developer_parent else None
+
+        last_updated_parent = doc.find(
+            class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
+        last_updated = str(
+            last_updated_parent.contents[0]) if last_updated_parent else None
+
+        etag = get_etag(ext_id, datepath, con)
+
+        overview_status = get_overview_status(datepath)
+
+        crx_status = get_crx_status(datepath)
+
+        con.execute("INSERT INTO extension VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
+                    (ext_id, date, name, version, description, downloads,
+                     full_description, developer, etag, last_updated,
+                     overview_status, crx_status))
+
+        if categories:
+            for category in categories:
+                con.execute("INSERT INTO category VALUES (?,?,?)",
+                            (ext_id, date, category))
+
+
+def parse_and_insert_crx(ext_id, date, datepath, con):
+    etag = get_etag(ext_id, datepath, con)
+    crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
+    filename = os.path.basename(crx_path)
+
+    with ZipFile(crx_path) as f:
+        with f.open("manifest.json") as m:
+            try:
+                # There are some manifests that seem to have weird encodings...
+                manifest = json.loads(m.read().decode("utf-8-sig"))
+                if "permissions" in manifest:
+                    for permission in manifest["permissions"]:
+                        con.execute(
+                            "INSERT OR REPLACE INTO permission VALUES (?,?)",
+                            (etag, str(permission)))
+            except json.decoder.JSONDecodeError:
+                pass
+
+        public_key = read_crx(crx_path).pk
+
+        con.execute("INSERT INTO crx VALUES (?,?,?)", (etag, filename,
+                                                       public_key))
+
+
+def update_sqlite_incremental(db_path, datepath, ext_id, date, verbose,
+                              indent):
+    txt = ""
+
+    txt = logmsg(verbose, txt,
+                 indent + "- updating using {}\n".format(datepath))
+
+    if not os.path.exists(db_path):
+        raise SqliteUpdateError("db file not found")
+
+    with sqlite3.connect(db_path) as con:
+        parse_and_insert_overview(ext_id, date, datepath, con)
+
+        crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
+
+        etag = get_etag(ext_id, datepath, con)
+        etag_already_in_db = next(
+            con.execute("SELECT COUNT(etag) FROM crx WHERE etag=?", (etag, )))[
+                0]
+        if etag and not etag_already_in_db:
+            if crx_path:
+                parse_and_insert_crx(ext_id, date, datepath, con)
+            else:
+                raise SqliteUpdateError(
+                    "etag not in db and no crx file present")

    return txt
+
+
+def update_sqlite(archivedir, tmptardir, ext_id, date, is_new, verbose, indent):
+    update_successful = False
+    txt = ""
+    indent2 = indent + 4 * " "
+
+    datepath = os.path.join(tmptardir, date)
+
+    txt = logmsg(verbose, txt,
+                 indent + "* extracting information into SQLite db...\n")
+
+    db_path = os.path.join(archivedir, ext_id[:3], ext_id + ".sqlite")
+
+    txt = logmsg(verbose, txt,
+                 indent2 + "- attempting incremental update...\n")
+    try:
+        updatetxt = update_sqlite_incremental(db_path, datepath, ext_id, date,
+                                              verbose, indent2)
+        txt = logmsg(verbose, txt, updatetxt)
+        update_successful = True
+    except SqliteUpdateError as e:
+        txt = logmsg(
+            verbose, txt,
+            indent2 + "- incremental update failed: {}\n".format(e.reason))
+
+    return update_successful, txt
--- a/35
+++ b/35
@ -33,6 +33,9 @@ import dateutil.parser
 import time
 import getopt

+# Script should run with python 3.4 or 3.5
+assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
+

 def write_log(dir, fname, text):
    os.makedirs(dir, exist_ok=True)
@ -78,6 +81,18 @@ def log_failures_to_file(dir, today, res):
        sorted(map(lambda x: x.id, filter(lambda x: x.corrupt_tar(), res))),
        "")
    write_log(dir, today + "-file-corruption.log", file_corruption)
+    
+    sql_exception = reduce(
+        lambda x, y: x + "\n" + y,
+        sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))),
+        "")
+    write_log(dir, today + "-sql-exception.log", sql_exception)
+    
+    sql_success = reduce(
+        lambda x, y: x + "\n" + y,
+        sorted(map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))),
+        "")
+    write_log(dir, today + "-sql-not-updated.log", sql_success)


 def log_summary(verbose, res, stderr=False, runtime=0):
@ -95,6 +110,8 @@ def log_summary(verbose, res, stderr=False, runtime=0):
    not_in_store = len(list(filter(lambda x: x.not_in_store(), res)))
    not_modified = len(list(filter(lambda x: x.not_modified(), res)))
    corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
+    sql_exception = len(list(filter(lambda x: x.sql_exception(), res)))
+    sql_success = len(list(filter(lambda x: x.sql_success(), res)))

    new = len(list(filter(lambda x: x.is_new(), res)))
    updated = len(
@ -105,6 +122,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
    p("    Updated {} out of {} extensions successfully\n".format(
        str(success), str(total)))
    p("    Updated extensions:      {:8d}\n".format(updated))
+    p("    Updated SQL databases:   {:8d}\n".format(sql_success))
    p("    New extensions:          {:8d}\n".format(new))
    p("    Not authorized:          {:8d}\n".format(not_authorized))
    p("    Raised Google DDOS:      {:8d}\n".format(raised_ddos))
@ -112,6 +130,7 @@ def log_summary(verbose, res, stderr=False, runtime=0):
    p("    Extensions not in store: {:8d}\n".format(not_in_store))
    p("    Unknown exception:       {:8d}\n".format(has_exception))
    p("    Corrupt tar archives:    {:8d}\n".format(len(corrupt_tar_archives)))
+    p("    SQL exception:           {:8d}\n".format(sql_exception))
    p("    Total runtime:            {}\n".format(
        str(datetime.timedelta(seconds=int(runtime)))))

@ -135,10 +154,11 @@ def help():
 def main(argv):
    today = datetime.datetime.now(datetime.timezone.utc).isoformat()
    basedir = "archive"
+    parallel = 24
    verbose = True
    discover = False
    try:
-        opts, args = getopt.getopt(argv, "hsda:", ["archive="])
+        opts, args = getopt.getopt(argv, "hsdap:", ["archive=",'parallel='])
    except getopt.GetoptError:
        help()
        sys.exit(2)
@ -148,6 +168,8 @@ def main(argv):
            sys.exit()
        elif opt in ("-a", "--archive"):
            basedir = arg
+        elif opt in ("-p", "--parallel"):
+            parallel = int(arg)
        elif opt == '-s':
            verbose = False
        elif opt == '-d':
@ -164,10 +186,11 @@ def main(argv):
    start_time = time.time()

    log(verbose, "Configuration:\n")
-    log(verbose, "  Base dir:          {}\n".format(basedir))
-    log(verbose, "    Archive dir:     {}\n".format(archive_dir))
-    log(verbose, "    Conf. dir:       {}\n".format(conf_dir))
-    log(verbose, "  Discover new ext.: {}\n".format(discover))
+    log(verbose, "  Base dir:                         {}\n".format(basedir))
+    log(verbose, "    Archive directory:              {}\n".format(archive_dir))
+    log(verbose, "    Configuration directory:        {}\n".format(conf_dir))
+    log(verbose, "  Discover new extensions:          {}\n".format(discover))
+    log(verbose, "  Max num. of concurrent downloads: {}\n".format(parallel))
    log(verbose, "\n")

    forum_ext_ids = get_forum_ext_ids(conf_dir, verbose)
@ -178,7 +201,7 @@ def main(argv):
        discovered_ids = get_new_ids(verbose, known_ids)
    ext_ids = list(set(discovered_ids) | set(known_ids))

-    res = update_extensions(archive_dir, verbose, forum_ext_ids, ext_ids)
+    res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, ext_ids)

    # We re-try (once) the extensions with unknown exceptions, as 
    # they are often temporary    
--- a/120
+++ b/120
@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2016,2017 The University of Sheffield, UK
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import getopt
+import os
+import sys
+import glob
+import tarfile
+import tempfile
+
+from ExtensionCrawler.sqlite import *
+
+
+def setup_tables(con):
+    con.execute("""CREATE TABLE review ("""
+                """id INTEGER PRIMARY KEY,"""
+                """extid TEXT,"""
+                """date TEXT,"""
+                """user TEXT,"""
+                """reviewdate TEXT,"""
+                """rating TEXT,"""
+                """comment TEXT"""
+                """)""")
+    con.execute("""CREATE TABLE category ("""
+                """extid TEXT,"""
+                """date TEXT,"""
+                """category TEXT,"""
+                """PRIMARY KEY (extid, date, category)"""
+                """)""")
+    con.execute("""CREATE TABLE permission ("""
+                """crx_etag TEXT,"""
+                """permission TEXT,"""
+                """PRIMARY KEY (crx_etag, permission)"""
+                """)""")
+    con.execute("""CREATE TABLE crx ("""
+                """etag TEXT PRIMARY KEY,"""
+                """filename TEXT,"""
+                """publickey BLOB"""
+                """)""")
+    con.execute("""CREATE TABLE extension ("""
+                """extid TEXT,"""
+                """date TEXT,"""
+                """name TEXT,"""
+                """version TEXT,"""
+                """description TEXT,"""
+                """downloads INTEGER,"""
+                """fulldescription TEXT,"""
+                """developer TEXT,"""
+                """crx_etag TEXT,"""
+                """lastupdated TEXT,"""
+                """crx_status INTEGER,"""
+                """overview_status INTEGER,"""
+                """PRIMARY KEY (extid, date),"""
+                """FOREIGN KEY (crx_etag) REFERENCES crx(etag)"""
+                """)""")
+
+
+def help():
+    print("create_db [OPTION]")
+    print("    -h           print this help text")
+    print("    -a=<DIR>     archive directory")
+    print("    -p=<PREFIX>  three-letter-prefix")
+
+
+def main(argv):
+    basedir = "archive"
+    prefix = ""
+    try:
+        opts, args = getopt.getopt(argv, "ha:p:", ["archive=", "prefix="])
+    except getopt.GetoptError:
+        help()
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            help()
+            sys.exit()
+        elif opt in ("-a", "--archive"):
+            basedir = arg
+        elif opt in ("-p", "--prefix"):
+            prefix = arg
+
+    archive_dir = os.path.join(basedir, "data")
+    threeletterdirs = glob.glob(os.path.join(archive_dir, prefix + "*"))
+    for threeletterdir in threeletterdirs:
+        for ext_id in set([d[:32] for d in os.listdir(threeletterdir)]):
+            tarpath = os.path.join(threeletterdir, ext_id + ".tar")
+            dbpath = os.path.join(threeletterdir, ext_id + ".sqlite")
+            if os.path.exists(dbpath):
+                os.remove(dbpath)
+            with tempfile.TemporaryDirectory() as tmpdir:
+                with tarfile.open(tarpath) as t:
+                    t.extractall(tmpdir)
+                    iddir = os.path.join(tmpdir, ext_id)
+
+                    with sqlite3.connect(dbpath) as con:
+                        setup_tables(con)
+                    for date in sorted(os.listdir(iddir)):
+                        datepath = os.path.join(iddir, date)
+                        print(
+                            update_sqlite_incremental(dbpath, datepath, ext_id,
+                                                      date, True, ""))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/permstats.py
+++ b/permstats.py
@ -1,173 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (C) 2016 The University of Sheffield, UK
-# 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-from zipfile import ZipFile
-import argparse
-import json
-import sys
-import os
-from jsmin import jsmin
-import re
-
-regex_concrete_url = re.compile(r'^.*://.*[a-z0-9]+\.[a-z]+.*$')
-
-
-class PermissionHandlerPrintNames:
-    def __init__(self, permname):
-        self.permname = permname
-        self.extinfo = {}
-
-    def handle_permission(self, extid, permobj, path):
-        if self.permname in str(permobj):
-            with open(os.path.join(path, 'metadata.json')) as f:
-                metadata = json.load(f)
-                self.extinfo[extid] = '{} | {} | {}'.format(metadata[1],
-                                                            metadata[6], path)
-
-    def print_result(self, fileobj, delim):
-        fileobj.write('Extensions that use permission "{}":\n\n'.format(
-            self.permname))
-        for extid in self.extinfo:
-            fileobj.write('{}\n'.format(self.extinfo[extid]))
-        fileobj.write('\n\n')
-
-
-class PermissionHandler:
-    def __init__(self):
-        self.permissions = {}
-        self.extids = set()
-
-    def handle_permission(self, extid, permobj, path):
-        self.extids.add(extid)
-        perm = str(permobj)
-        if not perm in self.permissions:
-            self.permissions[perm] = 0
-        self.permissions[perm] += 1
-
-    def print_result(self, fileobj, delim):
-        fileobj.write('Total: {} extensions\n'.format(len(self.extids)))
-        for perm in sorted(
-                self.permissions, key=self.permissions.get, reverse=True):
-            fileobj.write('{}{}{}{}{:.2%}\n'.format(
-                perm, delim, self.permissions[perm], delim,
-                float(self.permissions[perm]) / len(self.extids)))
-        fileobj.write('\n\n')
-
-
-class PermissionHandlerCondensed:
-    def __init__(self):
-        self.permissions = {}
-        self.extids = set()
-        self.exts_with_concrete_urls = set()
-
-    def handle_permission(self, extid, permobj, path):
-        self.extids.add(extid)
-
-        perm = str(permobj)
-        if regex_concrete_url.match(perm):
-            if extid in self.exts_with_concrete_urls:
-                return
-            self.exts_with_concrete_urls.add(extid)
-            perm = '<<<{}>>>'.format(regex_concrete_url.pattern)
-        if not perm in self.permissions:
-            self.permissions[perm] = 0
-        self.permissions[perm] += 1
-
-    def print_result(self, fileobj, delim):
-        fileobj.write('Condensed. Total: {} extensions\n'.format(
-            len(self.extids)))
-        for perm in sorted(
-                self.permissions, key=self.permissions.get, reverse=True):
-            fileobj.write('{}{}{}{}{:.2%}\n'.format(
-                perm, delim, self.permissions[perm], delim,
-                float(self.permissions[perm]) / len(self.extids)))
-        fileobj.write('\n\n')
-
-
-class PermissionStatisticGenerator:
-    def run(category_folder, permhandlers):
-        for root, dirs, files in os.walk(category_folder):
-            crxfile = next((f for f in files if f.endswith('.crx')), None)
-            if crxfile:
-                extid = os.path.basename(root)
-                with ZipFile(os.path.join(root, crxfile)) as zipfile:
-                    with zipfile.open('manifest.json') as f:
-                        content = jsmin(f.read().decode())
-
-                        # This is needed to strip weird BOMs ...
-                        first_bracket = content.find('{')
-                        if first_bracket >= 0:
-                            content = content[first_bracket:]
-
-                        manifest = json.loads(content)
-                        if 'permissions' in manifest:
-                            for permobj in manifest['permissions']:
-                                for handler in permhandlers:
-                                    handler.handle_permission(extid, permobj,
-                                                              root)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Prints statistics about the requested permissions of downloaded extensions.'
-    )
-    parser.add_argument(
-        'dir',
-        help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.'
-    )
-    parser.add_argument(
-        '-d',
-        '--delim',
-        default='\t',
-        help='Delimiter used for the statistics output.')
-    parser.add_argument(
-        '-o',
-        '--output',
-        default=sys.stdout,
-        type=argparse.FileType('w'),
-        help='Save the statistics into a file.')
-    parser.add_argument(
-        '-p',
-        '--permission',
-        help='Prints out all extension names and descriptions that use the given permission.'
-    )
-    parser.add_argument(
-        '-c',
-        '--categories',
-        action='store_true',
-        help='Print the results for each category separately.')
-
-    args = parser.parse_args()
-
-    category_folders = [args.dir]
-    if args.categories:
-        category_folders += [
-            os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1]
-        ]
-
-    for category_folder in category_folders:
-        args.output.write('Results for category {}:\n\n'.format(
-            category_folder))
-        if args.permission:
-            handlers = [PermissionHandlerPrintNames(args.permission)]
-        else:
-            handlers = [PermissionHandler(), PermissionHandlerCondensed()]
-        PermissionStatisticGenerator.run(category_folder, handlers)
-
-        for handler in handlers:
-            handler.print_result(args.output, args.delim)