diff --git a/ExtensionCrawler/cdnjs.py b/ExtensionCrawler/cdnjs.py
new file mode 100644
index 0000000..be88c85
--- /dev/null
+++ b/ExtensionCrawler/cdnjs.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3.5
+#
+# Copyright (C) 2016,2017 The University of Sheffield, UK
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+#
+""" Module for obtaining md5/sha1/sha256 hashes for all files available
+ at CDNJS.com."""
+
+import datetime
+import glob
+import hashlib
+import json
+import logging
+import os
+import re
+import sys
+from functools import partial
+from multiprocessing import Pool
+
+import requests
+
+# Script should run with python 3.4 or 3.5
+assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
+
+
+def get_cdnjs_all_libs_url():
+ """URL for obtaining list of all available libraries, see https://cdnjs.com/api for details."""
+ return "https://api.cdnjs.com/libraries"
+
+
+def get_jsfile_url(lib, version, jsfile):
+ """URL for obtaining detailed list of all available files/versionf of
+ a JavaScript library, see https://cdnjs.com/api for details."""
+ return "https://cdnjs.cloudflare.com/ajax/libs/{}/{}/{}".format(
+ lib, version, jsfile)
+
+
+def get_local_libs(archive):
+ """Get list of locally available libraries."""
+ dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
+ return (list(
+ map(lambda f: re.sub(".json$", "", os.path.basename(f)),
+ glob.glob(os.path.join(dirname, "*.json")))))
+
+
+def update_lib(force, archive, lib):
+ """Update information for a JavaScript library."""
+ name = lib['name']
+ try:
+ lib_res = requests.get(get_cdnjs_all_libs_url() + "/" + lib['name'],
+ timeout=10)
+ except Exception as e:
+ logging.error("Exception during download of library overview for " +
+ name + "from " + get_cdnjs_all_libs_url() + "/" +
+ lib['name'] + ":")
+ logging.error(str(e))
+ return
+
+ if not lib_res.status_code == 200:
+ logging.error(" Cannot access overview for " + name +
+ "(status codce: " + str(
+ lib_res.status_code) + ") " + str(lib_res.url))
+ logging.error(str(lib_res.content))
+ return
+ cdnjs_lib_json = lib_res.json()
+ dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
+ os.makedirs(str(dirname), exist_ok=True)
+
+ try:
+ with open(os.path.join(dirname, name + ".json"), "r") as json_file:
+ local_lib_json = json.load(json_file)
+ except IOError:
+ local_lib_json = None
+ except json.decoder.JSONDecodeError:
+ local_lib_json = None
+ logging.warning(" JSON file (" + os.path.join(dirname, name + ".json")
+ + ") defect, re-downloading.")
+ os.rename(
+ os.path.join(dirname, name + ".json"),
+ os.path.join(dirname, name + ".backup.json"))
+
+ local_versions = []
+ if local_lib_json is not None:
+ for lib_ver in local_lib_json['assets']:
+ local_versions.append(lib_ver['version'])
+
+ cdnjs_versions = []
+ for lib_ver in cdnjs_lib_json['assets']:
+ cdnjs_versions.append(lib_ver['version'])
+
+ for lib_ver in cdnjs_lib_json['assets']:
+ version = lib_ver['version']
+ logging.info(" Checking " + str(lib['name']) + " " + str(version))
+ files_with_hashes = []
+ if not force and version in local_versions:
+ logging.info(" Updating from local record.")
+ old_record = next(x for x in local_lib_json['assets']
+ if x['version'] == lib_ver['version'])
+ files_with_hashes = old_record['files']
+ else:
+ logging.warning(" Updating from remote record (" + name + " " +
+ version + ").")
+ for jsfile in lib_ver['files']:
+ jsfile_url = get_jsfile_url(name, version, jsfile)
+ logging.info(" " + jsfile_url)
+ try:
+ res_jsfile = requests.get(jsfile_url, timeout=10)
+ except Exception as e:
+ logging.error("Exception during download of assets of " +
+ name + " from " + jsfile_url + ":")
+ logging.error(str(e))
+ return
+
+ if res_jsfile.status_code == 403 or res_jsfile.status_code == 404:
+ logging.warning("Access denied: cannot access assests of " + name +
+ " (status code: " + str(
+ res_jsfile.status_code) + ") " + str(
+ res_jsfile.url))
+ files_with_hashes.append({
+ 'filename': jsfile,
+ 'url': jsfile_url,
+ 'first_seen': datetime.datetime.utcnow().isoformat(),
+ 'http_status_code': res_jsfile.status_code
+ })
+ elif res_jsfile.status_code == 200:
+ data = res_jsfile.content
+ files_with_hashes.append({
+ 'filename': jsfile,
+ 'md5': hashlib.md5(data).hexdigest(),
+ 'sha1': hashlib.sha1(data).hexdigest(),
+ 'sha256': hashlib.sha256(data).hexdigest(),
+ 'url': jsfile_url,
+ 'first_seen': datetime.datetime.utcnow().isoformat(),
+ 'size': len(data),
+ 'http_status_code': res_jsfile.status_code
+ })
+ else:
+ logging.error("Unknown error: cannot access assests of " + name +
+ " (status code: " + str(
+ res_jsfile.status_code) + ") " + str(
+ res_jsfile.url))
+ logging.error(str(res_jsfile.content))
+ return
+
+ lib_ver['files'] = files_with_hashes
+
+ if local_lib_json is not None:
+ outphased = []
+ for lib_ver in local_lib_json['assets']:
+ version = lib_ver['version']
+ if not version in cdnjs_versions:
+ logging.warning("Found outphased versions for " + name + " " +
+ str(version) + " , preserving from archive.")
+ if not 'outphased' in lib_ver:
+ lib_ver['outphased'] = datetime.datetime.utcnow(
+ ).isoformat()
+ outphased.append(lib_ver)
+ if outphased:
+ cdnjs_lib_json['assets'] = cdnjs_lib_json['assets'] + outphased
+
+ output = os.path.join(dirname, name + ".json")
+ logging.info(" Saving " + str(output))
+ with open(output, "w") as json_file:
+ json.dump(cdnjs_lib_json, json_file)
+
+
+def build_hash_map_of_lib(hashalg, archive, lib):
+ """Build dictionary with file information using the file hash as key."""
+ dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
+ hash_map = {}
+ try:
+ with open(os.path.join(dirname, lib + ".json"), "r") as json_file:
+ local_lib_json = json.load(json_file)
+ except IOError:
+ return None
+ for lib_ver in local_lib_json['assets']:
+ version = lib_ver['version']
+ for jsfile in lib_ver['files']:
+ hashvalue = jsfile[hashalg]
+ hash_map[hashvalue] = {
+ 'library': lib,
+ 'version': version,
+ 'file': jsfile['filename'],
+ 'first_seen': jsfile['first_seen']
+ }
+ if 'outphased' in jsfile:
+ (hash_map[hashvalue])['outphased'] = jsfile['outphased']
+ return hash_map
+
+
+def build_sha1_map_of_lib(archive, lib):
+ """Build dictionary with file information using the file sha1 as key."""
+ return build_hash_map_of_lib("sha1", archive, lib)
+
+
+def build_md5_map_of_lib(archive, lib):
+ """Build dictionary with file information using the file md5 as key."""
+ return build_hash_map_of_lib("md5", archive, lib)
+
+
+def build_hash_map(hashalg, archive):
+ """Build file information dictionary using the file hash as key"""
+ hash_map = None
+ for lib in get_local_libs(archive):
+ lib_map = build_hash_map_of_lib(hashalg, archive, lib)
+ if lib_map is not None and hash_map is not None:
+ hash_map.update(lib_map)
+ else:
+ hash_map = lib_map
+ return hash_map
+
+
+def build_sha1_map(archive):
+ """Build file information dictionary using the sha1 hash as key"""
+ return build_hash_map("sha1", archive)
+
+
+def build_md5_map(archive):
+ """Build file information dictionary using the md5 hash as key"""
+ return build_hash_map("md5", archive)
+
+
+def update_md5_map_file(archive):
+ """Update file containing md5 information for all files."""
+ with open(os.path.join(archive, "fileinfo", "cdnjs-md5.json"),
+ "w") as json_file:
+ json.dump(build_md5_map(archive), json_file)
+
+
+def update_sha1_map_file(archive):
+ """Update file containing sha1 information for all files."""
+ with open(os.path.join(archive, "fileinfo", "cdnjs-sha1.json"),
+ "w") as json_file:
+ json.dump(build_sha1_map(archive), json_file)
+
+
+def delete_orphaned(archive, local_libs, cdnjs_current_libs):
+ """Delete all orphaned local libaries."""
+ dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
+ for lib in local_libs:
+ if not lib in cdnjs_current_libs:
+ os.remove(os.path.join(dirname, lib + ".json"))
+
+
+def update_jslib_archive(force, clean, archive):
+ """Update information for all available JavaScript libraries."""
+ cdnjs_all_libs_url = get_cdnjs_all_libs_url()
+ try:
+ res = requests.get(cdnjs_all_libs_url, timeout=10)
+ except Exception as e:
+ logging.error("Exception during download of library overview from " +
+ cdnjs_all_libs_url + ":")
+ logging.error(str(e))
+ sys.exit(1)
+
+ if not res.status_code == 200:
+ logging.error("Could not obtain library overview (http status code: " +
+ str(res.status_code) + ")")
+ logging.error(str(res.content))
+ sys.exit(1)
+
+ cdnjs_lib_catalog = res.json()['results']
+ if clean:
+ local_lib_catalog = get_local_libs(archive)
+ delete_orphaned(archive, local_lib_catalog, cdnjs_lib_catalog)
+ dirname = os.path.join(archive, "fileinfo", "cdnjs")
+ os.makedirs(str(dirname), exist_ok=True)
+ with open(os.path.join(dirname, "cdnjs-libraries.json"), "w") as json_file:
+ json.dump(res.json(), json_file)
+ logging.info("Found " + str(len(cdnjs_lib_catalog)) +
+ " different libraries")
+
+ with Pool(32) as p:
+ p.map(partial(update_lib, force, archive), cdnjs_lib_catalog)
diff --git a/ExtensionCrawler/discover.py b/ExtensionCrawler/discover.py
old mode 100755
new mode 100644
diff --git a/ExtensionCrawler/js_decomposer.py b/ExtensionCrawler/js_decomposer.py
old mode 100755
new mode 100644
diff --git a/ExtensionCrawler/js_mincer.py b/ExtensionCrawler/js_mincer.py
old mode 100755
new mode 100644
diff --git a/cdnjs-crawler b/cdnjs-crawler
new file mode 100755
index 0000000..459ce42
--- /dev/null
+++ b/cdnjs-crawler
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3.5
+#
+# Copyright (C) 2016,2017 The University of Sheffield, UK
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+#
+""" Tool for obtaining md5/sha1/sha256 hashes for all files available
+ at CDNJS.com."""
+
+import getopt
+import logging
+import sys
+
+from ExtensionCrawler.cdnjs import (update_jslib_archive, update_md5_map_file,
+ update_sha1_map_file)
+from ExtensionCrawler.config import const_log_format
+
+# Script should run with python 3.4 or 3.5
+assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
+
+
+def helpmsg():
+ """Print help message."""
+ print("cdnjs-crawler [OPTION]")
+ print(" -h print this help text")
+ print(" -s silent (no log messages)")
+ print(" -f force full download (default: update of json files)")
+ print(" -c delete outdated (no longer available) libraries")
+ print(" -a=
archive directory")
+
+
+def main(argv):
+ """Main function of the extension crawler."""
+ basedir = "archive"
+ verbose = True
+ force = False
+ clean = False
+ try:
+ opts, args = getopt.getopt(argv, "hsed:a:o:w",
+ ["date=", "archive=", "output="])
+ except getopt.GetoptError:
+ helpmsg()
+ sys.exit(2)
+ for opt, arg in opts:
+ if opt == '-h':
+ helpmsg()
+ sys.exit()
+ elif opt in ("-a", "--archive"):
+ basedir = arg
+ elif opt == '-s':
+ verbose = False
+ elif opt == '-f':
+ force = True
+ elif opt == '-c':
+ clean = True
+
+ if verbose:
+ loglevel = logging.INFO
+ else:
+ loglevel = logging.WARNING
+
+ logger = logging.getLogger()
+ ch = logging.StreamHandler(sys.stdout)
+ ch.setFormatter(logging.Formatter(const_log_format()))
+ logger.addHandler(ch)
+ logger.setLevel(loglevel)
+
+ update_jslib_archive(force, clean, basedir)
+ update_sha1_map_file(basedir)
+ update_md5_map_file(basedir)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])