297 lines
11 KiB
Python
297 lines
11 KiB
Python
#!/usr/bin/env python3.7
|
|
#
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
""" Module for obtaining md5/sha1/sha256 hashes for all files available
|
|
at CDNJS.com."""
|
|
|
|
import datetime
|
|
import glob
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
from functools import partial
|
|
from multiprocessing import Pool
|
|
|
|
import requests
|
|
|
|
|
|
def get_cdnjs_all_libs_url():
|
|
"""URL for obtaining list of all available libraries, see https://cdnjs.com/api for details."""
|
|
return "https://api.cdnjs.com/libraries"
|
|
|
|
|
|
def get_jsfile_url(lib, version, jsfile):
|
|
"""URL for obtaining detailed list of all available files/versionf of
|
|
a JavaScript library, see https://cdnjs.com/api for details."""
|
|
return "https://cdnjs.cloudflare.com/ajax/libs/{}/{}/{}".format(
|
|
lib, version, jsfile)
|
|
|
|
|
|
def get_local_libs(archive):
|
|
"""Get list of locally available libraries."""
|
|
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
|
|
return (list(
|
|
map(lambda f: re.sub(".json$", "", os.path.basename(f)),
|
|
glob.glob(os.path.join(dirname, "*.json")))))
|
|
|
|
|
|
def update_lib(force, archive, lib):
|
|
"""Update information for a JavaScript library."""
|
|
name = lib['name']
|
|
try:
|
|
lib_res = requests.get(
|
|
get_cdnjs_all_libs_url() + "/" + lib['name'], timeout=10)
|
|
except Exception as e:
|
|
logging.error("Exception during download of library overview for " +
|
|
name + "from " + get_cdnjs_all_libs_url() + "/" +
|
|
lib['name'] + ":")
|
|
logging.error(str(e))
|
|
return
|
|
|
|
if not lib_res.status_code == 200:
|
|
logging.error(
|
|
" Cannot access overview for " + name + "(status codce: " + str(
|
|
lib_res.status_code) + ") " + str(lib_res.url))
|
|
logging.error(str(lib_res.content))
|
|
return
|
|
cdnjs_lib_json = lib_res.json()
|
|
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
|
|
os.makedirs(str(dirname), exist_ok=True)
|
|
|
|
try:
|
|
with open(os.path.join(dirname, name + ".json"), "r") as json_file:
|
|
local_lib_json = json.load(json_file)
|
|
except IOError:
|
|
local_lib_json = None
|
|
except json.decoder.JSONDecodeError:
|
|
local_lib_json = None
|
|
logging.warning(" JSON file (" + os.path.join(dirname, name + ".json")
|
|
+ ") defect, re-downloading.")
|
|
os.rename(
|
|
os.path.join(dirname, name + ".json"),
|
|
os.path.join(dirname, name + ".backup.json"))
|
|
|
|
local_versions = []
|
|
if local_lib_json is not None:
|
|
for lib_ver in local_lib_json['assets']:
|
|
local_versions.append(lib_ver['version'])
|
|
|
|
cdnjs_versions = []
|
|
for lib_ver in cdnjs_lib_json['assets']:
|
|
cdnjs_versions.append(lib_ver['version'])
|
|
|
|
for lib_ver in cdnjs_lib_json['assets']:
|
|
version = lib_ver['version']
|
|
logging.info(" Checking " + str(lib['name']) + " " + str(version))
|
|
files_with_hashes = []
|
|
if not force and version in local_versions:
|
|
logging.info(" Updating from local record.")
|
|
old_record = next(x for x in local_lib_json['assets']
|
|
if x['version'] == lib_ver['version'])
|
|
files_with_hashes = old_record['files']
|
|
else:
|
|
logging.warning(" Updating from remote record (" + name + " " +
|
|
version + ").")
|
|
for jsfile in lib_ver['files']:
|
|
jsfile_url = get_jsfile_url(name, version, jsfile)
|
|
logging.info(" " + jsfile_url)
|
|
try:
|
|
res_jsfile = requests.get(jsfile_url, timeout=10)
|
|
except Exception as e:
|
|
logging.error("Exception during download of assets of " +
|
|
name + " from " + jsfile_url + ":")
|
|
logging.error(str(e))
|
|
return
|
|
|
|
if res_jsfile.status_code == 403 or res_jsfile.status_code == 404:
|
|
logging.warning("Access denied: cannot access assests of "
|
|
+ name + " (status code: " + str(
|
|
res_jsfile.status_code) + ") " + str(
|
|
res_jsfile.url))
|
|
files_with_hashes.append({
|
|
'filename':
|
|
jsfile,
|
|
'url':
|
|
jsfile_url,
|
|
'first_seen':
|
|
datetime.datetime.utcnow().isoformat(),
|
|
'http_status_code':
|
|
res_jsfile.status_code
|
|
})
|
|
elif res_jsfile.status_code == 200:
|
|
data = res_jsfile.content
|
|
files_with_hashes.append({
|
|
'filename':
|
|
jsfile,
|
|
'md5':
|
|
hashlib.md5(data).hexdigest(),
|
|
'sha1':
|
|
hashlib.sha1(data).hexdigest(),
|
|
'sha256':
|
|
hashlib.sha256(data).hexdigest(),
|
|
'url':
|
|
jsfile_url,
|
|
'first_seen':
|
|
datetime.datetime.utcnow().isoformat(),
|
|
'size':
|
|
len(data),
|
|
'http_status_code':
|
|
res_jsfile.status_code
|
|
})
|
|
else:
|
|
logging.error("Unknown error: cannot access assests of " +
|
|
name + " (status code: " + str(
|
|
res_jsfile.status_code) + ") " + str(
|
|
res_jsfile.url))
|
|
logging.error(str(res_jsfile.content))
|
|
return
|
|
|
|
lib_ver['files'] = files_with_hashes
|
|
|
|
if local_lib_json is not None:
|
|
outphased = []
|
|
for lib_ver in local_lib_json['assets']:
|
|
version = lib_ver['version']
|
|
if version not in cdnjs_versions:
|
|
logging.warning("Found outphased versions for " + name + " " +
|
|
str(version) + " , preserving from archive.")
|
|
if 'outphased' not in lib_ver:
|
|
lib_ver[
|
|
'outphased'] = datetime.datetime.utcnow().isoformat()
|
|
outphased.append(lib_ver)
|
|
if outphased:
|
|
cdnjs_lib_json['assets'] = cdnjs_lib_json['assets'] + outphased
|
|
|
|
output = os.path.join(dirname, name + ".json")
|
|
logging.info(" Saving " + str(output))
|
|
with open(output, "w") as json_file:
|
|
json.dump(cdnjs_lib_json, json_file)
|
|
|
|
|
|
def build_hash_map_of_lib(hashalg, archive, lib):
|
|
"""Build dictionary with file information using the file hash as key."""
|
|
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
|
|
hash_map = {}
|
|
try:
|
|
with open(os.path.join(dirname, lib + ".json"), "r") as json_file:
|
|
local_lib_json = json.load(json_file)
|
|
except IOError:
|
|
return None
|
|
for lib_ver in local_lib_json['assets']:
|
|
version = lib_ver['version']
|
|
for jsfile in lib_ver['files']:
|
|
hashvalue = jsfile[hashalg]
|
|
hash_map[hashvalue] = {
|
|
'library': lib,
|
|
'version': version,
|
|
'file': jsfile['filename'],
|
|
}
|
|
if 'outphased' in jsfile:
|
|
(hash_map[hashvalue])['outphased'] = jsfile['outphased']
|
|
if 'first_seen' in jsfile:
|
|
(hash_map[hashvalue])['first_seen'] = jsfile['first_seen']
|
|
return hash_map
|
|
|
|
|
|
def build_sha1_map_of_lib(archive, lib):
|
|
"""Build dictionary with file information using the file sha1 as key."""
|
|
return build_hash_map_of_lib("sha1", archive, lib)
|
|
|
|
|
|
def build_md5_map_of_lib(archive, lib):
|
|
"""Build dictionary with file information using the file md5 as key."""
|
|
return build_hash_map_of_lib("md5", archive, lib)
|
|
|
|
|
|
def build_hash_map(hashalg, archive):
|
|
"""Build file information dictionary using the file hash as key"""
|
|
hash_map = None
|
|
for lib in get_local_libs(archive):
|
|
lib_map = build_hash_map_of_lib(hashalg, archive, lib)
|
|
if lib_map is not None and hash_map is not None:
|
|
hash_map.update(lib_map)
|
|
else:
|
|
hash_map = lib_map
|
|
return hash_map
|
|
|
|
|
|
def build_sha1_map(archive):
|
|
"""Build file information dictionary using the sha1 hash as key"""
|
|
return build_hash_map("sha1", archive)
|
|
|
|
|
|
def build_md5_map(archive):
|
|
"""Build file information dictionary using the md5 hash as key"""
|
|
return build_hash_map("md5", archive)
|
|
|
|
|
|
def update_md5_map_file(archive):
|
|
"""Update file containing md5 information for all files."""
|
|
with open(os.path.join(archive, "filedb", "cdnjs-md5.json"),
|
|
"w") as json_file:
|
|
json.dump(build_md5_map(archive), json_file)
|
|
|
|
|
|
def update_sha1_map_file(archive):
|
|
"""Update file containing sha1 information for all files."""
|
|
with open(os.path.join(archive, "filedb", "cdnjs-sha1.json"),
|
|
"w") as json_file:
|
|
json.dump(build_sha1_map(archive), json_file)
|
|
|
|
|
|
def delete_orphaned(archive, local_libs, cdnjs_current_libs):
|
|
"""Delete all orphaned local libaries."""
|
|
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
|
|
for lib in local_libs:
|
|
if lib not in cdnjs_current_libs:
|
|
os.remove(os.path.join(dirname, lib + ".json"))
|
|
|
|
|
|
def update_jslib_archive(force, clean, archive):
|
|
"""Update information for all available JavaScript libraries."""
|
|
cdnjs_all_libs_url = get_cdnjs_all_libs_url()
|
|
try:
|
|
res = requests.get(cdnjs_all_libs_url, timeout=10)
|
|
except Exception as e:
|
|
logging.error("Exception during download of library overview from " +
|
|
cdnjs_all_libs_url + ":")
|
|
logging.error(str(e))
|
|
sys.exit(1)
|
|
|
|
if not res.status_code == 200:
|
|
logging.error("Could not obtain library overview (http status code: " +
|
|
str(res.status_code) + ")")
|
|
logging.error(str(res.content))
|
|
sys.exit(1)
|
|
|
|
cdnjs_lib_catalog = res.json()['results']
|
|
if clean:
|
|
local_lib_catalog = get_local_libs(archive)
|
|
delete_orphaned(archive, local_lib_catalog, cdnjs_lib_catalog)
|
|
dirname = os.path.join(archive, "filedb", "cdnjs")
|
|
os.makedirs(str(dirname), exist_ok=True)
|
|
with open(os.path.join(dirname, "cdnjs-libraries.json"), "w") as json_file:
|
|
json.dump(res.json(), json_file)
|
|
logging.info("Found " + str(len(cdnjs_lib_catalog)) +
|
|
" different libraries")
|
|
|
|
with Pool(32) as p:
|
|
p.map(partial(update_lib, force, archive), cdnjs_lib_catalog)
|