ExtensionCrawler/ExtensionCrawler/cdnjs_crawler.py

297 lines
11 KiB
Python
Raw Normal View History

2019-02-02 18:17:20 +00:00
#!/usr/bin/env python3.7
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
""" Module for obtaining md5/sha1/sha256 hashes for all files available
at CDNJS.com."""
import datetime
2017-09-03 09:06:55 +00:00
import glob
import hashlib
import json
2017-09-03 09:29:57 +00:00
import logging
import os
import re
import sys
2017-09-03 09:06:55 +00:00
from functools import partial
from multiprocessing import Pool
import requests
def get_cdnjs_all_libs_url():
"""URL for obtaining list of all available libraries, see https://cdnjs.com/api for details."""
return "https://api.cdnjs.com/libraries"
def get_jsfile_url(lib, version, jsfile):
"""URL for obtaining detailed list of all available files/versionf of
a JavaScript library, see https://cdnjs.com/api for details."""
return "https://cdnjs.cloudflare.com/ajax/libs/{}/{}/{}".format(
lib, version, jsfile)
2017-09-02 20:44:20 +00:00
2017-09-02 19:41:16 +00:00
def get_local_libs(archive):
"""Get list of locally available libraries."""
2017-09-10 21:59:07 +00:00
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
2017-09-02 20:44:20 +00:00
return (list(
map(lambda f: re.sub(".json$", "", os.path.basename(f)),
glob.glob(os.path.join(dirname, "*.json")))))
2017-09-03 14:56:27 +00:00
def update_lib(force, archive, lib):
"""Update information for a JavaScript library."""
name = lib['name']
try:
2017-09-07 19:09:29 +00:00
lib_res = requests.get(
get_cdnjs_all_libs_url() + "/" + lib['name'], timeout=10)
except Exception as e:
logging.error("Exception during download of library overview for " +
name + "from " + get_cdnjs_all_libs_url() + "/" +
lib['name'] + ":")
logging.error(str(e))
return
2017-09-03 09:45:56 +00:00
if not lib_res.status_code == 200:
2017-11-27 06:50:25 +00:00
logging.error(
" Cannot access overview for " + name + "(status codce: " + str(
lib_res.status_code) + ") " + str(lib_res.url))
logging.error(str(lib_res.content))
2017-09-03 10:00:05 +00:00
return
2017-09-02 19:41:16 +00:00
cdnjs_lib_json = lib_res.json()
2017-09-10 21:59:07 +00:00
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
2017-09-02 19:41:16 +00:00
os.makedirs(str(dirname), exist_ok=True)
try:
with open(os.path.join(dirname, name + ".json"), "r") as json_file:
local_lib_json = json.load(json_file)
except IOError:
local_lib_json = None
2017-09-03 09:45:56 +00:00
except json.decoder.JSONDecodeError:
local_lib_json = None
2017-09-03 10:00:05 +00:00
logging.warning(" JSON file (" + os.path.join(dirname, name + ".json")
+ ") defect, re-downloading.")
os.rename(
os.path.join(dirname, name + ".json"),
os.path.join(dirname, name + ".backup.json"))
2017-09-02 19:41:16 +00:00
local_versions = []
if local_lib_json is not None:
for lib_ver in local_lib_json['assets']:
local_versions.append(lib_ver['version'])
2017-09-03 10:00:05 +00:00
cdnjs_versions = []
for lib_ver in cdnjs_lib_json['assets']:
cdnjs_versions.append(lib_ver['version'])
2017-09-02 19:41:16 +00:00
for lib_ver in cdnjs_lib_json['assets']:
version = lib_ver['version']
2017-09-03 14:56:27 +00:00
logging.info(" Checking " + str(lib['name']) + " " + str(version))
files_with_hashes = []
2017-09-02 19:41:16 +00:00
if not force and version in local_versions:
2017-09-03 14:56:27 +00:00
logging.info(" Updating from local record.")
2017-09-02 20:44:20 +00:00
old_record = next(x for x in local_lib_json['assets']
if x['version'] == lib_ver['version'])
2017-09-02 19:41:16 +00:00
files_with_hashes = old_record['files']
else:
2017-09-03 14:56:27 +00:00
logging.warning(" Updating from remote record (" + name + " " +
version + ").")
2017-09-02 19:41:16 +00:00
for jsfile in lib_ver['files']:
jsfile_url = get_jsfile_url(name, version, jsfile)
2017-09-03 14:56:27 +00:00
logging.info(" " + jsfile_url)
try:
res_jsfile = requests.get(jsfile_url, timeout=10)
except Exception as e:
logging.error("Exception during download of assets of " +
name + " from " + jsfile_url + ":")
logging.error(str(e))
return
2017-09-04 08:11:27 +00:00
if res_jsfile.status_code == 403 or res_jsfile.status_code == 404:
2017-09-07 19:09:29 +00:00
logging.warning("Access denied: cannot access assests of "
2017-11-27 06:50:25 +00:00
+ name + " (status code: " + str(
res_jsfile.status_code) + ") " + str(
res_jsfile.url))
2017-09-04 08:11:27 +00:00
files_with_hashes.append({
2017-09-07 19:09:29 +00:00
'filename':
jsfile,
'url':
jsfile_url,
'first_seen':
datetime.datetime.utcnow().isoformat(),
'http_status_code':
res_jsfile.status_code
2017-09-04 08:11:27 +00:00
})
elif res_jsfile.status_code == 200:
data = res_jsfile.content
files_with_hashes.append({
2017-09-07 19:09:29 +00:00
'filename':
jsfile,
'md5':
hashlib.md5(data).hexdigest(),
'sha1':
hashlib.sha1(data).hexdigest(),
'sha256':
hashlib.sha256(data).hexdigest(),
'url':
jsfile_url,
'first_seen':
datetime.datetime.utcnow().isoformat(),
'size':
len(data),
'http_status_code':
res_jsfile.status_code
2017-09-04 08:11:27 +00:00
})
else:
2017-09-07 19:09:29 +00:00
logging.error("Unknown error: cannot access assests of " +
2017-11-27 06:50:25 +00:00
name + " (status code: " + str(
res_jsfile.status_code) + ") " + str(
res_jsfile.url))
logging.error(str(res_jsfile.content))
return
lib_ver['files'] = files_with_hashes
2017-09-03 10:00:05 +00:00
if local_lib_json is not None:
outphased = []
for lib_ver in local_lib_json['assets']:
version = lib_ver['version']
2018-04-21 18:00:07 +00:00
if version not in cdnjs_versions:
logging.warning("Found outphased versions for " + name + " " +
str(version) + " , preserving from archive.")
2018-04-21 18:00:07 +00:00
if 'outphased' not in lib_ver:
2017-09-07 19:09:29 +00:00
lib_ver[
'outphased'] = datetime.datetime.utcnow().isoformat()
outphased.append(lib_ver)
if outphased:
cdnjs_lib_json['assets'] = cdnjs_lib_json['assets'] + outphased
2017-09-03 10:00:05 +00:00
output = os.path.join(dirname, name + ".json")
2017-09-03 14:56:27 +00:00
logging.info(" Saving " + str(output))
with open(output, "w") as json_file:
json.dump(cdnjs_lib_json, json_file)
2017-09-02 20:44:20 +00:00
2017-09-02 20:53:58 +00:00
def build_hash_map_of_lib(hashalg, archive, lib):
"""Build dictionary with file information using the file hash as key."""
2017-09-10 21:59:07 +00:00
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
2017-09-02 20:53:58 +00:00
hash_map = {}
try:
with open(os.path.join(dirname, lib + ".json"), "r") as json_file:
local_lib_json = json.load(json_file)
except IOError:
return None
for lib_ver in local_lib_json['assets']:
version = lib_ver['version']
for jsfile in lib_ver['files']:
2017-09-02 20:53:58 +00:00
hashvalue = jsfile[hashalg]
hash_map[hashvalue] = {
'library': lib,
'version': version,
'file': jsfile['filename'],
}
if 'outphased' in jsfile:
(hash_map[hashvalue])['outphased'] = jsfile['outphased']
2017-09-05 09:15:48 +00:00
if 'first_seen' in jsfile:
(hash_map[hashvalue])['first_seen'] = jsfile['first_seen']
2017-09-02 20:53:58 +00:00
return hash_map
2017-09-02 20:53:58 +00:00
def build_sha1_map_of_lib(archive, lib):
"""Build dictionary with file information using the file sha1 as key."""
return build_hash_map_of_lib("sha1", archive, lib)
2017-09-02 20:44:20 +00:00
2017-09-02 20:53:58 +00:00
def build_md5_map_of_lib(archive, lib):
"""Build dictionary with file information using the file md5 as key."""
return build_hash_map_of_lib("md5", archive, lib)
2017-09-02 20:53:58 +00:00
def build_hash_map(hashalg, archive):
"""Build file information dictionary using the file hash as key"""
hash_map = None
for lib in get_local_libs(archive):
2017-09-02 20:53:58 +00:00
lib_map = build_hash_map_of_lib(hashalg, archive, lib)
if lib_map is not None and hash_map is not None:
hash_map.update(lib_map)
else:
2017-09-02 20:53:58 +00:00
hash_map = lib_map
return hash_map
2017-09-02 20:53:58 +00:00
def build_sha1_map(archive):
"""Build file information dictionary using the sha1 hash as key"""
return build_hash_map("sha1", archive)
2017-09-02 20:53:58 +00:00
def build_md5_map(archive):
"""Build file information dictionary using the md5 hash as key"""
return build_hash_map("md5", archive)
2017-09-02 20:44:20 +00:00
def update_md5_map_file(archive):
"""Update file containing md5 information for all files."""
2017-09-10 21:59:07 +00:00
with open(os.path.join(archive, "filedb", "cdnjs-md5.json"),
"w") as json_file:
json.dump(build_md5_map(archive), json_file)
def update_sha1_map_file(archive):
"""Update file containing sha1 information for all files."""
2017-09-10 21:59:07 +00:00
with open(os.path.join(archive, "filedb", "cdnjs-sha1.json"),
"w") as json_file:
json.dump(build_sha1_map(archive), json_file)
def delete_orphaned(archive, local_libs, cdnjs_current_libs):
"""Delete all orphaned local libaries."""
2017-09-10 21:59:07 +00:00
dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
for lib in local_libs:
2018-04-21 18:00:07 +00:00
if lib not in cdnjs_current_libs:
os.remove(os.path.join(dirname, lib + ".json"))
2017-09-03 14:56:27 +00:00
def update_jslib_archive(force, clean, archive):
"""Update information for all available JavaScript libraries."""
cdnjs_all_libs_url = get_cdnjs_all_libs_url()
try:
res = requests.get(cdnjs_all_libs_url, timeout=10)
except Exception as e:
logging.error("Exception during download of library overview from " +
cdnjs_all_libs_url + ":")
logging.error(str(e))
sys.exit(1)
if not res.status_code == 200:
logging.error("Could not obtain library overview (http status code: " +
str(res.status_code) + ")")
logging.error(str(res.content))
sys.exit(1)
cdnjs_lib_catalog = res.json()['results']
if clean:
local_lib_catalog = get_local_libs(archive)
delete_orphaned(archive, local_lib_catalog, cdnjs_lib_catalog)
2017-09-10 21:59:07 +00:00
dirname = os.path.join(archive, "filedb", "cdnjs")
os.makedirs(str(dirname), exist_ok=True)
with open(os.path.join(dirname, "cdnjs-libraries.json"), "w") as json_file:
json.dump(res.json(), json_file)
2017-09-03 14:56:27 +00:00
logging.info("Found " + str(len(cdnjs_lib_catalog)) +
" different libraries")
2017-09-03 10:00:05 +00:00
2017-09-03 14:56:27 +00:00
with Pool(32) as p:
p.map(partial(update_lib, force, archive), cdnjs_lib_catalog)