Merge branch 'master' of logicalhacking.com:BrowserSecurity/ExtensionCrawler

This commit is contained in:
Michael Herzberg 2017-09-04 15:54:38 +01:00
commit a9173345e8
5 changed files with 371 additions and 0 deletions

286
ExtensionCrawler/cdnjs.py Normal file
View File

@ -0,0 +1,286 @@
#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
""" Module for obtaining md5/sha1/sha256 hashes for all files available
at CDNJS.com."""
import datetime
import glob
import hashlib
import json
import logging
import os
import re
import sys
from functools import partial
from multiprocessing import Pool
import requests
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
def get_cdnjs_all_libs_url():
"""URL for obtaining list of all available libraries, see https://cdnjs.com/api for details."""
return "https://api.cdnjs.com/libraries"
def get_jsfile_url(lib, version, jsfile):
"""URL for obtaining detailed list of all available files/versionf of
a JavaScript library, see https://cdnjs.com/api for details."""
return "https://cdnjs.cloudflare.com/ajax/libs/{}/{}/{}".format(
lib, version, jsfile)
def get_local_libs(archive):
"""Get list of locally available libraries."""
dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
return (list(
map(lambda f: re.sub(".json$", "", os.path.basename(f)),
glob.glob(os.path.join(dirname, "*.json")))))
def update_lib(force, archive, lib):
"""Update information for a JavaScript library."""
name = lib['name']
try:
lib_res = requests.get(get_cdnjs_all_libs_url() + "/" + lib['name'],
timeout=10)
except Exception as e:
logging.error("Exception during download of library overview for " +
name + "from " + get_cdnjs_all_libs_url() + "/" +
lib['name'] + ":")
logging.error(str(e))
return
if not lib_res.status_code == 200:
logging.error(" Cannot access overview for " + name +
"(status codce: " + str(
lib_res.status_code) + ") " + str(lib_res.url))
logging.error(str(lib_res.content))
return
cdnjs_lib_json = lib_res.json()
dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
os.makedirs(str(dirname), exist_ok=True)
try:
with open(os.path.join(dirname, name + ".json"), "r") as json_file:
local_lib_json = json.load(json_file)
except IOError:
local_lib_json = None
except json.decoder.JSONDecodeError:
local_lib_json = None
logging.warning(" JSON file (" + os.path.join(dirname, name + ".json")
+ ") defect, re-downloading.")
os.rename(
os.path.join(dirname, name + ".json"),
os.path.join(dirname, name + ".backup.json"))
local_versions = []
if local_lib_json is not None:
for lib_ver in local_lib_json['assets']:
local_versions.append(lib_ver['version'])
cdnjs_versions = []
for lib_ver in cdnjs_lib_json['assets']:
cdnjs_versions.append(lib_ver['version'])
for lib_ver in cdnjs_lib_json['assets']:
version = lib_ver['version']
logging.info(" Checking " + str(lib['name']) + " " + str(version))
files_with_hashes = []
if not force and version in local_versions:
logging.info(" Updating from local record.")
old_record = next(x for x in local_lib_json['assets']
if x['version'] == lib_ver['version'])
files_with_hashes = old_record['files']
else:
logging.warning(" Updating from remote record (" + name + " " +
version + ").")
for jsfile in lib_ver['files']:
jsfile_url = get_jsfile_url(name, version, jsfile)
logging.info(" " + jsfile_url)
try:
res_jsfile = requests.get(jsfile_url, timeout=10)
except Exception as e:
logging.error("Exception during download of assets of " +
name + " from " + jsfile_url + ":")
logging.error(str(e))
return
if res_jsfile.status_code == 403 or res_jsfile.status_code == 404:
logging.warning("Access denied: cannot access assests of " + name +
" (status code: " + str(
res_jsfile.status_code) + ") " + str(
res_jsfile.url))
files_with_hashes.append({
'filename': jsfile,
'url': jsfile_url,
'first_seen': datetime.datetime.utcnow().isoformat(),
'http_status_code': res_jsfile.status_code
})
elif res_jsfile.status_code == 200:
data = res_jsfile.content
files_with_hashes.append({
'filename': jsfile,
'md5': hashlib.md5(data).hexdigest(),
'sha1': hashlib.sha1(data).hexdigest(),
'sha256': hashlib.sha256(data).hexdigest(),
'url': jsfile_url,
'first_seen': datetime.datetime.utcnow().isoformat(),
'size': len(data),
'http_status_code': res_jsfile.status_code
})
else:
logging.error("Unknown error: cannot access assests of " + name +
" (status code: " + str(
res_jsfile.status_code) + ") " + str(
res_jsfile.url))
logging.error(str(res_jsfile.content))
return
lib_ver['files'] = files_with_hashes
if local_lib_json is not None:
outphased = []
for lib_ver in local_lib_json['assets']:
version = lib_ver['version']
if not version in cdnjs_versions:
logging.warning("Found outphased versions for " + name + " " +
str(version) + " , preserving from archive.")
if not 'outphased' in lib_ver:
lib_ver['outphased'] = datetime.datetime.utcnow(
).isoformat()
outphased.append(lib_ver)
if outphased:
cdnjs_lib_json['assets'] = cdnjs_lib_json['assets'] + outphased
output = os.path.join(dirname, name + ".json")
logging.info(" Saving " + str(output))
with open(output, "w") as json_file:
json.dump(cdnjs_lib_json, json_file)
def build_hash_map_of_lib(hashalg, archive, lib):
"""Build dictionary with file information using the file hash as key."""
dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
hash_map = {}
try:
with open(os.path.join(dirname, lib + ".json"), "r") as json_file:
local_lib_json = json.load(json_file)
except IOError:
return None
for lib_ver in local_lib_json['assets']:
version = lib_ver['version']
for jsfile in lib_ver['files']:
hashvalue = jsfile[hashalg]
hash_map[hashvalue] = {
'library': lib,
'version': version,
'file': jsfile['filename'],
'first_seen': jsfile['first_seen']
}
if 'outphased' in jsfile:
(hash_map[hashvalue])['outphased'] = jsfile['outphased']
return hash_map
def build_sha1_map_of_lib(archive, lib):
"""Build dictionary with file information using the file sha1 as key."""
return build_hash_map_of_lib("sha1", archive, lib)
def build_md5_map_of_lib(archive, lib):
"""Build dictionary with file information using the file md5 as key."""
return build_hash_map_of_lib("md5", archive, lib)
def build_hash_map(hashalg, archive):
"""Build file information dictionary using the file hash as key"""
hash_map = None
for lib in get_local_libs(archive):
lib_map = build_hash_map_of_lib(hashalg, archive, lib)
if lib_map is not None and hash_map is not None:
hash_map.update(lib_map)
else:
hash_map = lib_map
return hash_map
def build_sha1_map(archive):
"""Build file information dictionary using the sha1 hash as key"""
return build_hash_map("sha1", archive)
def build_md5_map(archive):
"""Build file information dictionary using the md5 hash as key"""
return build_hash_map("md5", archive)
def update_md5_map_file(archive):
"""Update file containing md5 information for all files."""
with open(os.path.join(archive, "fileinfo", "cdnjs-md5.json"),
"w") as json_file:
json.dump(build_md5_map(archive), json_file)
def update_sha1_map_file(archive):
"""Update file containing sha1 information for all files."""
with open(os.path.join(archive, "fileinfo", "cdnjs-sha1.json"),
"w") as json_file:
json.dump(build_sha1_map(archive), json_file)
def delete_orphaned(archive, local_libs, cdnjs_current_libs):
"""Delete all orphaned local libaries."""
dirname = os.path.join(archive, "fileinfo", "cdnjs", "lib")
for lib in local_libs:
if not lib in cdnjs_current_libs:
os.remove(os.path.join(dirname, lib + ".json"))
def update_jslib_archive(force, clean, archive):
"""Update information for all available JavaScript libraries."""
cdnjs_all_libs_url = get_cdnjs_all_libs_url()
try:
res = requests.get(cdnjs_all_libs_url, timeout=10)
except Exception as e:
logging.error("Exception during download of library overview from " +
cdnjs_all_libs_url + ":")
logging.error(str(e))
sys.exit(1)
if not res.status_code == 200:
logging.error("Could not obtain library overview (http status code: " +
str(res.status_code) + ")")
logging.error(str(res.content))
sys.exit(1)
cdnjs_lib_catalog = res.json()['results']
if clean:
local_lib_catalog = get_local_libs(archive)
delete_orphaned(archive, local_lib_catalog, cdnjs_lib_catalog)
dirname = os.path.join(archive, "fileinfo", "cdnjs")
os.makedirs(str(dirname), exist_ok=True)
with open(os.path.join(dirname, "cdnjs-libraries.json"), "w") as json_file:
json.dump(res.json(), json_file)
logging.info("Found " + str(len(cdnjs_lib_catalog)) +
" different libraries")
with Pool(32) as p:
p.map(partial(update_lib, force, archive), cdnjs_lib_catalog)

0
ExtensionCrawler/discover.py Executable file → Normal file
View File

0
ExtensionCrawler/js_decomposer.py Executable file → Normal file
View File

0
ExtensionCrawler/js_mincer.py Executable file → Normal file
View File

85
cdnjs-crawler Executable file
View File

@ -0,0 +1,85 @@
#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
""" Tool for obtaining md5/sha1/sha256 hashes for all files available
at CDNJS.com."""
import getopt
import logging
import sys
from ExtensionCrawler.cdnjs import (update_jslib_archive, update_md5_map_file,
update_sha1_map_file)
from ExtensionCrawler.config import const_log_format
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
def helpmsg():
"""Print help message."""
print("cdnjs-crawler [OPTION]")
print(" -h print this help text")
print(" -s silent (no log messages)")
print(" -f force full download (default: update of json files)")
print(" -c delete outdated (no longer available) libraries")
print(" -a=<DIR> archive directory")
def main(argv):
"""Main function of the extension crawler."""
basedir = "archive"
verbose = True
force = False
clean = False
try:
opts, args = getopt.getopt(argv, "hsed:a:o:w",
["date=", "archive=", "output="])
except getopt.GetoptError:
helpmsg()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
helpmsg()
sys.exit()
elif opt in ("-a", "--archive"):
basedir = arg
elif opt == '-s':
verbose = False
elif opt == '-f':
force = True
elif opt == '-c':
clean = True
if verbose:
loglevel = logging.INFO
else:
loglevel = logging.WARNING
logger = logging.getLogger()
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(logging.Formatter(const_log_format()))
logger.addHandler(ch)
logger.setLevel(loglevel)
update_jslib_archive(force, clean, basedir)
update_sha1_map_file(basedir)
update_md5_map_file(basedir)
if __name__ == "__main__":
main(sys.argv[1:])