372 lines
14 KiB
Python
372 lines
14 KiB
Python
#!/usr/bin/env python3.7
|
|
#
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
""" Module for obtaining md5/sha1/sha256 hashes for all files available
|
|
at CDNJS.com by mining the cdnjs git repository."""
|
|
|
|
import csv
|
|
import gc
|
|
import glob
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
from functools import reduce
|
|
|
|
import dateutil.parser
|
|
import git
|
|
|
|
import ExtensionCrawler.config as config
|
|
from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
|
|
from ExtensionCrawler.file_identifiers import get_file_identifiers
|
|
|
|
|
|
def get_add_date(git_path, filename):
|
|
"""Method for getting the initial add/commit date of a file."""
|
|
try:
|
|
gitobj = git.Git(git_path)
|
|
add_date_string = gitobj.log("--follow", "--format=%aD", "--reverse",
|
|
filename).splitlines()[0]
|
|
del gitobj
|
|
gc.collect()
|
|
logging.info(filename + " was added on " + add_date_string)
|
|
return dateutil.parser.parse(add_date_string)
|
|
except Exception as e:
|
|
logging.debug("Exception during git log for " + filename + ":\n" +
|
|
(str(e)))
|
|
return None
|
|
|
|
|
|
def pull_list_changed_files(git_path):
|
|
"""Pull new updates from remote origin."""
|
|
git_repo = git.Repo(git_path)
|
|
logging.info(" HEAD: " + str(git_repo.head.commit))
|
|
logging.info(" is detached: " + str(git_repo.head.is_detached))
|
|
logging.info(" is dirty: " + str(git_repo.is_dirty()))
|
|
if git_repo.head.is_detached:
|
|
raise Exception("Detached head")
|
|
if git_repo.is_dirty():
|
|
raise Exception("Dirty repository")
|
|
|
|
files = []
|
|
cdnjs_origin = git_repo.remotes.origin
|
|
fetch_info = cdnjs_origin.pull()
|
|
for single_fetch_info in fetch_info:
|
|
for diff in single_fetch_info.commit.diff(
|
|
single_fetch_info.old_commit):
|
|
logging.debug("Found diff: " + str(diff))
|
|
if diff.a_blob is not None:
|
|
if diff.a_blob.path not in files:
|
|
files.append(diff.a_blob.path)
|
|
return files
|
|
|
|
|
|
def hackish_pull_list_changed_files(git_path):
|
|
"""Pull new updates from remote origin (hack, using git binary -
|
|
faster but not as safe as GitPython)."""
|
|
git_repo = git.Repo(git_path)
|
|
logging.info(" HEAD: " + str(git_repo.head.commit))
|
|
logging.info(" is detached: " + str(git_repo.head.is_detached))
|
|
logging.info(" is dirty: " + str(git_repo.is_dirty()))
|
|
if git_repo.head.is_detached:
|
|
raise Exception("Detached head")
|
|
if git_repo.is_dirty():
|
|
raise Exception("Dirty repository")
|
|
del git_repo
|
|
gc.collect()
|
|
|
|
files = set()
|
|
git_obj = git.Git(git_path)
|
|
pull_lines = git_obj.pull().splitlines()
|
|
del git_obj
|
|
gc.collect()
|
|
|
|
for line in pull_lines:
|
|
match = re.search(r'^ (.+) \| .*$', line)
|
|
if match is not None:
|
|
changed_files = match.group(1).split('=>')
|
|
for changed_file in changed_files:
|
|
files.add(changed_file.strip())
|
|
return list(files)
|
|
|
|
|
|
def path_to_list(path):
|
|
"""Convert a path (string) to a list of folders/files."""
|
|
plist = []
|
|
while True:
|
|
(head, tail) = os.path.split(path)
|
|
if head == '':
|
|
if tail == '':
|
|
break
|
|
else:
|
|
plist.append(tail)
|
|
break
|
|
else:
|
|
if tail == '':
|
|
plist.append(head)
|
|
break
|
|
else:
|
|
plist.append(tail)
|
|
path = head
|
|
return list(reversed(plist))
|
|
|
|
|
|
def get_file_libinfo(release_dic, git_path, libfile):
|
|
"""Compute file idenfifiers and library information of libfile."""
|
|
logging.info("Computing file info for " + libfile)
|
|
file_info = get_file_identifiers(libfile)
|
|
plist = path_to_list(libfile)
|
|
idx = plist.index("libs")
|
|
lib = plist[idx + 1]
|
|
version = plist[idx + 2]
|
|
file_info['path'] = os.path.relpath(file_info['path'],
|
|
git_path + "/ajax/libs")
|
|
file_info['library'] = lib
|
|
file_info['version'] = version
|
|
file_info['add_date'] = release_dic[(lib, version)]
|
|
# TODO: why is package not used?
|
|
package = os.path.join(
|
|
reduce(os.path.join, plist[:idx + 1]), "package.json")
|
|
return file_info
|
|
|
|
|
|
def pull_get_updated_lib_files(cdnjs_git_path):
|
|
"""Pull repository and determine updated libraries."""
|
|
logging.info("Building file list (only updates)")
|
|
libvers = set()
|
|
files = []
|
|
for update in hackish_pull_list_changed_files(cdnjs_git_path):
|
|
if not (os.path.basename(update) in ["package.json", ".gitkeep"]):
|
|
if update.startswith("ajax"):
|
|
fname = os.path.join(cdnjs_git_path, update)
|
|
files.append(fname)
|
|
plist = path_to_list(update)
|
|
libvers.add(reduce(os.path.join, plist[:4]))
|
|
logging.info("Found " + str(len(files)) + " files")
|
|
logging.info("Found " + str(len(libvers)) +
|
|
" unique library/version combinations.")
|
|
return files, list(libvers)
|
|
|
|
|
|
def get_all_lib_files(cdnjs_git_path, localpath=None):
|
|
"""Return all libraries stored in cdnjs git repo."""
|
|
libvers = set()
|
|
files = []
|
|
versionidx = len(path_to_list(cdnjs_git_path)) + 4
|
|
if localpath is not None:
|
|
paths = os.path.join(cdnjs_git_path, localpath)
|
|
else:
|
|
paths = os.path.join(cdnjs_git_path, 'ajax/libs/**/*')
|
|
|
|
logging.info("Building file list for: " + str(paths))
|
|
|
|
for fname in glob.iglob(paths, recursive=True):
|
|
if not os.path.isdir(fname):
|
|
if not os.path.basename(fname) in ["package.json", ".gitkeep"]:
|
|
files.append(fname)
|
|
else:
|
|
plist = path_to_list(fname)
|
|
if len(plist) == versionidx:
|
|
libvers.add(fname)
|
|
gc.collect()
|
|
|
|
logging.info("Found " + str(len(files)) + " files")
|
|
logging.info("Found " + str(len(libvers)) +
|
|
" unique library/version combinations.")
|
|
return files, list(libvers)
|
|
|
|
|
|
def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
|
|
con):
|
|
"""Update database for all file."""
|
|
if os.path.isfile(filename):
|
|
logging.info("Updating database for file " + filename)
|
|
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
|
|
if file_info is not None:
|
|
if create_csv:
|
|
print(file_info['path'])
|
|
print(cdnjs_git_path)
|
|
file_info['path'] = re.sub(r'^.*\/ajax\/', 'ajax/',
|
|
file_info['path'])
|
|
for key in [
|
|
'md5', 'sha1', 'sha256', 'normalized_md5',
|
|
'normalized_sha1', 'normalized_sha256',
|
|
'dec_normalized_md5', 'dec_normalized_sha1',
|
|
'dec_normalized_sha256', 'dec_md5', 'dec_sha1',
|
|
'dec_sha256'
|
|
]:
|
|
if not file_info[key] is None:
|
|
file_info[key] = (file_info[key]).hex()
|
|
csv_writer = csv.DictWriter(sys.stdout, file_info.keys())
|
|
csv_writer.writeheader()
|
|
csv_writer.writerow(file_info)
|
|
else:
|
|
logging.info("Updating database ...")
|
|
for prefix, typ in [("", "AS_IS"), ("normalized_",
|
|
"NORMALIZED"),
|
|
("dec_", "DECOMPRESSED"),
|
|
("dec_normalized_",
|
|
"DECOMPRESSED_NORMALIZED")]:
|
|
if file_info[prefix + "md5"] is not None:
|
|
con.insert(
|
|
"cdnjs",
|
|
md5=file_info[prefix + "md5"],
|
|
sha1=file_info[prefix + "sha1"],
|
|
sha256=file_info[prefix + "sha256"],
|
|
simhash=file_info[prefix + "simhash"],
|
|
size=file_info[prefix + "size"],
|
|
loc=file_info[prefix + "loc"],
|
|
description=file_info[prefix + "description"],
|
|
encoding=file_info[prefix + "encoding"],
|
|
mimetype=file_info["mimetype"][0]
|
|
if "mimetype" in file_info else None,
|
|
mimetype_detail=file_info["mimetype"][1]
|
|
if "mimetype" in file_info else None,
|
|
path=file_info["path"],
|
|
filename=file_info["filename"],
|
|
add_date=file_info["add_date"],
|
|
library=file_info["library"],
|
|
version=file_info["version"],
|
|
typ=typ)
|
|
|
|
else:
|
|
logging.error("Skipping update for deleted file " + filename)
|
|
|
|
|
|
def update_database_for_file_chunked_timeout(create_csv, release_dic,
|
|
cdnjs_git_path, filenames):
|
|
logging.info("Creating MariaDB Connection")
|
|
with MysqlBackend(
|
|
None,
|
|
read_default_file=config.const_mysql_config_file(),
|
|
charset='utf8mb4',
|
|
compress=True) as con:
|
|
logging.info("Created MariaDB connection - start to update data base ("
|
|
+ str(len(filenames)) + " files)")
|
|
for filename in filenames:
|
|
update_database_for_file(create_csv, release_dic, cdnjs_git_path,
|
|
filename, con)
|
|
|
|
|
|
def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
|
|
filenames):
|
|
logging.info("Creating MariaDB Connection")
|
|
retries = 0
|
|
success = False
|
|
max_retries = 4
|
|
while not success and (retries < max_retries):
|
|
try:
|
|
update_database_for_file_chunked_timeout(create_csv, release_dic,
|
|
cdnjs_git_path, filenames)
|
|
logging.info("Updated data base chunk successfully")
|
|
success = True
|
|
except Exception as e:
|
|
logging.warning("Exception during data base chunk update: " +
|
|
(str(e)))
|
|
retries = retries + 1
|
|
if retries < max_retries:
|
|
logging.warning(" Retrying")
|
|
else:
|
|
logging.warning(" Giving up")
|
|
|
|
|
|
def chunks(l, n):
|
|
"""Yield successive n-sized chunks from l."""
|
|
logging.info("Computing junk " + str(n))
|
|
for i in range(0, len(l), n):
|
|
yield l[i:i + n]
|
|
|
|
|
|
def update_database(create_csv, release_dic, cdnjs_git_path, files):
|
|
"""Update database for all files in files."""
|
|
logging.info("Updating data base")
|
|
for chunk in chunks(list(files), 200):
|
|
update_database_for_file_chunked(create_csv, release_dic,
|
|
cdnjs_git_path, chunk)
|
|
|
|
|
|
def get_release_triple(git_path, libver):
|
|
plist = path_to_list(libver)
|
|
ver = plist[-1]
|
|
lib = plist[-2]
|
|
date = get_add_date(git_path, libver)
|
|
logging.info("Release information:" + lib + " " + ver + ": " + str(date))
|
|
return lib, ver, date
|
|
|
|
|
|
def build_release_date_dic(git_path, libvers):
|
|
""""Build dictionary of release date with the tuple (library, version) as key."""
|
|
logging.info("Building release dictionary")
|
|
libverdates = []
|
|
for libver in libvers:
|
|
libverdates.append(get_release_triple(git_path, libver))
|
|
release_date_dic = {}
|
|
for (lib, ver, date) in libverdates:
|
|
release_date_dic[(lib, ver)] = date
|
|
return release_date_dic
|
|
|
|
|
|
def pull_and_update_db(cdnjs_git_path, create_csv):
|
|
"""Pull repo and update database."""
|
|
logging.info("Pulling and updating data base")
|
|
files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
|
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers)
|
|
del libvers
|
|
gc.collect()
|
|
update_database(create_csv, release_dic, cdnjs_git_path, files)
|
|
|
|
|
|
def update_db_from_listfile(cdnjs_git_path, listfile, create_csv):
|
|
"""Update database (without pull) for files in listfile)"""
|
|
with open(listfile) as listfileobj:
|
|
paths = listfileobj.read().splitlines()
|
|
files = []
|
|
libvers = []
|
|
for path in paths:
|
|
path_files, path_libvers = get_all_lib_files(cdnjs_git_path, path)
|
|
libvers = libvers + path_libvers
|
|
files = files + path_files
|
|
logging.info("In total, found " + str(len(files)) + " files in " + str(
|
|
len(libvers)) + " liberies/versions.")
|
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers)
|
|
update_database(create_csv, release_dic, cdnjs_git_path, files)
|
|
|
|
|
|
def update_db_all_libs(cdnjs_git_path, create_csv, taskid=1, maxtaskid=1):
|
|
"""Update database entries for all libs in git repo."""
|
|
files, libvers = get_all_lib_files(cdnjs_git_path)
|
|
|
|
if maxtaskid > 1:
|
|
logging.info("Running task " + str(taskid) + " of " + str(maxtaskid))
|
|
chunksize = int(len(files) / maxtaskid)
|
|
if taskid == maxtaskid:
|
|
files = files[(taskid - 1) * chunksize:]
|
|
else:
|
|
files = files[(taskid - 1) * chunksize:taskid * chunksize]
|
|
libvers = set()
|
|
versionidx = len(path_to_list(cdnjs_git_path)) + 4
|
|
for path in files:
|
|
libvers.add(reduce(os.path.join, path_to_list(path)[:versionidx]))
|
|
libvers = list(libvers)
|
|
logging.info("This task has " + str(len(files)) + " files from " +
|
|
str(len(libvers)) + " library version(s).")
|
|
|
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers)
|
|
del libvers
|
|
gc.collect()
|
|
update_database(create_csv, release_dic, cdnjs_git_path, files)
|