#!/usr/bin/env python3.6 # # Copyright (C) 2016,2017 The University of Sheffield, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # """ Module for obtaining md5/sha1/sha256 hashes for all files available at CDNJS.com by mining the cdnjs git repository.""" import csv import gc import glob import logging import os import re import sys from functools import partial, reduce from multiprocessing import Pool import dateutil.parser import git import ExtensionCrawler.config as config from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend from ExtensionCrawler.file_identifiers import get_file_identifiers def get_add_date(git_path, filename): """Method for getting the initial add/commit date of a file.""" try: gitobj = git.Git(git_path) add_date_string = gitobj.log("--follow", "--format=%aD", "--reverse", filename).splitlines()[0] del gitobj gc.collect() logging.info(filename + " was added on " + add_date_string) return dateutil.parser.parse(add_date_string) except Exception as e: logging.debug("Exception during git log for " + filename + ":\n" + (str(e))) return None def pull_list_changed_files(git_path): """Pull new updates from remote origin.""" git_repo = git.Repo(git_path) logging.info(" HEAD: " + str(git_repo.head.commit)) logging.info(" is detached: " + str(git_repo.head.is_detached)) logging.info(" is dirty: " + str(git_repo.is_dirty())) if git_repo.head.is_detached: raise Exception("Detached head") if git_repo.is_dirty(): raise Exception("Dirty repository") files = [] cdnjs_origin = git_repo.remotes.origin fetch_info = cdnjs_origin.pull() for single_fetch_info in fetch_info: for diff in single_fetch_info.commit.diff( single_fetch_info.old_commit): logging.debug("Found diff: " + str(diff)) if not diff.a_blob is None: if not diff.a_blob.path in files: files.append(diff.a_blob.path) return files def hackish_pull_list_changed_files(git_path): """Pull new updates from remote origin (hack, using git binary - faster but not as safe as GitPython).""" git_repo = git.Repo(git_path) logging.info(" HEAD: " + str(git_repo.head.commit)) logging.info(" is detached: " + str(git_repo.head.is_detached)) logging.info(" is dirty: " + str(git_repo.is_dirty())) if git_repo.head.is_detached: raise Exception("Detached head") if git_repo.is_dirty(): raise Exception("Dirty repository") del git_repo gc.collect() files = set() git_obj = git.Git(git_path) pull_lines = git_obj.pull().splitlines() del git_obj gc.collect() for line in pull_lines: match = re.search(r'^ (.+) \| .*$', line) if not match is None: changed_files = match.group(1).split('=>') for changed_file in changed_files: files.add(changed_file.strip()) return list(files) def path_to_list(path): """Convert a path (string) to a list of folders/files.""" plist = [] while True: (head, tail) = os.path.split(path) if head == '': if tail == '': break else: plist.append(tail) break else: if tail == '': plist.append(head) break else: plist.append(tail) path = head return list(reversed(plist)) def get_file_libinfo(release_dic, git_path, libfile): """Compute file idenfifiers and library information of libfile.""" logging.info("Computing file info for " + libfile) file_info = get_file_identifiers(libfile) plist = path_to_list(libfile) idx = plist.index("libs") lib = plist[idx + 1] version = plist[idx + 2] file_info['path'] = os.path.relpath(file_info['path'], git_path + "/ajax/libs") file_info['library'] = lib file_info['version'] = version file_info['add_date'] = release_dic[(lib, version)] package = os.path.join( reduce(os.path.join, plist[:idx + 1]), "package.json") return file_info def pull_get_updated_lib_files(cdnjs_git_path): """Pull repository and determine updated libraries.""" logging.info("Building file list (only updates)") libvers = set() files = [] for update in hackish_pull_list_changed_files(cdnjs_git_path): if not (os.path.basename(update) in ["package.json", ".gitkeep"]): if update.startswith("ajax"): fname = os.path.join(cdnjs_git_path, update) files.append(fname) plist = path_to_list(update) libvers.add(reduce(os.path.join, plist[:4])) logging.info("Found " + str(len(files)) + " files") logging.info("Found " + str(len(libvers)) + " unique library/version combinations.") return files, list(libvers) def get_all_lib_files(cdnjs_git_path, localpath=None): """Return all libraries stored in cdnjs git repo.""" libvers = set() files = [] versionidx = len(path_to_list(cdnjs_git_path)) + 4 if not localpath is None: paths = os.path.join(cdnjs_git_path, localpath) else: paths = os.path.join(cdnjs_git_path, 'ajax/libs/**/*') logging.info("Building file list for: " + str(paths)) for fname in glob.iglob(paths, recursive=True): if not os.path.isdir(fname): if not os.path.basename(fname) in ["package.json", ".gitkeep"]: files.append(fname) else: plist = path_to_list(fname) if len(plist) == versionidx: libvers.add(fname) gc.collect() logging.info("Found " + str(len(files)) + " files") logging.info("Found " + str(len(libvers)) + " unique library/version combinations.") return files, list(libvers) def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename, con): """Update database for all file.""" if os.path.isfile(filename): logging.info("Updating database for file " + filename) file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename) if not file_info is None: if create_csv: print(file_info['path']) print(cdnjs_git_path) file_info['path'] = re.sub(r'^.*\/ajax\/', 'ajax/', file_info['path']) for key in [ 'md5', 'sha1', 'sha256', 'normalized_md5', 'normalized_sha1', 'normalized_sha256', 'dec_normalized_md5', 'dec_normalized_sha1', 'dec_normalized_sha256', 'dec_md5', 'dec_sha1', 'dec_sha256' ]: if not file_info[key] is None: file_info[key] = (file_info[key]).hex() csv_writer = csv.DictWriter(sys.stdout, file_info.keys()) csv_writer.writeheader() csv_writer.writerow(file_info) else: logging.info("Updating database ...") for prefix, typ in [("", "AS_IS"), ("normalized_", "NORMALIZED"), ("dec_", "DECOMPRESSED"), ("dec_normalized_", "DECOMPRESSED_NORMALIZED")]: if file_info[prefix + "md5"] is not None: con.insert( "cdnjs", md5=file_info[prefix + "md5"], sha1=file_info[prefix + "sha1"], sha256=file_info[prefix + "sha256"], size=file_info[prefix + "size"], loc=file_info[prefix + "loc"], description=file_info[prefix + "description"], encoding=file_info[prefix + "encoding"], mimetype=file_info["mimetype"][0] if "mimetype" in file_info else None, mimetype_detail=file_info["mimetype"][1] if "mimetype" in file_info else None, path=file_info["path"], filename=file_info["filename"], add_date=file_info["add_date"], library=file_info["library"], version=file_info["version"], typ=typ) else: logging.error("Skipping update for deleted file " + filename) def update_database_for_file_chunked_timeout(create_csv, release_dic, cdnjs_git_path, filenames): logging.info("Creating MariaDB Connection") with MysqlBackend( None, read_default_file=config.const_mysql_config_file(), charset='utf8mb4', compress=True) as con: logging.info("Created MariaDB connection - start to update data base (" + str(len(filenames)) + " files)") for filename in filenames: update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename, con) def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, filenames): logging.info("Creating MariaDB Connection") retries = 0 success = False max_retries = 4 while (not success and (retries < max_retries)): try: update_database_for_file_chunked_timeout(create_csv, release_dic, cdnjs_git_path, filenames) logging.info("Updated data base chunk successfully") success = True except Exception as e: logging.warning("Exception during data base chunk update: " + (str(e))) retries = retries + 1 if retries < max_retries: logging.warning(" Retrying") else: logging.warning(" Giving up") def chunks(l, n): """Yield successive n-sized chunks from l.""" logging.info("Computing junk " + str(n)) for i in range(0, len(l), n): yield l[i:i + n] def update_database(create_csv, release_dic, cdnjs_git_path, files): """Update database for all files in files.""" logging.info("Updating data base") for chunk in chunks(list(files), 200): update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path, chunk) def get_release_triple(git_path, libver): plist = path_to_list(libver) ver = plist[-1] lib = plist[-2] date = get_add_date(git_path, libver) logging.info("Release information:" + lib + " " + ver + ": " + str(date)) return (lib, ver, date) def build_release_date_dic(git_path, libvers): """"Build dictionary of release date with the tuple (library, version) as key.""" logging.info("Building release dictionary") libverdates = [] for libver in libvers: libverdates.append(get_release_triple(git_path, libver)) release_date_dic = {} for (lib, ver, date) in libverdates: release_date_dic[(lib, ver)] = date return release_date_dic def pull_and_update_db(cdnjs_git_path, create_csv): """Pull repo and update database.""" logging.info("Pulling and updating data base") files, libvers = pull_get_updated_lib_files(cdnjs_git_path) release_dic = build_release_date_dic(cdnjs_git_path, libvers) del libvers gc.collect() update_database(create_csv, release_dic, cdnjs_git_path, files) def update_db_from_listfile(cdnjs_git_path, listfile, create_csv): """Update database (without pull) for files in listfile)""" paths = [] with open(listfile) as listfileobj: paths = listfileobj.read().splitlines() files = [] libvers = [] for path in paths: path_files, path_libvers = get_all_lib_files(cdnjs_git_path, path) libvers = libvers + path_libvers files = files + path_files logging.info("In total, found " + str(len(files)) + " files in " + str(len(libvers)) + " liberies/versions.") release_dic = build_release_date_dic(cdnjs_git_path, libvers) update_database(create_csv, release_dic, cdnjs_git_path, files) def update_db_all_libs(cdnjs_git_path, create_csv, taskid=1, maxtaskid=1): """Update database entries for all libs in git repo.""" files, libvers = get_all_lib_files(cdnjs_git_path) if maxtaskid > 1: logging.info("Running task " + str(taskid) + " of " + str(maxtaskid)) chunksize = int(len(files) / maxtaskid) if taskid == maxtaskid: files = files[(taskid - 1) * chunksize:] else: files = files[(taskid - 1) * chunksize:taskid * chunksize] libvers = set() versionidx = len(path_to_list(cdnjs_git_path)) + 4 for path in files: libvers.add(reduce(os.path.join, path_to_list(path)[:versionidx])) libvers = list(libvers) logging.info("This task has " + str(len(files)) + " files from " + str(len(libvers)) + " library version(s).") release_dic = build_release_date_dic(cdnjs_git_path, libvers) del libvers gc.collect() update_database(create_csv, release_dic, cdnjs_git_path, files)