Added csv output for debugging.

This commit is contained in:
Achim D. Brucker 2017-09-16 13:21:49 +01:00
parent de6dde5269
commit c274b96f66
2 changed files with 54 additions and 22 deletions

View File

@ -29,6 +29,8 @@ import zlib
from functools import partial, reduce from functools import partial, reduce
from io import StringIO from io import StringIO
from multiprocessing import Pool from multiprocessing import Pool
import csv
import sys
import cchardet as chardet import cchardet as chardet
import dateutil.parser import dateutil.parser
@ -296,23 +298,46 @@ def get_all_lib_files(cdnjs_git_path):
return files, list(libvers) return files, list(libvers)
def update_database_for_file(release_dic, cdnjs_git_path, filename): def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
filename):
"""Update database for all file.""" """Update database for all file."""
if os.path.isfile(filename): if os.path.isfile(filename):
logging.info("Updating database for file " + filename) logging.info("Updating database for file " + filename)
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename) file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
if not file_info is None: if not file_info is None:
## TODO if create_csv:
logging.info("Updating database ...") print(file_info['path'])
print(cdnjs_git_path)
file_info['path'] = re.sub(r'^.*\/ajax\/', 'ajax/',
file_info['path'])
for key in [
'md5', 'sha1', 'sha256', 'normalized_md5',
'normalized_sha1', 'normalized_sha256',
'dec_normalized_md5', 'dec_normalized_sha1',
'dec_normalized_sha256', 'dec_md5', 'dec_sha1',
'dec_sha256'
]:
if not file_info[key] is None:
file_info[key] = (file_info[key]).hex()
csv_writer = csv.DictWriter(sys.stdout, file_info.keys())
csv_writer.writeheader()
csv_writer.writerow(file_info)
else:
logging.info("Updating database (TODO) ...")
else: else:
logging.info("Skipping update for deleted file " + filename) logging.info("Skipping update for deleted file " + filename)
def update_database(release_dic, cdnjs_git_path, files, poolsize=16):
def update_database(create_csv,
release_dic,
cdnjs_git_path,
files,
poolsize=16):
"""Update database for all files in files.""" """Update database for all files in files."""
with Pool(poolsize) as pool: with Pool(poolsize) as pool:
pool.map( pool.map(
partial(update_database_for_file, release_dic, cdnjs_git_path), partial(update_database_for_file, create_csv, release_dic,
files) cdnjs_git_path), files)
def get_release_triple(git_path, libver): def get_release_triple(git_path, libver):
@ -335,16 +360,21 @@ def build_release_date_dic(git_path, libvers, poolsize=16):
return release_date_dic return release_date_dic
def pull_and_update_db(cdnjs_git_path, poolsize=16): def pull_and_update_db(cdnjs_git_path, create_csv, poolsize=16):
"""Pull repo and update database.""" """Pull repo and update database."""
files, libvers = pull_get_updated_lib_files(cdnjs_git_path) files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
del libvers del libvers
gc.collect() gc.collect()
update_database(release_dic, cdnjs_git_path, files, poolsize) update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)
def update_db_all_libs(cdnjs_git_path, taskid=1, maxtaskid=1, poolsize=16): def update_db_all_libs(cdnjs_git_path,
create_csv,
taskid=1,
maxtaskid=1,
poolsize=16):
"""Update database entries for all libs in git repo.""" """Update database entries for all libs in git repo."""
files, libvers = get_all_lib_files(cdnjs_git_path) files, libvers = get_all_lib_files(cdnjs_git_path)
@ -366,4 +396,4 @@ def update_db_all_libs(cdnjs_git_path, taskid=1, maxtaskid=1, poolsize=16):
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize) release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
del libvers del libvers
gc.collect() gc.collect()
update_database(release_dic, cdnjs_git_path, files, poolsize) update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)

View File

@ -32,15 +32,14 @@ assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
def helpmsg(): def helpmsg():
"""Print help message.""" """Print help message."""
print("cdnjs-git-miner [OPTION]") print("cdnjs-git-miner [OPTION]")
print( print(" -i initialize/update database with all libraries in the repository")
" -i initialize/update database with all libraries in the repository" print(" -u update: pull repository and update database")
) print(" -p n update n files in parallel")
print(" -u update: pull repository and update database") print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -p n update n files in parallel") print(" -N <MAXTASKID> ")
print(" -n <TASKID> process chunk n where n in [1,N]") print(" -v verbose")
print(" -N <MAXTASKID> ") print(" -c print csv format to stdout instead of writing to database")
print(" -v verbose") print(" -h print this help text")
print(" -h print this help text")
def main(argv): def main(argv):
@ -52,9 +51,10 @@ def main(argv):
parallel_updates = 5 parallel_updates = 5
taskid = 1 taskid = 1
maxtaskid = 1 maxtaskid = 1
csv = False
try: try:
opts, args = getopt.getopt(argv, "hviup:n:N:", opts, args = getopt.getopt(argv, "hvicup:n:N:",
["taskid=", "maxtaskid="]) ["taskid=", "maxtaskid="])
except getopt.GetoptError: except getopt.GetoptError:
helpmsg() helpmsg()
@ -69,6 +69,8 @@ def main(argv):
initialize = True initialize = True
elif opt == '-u': elif opt == '-u':
update = True update = True
elif opt == '-c':
csv = True
elif opt == "-p": elif opt == "-p":
parallel_updates = int(arg) parallel_updates = int(arg)
elif opt in ("-n", "--taskid"): elif opt in ("-n", "--taskid"):
@ -90,9 +92,9 @@ def main(argv):
cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git") cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git")
if initialize: if initialize:
update_db_all_libs(cdnjs_git_path, taskid, maxtaskid, parallel_updates) update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid, parallel_updates)
if update: if update:
pull_and_update_db(cdnjs_git_path, parallel_updates) pull_and_update_db(cdnjs_git_path, csv, parallel_updates)
if __name__ == "__main__": if __name__ == "__main__":