Added csv output for debugging.
This commit is contained in:
parent
de6dde5269
commit
c274b96f66
|
@ -29,6 +29,8 @@ import zlib
|
||||||
from functools import partial, reduce
|
from functools import partial, reduce
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
|
||||||
import cchardet as chardet
|
import cchardet as chardet
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
|
@ -296,23 +298,46 @@ def get_all_lib_files(cdnjs_git_path):
|
||||||
return files, list(libvers)
|
return files, list(libvers)
|
||||||
|
|
||||||
|
|
||||||
def update_database_for_file(release_dic, cdnjs_git_path, filename):
|
def update_database_for_file(create_csv, release_dic, cdnjs_git_path,
|
||||||
|
filename):
|
||||||
"""Update database for all file."""
|
"""Update database for all file."""
|
||||||
if os.path.isfile(filename):
|
if os.path.isfile(filename):
|
||||||
logging.info("Updating database for file " + filename)
|
logging.info("Updating database for file " + filename)
|
||||||
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
|
file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
|
||||||
if not file_info is None:
|
if not file_info is None:
|
||||||
## TODO
|
if create_csv:
|
||||||
logging.info("Updating database ...")
|
print(file_info['path'])
|
||||||
|
print(cdnjs_git_path)
|
||||||
|
file_info['path'] = re.sub(r'^.*\/ajax\/', 'ajax/',
|
||||||
|
file_info['path'])
|
||||||
|
for key in [
|
||||||
|
'md5', 'sha1', 'sha256', 'normalized_md5',
|
||||||
|
'normalized_sha1', 'normalized_sha256',
|
||||||
|
'dec_normalized_md5', 'dec_normalized_sha1',
|
||||||
|
'dec_normalized_sha256', 'dec_md5', 'dec_sha1',
|
||||||
|
'dec_sha256'
|
||||||
|
]:
|
||||||
|
if not file_info[key] is None:
|
||||||
|
file_info[key] = (file_info[key]).hex()
|
||||||
|
csv_writer = csv.DictWriter(sys.stdout, file_info.keys())
|
||||||
|
csv_writer.writeheader()
|
||||||
|
csv_writer.writerow(file_info)
|
||||||
|
else:
|
||||||
|
logging.info("Updating database (TODO) ...")
|
||||||
else:
|
else:
|
||||||
logging.info("Skipping update for deleted file " + filename)
|
logging.info("Skipping update for deleted file " + filename)
|
||||||
|
|
||||||
def update_database(release_dic, cdnjs_git_path, files, poolsize=16):
|
|
||||||
|
def update_database(create_csv,
|
||||||
|
release_dic,
|
||||||
|
cdnjs_git_path,
|
||||||
|
files,
|
||||||
|
poolsize=16):
|
||||||
"""Update database for all files in files."""
|
"""Update database for all files in files."""
|
||||||
with Pool(poolsize) as pool:
|
with Pool(poolsize) as pool:
|
||||||
pool.map(
|
pool.map(
|
||||||
partial(update_database_for_file, release_dic, cdnjs_git_path),
|
partial(update_database_for_file, create_csv, release_dic,
|
||||||
files)
|
cdnjs_git_path), files)
|
||||||
|
|
||||||
|
|
||||||
def get_release_triple(git_path, libver):
|
def get_release_triple(git_path, libver):
|
||||||
|
@ -335,16 +360,21 @@ def build_release_date_dic(git_path, libvers, poolsize=16):
|
||||||
return release_date_dic
|
return release_date_dic
|
||||||
|
|
||||||
|
|
||||||
def pull_and_update_db(cdnjs_git_path, poolsize=16):
|
def pull_and_update_db(cdnjs_git_path, create_csv, poolsize=16):
|
||||||
"""Pull repo and update database."""
|
"""Pull repo and update database."""
|
||||||
files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
|
files, libvers = pull_get_updated_lib_files(cdnjs_git_path)
|
||||||
|
|
||||||
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
|
||||||
del libvers
|
del libvers
|
||||||
gc.collect()
|
gc.collect()
|
||||||
update_database(release_dic, cdnjs_git_path, files, poolsize)
|
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)
|
||||||
|
|
||||||
|
|
||||||
def update_db_all_libs(cdnjs_git_path, taskid=1, maxtaskid=1, poolsize=16):
|
def update_db_all_libs(cdnjs_git_path,
|
||||||
|
create_csv,
|
||||||
|
taskid=1,
|
||||||
|
maxtaskid=1,
|
||||||
|
poolsize=16):
|
||||||
"""Update database entries for all libs in git repo."""
|
"""Update database entries for all libs in git repo."""
|
||||||
files, libvers = get_all_lib_files(cdnjs_git_path)
|
files, libvers = get_all_lib_files(cdnjs_git_path)
|
||||||
|
|
||||||
|
@ -366,4 +396,4 @@ def update_db_all_libs(cdnjs_git_path, taskid=1, maxtaskid=1, poolsize=16):
|
||||||
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
|
release_dic = build_release_date_dic(cdnjs_git_path, libvers, poolsize)
|
||||||
del libvers
|
del libvers
|
||||||
gc.collect()
|
gc.collect()
|
||||||
update_database(release_dic, cdnjs_git_path, files, poolsize)
|
update_database(create_csv, release_dic, cdnjs_git_path, files, poolsize)
|
||||||
|
|
|
@ -32,15 +32,14 @@ assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
|
||||||
def helpmsg():
|
def helpmsg():
|
||||||
"""Print help message."""
|
"""Print help message."""
|
||||||
print("cdnjs-git-miner [OPTION]")
|
print("cdnjs-git-miner [OPTION]")
|
||||||
print(
|
print(" -i initialize/update database with all libraries in the repository")
|
||||||
" -i initialize/update database with all libraries in the repository"
|
print(" -u update: pull repository and update database")
|
||||||
)
|
print(" -p n update n files in parallel")
|
||||||
print(" -u update: pull repository and update database")
|
print(" -n <TASKID> process chunk n where n in [1,N]")
|
||||||
print(" -p n update n files in parallel")
|
print(" -N <MAXTASKID> ")
|
||||||
print(" -n <TASKID> process chunk n where n in [1,N]")
|
print(" -v verbose")
|
||||||
print(" -N <MAXTASKID> ")
|
print(" -c print csv format to stdout instead of writing to database")
|
||||||
print(" -v verbose")
|
print(" -h print this help text")
|
||||||
print(" -h print this help text")
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
@ -52,9 +51,10 @@ def main(argv):
|
||||||
parallel_updates = 5
|
parallel_updates = 5
|
||||||
taskid = 1
|
taskid = 1
|
||||||
maxtaskid = 1
|
maxtaskid = 1
|
||||||
|
csv = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
opts, args = getopt.getopt(argv, "hviup:n:N:",
|
opts, args = getopt.getopt(argv, "hvicup:n:N:",
|
||||||
["taskid=", "maxtaskid="])
|
["taskid=", "maxtaskid="])
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
helpmsg()
|
helpmsg()
|
||||||
|
@ -69,6 +69,8 @@ def main(argv):
|
||||||
initialize = True
|
initialize = True
|
||||||
elif opt == '-u':
|
elif opt == '-u':
|
||||||
update = True
|
update = True
|
||||||
|
elif opt == '-c':
|
||||||
|
csv = True
|
||||||
elif opt == "-p":
|
elif opt == "-p":
|
||||||
parallel_updates = int(arg)
|
parallel_updates = int(arg)
|
||||||
elif opt in ("-n", "--taskid"):
|
elif opt in ("-n", "--taskid"):
|
||||||
|
@ -90,9 +92,9 @@ def main(argv):
|
||||||
cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git")
|
cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git")
|
||||||
|
|
||||||
if initialize:
|
if initialize:
|
||||||
update_db_all_libs(cdnjs_git_path, taskid, maxtaskid, parallel_updates)
|
update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid, parallel_updates)
|
||||||
if update:
|
if update:
|
||||||
pull_and_update_db(cdnjs_git_path, parallel_updates)
|
pull_and_update_db(cdnjs_git_path, csv, parallel_updates)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue