Removed extfind.py - superseeded by new hpc scripts.
This commit is contained in:
parent
4ffc51e6b9
commit
49bb2d4690
110
extfind
110
extfind
|
@ -1,110 +0,0 @@
|
|||
#!/usr/bin/env python3.7
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import getopt
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import re
|
||||
|
||||
from ExtensionCrawler import config
|
||||
|
||||
|
||||
def print_help():
|
||||
print("""extfind [OPTION]""")
|
||||
print(""" -h print this help text""")
|
||||
print(""" -a <DIR> archive directory""")
|
||||
print(""" -g <GLOB> glob on the extension id, don't use with -e """)
|
||||
print(""" -e <EXTIDFILELIST> file with extension ids, don't use with -g""")
|
||||
print(""" -n <TASKID> process chunk n where n in [1,N]""")
|
||||
print(""" -N <MAXTASKID> """)
|
||||
|
||||
|
||||
def split(l, n, N):
|
||||
if n < 1 or n > N:
|
||||
raise ValueError("n must be between 1 and N")
|
||||
chunksize = int(len(l) / N) + 1
|
||||
|
||||
# Slicing beyond the list contents returns the empty list
|
||||
return l[chunksize * (n - 1):chunksize * n]
|
||||
|
||||
|
||||
def iter_extension_paths_from_file(archive, n, N, extidlistfile):
|
||||
paths = []
|
||||
with open(extidlistfile, 'r') as f:
|
||||
for line in f.readlines():
|
||||
path = os.path.join(archive, "data", line[:3], line + ".tar")
|
||||
if re.fullmatch("[a-p]{32}", line) and os.path.exists(path):
|
||||
paths += [path]
|
||||
else:
|
||||
logging.warning("WARNING: {} is not a valid extension path!".format(path))
|
||||
return split(paths, n, N)
|
||||
|
||||
|
||||
def iter_extension_paths(archive, n, N, extidglob="[a-p]"*32):
|
||||
paths = glob.glob(os.path.join(archive, "data", "[a-p]" * 3, extidglob + ".tar"))
|
||||
return split(paths, n, N)
|
||||
|
||||
|
||||
def main(argv):
|
||||
archive = config.const_basedir()
|
||||
extidglob = None
|
||||
extidlistfile = None
|
||||
taskid = 1
|
||||
maxtaskid = 1
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [
|
||||
"archive=", "glob=", "extidlistfile=", "taskid=",
|
||||
"maxtaskid=", "help"
|
||||
])
|
||||
except getopt.GetoptError:
|
||||
print_help()
|
||||
sys.exit(2)
|
||||
for opt, arg in opts:
|
||||
if opt in ("-h", "--help"):
|
||||
print_help()
|
||||
sys.exit()
|
||||
elif opt in ("-a", "--archive"):
|
||||
archive = arg
|
||||
elif opt in ("-g", "--glob"):
|
||||
extidglob = arg
|
||||
elif opt in ("-e", "--extidlistfile"):
|
||||
extidlistfile = arg
|
||||
elif opt in ("-n", "--taskid"):
|
||||
taskid = int(arg)
|
||||
elif opt in ("-N", "--maxtaskid"):
|
||||
maxtaskid = int(arg)
|
||||
|
||||
if extidglob is None and extidlistfile is None:
|
||||
paths = iter_extension_paths(archive, taskid, maxtaskid)
|
||||
elif extidglob is None and extidlistfile is not None:
|
||||
paths = iter_extension_paths_from_file(archive, taskid, maxtaskid, extidlistfile)
|
||||
elif extidglob is not None and extidlistfile is None:
|
||||
paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
|
||||
else:
|
||||
print_help()
|
||||
sys.exit(2)
|
||||
|
||||
for path in paths:
|
||||
print(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
|
@ -1 +0,0 @@
|
|||
extfind
|
Loading…
Reference in New Issue