2017-06-17 16:10:18 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
#
|
|
|
|
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
|
|
|
|
import getopt
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import tarfile
|
2017-08-27 22:04:38 +00:00
|
|
|
import time
|
2017-06-17 16:10:18 +00:00
|
|
|
import tempfile
|
2017-06-19 15:42:35 +00:00
|
|
|
import traceback
|
2017-08-23 17:04:33 +00:00
|
|
|
import fnmatch
|
|
|
|
from multiprocessing import Pool, Lock
|
|
|
|
from functools import partial
|
2017-06-17 16:10:18 +00:00
|
|
|
|
2017-08-23 17:04:33 +00:00
|
|
|
from ExtensionCrawler.archive import update_sqlite_incremental
|
2017-06-17 16:10:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
def help():
|
2017-06-30 17:39:26 +00:00
|
|
|
print("create-db [OPTION] DBBASEDIR")
|
2017-08-23 17:04:33 +00:00
|
|
|
print(" DBBASEDIR directory for generated db files")
|
|
|
|
print(" -h print this help text")
|
|
|
|
print(" -a <DIR> archive directory")
|
|
|
|
print(" -p <PREFIX> three-letter-prefix")
|
|
|
|
print(" -e <EXTIDFILELIST> file with extension ids")
|
|
|
|
print(" -t <THREADS> number of parallel threads")
|
|
|
|
print(" -n <TASKID> process chunk n where n in [1,N]")
|
|
|
|
print(" -N <MAXTASKID> ")
|
2017-06-20 14:10:32 +00:00
|
|
|
|
|
|
|
|
2017-08-23 17:04:33 +00:00
|
|
|
def guarded_stdout(string):
|
|
|
|
lock.acquire()
|
|
|
|
sys.stdout.write(string)
|
|
|
|
lock.release()
|
2017-06-20 14:10:32 +00:00
|
|
|
|
2017-08-23 17:04:33 +00:00
|
|
|
|
|
|
|
def guarded_stderr(string):
|
|
|
|
lock.acquire()
|
|
|
|
sys.stderr.write(string)
|
|
|
|
lock.release()
|
|
|
|
|
|
|
|
|
|
|
|
def process_id(dbbasedir, path):
|
2017-08-27 22:04:38 +00:00
|
|
|
start = time.clock()
|
2017-06-20 14:10:32 +00:00
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
2017-08-23 17:04:33 +00:00
|
|
|
with tarfile.open(path) as t:
|
2017-06-20 14:10:32 +00:00
|
|
|
t.extractall(tmpdir)
|
2017-08-23 17:04:33 +00:00
|
|
|
|
|
|
|
extid = os.listdir(tmpdir)[0]
|
2017-08-27 22:04:38 +00:00
|
|
|
guarded_stdout("Processing {}\n".format(extid))
|
2017-08-23 17:04:33 +00:00
|
|
|
dbpath = os.path.join(dbbasedir, extid + ".sqlite")
|
|
|
|
if os.path.exists(dbpath):
|
|
|
|
os.remove(dbpath)
|
|
|
|
iddir = os.path.join(tmpdir, extid)
|
2017-06-20 14:10:32 +00:00
|
|
|
|
|
|
|
for date in sorted(os.listdir(iddir)):
|
|
|
|
try:
|
2017-08-24 07:07:37 +00:00
|
|
|
update_sqlite_incremental(dbpath, iddir, extid, date, True,
|
|
|
|
"")
|
2017-06-22 16:46:18 +00:00
|
|
|
except Exception:
|
2017-08-23 17:04:33 +00:00
|
|
|
guarded_stderr("Exception when handling {} on {}:\n{}\n".
|
|
|
|
format(extid, date, traceback.format_exc()))
|
2017-08-27 22:04:38 +00:00
|
|
|
guarded_stdout("Finished {} in {}s\n".format(extid, time.clock() - start))
|
2017-06-20 14:10:32 +00:00
|
|
|
|
|
|
|
|
2017-08-23 17:04:33 +00:00
|
|
|
def find(archive, pattern):
|
|
|
|
for root, _, files in os.walk(os.path.join(archive, "data")):
|
|
|
|
for file in files:
|
|
|
|
if fnmatch.fnmatch(file, pattern + ".tar"):
|
|
|
|
yield os.path.join(root, file)
|
2017-06-17 16:10:18 +00:00
|
|
|
|
|
|
|
|
2017-08-23 17:04:33 +00:00
|
|
|
def find_from_file(archive, extidlistfile):
|
|
|
|
with open(extidlistfile, 'r') as f:
|
|
|
|
extids = [l.strip() for l in f.readlines()]
|
|
|
|
|
|
|
|
for root, _, files in os.walk(os.path.join(archive, "data")):
|
|
|
|
for file in files:
|
|
|
|
for extid in extids:
|
|
|
|
if fnmatch.fnmatch(file, extid + ".tar"):
|
|
|
|
yield os.path.join(root, file)
|
|
|
|
|
|
|
|
|
|
|
|
def init(l):
|
|
|
|
global lock
|
|
|
|
lock = l
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args(argv):
|
|
|
|
archive = "archive"
|
2017-06-20 14:10:32 +00:00
|
|
|
parallel = 8
|
2017-08-23 17:04:33 +00:00
|
|
|
taskid = 1
|
|
|
|
maxtaskid = 1
|
|
|
|
|
|
|
|
paths = []
|
|
|
|
|
2017-06-17 16:10:18 +00:00
|
|
|
try:
|
2017-08-23 17:04:33 +00:00
|
|
|
opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [
|
|
|
|
"archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",
|
|
|
|
"maxtaskid="
|
|
|
|
])
|
2017-06-17 16:10:18 +00:00
|
|
|
except getopt.GetoptError:
|
|
|
|
help()
|
|
|
|
sys.exit(2)
|
|
|
|
for opt, arg in opts:
|
|
|
|
if opt == '-h':
|
|
|
|
help()
|
|
|
|
sys.exit()
|
|
|
|
elif opt in ("-a", "--archive"):
|
2017-08-23 17:04:33 +00:00
|
|
|
archive = arg
|
2017-06-17 16:10:18 +00:00
|
|
|
elif opt in ("-p", "--prefix"):
|
2017-08-23 17:04:33 +00:00
|
|
|
paths += find(archive, arg + "*")
|
|
|
|
elif opt in ("-e", "--extidlistfile"):
|
|
|
|
paths += find_from_file(archive, arg)
|
2017-06-20 14:10:32 +00:00
|
|
|
elif opt in ("-t", "--threads"):
|
|
|
|
parallel = int(arg)
|
2017-08-23 17:04:33 +00:00
|
|
|
elif opt in ("-n", "--taskid"):
|
|
|
|
taskid = int(arg)
|
|
|
|
elif opt in ("-N", "--maxtaskid"):
|
|
|
|
maxtaskid = int(arg)
|
2017-06-17 16:10:18 +00:00
|
|
|
|
2017-06-22 16:46:18 +00:00
|
|
|
if len(args) < 1:
|
|
|
|
help()
|
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
dbbasedir = args[0]
|
|
|
|
|
2017-08-23 17:04:33 +00:00
|
|
|
paths += args[1:]
|
|
|
|
if paths == []:
|
|
|
|
paths = list(find(archive, "*"))
|
|
|
|
|
|
|
|
chunksize = int(len(paths) / maxtaskid)
|
|
|
|
if taskid == maxtaskid:
|
|
|
|
paths = paths[(taskid - 1) * chunksize:]
|
|
|
|
else:
|
|
|
|
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
|
|
|
|
|
|
|
|
return dbbasedir, paths, parallel
|
|
|
|
|
|
|
|
|
|
|
|
def main(argv):
|
|
|
|
dbbasedir, paths, parallel = parse_args(argv)
|
|
|
|
|
|
|
|
with Pool(initializer=init, initargs=(Lock(), ), processes=parallel) as p:
|
|
|
|
p.map(partial(process_id, dbbasedir), paths)
|
2017-06-17 16:10:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main(sys.argv[1:])
|