ExtensionCrawler/create-db

144 lines
4.3 KiB
Plaintext
Raw Normal View History

2017-09-01 13:12:05 +00:00
#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import getopt
import os
import sys
import tarfile
2017-08-27 22:04:38 +00:00
import time
import tempfile
2017-08-23 17:04:33 +00:00
import fnmatch
from multiprocessing import Pool
import logging
2017-08-30 14:12:54 +00:00
import datetime
from ExtensionCrawler.archive import update_db_incremental
from ExtensionCrawler.config import *
2017-08-30 14:12:54 +00:00
from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception
def help():
print("create-db [OPTION]")
2017-08-23 17:04:33 +00:00
print(" -h print this help text")
print(" -a <DIR> archive directory")
print(" -p <PREFIX> three-letter-prefix")
print(" -e <EXTIDFILELIST> file with extension ids")
print(" -t <THREADS> number of parallel threads")
print(" -n <TASKID> process chunk n where n in [1,N]")
print(" -N <MAXTASKID> ")
def process_id(path):
start = time.time()
with tempfile.TemporaryDirectory() as tmpdir:
2017-08-23 17:04:33 +00:00
with tarfile.open(path) as t:
t.extractall(tmpdir)
2017-08-23 17:04:33 +00:00
extid = os.listdir(tmpdir)[0]
2017-08-30 14:12:54 +00:00
log_info("Start processing extension", 0, extid)
2017-08-23 17:04:33 +00:00
iddir = os.path.join(tmpdir, extid)
for date in sorted(os.listdir(iddir)):
try:
update_db_incremental(iddir, extid, date)
2017-06-22 16:46:18 +00:00
except Exception:
log_exception(
"Exception when handling data from {}".format(date), 0,
extid)
log_info(
"Finished extension in {}".format(
str(datetime.timedelta(seconds=int(time.time() - start)))),
0,
extid)
2017-08-23 17:04:33 +00:00
def find(archive, pattern):
for root, _, files in os.walk(os.path.join(archive, "data")):
for file in files:
if fnmatch.fnmatch(file, pattern + ".tar"):
yield os.path.join(root, file)
2017-08-23 17:04:33 +00:00
def find_from_file(archive, extidlistfile):
with open(extidlistfile, 'r') as f:
extids = [l.strip() for l in f.readlines()]
for root, _, files in os.walk(os.path.join(archive, "data")):
for file in files:
for extid in extids:
if fnmatch.fnmatch(file, extid + ".tar"):
yield os.path.join(root, file)
def parse_args(argv):
archive = "archive"
parallel = 8
2017-08-23 17:04:33 +00:00
taskid = 1
maxtaskid = 1
paths = []
try:
2017-08-23 17:04:33 +00:00
opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [
"archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",
"maxtaskid="
])
except getopt.GetoptError:
help()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
help()
sys.exit()
elif opt in ("-a", "--archive"):
2017-08-23 17:04:33 +00:00
archive = arg
elif opt in ("-p", "--prefix"):
2017-08-23 17:04:33 +00:00
paths += find(archive, arg + "*")
elif opt in ("-e", "--extidlistfile"):
paths += find_from_file(archive, arg)
elif opt in ("-t", "--threads"):
parallel = int(arg)
2017-08-23 17:04:33 +00:00
elif opt in ("-n", "--taskid"):
taskid = int(arg)
elif opt in ("-N", "--maxtaskid"):
maxtaskid = int(arg)
2017-08-23 17:04:33 +00:00
if paths == []:
paths = list(find(archive, "*"))
chunksize = int(len(paths) / maxtaskid)
if taskid == maxtaskid:
paths = paths[(taskid - 1) * chunksize:]
else:
paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
return paths, parallel
2017-08-23 17:04:33 +00:00
def main(argv):
logging.basicConfig(level=logging.INFO, format=const_log_format())
paths, parallel = parse_args(argv)
2017-08-23 17:04:33 +00:00
with Pool(processes=parallel) as p:
p.map(process_id, paths)
if __name__ == "__main__":
main(sys.argv[1:])