ExtensionCrawler/create-db

#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import getopt
import os
import sys
import tarfile
import time
import tempfile
import fnmatch
from multiprocessing import Pool
import logging
import datetime

from ExtensionCrawler.archive import update_db_incremental
from ExtensionCrawler.config import *
from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception


def help():
    print("create-db [OPTION]")
    print("    -h                  print this help text")
    print("    -a <DIR>            archive directory")
    print("    -p <PREFIX>         three-letter-prefix")
    print("    -e <EXTIDFILELIST>  file with extension ids")
    print("    -t <THREADS>        number of parallel threads")
    print("    -n <TASKID>         process chunk n where n in [1,N]")
    print("    -N <MAXTASKID>      ")


def process_id(path):
    start = time.time()
    with tempfile.TemporaryDirectory() as tmpdir:
        with tarfile.open(path) as t:
            t.extractall(tmpdir)

            extid = os.listdir(tmpdir)[0]
            log_info("Start processing extension", 0, extid)
            iddir = os.path.join(tmpdir, extid)

            for date in sorted(os.listdir(iddir)):
                try:
                    update_db_incremental(iddir, extid, date)
                except Exception:
                    log_exception(
                        "Exception when handling data from {}".format(date), 0,
                        extid)
    log_info(
        "Finished extension in {}".format(
            str(datetime.timedelta(seconds=int(time.time() - start)))),
        0,
        extid)


def find(archive, pattern):
    for root, _, files in os.walk(os.path.join(archive, "data")):
        for file in files:
            if fnmatch.fnmatch(file, pattern + ".tar"):
                yield os.path.join(root, file)


def find_from_file(archive, extidlistfile):
    with open(extidlistfile, 'r') as f:
        extids = [l.strip() for l in f.readlines()]

    for root, _, files in os.walk(os.path.join(archive, "data")):
        for file in files:
            for extid in extids:
                if fnmatch.fnmatch(file, extid + ".tar"):
                    yield os.path.join(root, file)


def parse_args(argv):
    archive = "archive"
    parallel = 8
    taskid = 1
    maxtaskid = 1

    paths = []

    try:
        opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [
            "archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",
            "maxtaskid="
        ])
    except getopt.GetoptError:
        help()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            help()
            sys.exit()
        elif opt in ("-a", "--archive"):
            archive = arg
        elif opt in ("-p", "--prefix"):
            paths += find(archive, arg + "*")
        elif opt in ("-e", "--extidlistfile"):
            paths += find_from_file(archive, arg)
        elif opt in ("-t", "--threads"):
            parallel = int(arg)
        elif opt in ("-n", "--taskid"):
            taskid = int(arg)
        elif opt in ("-N", "--maxtaskid"):
            maxtaskid = int(arg)

    if paths == []:
        paths = list(find(archive, "*"))

    chunksize = int(len(paths) / maxtaskid)
    if taskid == maxtaskid:
        paths = paths[(taskid - 1) * chunksize:]
    else:
        paths = paths[(taskid - 1) * chunksize:taskid * chunksize]

    return paths, parallel


def main(argv):
    logging.basicConfig(level=logging.INFO, format=const_log_format())

    paths, parallel = parse_args(argv)

    with Pool(processes=parallel) as p:
        p.map(process_id, paths)


if __name__ == "__main__":
    main(sys.argv[1:])
Use python3.5 for all files. 2017-09-01 13:12:05 +00:00			`#!/usr/bin/env python3.5`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00			`#`
			`# Copyright (C) 2016,2017 The University of Sheffield, UK`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`

			`import getopt`
			`import os`
			`import sys`
			`import tarfile`
Added logging for create-db. 2017-08-27 22:04:38 +00:00			`import time`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00			`import tempfile`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`import fnmatch`
Changed logging to use logging library. 2017-08-29 21:29:38 +00:00			`from multiprocessing import Pool`
			`import logging`
Improved logging. 2017-08-30 14:12:54 +00:00			`import datetime`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00
Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`from ExtensionCrawler.archive import update_db_incremental`
Changed logging to use logging library. 2017-08-29 21:29:38 +00:00			`from ExtensionCrawler.config import *`
Improved logging. 2017-08-30 14:12:54 +00:00			`from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00

			`def help():`
Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`print("create-db [OPTION]")`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`print(" -h print this help text")`
			`print(" -a <DIR> archive directory")`
			`print(" -p <PREFIX> three-letter-prefix")`
			`print(" -e <EXTIDFILELIST> file with extension ids")`
			`print(" -t <THREADS> number of parallel threads")`
			`print(" -n <TASKID> process chunk n where n in [1,N]")`
			`print(" -N <MAXTASKID> ")`
Added -p option for create-db (parallelism). 2017-06-20 14:10:32 +00:00

Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`def process_id(path):`
Added mysql retry logic and use time.time() now. 2017-08-28 10:50:41 +00:00			`start = time.time()`
Added -p option for create-db (parallelism). 2017-06-20 14:10:32 +00:00			`with tempfile.TemporaryDirectory() as tmpdir:`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`with tarfile.open(path) as t:`
Added -p option for create-db (parallelism). 2017-06-20 14:10:32 +00:00			`t.extractall(tmpdir)`
Also improved create-db script. 2017-08-23 17:04:33 +00:00
			`extid = os.listdir(tmpdir)[0]`
Improved logging. 2017-08-30 14:12:54 +00:00			`log_info("Start processing extension", 0, extid)`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`iddir = os.path.join(tmpdir, extid)`
Added -p option for create-db (parallelism). 2017-06-20 14:10:32 +00:00
			`for date in sorted(os.listdir(iddir)):`
			`try:`
Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`update_db_incremental(iddir, extid, date)`
Make db path configurable. 2017-06-22 16:46:18 +00:00			`except Exception:`
Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`log_exception(`
			`"Exception when handling data from {}".format(date), 0,`
			`extid)`
			`log_info(`
			`"Finished extension in {}".format(`
			`str(datetime.timedelta(seconds=int(time.time() - start)))),`
			`0,`
			`extid)`
Added -p option for create-db (parallelism). 2017-06-20 14:10:32 +00:00

Also improved create-db script. 2017-08-23 17:04:33 +00:00			`def find(archive, pattern):`
			`for root, _, files in os.walk(os.path.join(archive, "data")):`
			`for file in files:`
			`if fnmatch.fnmatch(file, pattern + ".tar"):`
			`yield os.path.join(root, file)`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00

Also improved create-db script. 2017-08-23 17:04:33 +00:00			`def find_from_file(archive, extidlistfile):`
			`with open(extidlistfile, 'r') as f:`
			`extids = [l.strip() for l in f.readlines()]`

			`for root, _, files in os.walk(os.path.join(archive, "data")):`
			`for file in files:`
			`for extid in extids:`
			`if fnmatch.fnmatch(file, extid + ".tar"):`
			`yield os.path.join(root, file)`


			`def parse_args(argv):`
			`archive = "archive"`
Added -p option for create-db (parallelism). 2017-06-20 14:10:32 +00:00			`parallel = 8`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`taskid = 1`
			`maxtaskid = 1`

			`paths = []`

Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00			`try:`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [`
			`"archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",`
			`"maxtaskid="`
			`])`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00			`except getopt.GetoptError:`
			`help()`
			`sys.exit(2)`
			`for opt, arg in opts:`
			`if opt == '-h':`
			`help()`
			`sys.exit()`
			`elif opt in ("-a", "--archive"):`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`archive = arg`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00			`elif opt in ("-p", "--prefix"):`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`paths += find(archive, arg + "*")`
			`elif opt in ("-e", "--extidlistfile"):`
			`paths += find_from_file(archive, arg)`
Added -p option for create-db (parallelism). 2017-06-20 14:10:32 +00:00			`elif opt in ("-t", "--threads"):`
			`parallel = int(arg)`
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`elif opt in ("-n", "--taskid"):`
			`taskid = int(arg)`
			`elif opt in ("-N", "--maxtaskid"):`
			`maxtaskid = int(arg)`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00
Also improved create-db script. 2017-08-23 17:04:33 +00:00			`if paths == []:`
			`paths = list(find(archive, "*"))`

			`chunksize = int(len(paths) / maxtaskid)`
			`if taskid == maxtaskid:`
			`paths = paths[(taskid - 1) * chunksize:]`
			`else:`
			`paths = paths[(taskid - 1) * chunksize:taskid * chunksize]`

Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`return paths, parallel`
Also improved create-db script. 2017-08-23 17:04:33 +00:00

			`def main(argv):`
Changed logging to use logging library. 2017-08-29 21:29:38 +00:00			`logging.basicConfig(level=logging.INFO, format=const_log_format())`

Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`paths, parallel = parse_args(argv)`
Also improved create-db script. 2017-08-23 17:04:33 +00:00
Changed logging to use logging library. 2017-08-29 21:29:38 +00:00			`with Pool(processes=parallel) as p:`
Removed everything related to sqlite and updated README. 2017-08-30 14:38:04 +00:00			`p.map(process_id, paths)`
Split db creation into incremental part and separate full regeneration script. 2017-06-17 16:10:18 +00:00

			`if __name__ == "__main__":`
			`main(sys.argv[1:])`