Implemented skeleton of main routine.

This commit is contained in:
Achim D. Brucker 2017-09-13 02:56:13 +01:00
parent a8a5534be1
commit c30f7fdd7c
2 changed files with 75 additions and 7 deletions

View File

@ -190,3 +190,44 @@ def get_file_libinfo(gitobj, libfile):
return file_info
except Exception:
return None
def pull_get_updated_lib_files(cdnjs_repo):
"""Pull repository and determine updated libraries."""
files = []
for update in pull_get_list_changed_files(cdnjs_repo):
if not (os.path.basename(update) in ["package.json", ".gitkeep"]):
if update.startswith("ajax"):
files.append(update)
return files
def get_all_lib_files(cdnjs_git_path):
"""Return all libraries stored in cdnjs git repo."""
files = []
for dirpath, dirs, files in os.walk(os.path.join(cdnjs_git_path, "ajax")):
for filename in files:
if filename != "package.json" and filename != ".gitkeep":
fname = os.path.join(dirpath, filename)
files.append(fname)
return files
def update_database(cdnjs_git, files):
"""Update database for all files in files."""
# could be converted to parallel map
for fname in files:
file_info = get_file_libinfo(cdnjs_git, fname)
if not file_info is None:
print("TODO: Updating data base: " + fname)
def pull_and_update_db(cdnjs_git_path):
"""Pull repo and update database."""
cdnjs_git = git.Git(cdnjs_git_path)
cdnjs_repo = git.Repo(cdnjs_git_path)
files = pull_get_updated_lib_files(cdnjs_repo)
update_database(cdnjs_git, files)
def update_db_all_libs(cdnjs_git_path):
"""Update database entries for all libs in git repo."""
cdnjs_git = git.Git(cdnjs_git_path)
files = get_all_lib_files(cdnjs_git_path)
update_database(cdnjs_git, files)

View File

@ -20,8 +20,11 @@
import getopt
import logging
import sys
import os
from ExtensionCrawler.config import (const_log_format, const_basedir)
from ExtensionCrawler.cdnjs_git import (pull_and_update_db, update_db_all_libs)
from ExtensionCrawler.config import const_log_format
# Script should run with python 3.4 or 3.5
assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
@ -30,17 +33,24 @@ assert sys.version_info >= (3, 4) and sys.version_info < (3, 6)
def helpmsg():
"""Print help message."""
print("cdnjs-git-miner [OPTION]")
print(" -h print this help text")
print(
" -i initialize database: update database with all libraries in the repository"
)
print(
" -u update: pull repository and update database with new/upated libraries"
)
print(" -v verbose")
print(" -h print this help text")
def main(argv):
"""Main function of the extension crawler."""
basedir = "archive"
verbose = True
force = False
clean = False
basedir = const_basedir()
verbose = False
initialize = False
update = False
try:
opts, args = getopt.getopt(argv, "h")
opts, args = getopt.getopt(argv, "hviu")
except getopt.GetoptError:
helpmsg()
sys.exit(2)
@ -48,6 +58,17 @@ def main(argv):
if opt == '-h':
helpmsg()
sys.exit()
elif opt == '-v':
verbose = True
elif opt == '-i':
initialize = True
elif opt == '-u':
update = True
if verbose:
loglevel = logging.INFO
else:
loglevel = logging.WARNING
logger = logging.getLogger()
ch = logging.StreamHandler(sys.stdout)
@ -55,6 +76,12 @@ def main(argv):
logger.addHandler(ch)
logger.setLevel(loglevel)
cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git")
if initialize:
update_db_all_libs(cdnjs_git_path)
if update:
pull_and_update_db(cdnjs_git_path)
if __name__ == "__main__":
main(sys.argv[1:])